aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Documentation/ABI/testing/sysfs-kernel-mm-damon7
-rw-r--r--Documentation/admin-guide/mm/damon/usage.rst11
-rw-r--r--Documentation/admin-guide/mm/transhuge.rst36
-rw-r--r--Documentation/filesystems/proc.rst5
-rw-r--r--Documentation/mm/damon/design.rst18
-rw-r--r--MAINTAINERS1
-rw-r--r--arch/arc/include/asm/arcregs.h3
-rw-r--r--arch/arc/mm/cache.c8
-rw-r--r--arch/arc/mm/tlb.c2
-rw-r--r--arch/arm/include/asm/hugetlb.h2
-rw-r--r--arch/arm/mm/copypage-v4mc.c2
-rw-r--r--arch/arm/mm/copypage-v6.c2
-rw-r--r--arch/arm/mm/copypage-xscale.c2
-rw-r--r--arch/arm/mm/dma-mapping.c2
-rw-r--r--arch/arm/mm/fault-armv.c2
-rw-r--r--arch/arm/mm/flush.c10
-rw-r--r--arch/arm64/include/asm/hugetlb.h6
-rw-r--r--arch/arm64/include/asm/mte.h16
-rw-r--r--arch/arm64/mm/flush.c8
-rw-r--r--arch/csky/abiv1/cacheflush.c6
-rw-r--r--arch/mips/include/asm/cacheflush.h6
-rw-r--r--arch/nios2/mm/cacheflush.c6
-rw-r--r--arch/openrisc/include/asm/cacheflush.h2
-rw-r--r--arch/openrisc/mm/cache.c2
-rw-r--r--arch/parisc/kernel/cache.c6
-rw-r--r--arch/powerpc/include/asm/cacheflush.h4
-rw-r--r--arch/powerpc/include/asm/kvm_ppc.h4
-rw-r--r--arch/powerpc/mm/book3s64/hash_utils.c4
-rw-r--r--arch/powerpc/mm/pgtable.c12
-rw-r--r--arch/powerpc/mm/ptdump/8xx.c2
-rw-r--r--arch/powerpc/mm/ptdump/book3s64.c2
-rw-r--r--arch/powerpc/mm/ptdump/ptdump.h4
-rw-r--r--arch/powerpc/mm/ptdump/shared.c2
-rw-r--r--arch/powerpc/platforms/pseries/cmm.c2
-rw-r--r--arch/riscv/include/asm/cacheflush.h4
-rw-r--r--arch/riscv/include/asm/hugetlb.h2
-rw-r--r--arch/riscv/include/asm/pgtable.h11
-rw-r--r--arch/riscv/mm/cacheflush.c4
-rw-r--r--arch/s390/include/asm/hugetlb.h2
-rw-r--r--arch/s390/kernel/uv.c12
-rw-r--r--arch/s390/mm/gmap.c2
-rw-r--r--arch/s390/mm/hugetlbpage.c2
-rw-r--r--arch/s390/mm/mmap.c4
-rw-r--r--arch/sh/include/asm/hugetlb.h2
-rw-r--r--arch/sh/mm/cache-sh4.c2
-rw-r--r--arch/sh/mm/cache-sh7705.c2
-rw-r--r--arch/sh/mm/cache.c14
-rw-r--r--arch/sh/mm/kmap.c2
-rw-r--r--arch/sparc/kernel/sys_sparc_64.c4
-rw-r--r--arch/sparc/mm/init_64.c10
-rw-r--r--arch/x86/mm/init.c1
-rw-r--r--arch/x86/mm/mmap.c4
-rw-r--r--arch/x86/mm/pat/memtype.c6
-rw-r--r--arch/xtensa/mm/cache.c12
-rw-r--r--block/blk-lib.c15
-rw-r--r--drivers/base/node.c5
-rw-r--r--drivers/block/zram/zram_drv.c23
-rw-r--r--drivers/char/mem.c21
-rw-r--r--drivers/misc/vmw_balloon.c4
-rw-r--r--drivers/virtio/virtio_balloon.c2
-rw-r--r--drivers/xen/Kconfig1
-rw-r--r--drivers/xen/gntdev.c5
-rw-r--r--fs/Kconfig1
-rw-r--r--fs/aio.c2
-rw-r--r--fs/bcachefs/darray.c2
-rw-r--r--fs/bcachefs/fs-io-buffered.c13
-rw-r--r--fs/bcachefs/util.h2
-rw-r--r--fs/btrfs/disk-io.c1
-rw-r--r--fs/btrfs/inode.c4
-rw-r--r--fs/coredump.c4
-rw-r--r--fs/dax.c47
-rw-r--r--fs/exec.c2
-rw-r--r--fs/fuse/dev.c2
-rw-r--r--fs/gfs2/glops.c2
-rw-r--r--fs/hugetlbfs/inode.c4
-rw-r--r--fs/jffs2/file.c4
-rw-r--r--fs/jfs/jfs_metapage.c8
-rw-r--r--fs/nilfs2/page.c2
-rw-r--r--fs/ntfs3/inode.c15
-rw-r--r--fs/pidfs.c7
-rw-r--r--fs/proc/array.c2
-rw-r--r--fs/proc/base.c12
-rw-r--r--fs/proc/internal.h15
-rw-r--r--fs/proc/page.c4
-rw-r--r--fs/proc/task_mmu.c190
-rw-r--r--fs/proc/task_nommu.c14
-rw-r--r--fs/ubifs/file.c6
-rw-r--r--include/asm-generic/memory_model.h2
-rw-r--r--include/linux/blkdev.h2
-rw-r--r--include/linux/bpfptr.h2
-rw-r--r--include/linux/damon.h17
-rw-r--r--include/linux/huge_mm.h100
-rw-r--r--include/linux/khugepaged.h6
-rw-r--r--include/linux/ksm.h6
-rw-r--r--include/linux/maple_tree.h16
-rw-r--r--include/linux/memcontrol.h10
-rw-r--r--include/linux/mempool.h2
-rw-r--r--include/linux/memremap.h39
-rw-r--r--include/linux/migrate.h11
-rw-r--r--include/linux/mm.h116
-rw-r--r--include/linux/mm_inline.h12
-rw-r--r--include/linux/mm_types.h103
-rw-r--r--include/linux/mman.h2
-rw-r--r--include/linux/mmap_lock.h85
-rw-r--r--include/linux/mmzone.h47
-rw-r--r--include/linux/netfs.h2
-rw-r--r--include/linux/oom.h2
-rw-r--r--include/linux/page-flags.h40
-rw-r--r--include/linux/pageblock-flags.h12
-rw-r--r--include/linux/pagemap.h2
-rw-r--r--include/linux/pagevec.h4
-rw-r--r--include/linux/pgalloc_tag.h7
-rw-r--r--include/linux/pgtable.h26
-rw-r--r--include/linux/rmap.h62
-rw-r--r--include/linux/sched/coredump.h18
-rw-r--r--include/linux/slab.h39
-rw-r--r--include/linux/swap.h8
-rw-r--r--include/linux/vmalloc.h12
-rw-r--r--include/linux/writeback.h6
-rw-r--r--include/trace/events/cma.h19
-rw-r--r--include/trace/events/kmem.h5
-rw-r--r--include/trace/events/page_ref.h4
-rw-r--r--include/uapi/linux/mempolicy.h12
-rw-r--r--include/uapi/linux/prctl.h10
-rw-r--r--kernel/events/uprobes.c32
-rw-r--r--kernel/fork.c9
-rw-r--r--kernel/kexec_handover.c11
-rw-r--r--kernel/rcu/rcuscale.c2
-rw-r--r--kernel/sched/fair.c10
-rw-r--r--kernel/sys.c69
-rw-r--r--lib/maple_tree.c12
-rw-r--r--lib/rhashtable.c4
-rw-r--r--lib/test_hmm.c2
-rw-r--r--lib/test_kho.c52
-rw-r--r--lib/test_maple_tree.c2
-rw-r--r--lib/xarray.c2
-rw-r--r--mm/Kconfig18
-rw-r--r--mm/backing-dev.c2
-rw-r--r--mm/cma.c2
-rw-r--r--mm/damon/Kconfig2
-rw-r--r--mm/damon/core.c95
-rw-r--r--mm/damon/ops-common.c11
-rw-r--r--mm/damon/ops-common.h2
-rw-r--r--mm/damon/paddr.c130
-rw-r--r--mm/damon/sysfs.c41
-rw-r--r--mm/damon/tests/core-kunit.h38
-rw-r--r--mm/damon/tests/vaddr-kunit.h2
-rw-r--r--mm/damon/vaddr.c105
-rw-r--r--mm/debug.c4
-rw-r--r--mm/filemap.c47
-rw-r--r--mm/gup.c12
-rw-r--r--mm/huge_memory.c224
-rw-r--r--mm/hugetlb.c8
-rw-r--r--mm/internal.h5
-rw-r--r--mm/kasan/init.c4
-rw-r--r--mm/kasan/kasan_test_c.c40
-rw-r--r--mm/khugepaged.c34
-rw-r--r--mm/ksm.c32
-rw-r--r--mm/memcontrol.c7
-rw-r--r--mm/memory-failure.c12
-rw-r--r--mm/memory.c352
-rw-r--r--mm/migrate.c85
-rw-r--r--mm/migrate_device.c2
-rw-r--r--mm/mincore.c71
-rw-r--r--mm/mmap.c8
-rw-r--r--mm/mmap_lock.c109
-rw-r--r--mm/mmu_gather.c4
-rw-r--r--mm/mmzone.c4
-rw-r--r--mm/nommu.c17
-rw-r--r--mm/oom_kill.c26
-rw-r--r--mm/page-writeback.c46
-rw-r--r--mm/page_alloc.c47
-rw-r--r--mm/pagewalk.c20
-rw-r--r--mm/rmap.c128
-rw-r--r--mm/shmem.c16
-rw-r--r--mm/slab.h6
-rw-r--r--mm/slub.c77
-rw-r--r--mm/sparse.c6
-rw-r--r--mm/swap.c10
-rw-r--r--mm/swap.h10
-rw-r--r--mm/swap_state.c38
-rw-r--r--mm/swapfile.c114
-rw-r--r--mm/userfaultfd.c222
-rw-r--r--mm/util.c6
-rw-r--r--mm/vma.h2
-rw-r--r--mm/vma_init.c2
-rw-r--r--mm/vmalloc.c31
-rw-r--r--mm/vmscan.c31
-rw-r--r--mm/vmstat.c2
-rw-r--r--mm/workingset.c2
-rw-r--r--mm/zsmalloc.c4
-rw-r--r--mm/zswap.c62
-rw-r--r--rust/helpers/slab.c10
-rw-r--r--rust/helpers/vmalloc.c5
-rw-r--r--rust/kernel/alloc.rs54
-rw-r--r--rust/kernel/alloc/allocator.rs105
-rw-r--r--rust/kernel/alloc/allocator_test.rs3
-rw-r--r--rust/kernel/alloc/kbox.rs4
-rw-r--r--rust/kernel/alloc/kvec.rs11
-rw-r--r--rust/kernel/mm.rs3
-rw-r--r--rust/kernel/mm/mmput_async.rs2
-rw-r--r--tools/include/linux/atomic.h22
-rw-r--r--tools/testing/radix-tree/maple.c10
-rw-r--r--tools/testing/selftests/damon/Makefile2
-rw-r--r--tools/testing/selftests/damon/access_memory_even.c1
-rwxr-xr-xtools/testing/selftests/damon/sysfs_no_op_commit_break.py72
-rw-r--r--tools/testing/selftests/kho/init.c13
-rwxr-xr-xtools/testing/selftests/kho/vmtest.sh28
-rw-r--r--tools/testing/selftests/kselftest.h4
-rw-r--r--tools/testing/selftests/landlock/audit.h6
-rw-r--r--tools/testing/selftests/landlock/common.h4
-rw-r--r--tools/testing/selftests/mm/.gitignore2
-rw-r--r--tools/testing/selftests/mm/Makefile4
-rw-r--r--tools/testing/selftests/mm/cow.c13
-rw-r--r--tools/testing/selftests/mm/hugepage-mremap.c16
-rw-r--r--tools/testing/selftests/mm/ksm_functional_tests.c156
-rw-r--r--tools/testing/selftests/mm/mremap_test.c6
-rw-r--r--tools/testing/selftests/mm/pagemap_ioctl.c24
-rw-r--r--tools/testing/selftests/mm/pfnmap.c48
-rw-r--r--tools/testing/selftests/mm/pkey-helpers.h3
-rw-r--r--tools/testing/selftests/mm/prctl_thp_disable.c291
-rw-r--r--tools/testing/selftests/mm/rmap.c433
-rwxr-xr-xtools/testing/selftests/mm/run_vmtests.sh14
-rw-r--r--tools/testing/selftests/mm/split_huge_page_test.c353
-rwxr-xr-xtools/testing/selftests/mm/test_vmalloc.sh6
-rw-r--r--tools/testing/selftests/mm/thp_settings.c9
-rw-r--r--tools/testing/selftests/mm/thp_settings.h1
-rw-r--r--tools/testing/selftests/mm/thuge-gen.c11
-rw-r--r--tools/testing/selftests/mm/uffd-stress.c19
-rw-r--r--tools/testing/selftests/mm/uffd-wp-mremap.c9
-rw-r--r--tools/testing/selftests/mm/virtual_address_range.c13
-rw-r--r--tools/testing/selftests/mm/vm_util.c150
-rw-r--r--tools/testing/selftests/mm/vm_util.h17
-rw-r--r--tools/testing/selftests/net/psock_lib.h4
-rw-r--r--tools/testing/selftests/perf_events/watermark_signal.c2
-rw-r--r--tools/testing/selftests/proc/proc-maps-race.c65
-rw-r--r--tools/testing/selftests/ublk/utils.h2
-rw-r--r--tools/testing/shared/linux/maple_tree.h6
-rw-r--r--tools/testing/shared/shared.mk6
-rw-r--r--tools/testing/vma/linux/atomic.h17
-rw-r--r--tools/testing/vma/vma_internal.h49
241 files changed, 4398 insertions, 1972 deletions
diff --git a/Documentation/ABI/testing/sysfs-kernel-mm-damon b/Documentation/ABI/testing/sysfs-kernel-mm-damon
index 6791d879759e..b6b71db36ca7 100644
--- a/Documentation/ABI/testing/sysfs-kernel-mm-damon
+++ b/Documentation/ABI/testing/sysfs-kernel-mm-damon
@@ -77,6 +77,13 @@ Description: Writing a keyword for a monitoring operations set ('vaddr' for
Note that only the operations sets that listed in
'avail_operations' file are valid inputs.
+What: /sys/kernel/mm/damon/admin/kdamonds/<K>/contexts/<C>/addr_unit
+Date: Aug 2025
+Contact: SeongJae Park <sj@kernel.org>
+Description: Writing an integer to this file sets the 'address unit'
+ parameter of the given operations set of the context. Reading
+ the file returns the last-written 'address unit' value.
+
What: /sys/kernel/mm/damon/admin/kdamonds/<K>/contexts/<C>/monitoring_attrs/intervals/sample_us
Date: Mar 2022
Contact: SeongJae Park <sj@kernel.org>
diff --git a/Documentation/admin-guide/mm/damon/usage.rst b/Documentation/admin-guide/mm/damon/usage.rst
index ff3a2dda1f02..2cae60b6f3ca 100644
--- a/Documentation/admin-guide/mm/damon/usage.rst
+++ b/Documentation/admin-guide/mm/damon/usage.rst
@@ -61,7 +61,7 @@ comma (",").
│ :ref:`kdamonds <sysfs_kdamonds>`/nr_kdamonds
│ │ :ref:`0 <sysfs_kdamond>`/state,pid,refresh_ms
│ │ │ :ref:`contexts <sysfs_contexts>`/nr_contexts
- │ │ │ │ :ref:`0 <sysfs_context>`/avail_operations,operations
+ │ │ │ │ :ref:`0 <sysfs_context>`/avail_operations,operations,addr_unit
│ │ │ │ │ :ref:`monitoring_attrs <sysfs_monitoring_attrs>`/
│ │ │ │ │ │ intervals/sample_us,aggr_us,update_us
│ │ │ │ │ │ │ intervals_goal/access_bp,aggrs,min_sample_us,max_sample_us
@@ -188,9 +188,9 @@ details). At the moment, only one context per kdamond is supported, so only
contexts/<N>/
-------------
-In each context directory, two files (``avail_operations`` and ``operations``)
-and three directories (``monitoring_attrs``, ``targets``, and ``schemes``)
-exist.
+In each context directory, three files (``avail_operations``, ``operations``
+and ``addr_unit``) and three directories (``monitoring_attrs``, ``targets``,
+and ``schemes``) exist.
DAMON supports multiple types of :ref:`monitoring operations
<damon_design_configurable_operations_set>`, including those for virtual address
@@ -205,6 +205,9 @@ You can set and get what type of monitoring operations DAMON will use for the
context by writing one of the keywords listed in ``avail_operations`` file and
reading from the ``operations`` file.
+``addr_unit`` file is for setting and getting the :ref:`address unit
+<damon_design_addr_unit>` parameter of the operations set.
+
.. _sysfs_monitoring_attrs:
contexts/<N>/monitoring_attrs/
diff --git a/Documentation/admin-guide/mm/transhuge.rst b/Documentation/admin-guide/mm/transhuge.rst
index 370fba113460..a16a04841b96 100644
--- a/Documentation/admin-guide/mm/transhuge.rst
+++ b/Documentation/admin-guide/mm/transhuge.rst
@@ -225,6 +225,42 @@ to "always" or "madvise"), and it'll be automatically shutdown when
PMD-sized THP is disabled (when both the per-size anon control and the
top-level control are "never")
+process THP controls
+--------------------
+
+A process can control its own THP behaviour using the ``PR_SET_THP_DISABLE``
+and ``PR_GET_THP_DISABLE`` pair of prctl(2) calls. The THP behaviour set using
+``PR_SET_THP_DISABLE`` is inherited across fork(2) and execve(2). These calls
+support the following arguments::
+
+ prctl(PR_SET_THP_DISABLE, 1, 0, 0, 0):
+ This will disable THPs completely for the process, irrespective
+ of global THP controls or madvise(..., MADV_COLLAPSE) being used.
+
+ prctl(PR_SET_THP_DISABLE, 1, PR_THP_DISABLE_EXCEPT_ADVISED, 0, 0):
+ This will disable THPs for the process except when the usage of THPs is
+ advised. Consequently, THPs will only be used when:
+ - Global THP controls are set to "always" or "madvise" and
+ madvise(..., MADV_HUGEPAGE) or madvise(..., MADV_COLLAPSE) is used.
+ - Global THP controls are set to "never" and madvise(..., MADV_COLLAPSE)
+ is used. This is the same behavior as if THPs would not be disabled on
+ a process level.
+ Note that MADV_COLLAPSE is currently always rejected if
+ madvise(..., MADV_NOHUGEPAGE) is set on an area.
+
+ prctl(PR_SET_THP_DISABLE, 0, 0, 0, 0):
+ This will re-enable THPs for the process, as if they were never disabled.
+ Whether THPs will actually be used depends on global THP controls and
+ madvise() calls.
+
+ prctl(PR_GET_THP_DISABLE, 0, 0, 0, 0):
+ This returns a value whose bits indicate how THP-disable is configured:
+ Bits
+ 1 0 Value Description
+ |0|0| 0 No THP-disable behaviour specified.
+ |0|1| 1 THP is entirely disabled for this process.
+ |1|1| 3 THP-except-advised mode is set for this process.
+
Khugepaged controls
-------------------
diff --git a/Documentation/filesystems/proc.rst b/Documentation/filesystems/proc.rst
index 2971551b7235..915a3e44bc12 100644
--- a/Documentation/filesystems/proc.rst
+++ b/Documentation/filesystems/proc.rst
@@ -291,8 +291,9 @@ It's slow but very precise.
HugetlbPages size of hugetlb memory portions
CoreDumping process's memory is currently being dumped
(killing the process may lead to a corrupted core)
- THP_enabled process is allowed to use THP (returns 0 when
- PR_SET_THP_DISABLE is set on the process
+ THP_enabled process is allowed to use THP (returns 0 when
+ PR_SET_THP_DISABLE is set on the process to disable
+ THP completely, not just partially)
Threads number of threads
SigQ number of signals queued/max. number for queue
SigPnd bitmap of pending signals for the thread
diff --git a/Documentation/mm/damon/design.rst b/Documentation/mm/damon/design.rst
index 03f8137256f5..80354f4f42ba 100644
--- a/Documentation/mm/damon/design.rst
+++ b/Documentation/mm/damon/design.rst
@@ -67,7 +67,7 @@ processes, NUMA nodes, files, and backing memory devices would be supportable.
Also, if some architectures or devices support special optimized access check
features, those will be easily configurable.
-DAMON currently provides below three operation sets. Below two subsections
+DAMON currently provides below three operation sets. Below three subsections
describe how those work.
- vaddr: Monitor virtual address spaces of specific processes
@@ -135,6 +135,20 @@ the interference is the responsibility of sysadmins. However, it solves the
conflict with the reclaim logic using ``PG_idle`` and ``PG_young`` page flags,
as Idle page tracking does.
+.. _damon_design_addr_unit:
+
+Address Unit
+------------
+
+DAMON core layer uses ``unsinged long`` type for monitoring target address
+ranges. In some cases, the address space for a given operations set could be
+too large to be handled with the type. ARM (32-bit) with large physical
+address extension is an example. For such cases, a per-operations set
+parameter called ``address unit`` is provided. It represents the scale factor
+that need to be multiplied to the core layer's address for calculating real
+address on the given address space. Support of ``address unit`` parameter is
+up to each operations set implementation. ``paddr`` is the only operations set
+implementation that supports the parameter.
.. _damon_core_logic:
@@ -689,7 +703,7 @@ DAMOS accounts below statistics for each scheme, from the beginning of the
scheme's execution.
- ``nr_tried``: Total number of regions that the scheme is tried to be applied.
-- ``sz_trtied``: Total size of regions that the scheme is tried to be applied.
+- ``sz_tried``: Total size of regions that the scheme is tried to be applied.
- ``sz_ops_filter_passed``: Total bytes that passed operations set
layer-handled DAMOS filters.
- ``nr_applied``: Total number of regions that the scheme is applied.
diff --git a/MAINTAINERS b/MAINTAINERS
index 03b433441836..9344c33c52e1 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -16201,6 +16201,7 @@ S: Maintained
F: include/linux/rmap.h
F: mm/page_vma_mapped.c
F: mm/rmap.c
+F: tools/testing/selftests/mm/rmap.c
MEMORY MANAGEMENT - SECRETMEM
M: Andrew Morton <akpm@linux-foundation.org>
diff --git a/arch/arc/include/asm/arcregs.h b/arch/arc/include/asm/arcregs.h
index a31bbf5c8bbc..d84908a177bd 100644
--- a/arch/arc/include/asm/arcregs.h
+++ b/arch/arc/include/asm/arcregs.h
@@ -151,9 +151,6 @@
/* Helpers */
#define TO_KB(bytes) ((bytes) >> 10)
#define TO_MB(bytes) (TO_KB(bytes) >> 10)
-#define PAGES_TO_KB(n_pages) ((n_pages) << (PAGE_SHIFT - 10))
-#define PAGES_TO_MB(n_pages) (PAGES_TO_KB(n_pages) >> 10)
-
/*
***************************************************************
diff --git a/arch/arc/mm/cache.c b/arch/arc/mm/cache.c
index 9106ceac323c..7d2f93dc1e91 100644
--- a/arch/arc/mm/cache.c
+++ b/arch/arc/mm/cache.c
@@ -704,7 +704,7 @@ static inline void arc_slc_enable(void)
void flush_dcache_folio(struct folio *folio)
{
- clear_bit(PG_dc_clean, &folio->flags);
+ clear_bit(PG_dc_clean, &folio->flags.f);
return;
}
EXPORT_SYMBOL(flush_dcache_folio);
@@ -889,8 +889,8 @@ void copy_user_highpage(struct page *to, struct page *from,
copy_page(kto, kfrom);
- clear_bit(PG_dc_clean, &dst->flags);
- clear_bit(PG_dc_clean, &src->flags);
+ clear_bit(PG_dc_clean, &dst->flags.f);
+ clear_bit(PG_dc_clean, &src->flags.f);
kunmap_atomic(kto);
kunmap_atomic(kfrom);
@@ -900,7 +900,7 @@ void clear_user_page(void *to, unsigned long u_vaddr, struct page *page)
{
struct folio *folio = page_folio(page);
clear_page(to);
- clear_bit(PG_dc_clean, &folio->flags);
+ clear_bit(PG_dc_clean, &folio->flags.f);
}
EXPORT_SYMBOL(clear_user_page);
diff --git a/arch/arc/mm/tlb.c b/arch/arc/mm/tlb.c
index cae4a7aae0ed..ed6915ba76ec 100644
--- a/arch/arc/mm/tlb.c
+++ b/arch/arc/mm/tlb.c
@@ -488,7 +488,7 @@ void update_mmu_cache_range(struct vm_fault *vmf, struct vm_area_struct *vma,
*/
if (vma->vm_flags & VM_EXEC) {
struct folio *folio = page_folio(page);
- int dirty = !test_and_set_bit(PG_dc_clean, &folio->flags);
+ int dirty = !test_and_set_bit(PG_dc_clean, &folio->flags.f);
if (dirty) {
unsigned long offset = offset_in_folio(folio, paddr);
nr = folio_nr_pages(folio);
diff --git a/arch/arm/include/asm/hugetlb.h b/arch/arm/include/asm/hugetlb.h
index b766c4b373f6..700055b1ccb3 100644
--- a/arch/arm/include/asm/hugetlb.h
+++ b/arch/arm/include/asm/hugetlb.h
@@ -17,7 +17,7 @@
static inline void arch_clear_hugetlb_flags(struct folio *folio)
{
- clear_bit(PG_dcache_clean, &folio->flags);
+ clear_bit(PG_dcache_clean, &folio->flags.f);
}
#define arch_clear_hugetlb_flags arch_clear_hugetlb_flags
diff --git a/arch/arm/mm/copypage-v4mc.c b/arch/arm/mm/copypage-v4mc.c
index 7ddd82b9fe8b..ed843bb22020 100644
--- a/arch/arm/mm/copypage-v4mc.c
+++ b/arch/arm/mm/copypage-v4mc.c
@@ -67,7 +67,7 @@ void v4_mc_copy_user_highpage(struct page *to, struct page *from,
struct folio *src = page_folio(from);
void *kto = kmap_atomic(to);
- if (!test_and_set_bit(PG_dcache_clean, &src->flags))
+ if (!test_and_set_bit(PG_dcache_clean, &src->flags.f))
__flush_dcache_folio(folio_flush_mapping(src), src);
raw_spin_lock(&minicache_lock);
diff --git a/arch/arm/mm/copypage-v6.c b/arch/arm/mm/copypage-v6.c
index a1a71f36d850..0710dba5c0bf 100644
--- a/arch/arm/mm/copypage-v6.c
+++ b/arch/arm/mm/copypage-v6.c
@@ -73,7 +73,7 @@ static void v6_copy_user_highpage_aliasing(struct page *to,
unsigned int offset = CACHE_COLOUR(vaddr);
unsigned long kfrom, kto;
- if (!test_and_set_bit(PG_dcache_clean, &src->flags))
+ if (!test_and_set_bit(PG_dcache_clean, &src->flags.f))
__flush_dcache_folio(folio_flush_mapping(src), src);
/* FIXME: not highmem safe */
diff --git a/arch/arm/mm/copypage-xscale.c b/arch/arm/mm/copypage-xscale.c
index f1e29d3e8193..e16af68d709f 100644
--- a/arch/arm/mm/copypage-xscale.c
+++ b/arch/arm/mm/copypage-xscale.c
@@ -87,7 +87,7 @@ void xscale_mc_copy_user_highpage(struct page *to, struct page *from,
struct folio *src = page_folio(from);
void *kto = kmap_atomic(to);
- if (!test_and_set_bit(PG_dcache_clean, &src->flags))
+ if (!test_and_set_bit(PG_dcache_clean, &src->flags.f))
__flush_dcache_folio(folio_flush_mapping(src), src);
raw_spin_lock(&minicache_lock);
diff --git a/arch/arm/mm/dma-mapping.c b/arch/arm/mm/dma-mapping.c
index 88c2d68a69c9..08641a936394 100644
--- a/arch/arm/mm/dma-mapping.c
+++ b/arch/arm/mm/dma-mapping.c
@@ -718,7 +718,7 @@ static void __dma_page_dev_to_cpu(struct page *page, unsigned long off,
if (size < sz)
break;
if (!offset)
- set_bit(PG_dcache_clean, &folio->flags);
+ set_bit(PG_dcache_clean, &folio->flags.f);
offset = 0;
size -= sz;
if (!size)
diff --git a/arch/arm/mm/fault-armv.c b/arch/arm/mm/fault-armv.c
index 39fd5df73317..91e488767783 100644
--- a/arch/arm/mm/fault-armv.c
+++ b/arch/arm/mm/fault-armv.c
@@ -203,7 +203,7 @@ void update_mmu_cache_range(struct vm_fault *vmf, struct vm_area_struct *vma,
folio = page_folio(pfn_to_page(pfn));
mapping = folio_flush_mapping(folio);
- if (!test_and_set_bit(PG_dcache_clean, &folio->flags))
+ if (!test_and_set_bit(PG_dcache_clean, &folio->flags.f))
__flush_dcache_folio(mapping, folio);
if (mapping) {
if (cache_is_vivt())
diff --git a/arch/arm/mm/flush.c b/arch/arm/mm/flush.c
index 5219158d54cf..19470d938b23 100644
--- a/arch/arm/mm/flush.c
+++ b/arch/arm/mm/flush.c
@@ -304,7 +304,7 @@ void __sync_icache_dcache(pte_t pteval)
else
mapping = NULL;
- if (!test_and_set_bit(PG_dcache_clean, &folio->flags))
+ if (!test_and_set_bit(PG_dcache_clean, &folio->flags.f))
__flush_dcache_folio(mapping, folio);
if (pte_exec(pteval))
@@ -343,8 +343,8 @@ void flush_dcache_folio(struct folio *folio)
return;
if (!cache_ops_need_broadcast() && cache_is_vipt_nonaliasing()) {
- if (test_bit(PG_dcache_clean, &folio->flags))
- clear_bit(PG_dcache_clean, &folio->flags);
+ if (test_bit(PG_dcache_clean, &folio->flags.f))
+ clear_bit(PG_dcache_clean, &folio->flags.f);
return;
}
@@ -352,14 +352,14 @@ void flush_dcache_folio(struct folio *folio)
if (!cache_ops_need_broadcast() &&
mapping && !folio_mapped(folio))
- clear_bit(PG_dcache_clean, &folio->flags);
+ clear_bit(PG_dcache_clean, &folio->flags.f);
else {
__flush_dcache_folio(mapping, folio);
if (mapping && cache_is_vivt())
__flush_dcache_aliases(mapping, folio);
else if (mapping)
__flush_icache_all();
- set_bit(PG_dcache_clean, &folio->flags);
+ set_bit(PG_dcache_clean, &folio->flags.f);
}
}
EXPORT_SYMBOL(flush_dcache_folio);
diff --git a/arch/arm64/include/asm/hugetlb.h b/arch/arm64/include/asm/hugetlb.h
index 2a8155c4a882..44c1f757bfcf 100644
--- a/arch/arm64/include/asm/hugetlb.h
+++ b/arch/arm64/include/asm/hugetlb.h
@@ -21,12 +21,12 @@ extern bool arch_hugetlb_migration_supported(struct hstate *h);
static inline void arch_clear_hugetlb_flags(struct folio *folio)
{
- clear_bit(PG_dcache_clean, &folio->flags);
+ clear_bit(PG_dcache_clean, &folio->flags.f);
#ifdef CONFIG_ARM64_MTE
if (system_supports_mte()) {
- clear_bit(PG_mte_tagged, &folio->flags);
- clear_bit(PG_mte_lock, &folio->flags);
+ clear_bit(PG_mte_tagged, &folio->flags.f);
+ clear_bit(PG_mte_lock, &folio->flags.f);
}
#endif
}
diff --git a/arch/arm64/include/asm/mte.h b/arch/arm64/include/asm/mte.h
index 6567df8ec8ca..3b5069f4683d 100644
--- a/arch/arm64/include/asm/mte.h
+++ b/arch/arm64/include/asm/mte.h
@@ -48,12 +48,12 @@ static inline void set_page_mte_tagged(struct page *page)
* before the page flags update.
*/
smp_wmb();
- set_bit(PG_mte_tagged, &page->flags);
+ set_bit(PG_mte_tagged, &page->flags.f);
}
static inline bool page_mte_tagged(struct page *page)
{
- bool ret = test_bit(PG_mte_tagged, &page->flags);
+ bool ret = test_bit(PG_mte_tagged, &page->flags.f);
VM_WARN_ON_ONCE(folio_test_hugetlb(page_folio(page)));
@@ -82,7 +82,7 @@ static inline bool try_page_mte_tagging(struct page *page)
{
VM_WARN_ON_ONCE(folio_test_hugetlb(page_folio(page)));
- if (!test_and_set_bit(PG_mte_lock, &page->flags))
+ if (!test_and_set_bit(PG_mte_lock, &page->flags.f))
return true;
/*
@@ -90,7 +90,7 @@ static inline bool try_page_mte_tagging(struct page *page)
* already. Check if the PG_mte_tagged flag has been set or wait
* otherwise.
*/
- smp_cond_load_acquire(&page->flags, VAL & (1UL << PG_mte_tagged));
+ smp_cond_load_acquire(&page->flags.f, VAL & (1UL << PG_mte_tagged));
return false;
}
@@ -173,13 +173,13 @@ static inline void folio_set_hugetlb_mte_tagged(struct folio *folio)
* before the folio flags update.
*/
smp_wmb();
- set_bit(PG_mte_tagged, &folio->flags);
+ set_bit(PG_mte_tagged, &folio->flags.f);
}
static inline bool folio_test_hugetlb_mte_tagged(struct folio *folio)
{
- bool ret = test_bit(PG_mte_tagged, &folio->flags);
+ bool ret = test_bit(PG_mte_tagged, &folio->flags.f);
VM_WARN_ON_ONCE(!folio_test_hugetlb(folio));
@@ -196,7 +196,7 @@ static inline bool folio_try_hugetlb_mte_tagging(struct folio *folio)
{
VM_WARN_ON_ONCE(!folio_test_hugetlb(folio));
- if (!test_and_set_bit(PG_mte_lock, &folio->flags))
+ if (!test_and_set_bit(PG_mte_lock, &folio->flags.f))
return true;
/*
@@ -204,7 +204,7 @@ static inline bool folio_try_hugetlb_mte_tagging(struct folio *folio)
* already. Check if the PG_mte_tagged flag has been set or wait
* otherwise.
*/
- smp_cond_load_acquire(&folio->flags, VAL & (1UL << PG_mte_tagged));
+ smp_cond_load_acquire(&folio->flags.f, VAL & (1UL << PG_mte_tagged));
return false;
}
diff --git a/arch/arm64/mm/flush.c b/arch/arm64/mm/flush.c
index 013eead9b695..fbf08b543c3f 100644
--- a/arch/arm64/mm/flush.c
+++ b/arch/arm64/mm/flush.c
@@ -53,11 +53,11 @@ void __sync_icache_dcache(pte_t pte)
{
struct folio *folio = page_folio(pte_page(pte));
- if (!test_bit(PG_dcache_clean, &folio->flags)) {
+ if (!test_bit(PG_dcache_clean, &folio->flags.f)) {
sync_icache_aliases((unsigned long)folio_address(folio),
(unsigned long)folio_address(folio) +
folio_size(folio));
- set_bit(PG_dcache_clean, &folio->flags);
+ set_bit(PG_dcache_clean, &folio->flags.f);
}
}
EXPORT_SYMBOL_GPL(__sync_icache_dcache);
@@ -69,8 +69,8 @@ EXPORT_SYMBOL_GPL(__sync_icache_dcache);
*/
void flush_dcache_folio(struct folio *folio)
{
- if (test_bit(PG_dcache_clean, &folio->flags))
- clear_bit(PG_dcache_clean, &folio->flags);
+ if (test_bit(PG_dcache_clean, &folio->flags.f))
+ clear_bit(PG_dcache_clean, &folio->flags.f);
}
EXPORT_SYMBOL(flush_dcache_folio);
diff --git a/arch/csky/abiv1/cacheflush.c b/arch/csky/abiv1/cacheflush.c
index 171e8fb32285..4bc0aad3cf8a 100644
--- a/arch/csky/abiv1/cacheflush.c
+++ b/arch/csky/abiv1/cacheflush.c
@@ -25,12 +25,12 @@ void flush_dcache_folio(struct folio *folio)
mapping = folio_flush_mapping(folio);
if (mapping && !folio_mapped(folio))
- clear_bit(PG_dcache_clean, &folio->flags);
+ clear_bit(PG_dcache_clean, &folio->flags.f);
else {
dcache_wbinv_all();
if (mapping)
icache_inv_all();
- set_bit(PG_dcache_clean, &folio->flags);
+ set_bit(PG_dcache_clean, &folio->flags.f);
}
}
EXPORT_SYMBOL(flush_dcache_folio);
@@ -56,7 +56,7 @@ void update_mmu_cache_range(struct vm_fault *vmf, struct vm_area_struct *vma,
return;
folio = page_folio(pfn_to_page(pfn));
- if (!test_and_set_bit(PG_dcache_clean, &folio->flags))
+ if (!test_and_set_bit(PG_dcache_clean, &folio->flags.f))
dcache_wbinv_all();
if (folio_flush_mapping(folio)) {
diff --git a/arch/mips/include/asm/cacheflush.h b/arch/mips/include/asm/cacheflush.h
index 1f14132b3fc9..5d283ef89d90 100644
--- a/arch/mips/include/asm/cacheflush.h
+++ b/arch/mips/include/asm/cacheflush.h
@@ -37,11 +37,11 @@
#define PG_dcache_dirty PG_arch_1
#define folio_test_dcache_dirty(folio) \
- test_bit(PG_dcache_dirty, &(folio)->flags)
+ test_bit(PG_dcache_dirty, &(folio)->flags.f)
#define folio_set_dcache_dirty(folio) \
- set_bit(PG_dcache_dirty, &(folio)->flags)
+ set_bit(PG_dcache_dirty, &(folio)->flags.f)
#define folio_clear_dcache_dirty(folio) \
- clear_bit(PG_dcache_dirty, &(folio)->flags)
+ clear_bit(PG_dcache_dirty, &(folio)->flags.f)
extern void (*flush_cache_all)(void);
extern void (*__flush_cache_all)(void);
diff --git a/arch/nios2/mm/cacheflush.c b/arch/nios2/mm/cacheflush.c
index 0ee9c5f02e08..8321182eb927 100644
--- a/arch/nios2/mm/cacheflush.c
+++ b/arch/nios2/mm/cacheflush.c
@@ -187,7 +187,7 @@ void flush_dcache_folio(struct folio *folio)
/* Flush this page if there are aliases. */
if (mapping && !mapping_mapped(mapping)) {
- clear_bit(PG_dcache_clean, &folio->flags);
+ clear_bit(PG_dcache_clean, &folio->flags.f);
} else {
__flush_dcache_folio(folio);
if (mapping) {
@@ -195,7 +195,7 @@ void flush_dcache_folio(struct folio *folio)
flush_aliases(mapping, folio);
flush_icache_range(start, start + folio_size(folio));
}
- set_bit(PG_dcache_clean, &folio->flags);
+ set_bit(PG_dcache_clean, &folio->flags.f);
}
}
EXPORT_SYMBOL(flush_dcache_folio);
@@ -227,7 +227,7 @@ void update_mmu_cache_range(struct vm_fault *vmf, struct vm_area_struct *vma,
return;
folio = page_folio(pfn_to_page(pfn));
- if (!test_and_set_bit(PG_dcache_clean, &folio->flags))
+ if (!test_and_set_bit(PG_dcache_clean, &folio->flags.f))
__flush_dcache_folio(folio);
mapping = folio_flush_mapping(folio);
diff --git a/arch/openrisc/include/asm/cacheflush.h b/arch/openrisc/include/asm/cacheflush.h
index 0e60af486ec1..cd8f971c0fec 100644
--- a/arch/openrisc/include/asm/cacheflush.h
+++ b/arch/openrisc/include/asm/cacheflush.h
@@ -75,7 +75,7 @@ static inline void sync_icache_dcache(struct page *page)
static inline void flush_dcache_folio(struct folio *folio)
{
- clear_bit(PG_dc_clean, &folio->flags);
+ clear_bit(PG_dc_clean, &folio->flags.f);
}
#define flush_dcache_folio flush_dcache_folio
diff --git a/arch/openrisc/mm/cache.c b/arch/openrisc/mm/cache.c
index 0f265b8e73ec..f33df46dae4e 100644
--- a/arch/openrisc/mm/cache.c
+++ b/arch/openrisc/mm/cache.c
@@ -83,7 +83,7 @@ void update_cache(struct vm_area_struct *vma, unsigned long address,
{
unsigned long pfn = pte_val(*pte) >> PAGE_SHIFT;
struct folio *folio = page_folio(pfn_to_page(pfn));
- int dirty = !test_and_set_bit(PG_dc_clean, &folio->flags);
+ int dirty = !test_and_set_bit(PG_dc_clean, &folio->flags.f);
/*
* Since icaches do not snoop for updated data on OpenRISC, we
diff --git a/arch/parisc/kernel/cache.c b/arch/parisc/kernel/cache.c
index 37ca484cc495..4c5240d3a3c7 100644
--- a/arch/parisc/kernel/cache.c
+++ b/arch/parisc/kernel/cache.c
@@ -122,10 +122,10 @@ void __update_cache(pte_t pte)
pfn = folio_pfn(folio);
nr = folio_nr_pages(folio);
if (folio_flush_mapping(folio) &&
- test_bit(PG_dcache_dirty, &folio->flags)) {
+ test_bit(PG_dcache_dirty, &folio->flags.f)) {
while (nr--)
flush_kernel_dcache_page_addr(pfn_va(pfn + nr));
- clear_bit(PG_dcache_dirty, &folio->flags);
+ clear_bit(PG_dcache_dirty, &folio->flags.f);
} else if (parisc_requires_coherency())
while (nr--)
flush_kernel_dcache_page_addr(pfn_va(pfn + nr));
@@ -481,7 +481,7 @@ void flush_dcache_folio(struct folio *folio)
pgoff_t pgoff;
if (mapping && !mapping_mapped(mapping)) {
- set_bit(PG_dcache_dirty, &folio->flags);
+ set_bit(PG_dcache_dirty, &folio->flags.f);
return;
}
diff --git a/arch/powerpc/include/asm/cacheflush.h b/arch/powerpc/include/asm/cacheflush.h
index f2656774aaa9..1fea42928f64 100644
--- a/arch/powerpc/include/asm/cacheflush.h
+++ b/arch/powerpc/include/asm/cacheflush.h
@@ -40,8 +40,8 @@ static inline void flush_dcache_folio(struct folio *folio)
if (cpu_has_feature(CPU_FTR_COHERENT_ICACHE))
return;
/* avoid an atomic op if possible */
- if (test_bit(PG_dcache_clean, &folio->flags))
- clear_bit(PG_dcache_clean, &folio->flags);
+ if (test_bit(PG_dcache_clean, &folio->flags.f))
+ clear_bit(PG_dcache_clean, &folio->flags.f);
}
#define flush_dcache_folio flush_dcache_folio
diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h
index ca3829d47ab7..0953f2daa466 100644
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -939,9 +939,9 @@ static inline void kvmppc_mmu_flush_icache(kvm_pfn_t pfn)
/* Clear i-cache for new pages */
folio = page_folio(pfn_to_page(pfn));
- if (!test_bit(PG_dcache_clean, &folio->flags)) {
+ if (!test_bit(PG_dcache_clean, &folio->flags.f)) {
flush_dcache_icache_folio(folio);
- set_bit(PG_dcache_clean, &folio->flags);
+ set_bit(PG_dcache_clean, &folio->flags.f);
}
}
diff --git a/arch/powerpc/mm/book3s64/hash_utils.c b/arch/powerpc/mm/book3s64/hash_utils.c
index 4693c464fc5a..3aee3af614af 100644
--- a/arch/powerpc/mm/book3s64/hash_utils.c
+++ b/arch/powerpc/mm/book3s64/hash_utils.c
@@ -1562,11 +1562,11 @@ unsigned int hash_page_do_lazy_icache(unsigned int pp, pte_t pte, int trap)
folio = page_folio(pte_page(pte));
/* page is dirty */
- if (!test_bit(PG_dcache_clean, &folio->flags) &&
+ if (!test_bit(PG_dcache_clean, &folio->flags.f) &&
!folio_test_reserved(folio)) {
if (trap == INTERRUPT_INST_STORAGE) {
flush_dcache_icache_folio(folio);
- set_bit(PG_dcache_clean, &folio->flags);
+ set_bit(PG_dcache_clean, &folio->flags.f);
} else
pp |= HPTE_R_N;
}
diff --git a/arch/powerpc/mm/pgtable.c b/arch/powerpc/mm/pgtable.c
index dfaa9fd86f7e..56d7e8960e77 100644
--- a/arch/powerpc/mm/pgtable.c
+++ b/arch/powerpc/mm/pgtable.c
@@ -87,9 +87,9 @@ static pte_t set_pte_filter_hash(pte_t pte, unsigned long addr)
struct folio *folio = maybe_pte_to_folio(pte);
if (!folio)
return pte;
- if (!test_bit(PG_dcache_clean, &folio->flags)) {
+ if (!test_bit(PG_dcache_clean, &folio->flags.f)) {
flush_dcache_icache_folio(folio);
- set_bit(PG_dcache_clean, &folio->flags);
+ set_bit(PG_dcache_clean, &folio->flags.f);
}
}
return pte;
@@ -127,13 +127,13 @@ static inline pte_t set_pte_filter(pte_t pte, unsigned long addr)
return pte;
/* If the page clean, we move on */
- if (test_bit(PG_dcache_clean, &folio->flags))
+ if (test_bit(PG_dcache_clean, &folio->flags.f))
return pte;
/* If it's an exec fault, we flush the cache and make it clean */
if (is_exec_fault()) {
flush_dcache_icache_folio(folio);
- set_bit(PG_dcache_clean, &folio->flags);
+ set_bit(PG_dcache_clean, &folio->flags.f);
return pte;
}
@@ -175,12 +175,12 @@ static pte_t set_access_flags_filter(pte_t pte, struct vm_area_struct *vma,
goto bail;
/* If the page is already clean, we move on */
- if (test_bit(PG_dcache_clean, &folio->flags))
+ if (test_bit(PG_dcache_clean, &folio->flags.f))
goto bail;
/* Clean the page and set PG_dcache_clean */
flush_dcache_icache_folio(folio);
- set_bit(PG_dcache_clean, &folio->flags);
+ set_bit(PG_dcache_clean, &folio->flags.f);
bail:
return pte_mkexec(pte);
diff --git a/arch/powerpc/mm/ptdump/8xx.c b/arch/powerpc/mm/ptdump/8xx.c
index b5c79b11ea3c..4ca9cf7a90c9 100644
--- a/arch/powerpc/mm/ptdump/8xx.c
+++ b/arch/powerpc/mm/ptdump/8xx.c
@@ -69,7 +69,7 @@ static const struct flag_info flag_array[] = {
}
};
-struct pgtable_level pg_level[5] = {
+struct ptdump_pg_level pg_level[5] = {
{ /* pgd */
.flag = flag_array,
.num = ARRAY_SIZE(flag_array),
diff --git a/arch/powerpc/mm/ptdump/book3s64.c b/arch/powerpc/mm/ptdump/book3s64.c
index 5ad92d9dc5d1..6b2da9241d4c 100644
--- a/arch/powerpc/mm/ptdump/book3s64.c
+++ b/arch/powerpc/mm/ptdump/book3s64.c
@@ -102,7 +102,7 @@ static const struct flag_info flag_array[] = {
}
};
-struct pgtable_level pg_level[5] = {
+struct ptdump_pg_level pg_level[5] = {
{ /* pgd */
.flag = flag_array,
.num = ARRAY_SIZE(flag_array),
diff --git a/arch/powerpc/mm/ptdump/ptdump.h b/arch/powerpc/mm/ptdump/ptdump.h
index 154efae96ae0..4232aa4b57ea 100644
--- a/arch/powerpc/mm/ptdump/ptdump.h
+++ b/arch/powerpc/mm/ptdump/ptdump.h
@@ -11,12 +11,12 @@ struct flag_info {
int shift;
};
-struct pgtable_level {
+struct ptdump_pg_level {
const struct flag_info *flag;
size_t num;
u64 mask;
};
-extern struct pgtable_level pg_level[5];
+extern struct ptdump_pg_level pg_level[5];
void pt_dump_size(struct seq_file *m, unsigned long delta);
diff --git a/arch/powerpc/mm/ptdump/shared.c b/arch/powerpc/mm/ptdump/shared.c
index 39c30c62b7ea..58998960eb9a 100644
--- a/arch/powerpc/mm/ptdump/shared.c
+++ b/arch/powerpc/mm/ptdump/shared.c
@@ -67,7 +67,7 @@ static const struct flag_info flag_array[] = {
}
};
-struct pgtable_level pg_level[5] = {
+struct ptdump_pg_level pg_level[5] = {
{ /* pgd */
.flag = flag_array,
.num = ARRAY_SIZE(flag_array),
diff --git a/arch/powerpc/platforms/pseries/cmm.c b/arch/powerpc/platforms/pseries/cmm.c
index 5e0a718d1be7..0823fa2da151 100644
--- a/arch/powerpc/platforms/pseries/cmm.c
+++ b/arch/powerpc/platforms/pseries/cmm.c
@@ -545,7 +545,7 @@ static int cmm_migratepage(struct balloon_dev_info *b_dev_info,
/* balloon page list reference */
put_page(page);
- return MIGRATEPAGE_SUCCESS;
+ return 0;
}
static void cmm_balloon_compaction_init(void)
diff --git a/arch/riscv/include/asm/cacheflush.h b/arch/riscv/include/asm/cacheflush.h
index 6086b38d5427..0092513c3376 100644
--- a/arch/riscv/include/asm/cacheflush.h
+++ b/arch/riscv/include/asm/cacheflush.h
@@ -23,8 +23,8 @@ static inline void local_flush_icache_range(unsigned long start,
static inline void flush_dcache_folio(struct folio *folio)
{
- if (test_bit(PG_dcache_clean, &folio->flags))
- clear_bit(PG_dcache_clean, &folio->flags);
+ if (test_bit(PG_dcache_clean, &folio->flags.f))
+ clear_bit(PG_dcache_clean, &folio->flags.f);
}
#define flush_dcache_folio flush_dcache_folio
#define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 1
diff --git a/arch/riscv/include/asm/hugetlb.h b/arch/riscv/include/asm/hugetlb.h
index 446126497768..0872d43fc0c0 100644
--- a/arch/riscv/include/asm/hugetlb.h
+++ b/arch/riscv/include/asm/hugetlb.h
@@ -7,7 +7,7 @@
static inline void arch_clear_hugetlb_flags(struct folio *folio)
{
- clear_bit(PG_dcache_clean, &folio->flags);
+ clear_bit(PG_dcache_clean, &folio->flags.f);
}
#define arch_clear_hugetlb_flags arch_clear_hugetlb_flags
diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h
index 91697fbf1f90..e69346307e78 100644
--- a/arch/riscv/include/asm/pgtable.h
+++ b/arch/riscv/include/asm/pgtable.h
@@ -942,6 +942,17 @@ static inline int pudp_test_and_clear_young(struct vm_area_struct *vma,
return ptep_test_and_clear_young(vma, address, (pte_t *)pudp);
}
+#define __HAVE_ARCH_PUDP_HUGE_GET_AND_CLEAR
+static inline pud_t pudp_huge_get_and_clear(struct mm_struct *mm,
+ unsigned long address, pud_t *pudp)
+{
+ pud_t pud = __pud(atomic_long_xchg((atomic_long_t *)pudp, 0));
+
+ page_table_check_pud_clear(mm, pud);
+
+ return pud;
+}
+
static inline int pud_young(pud_t pud)
{
return pte_young(pud_pte(pud));
diff --git a/arch/riscv/mm/cacheflush.c b/arch/riscv/mm/cacheflush.c
index 4ca5aafce22e..d83a612464f6 100644
--- a/arch/riscv/mm/cacheflush.c
+++ b/arch/riscv/mm/cacheflush.c
@@ -101,9 +101,9 @@ void flush_icache_pte(struct mm_struct *mm, pte_t pte)
{
struct folio *folio = page_folio(pte_page(pte));
- if (!test_bit(PG_dcache_clean, &folio->flags)) {
+ if (!test_bit(PG_dcache_clean, &folio->flags.f)) {
flush_icache_mm(mm, false);
- set_bit(PG_dcache_clean, &folio->flags);
+ set_bit(PG_dcache_clean, &folio->flags.f);
}
}
#endif /* CONFIG_MMU */
diff --git a/arch/s390/include/asm/hugetlb.h b/arch/s390/include/asm/hugetlb.h
index 931fcc413598..69131736daaa 100644
--- a/arch/s390/include/asm/hugetlb.h
+++ b/arch/s390/include/asm/hugetlb.h
@@ -39,7 +39,7 @@ static inline pte_t huge_ptep_get_and_clear(struct mm_struct *mm,
static inline void arch_clear_hugetlb_flags(struct folio *folio)
{
- clear_bit(PG_arch_1, &folio->flags);
+ clear_bit(PG_arch_1, &folio->flags.f);
}
#define arch_clear_hugetlb_flags arch_clear_hugetlb_flags
diff --git a/arch/s390/kernel/uv.c b/arch/s390/kernel/uv.c
index 47f574cd1728..93b2a01bae40 100644
--- a/arch/s390/kernel/uv.c
+++ b/arch/s390/kernel/uv.c
@@ -144,7 +144,7 @@ int uv_destroy_folio(struct folio *folio)
folio_get(folio);
rc = uv_destroy(folio_to_phys(folio));
if (!rc)
- clear_bit(PG_arch_1, &folio->flags);
+ clear_bit(PG_arch_1, &folio->flags.f);
folio_put(folio);
return rc;
}
@@ -193,7 +193,7 @@ int uv_convert_from_secure_folio(struct folio *folio)
folio_get(folio);
rc = uv_convert_from_secure(folio_to_phys(folio));
if (!rc)
- clear_bit(PG_arch_1, &folio->flags);
+ clear_bit(PG_arch_1, &folio->flags.f);
folio_put(folio);
return rc;
}
@@ -289,7 +289,7 @@ static int __make_folio_secure(struct folio *folio, struct uv_cb_header *uvcb)
expected = expected_folio_refs(folio) + 1;
if (!folio_ref_freeze(folio, expected))
return -EBUSY;
- set_bit(PG_arch_1, &folio->flags);
+ set_bit(PG_arch_1, &folio->flags.f);
/*
* If the UVC does not succeed or fail immediately, we don't want to
* loop for long, or we might get stall notifications.
@@ -483,18 +483,18 @@ int arch_make_folio_accessible(struct folio *folio)
* convert_to_secure.
* As secure pages are never large folios, both variants can co-exists.
*/
- if (!test_bit(PG_arch_1, &folio->flags))
+ if (!test_bit(PG_arch_1, &folio->flags.f))
return 0;
rc = uv_pin_shared(folio_to_phys(folio));
if (!rc) {
- clear_bit(PG_arch_1, &folio->flags);
+ clear_bit(PG_arch_1, &folio->flags.f);
return 0;
}
rc = uv_convert_from_secure(folio_to_phys(folio));
if (!rc) {
- clear_bit(PG_arch_1, &folio->flags);
+ clear_bit(PG_arch_1, &folio->flags.f);
return 0;
}
diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c
index c7defe4ed1f6..8ff6bba107e8 100644
--- a/arch/s390/mm/gmap.c
+++ b/arch/s390/mm/gmap.c
@@ -2272,7 +2272,7 @@ static int __s390_enable_skey_hugetlb(pte_t *pte, unsigned long addr,
start = pmd_val(*pmd) & HPAGE_MASK;
end = start + HPAGE_SIZE;
__storage_key_init_range(start, end);
- set_bit(PG_arch_1, &folio->flags);
+ set_bit(PG_arch_1, &folio->flags.f);
cond_resched();
return 0;
}
diff --git a/arch/s390/mm/hugetlbpage.c b/arch/s390/mm/hugetlbpage.c
index e88c02c9e642..72e8fa136af5 100644
--- a/arch/s390/mm/hugetlbpage.c
+++ b/arch/s390/mm/hugetlbpage.c
@@ -155,7 +155,7 @@ static void clear_huge_pte_skeys(struct mm_struct *mm, unsigned long rste)
paddr = rste & PMD_MASK;
}
- if (!test_and_set_bit(PG_arch_1, &folio->flags))
+ if (!test_and_set_bit(PG_arch_1, &folio->flags.f))
__storage_key_init_range(paddr, paddr + size);
}
diff --git a/arch/s390/mm/mmap.c b/arch/s390/mm/mmap.c
index 40a526d28184..547104ccc22a 100644
--- a/arch/s390/mm/mmap.c
+++ b/arch/s390/mm/mmap.c
@@ -182,10 +182,10 @@ void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack)
*/
if (mmap_is_legacy(rlim_stack)) {
mm->mmap_base = mmap_base_legacy(random_factor);
- clear_bit(MMF_TOPDOWN, &mm->flags);
+ mm_flags_clear(MMF_TOPDOWN, mm);
} else {
mm->mmap_base = mmap_base(random_factor, rlim_stack);
- set_bit(MMF_TOPDOWN, &mm->flags);
+ mm_flags_set(MMF_TOPDOWN, mm);
}
}
diff --git a/arch/sh/include/asm/hugetlb.h b/arch/sh/include/asm/hugetlb.h
index 4a92e6e4d627..974512f359f0 100644
--- a/arch/sh/include/asm/hugetlb.h
+++ b/arch/sh/include/asm/hugetlb.h
@@ -14,7 +14,7 @@ static inline pte_t huge_ptep_clear_flush(struct vm_area_struct *vma,
static inline void arch_clear_hugetlb_flags(struct folio *folio)
{
- clear_bit(PG_dcache_clean, &folio->flags);
+ clear_bit(PG_dcache_clean, &folio->flags.f);
}
#define arch_clear_hugetlb_flags arch_clear_hugetlb_flags
diff --git a/arch/sh/mm/cache-sh4.c b/arch/sh/mm/cache-sh4.c
index 46393b00137e..83fb34b39ca7 100644
--- a/arch/sh/mm/cache-sh4.c
+++ b/arch/sh/mm/cache-sh4.c
@@ -114,7 +114,7 @@ static void sh4_flush_dcache_folio(void *arg)
struct address_space *mapping = folio_flush_mapping(folio);
if (mapping && !mapping_mapped(mapping))
- clear_bit(PG_dcache_clean, &folio->flags);
+ clear_bit(PG_dcache_clean, &folio->flags.f);
else
#endif
{
diff --git a/arch/sh/mm/cache-sh7705.c b/arch/sh/mm/cache-sh7705.c
index b509a407588f..71f8be9fc8e0 100644
--- a/arch/sh/mm/cache-sh7705.c
+++ b/arch/sh/mm/cache-sh7705.c
@@ -138,7 +138,7 @@ static void sh7705_flush_dcache_folio(void *arg)
struct address_space *mapping = folio_flush_mapping(folio);
if (mapping && !mapping_mapped(mapping))
- clear_bit(PG_dcache_clean, &folio->flags);
+ clear_bit(PG_dcache_clean, &folio->flags.f);
else {
unsigned long pfn = folio_pfn(folio);
unsigned int i, nr = folio_nr_pages(folio);
diff --git a/arch/sh/mm/cache.c b/arch/sh/mm/cache.c
index 6ebdeaff3021..c3f028bed049 100644
--- a/arch/sh/mm/cache.c
+++ b/arch/sh/mm/cache.c
@@ -64,14 +64,14 @@ void copy_to_user_page(struct vm_area_struct *vma, struct page *page,
struct folio *folio = page_folio(page);
if (boot_cpu_data.dcache.n_aliases && folio_mapped(folio) &&
- test_bit(PG_dcache_clean, &folio->flags)) {
+ test_bit(PG_dcache_clean, &folio->flags.f)) {
void *vto = kmap_coherent(page, vaddr) + (vaddr & ~PAGE_MASK);
memcpy(vto, src, len);
kunmap_coherent(vto);
} else {
memcpy(dst, src, len);
if (boot_cpu_data.dcache.n_aliases)
- clear_bit(PG_dcache_clean, &folio->flags);
+ clear_bit(PG_dcache_clean, &folio->flags.f);
}
if (vma->vm_flags & VM_EXEC)
@@ -85,14 +85,14 @@ void copy_from_user_page(struct vm_area_struct *vma, struct page *page,
struct folio *folio = page_folio(page);
if (boot_cpu_data.dcache.n_aliases && folio_mapped(folio) &&
- test_bit(PG_dcache_clean, &folio->flags)) {
+ test_bit(PG_dcache_clean, &folio->flags.f)) {
void *vfrom = kmap_coherent(page, vaddr) + (vaddr & ~PAGE_MASK);
memcpy(dst, vfrom, len);
kunmap_coherent(vfrom);
} else {
memcpy(dst, src, len);
if (boot_cpu_data.dcache.n_aliases)
- clear_bit(PG_dcache_clean, &folio->flags);
+ clear_bit(PG_dcache_clean, &folio->flags.f);
}
}
@@ -105,7 +105,7 @@ void copy_user_highpage(struct page *to, struct page *from,
vto = kmap_atomic(to);
if (boot_cpu_data.dcache.n_aliases && folio_mapped(src) &&
- test_bit(PG_dcache_clean, &src->flags)) {
+ test_bit(PG_dcache_clean, &src->flags.f)) {
vfrom = kmap_coherent(from, vaddr);
copy_page(vto, vfrom);
kunmap_coherent(vfrom);
@@ -148,7 +148,7 @@ void __update_cache(struct vm_area_struct *vma,
if (pfn_valid(pfn)) {
struct folio *folio = page_folio(pfn_to_page(pfn));
- int dirty = !test_and_set_bit(PG_dcache_clean, &folio->flags);
+ int dirty = !test_and_set_bit(PG_dcache_clean, &folio->flags.f);
if (dirty)
__flush_purge_region(folio_address(folio),
folio_size(folio));
@@ -162,7 +162,7 @@ void __flush_anon_page(struct page *page, unsigned long vmaddr)
if (pages_do_alias(addr, vmaddr)) {
if (boot_cpu_data.dcache.n_aliases && folio_mapped(folio) &&
- test_bit(PG_dcache_clean, &folio->flags)) {
+ test_bit(PG_dcache_clean, &folio->flags.f)) {
void *kaddr;
kaddr = kmap_coherent(page, vmaddr);
diff --git a/arch/sh/mm/kmap.c b/arch/sh/mm/kmap.c
index fa50e8f6e7a9..c9f32d5a54b8 100644
--- a/arch/sh/mm/kmap.c
+++ b/arch/sh/mm/kmap.c
@@ -31,7 +31,7 @@ void *kmap_coherent(struct page *page, unsigned long addr)
enum fixed_addresses idx;
unsigned long vaddr;
- BUG_ON(!test_bit(PG_dcache_clean, &folio->flags));
+ BUG_ON(!test_bit(PG_dcache_clean, &folio->flags.f));
preempt_disable();
pagefault_disable();
diff --git a/arch/sparc/kernel/sys_sparc_64.c b/arch/sparc/kernel/sys_sparc_64.c
index c5a284df7b41..785e9909340f 100644
--- a/arch/sparc/kernel/sys_sparc_64.c
+++ b/arch/sparc/kernel/sys_sparc_64.c
@@ -309,7 +309,7 @@ void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack)
gap == RLIM_INFINITY ||
sysctl_legacy_va_layout) {
mm->mmap_base = TASK_UNMAPPED_BASE + random_factor;
- clear_bit(MMF_TOPDOWN, &mm->flags);
+ mm_flags_clear(MMF_TOPDOWN, mm);
} else {
/* We know it's 32-bit */
unsigned long task_size = STACK_TOP32;
@@ -320,7 +320,7 @@ void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack)
gap = (task_size / 6 * 5);
mm->mmap_base = PAGE_ALIGN(task_size - gap - random_factor);
- set_bit(MMF_TOPDOWN, &mm->flags);
+ mm_flags_set(MMF_TOPDOWN, mm);
}
}
diff --git a/arch/sparc/mm/init_64.c b/arch/sparc/mm/init_64.c
index 7ed58bf3aaca..df9f7c444c39 100644
--- a/arch/sparc/mm/init_64.c
+++ b/arch/sparc/mm/init_64.c
@@ -224,7 +224,7 @@ inline void flush_dcache_folio_impl(struct folio *folio)
((1UL<<ilog2(roundup_pow_of_two(NR_CPUS)))-1UL)
#define dcache_dirty_cpu(folio) \
- (((folio)->flags >> PG_dcache_cpu_shift) & PG_dcache_cpu_mask)
+ (((folio)->flags.f >> PG_dcache_cpu_shift) & PG_dcache_cpu_mask)
static inline void set_dcache_dirty(struct folio *folio, int this_cpu)
{
@@ -243,7 +243,7 @@ static inline void set_dcache_dirty(struct folio *folio, int this_cpu)
"bne,pn %%xcc, 1b\n\t"
" nop"
: /* no outputs */
- : "r" (mask), "r" (non_cpu_bits), "r" (&folio->flags)
+ : "r" (mask), "r" (non_cpu_bits), "r" (&folio->flags.f)
: "g1", "g7");
}
@@ -265,7 +265,7 @@ static inline void clear_dcache_dirty_cpu(struct folio *folio, unsigned long cpu
" nop\n"
"2:"
: /* no outputs */
- : "r" (cpu), "r" (mask), "r" (&folio->flags),
+ : "r" (cpu), "r" (mask), "r" (&folio->flags.f),
"i" (PG_dcache_cpu_mask),
"i" (PG_dcache_cpu_shift)
: "g1", "g7");
@@ -292,7 +292,7 @@ static void flush_dcache(unsigned long pfn)
struct folio *folio = page_folio(page);
unsigned long pg_flags;
- pg_flags = folio->flags;
+ pg_flags = folio->flags.f;
if (pg_flags & (1UL << PG_dcache_dirty)) {
int cpu = ((pg_flags >> PG_dcache_cpu_shift) &
PG_dcache_cpu_mask);
@@ -480,7 +480,7 @@ void flush_dcache_folio(struct folio *folio)
mapping = folio_flush_mapping(folio);
if (mapping && !mapping_mapped(mapping)) {
- bool dirty = test_bit(PG_dcache_dirty, &folio->flags);
+ bool dirty = test_bit(PG_dcache_dirty, &folio->flags.f);
if (dirty) {
int dirty_cpu = dcache_dirty_cpu(folio);
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index bb57e93b4caf..8bf6ad4b9400 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -34,6 +34,7 @@
* We need to define the tracepoints somewhere, and tlb.c
* is only compiled when SMP=y.
*/
+#define CREATE_TRACE_POINTS
#include <trace/events/tlb.h>
#include "mm_internal.h"
diff --git a/arch/x86/mm/mmap.c b/arch/x86/mm/mmap.c
index 5ed2109211da..708f85dc9380 100644
--- a/arch/x86/mm/mmap.c
+++ b/arch/x86/mm/mmap.c
@@ -122,9 +122,9 @@ static void arch_pick_mmap_base(unsigned long *base, unsigned long *legacy_base,
void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack)
{
if (mmap_is_legacy())
- clear_bit(MMF_TOPDOWN, &mm->flags);
+ mm_flags_clear(MMF_TOPDOWN, mm);
else
- set_bit(MMF_TOPDOWN, &mm->flags);
+ mm_flags_set(MMF_TOPDOWN, mm);
arch_pick_mmap_base(&mm->mmap_base, &mm->mmap_legacy_base,
arch_rnd(mmap64_rnd_bits), task_size_64bit(0),
diff --git a/arch/x86/mm/pat/memtype.c b/arch/x86/mm/pat/memtype.c
index c09284302dd3..b68200a0e0c6 100644
--- a/arch/x86/mm/pat/memtype.c
+++ b/arch/x86/mm/pat/memtype.c
@@ -126,7 +126,7 @@ __setup("debugpat", pat_debug_setup);
static inline enum page_cache_mode get_page_memtype(struct page *pg)
{
- unsigned long pg_flags = pg->flags & _PGMT_MASK;
+ unsigned long pg_flags = pg->flags.f & _PGMT_MASK;
if (pg_flags == _PGMT_WB)
return _PAGE_CACHE_MODE_WB;
@@ -161,10 +161,10 @@ static inline void set_page_memtype(struct page *pg,
break;
}
- old_flags = READ_ONCE(pg->flags);
+ old_flags = READ_ONCE(pg->flags.f);
do {
new_flags = (old_flags & _PGMT_CLEAR_MASK) | memtype_flags;
- } while (!try_cmpxchg(&pg->flags, &old_flags, new_flags));
+ } while (!try_cmpxchg(&pg->flags.f, &old_flags, new_flags));
}
#else
static inline enum page_cache_mode get_page_memtype(struct page *pg)
diff --git a/arch/xtensa/mm/cache.c b/arch/xtensa/mm/cache.c
index 23be0e7516ce..5354df52d61f 100644
--- a/arch/xtensa/mm/cache.c
+++ b/arch/xtensa/mm/cache.c
@@ -134,8 +134,8 @@ void flush_dcache_folio(struct folio *folio)
*/
if (mapping && !mapping_mapped(mapping)) {
- if (!test_bit(PG_arch_1, &folio->flags))
- set_bit(PG_arch_1, &folio->flags);
+ if (!test_bit(PG_arch_1, &folio->flags.f))
+ set_bit(PG_arch_1, &folio->flags.f);
return;
} else {
@@ -232,7 +232,7 @@ void update_mmu_cache_range(struct vm_fault *vmf, struct vm_area_struct *vma,
#if (DCACHE_WAY_SIZE > PAGE_SIZE)
- if (!folio_test_reserved(folio) && test_bit(PG_arch_1, &folio->flags)) {
+ if (!folio_test_reserved(folio) && test_bit(PG_arch_1, &folio->flags.f)) {
unsigned long phys = folio_pfn(folio) * PAGE_SIZE;
unsigned long tmp;
@@ -247,10 +247,10 @@ void update_mmu_cache_range(struct vm_fault *vmf, struct vm_area_struct *vma,
}
preempt_enable();
- clear_bit(PG_arch_1, &folio->flags);
+ clear_bit(PG_arch_1, &folio->flags.f);
}
#else
- if (!folio_test_reserved(folio) && !test_bit(PG_arch_1, &folio->flags)
+ if (!folio_test_reserved(folio) && !test_bit(PG_arch_1, &folio->flags.f)
&& (vma->vm_flags & VM_EXEC) != 0) {
for (i = 0; i < nr; i++) {
void *paddr = kmap_local_folio(folio, i * PAGE_SIZE);
@@ -258,7 +258,7 @@ void update_mmu_cache_range(struct vm_fault *vmf, struct vm_area_struct *vma,
__invalidate_icache_page((unsigned long)paddr);
kunmap_local(paddr);
}
- set_bit(PG_arch_1, &folio->flags);
+ set_bit(PG_arch_1, &folio->flags.f);
}
#endif
}
diff --git a/block/blk-lib.c b/block/blk-lib.c
index 4c9f20a689f7..3030a772d3aa 100644
--- a/block/blk-lib.c
+++ b/block/blk-lib.c
@@ -196,6 +196,8 @@ static void __blkdev_issue_zero_pages(struct block_device *bdev,
sector_t sector, sector_t nr_sects, gfp_t gfp_mask,
struct bio **biop, unsigned int flags)
{
+ struct folio *zero_folio = largest_zero_folio();
+
while (nr_sects) {
unsigned int nr_vecs = __blkdev_sectors_to_bio_pages(nr_sects);
struct bio *bio;
@@ -208,15 +210,14 @@ static void __blkdev_issue_zero_pages(struct block_device *bdev,
break;
do {
- unsigned int len, added;
+ unsigned int len;
- len = min_t(sector_t,
- PAGE_SIZE, nr_sects << SECTOR_SHIFT);
- added = bio_add_page(bio, ZERO_PAGE(0), len, 0);
- if (added < len)
+ len = min_t(sector_t, folio_size(zero_folio),
+ nr_sects << SECTOR_SHIFT);
+ if (!bio_add_folio(bio, zero_folio, len, 0))
break;
- nr_sects -= added >> SECTOR_SHIFT;
- sector += added >> SECTOR_SHIFT;
+ nr_sects -= len >> SECTOR_SHIFT;
+ sector += len >> SECTOR_SHIFT;
} while (nr_sects);
*biop = bio_chain_and_submit(*biop, bio);
diff --git a/drivers/base/node.c b/drivers/base/node.c
index 3399594136b2..45d512939c40 100644
--- a/drivers/base/node.c
+++ b/drivers/base/node.c
@@ -885,6 +885,11 @@ int register_one_node(int nid)
node_devices[nid] = node;
error = register_node(node_devices[nid], nid);
+ if (error) {
+ node_devices[nid] = NULL;
+ kfree(node);
+ return error;
+ }
/* link cpu under this node */
for_each_present_cpu(cpu) {
diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index f31652085adc..78b56cd7698e 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -1225,18 +1225,6 @@ static void comp_algorithm_set(struct zram *zram, u32 prio, const char *alg)
zram->comp_algs[prio] = alg;
}
-static ssize_t __comp_algorithm_show(struct zram *zram, u32 prio,
- char *buf, ssize_t at)
-{
- ssize_t sz;
-
- down_read(&zram->init_lock);
- sz = zcomp_available_show(zram->comp_algs[prio], buf, at);
- up_read(&zram->init_lock);
-
- return sz;
-}
-
static int __comp_algorithm_store(struct zram *zram, u32 prio, const char *buf)
{
char *compressor;
@@ -1387,8 +1375,12 @@ static ssize_t comp_algorithm_show(struct device *dev,
char *buf)
{
struct zram *zram = dev_to_zram(dev);
+ ssize_t sz;
- return __comp_algorithm_show(zram, ZRAM_PRIMARY_COMP, buf, 0);
+ down_read(&zram->init_lock);
+ sz = zcomp_available_show(zram->comp_algs[ZRAM_PRIMARY_COMP], buf, 0);
+ up_read(&zram->init_lock);
+ return sz;
}
static ssize_t comp_algorithm_store(struct device *dev,
@@ -1412,14 +1404,15 @@ static ssize_t recomp_algorithm_show(struct device *dev,
ssize_t sz = 0;
u32 prio;
+ down_read(&zram->init_lock);
for (prio = ZRAM_SECONDARY_COMP; prio < ZRAM_MAX_COMPS; prio++) {
if (!zram->comp_algs[prio])
continue;
sz += sysfs_emit_at(buf, sz, "#%d: ", prio);
- sz += __comp_algorithm_show(zram, prio, buf, sz);
+ sz += zcomp_available_show(zram->comp_algs[prio], buf, sz);
}
-
+ up_read(&zram->init_lock);
return sz;
}
diff --git a/drivers/char/mem.c b/drivers/char/mem.c
index 48839958b0b1..34b815901b20 100644
--- a/drivers/char/mem.c
+++ b/drivers/char/mem.c
@@ -512,11 +512,18 @@ static int mmap_zero(struct file *file, struct vm_area_struct *vma)
return 0;
}
+#ifndef CONFIG_MMU
+static unsigned long get_unmapped_area_zero(struct file *file,
+ unsigned long addr, unsigned long len,
+ unsigned long pgoff, unsigned long flags)
+{
+ return -ENOSYS;
+}
+#else
static unsigned long get_unmapped_area_zero(struct file *file,
unsigned long addr, unsigned long len,
unsigned long pgoff, unsigned long flags)
{
-#ifdef CONFIG_MMU
if (flags & MAP_SHARED) {
/*
* mmap_zero() will call shmem_zero_setup() to create a file,
@@ -527,12 +534,18 @@ static unsigned long get_unmapped_area_zero(struct file *file,
return shmem_get_unmapped_area(NULL, addr, len, pgoff, flags);
}
- /* Otherwise flags & MAP_PRIVATE: with no shmem object beneath it */
- return mm_get_unmapped_area(current->mm, file, addr, len, pgoff, flags);
+ /*
+ * Otherwise flags & MAP_PRIVATE: with no shmem object beneath it,
+ * attempt to map aligned to huge page size if possible, otherwise we
+ * fall back to system page size mappings.
+ */
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ return thp_get_unmapped_area(file, addr, len, pgoff, flags);
#else
- return -ENOSYS;
+ return mm_get_unmapped_area(current->mm, file, addr, len, pgoff, flags);
#endif
}
+#endif /* CONFIG_MMU */
static ssize_t write_full(struct file *file, const char __user *buf,
size_t count, loff_t *ppos)
diff --git a/drivers/misc/vmw_balloon.c b/drivers/misc/vmw_balloon.c
index 6653fc53c951..6df51ee8db62 100644
--- a/drivers/misc/vmw_balloon.c
+++ b/drivers/misc/vmw_balloon.c
@@ -1806,7 +1806,7 @@ static int vmballoon_migratepage(struct balloon_dev_info *b_dev_info,
* the list after acquiring the lock.
*/
get_page(newpage);
- ret = MIGRATEPAGE_SUCCESS;
+ ret = 0;
}
/* Update the balloon list under the @pages_lock */
@@ -1817,7 +1817,7 @@ static int vmballoon_migratepage(struct balloon_dev_info *b_dev_info,
* If we succeed just insert it to the list and update the statistics
* under the lock.
*/
- if (ret == MIGRATEPAGE_SUCCESS) {
+ if (!ret) {
balloon_page_insert(&b->b_dev_info, newpage);
__count_vm_event(BALLOON_MIGRATE);
}
diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c
index e299e18346a3..eae65136cdfb 100644
--- a/drivers/virtio/virtio_balloon.c
+++ b/drivers/virtio/virtio_balloon.c
@@ -875,7 +875,7 @@ static int virtballoon_migratepage(struct balloon_dev_info *vb_dev_info,
balloon_page_finalize(page);
put_page(page); /* balloon reference */
- return MIGRATEPAGE_SUCCESS;
+ return 0;
}
#endif /* CONFIG_BALLOON_COMPACTION */
diff --git a/drivers/xen/Kconfig b/drivers/xen/Kconfig
index 24f485827e03..f9a35ed266ec 100644
--- a/drivers/xen/Kconfig
+++ b/drivers/xen/Kconfig
@@ -138,6 +138,7 @@ config XEN_GNTDEV
depends on XEN
default m
select MMU_NOTIFIER
+ select FIND_NORMAL_PAGE
help
Allows userspace processes to use grants.
diff --git a/drivers/xen/gntdev.c b/drivers/xen/gntdev.c
index 1f2160765618..26f13b37c78e 100644
--- a/drivers/xen/gntdev.c
+++ b/drivers/xen/gntdev.c
@@ -321,6 +321,7 @@ static int find_grant_ptes(pte_t *pte, unsigned long addr, void *data)
BUG_ON(pgnr >= map->count);
pte_maddr = arbitrary_virt_to_machine(pte).maddr;
+ /* Note: this will perform a pte_mkspecial() through the hypercall. */
gnttab_set_map_op(&map->map_ops[pgnr], pte_maddr, flags,
map->grants[pgnr].ref,
map->grants[pgnr].domid);
@@ -528,7 +529,7 @@ static void gntdev_vma_close(struct vm_area_struct *vma)
gntdev_put_map(priv, map);
}
-static struct page *gntdev_vma_find_special_page(struct vm_area_struct *vma,
+static struct page *gntdev_vma_find_normal_page(struct vm_area_struct *vma,
unsigned long addr)
{
struct gntdev_grant_map *map = vma->vm_private_data;
@@ -539,7 +540,7 @@ static struct page *gntdev_vma_find_special_page(struct vm_area_struct *vma,
static const struct vm_operations_struct gntdev_vmops = {
.open = gntdev_vma_open,
.close = gntdev_vma_close,
- .find_special_page = gntdev_vma_find_special_page,
+ .find_normal_page = gntdev_vma_find_normal_page,
};
/* ------------------------------------------------------------------ */
diff --git a/fs/Kconfig b/fs/Kconfig
index c654a3642897..187a75440aca 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -250,7 +250,6 @@ config ARCH_SUPPORTS_HUGETLBFS
menuconfig HUGETLBFS
bool "HugeTLB file system support"
depends on ARCH_SUPPORTS_HUGETLBFS
- depends on (SYSFS || SYSCTL)
select MEMFD_CREATE
select PADATA if SMP
help
diff --git a/fs/aio.c b/fs/aio.c
index 7fc7b6221312..059e03cfa088 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -445,7 +445,7 @@ static int aio_migrate_folio(struct address_space *mapping, struct folio *dst,
folio_get(dst);
rc = folio_migrate_mapping(mapping, dst, src, 1);
- if (rc != MIGRATEPAGE_SUCCESS) {
+ if (rc) {
folio_put(dst);
goto out_unlock;
}
diff --git a/fs/bcachefs/darray.c b/fs/bcachefs/darray.c
index e86d36d23e9e..928e83a1ce42 100644
--- a/fs/bcachefs/darray.c
+++ b/fs/bcachefs/darray.c
@@ -21,7 +21,7 @@ int __bch2_darray_resize_noprof(darray_char *d, size_t element_size, size_t new_
return -ENOMEM;
void *data = likely(bytes < INT_MAX)
- ? kvmalloc_noprof(bytes, gfp)
+ ? kvmalloc_node_align_noprof(bytes, 1, gfp, NUMA_NO_NODE)
: vmalloc_noprof(bytes);
if (!data)
return -ENOMEM;
diff --git a/fs/bcachefs/fs-io-buffered.c b/fs/bcachefs/fs-io-buffered.c
index 1c54b9b5bd69..fdeaa25189f2 100644
--- a/fs/bcachefs/fs-io-buffered.c
+++ b/fs/bcachefs/fs-io-buffered.c
@@ -655,6 +655,17 @@ do_io:
return 0;
}
+static int bch2_write_cache_pages(struct address_space *mapping,
+ struct writeback_control *wbc, void *data)
+{
+ struct folio *folio = NULL;
+ int error;
+
+ while ((folio = writeback_iter(mapping, wbc, folio, &error)))
+ error = __bch2_writepage(folio, wbc, data);
+ return error;
+}
+
int bch2_writepages(struct address_space *mapping, struct writeback_control *wbc)
{
struct bch_fs *c = mapping->host->i_sb->s_fs_info;
@@ -663,7 +674,7 @@ int bch2_writepages(struct address_space *mapping, struct writeback_control *wbc
bch2_inode_opts_get(&w->opts, c, &to_bch_ei(mapping->host)->ei_inode);
blk_start_plug(&w->plug);
- int ret = write_cache_pages(mapping, wbc, __bch2_writepage, w);
+ int ret = bch2_write_cache_pages(mapping, wbc, w);
if (w->io)
bch2_writepage_do_io(w);
blk_finish_plug(&w->plug);
diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h
index 6488f098d140..7112fd40ee21 100644
--- a/fs/bcachefs/util.h
+++ b/fs/bcachefs/util.h
@@ -61,7 +61,7 @@ static inline void *bch2_kvmalloc_noprof(size_t n, gfp_t flags)
{
void *p = unlikely(n >= INT_MAX)
? vmalloc_noprof(n)
- : kvmalloc_noprof(n, flags & ~__GFP_ZERO);
+ : kvmalloc_node_align_noprof(n, 1, flags & ~__GFP_ZERO, NUMA_NO_NODE);
if (p && (flags & __GFP_ZERO))
memset(p, 0, n);
return p;
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 70fc4e7cc5a0..7fab5057cf8e 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1930,6 +1930,7 @@ static int btrfs_init_btree_inode(struct super_block *sb)
BTRFS_I(inode)->root = btrfs_grab_root(fs_info->tree_root);
set_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags);
__insert_inode_hash(inode, hash);
+ set_bit(AS_KERNEL_FILE, &inode->i_mapping->flags);
fs_info->btree_inode = inode;
return 0;
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index dd82dcc7b2b7..0bb604dbd673 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -7421,7 +7421,7 @@ static int btrfs_migrate_folio(struct address_space *mapping,
{
int ret = filemap_migrate_folio(mapping, dst, src, mode);
- if (ret != MIGRATEPAGE_SUCCESS)
+ if (ret)
return ret;
if (folio_test_ordered(src)) {
@@ -7429,7 +7429,7 @@ static int btrfs_migrate_folio(struct address_space *mapping,
folio_set_ordered(dst);
}
- return MIGRATEPAGE_SUCCESS;
+ return 0;
}
#else
#define btrfs_migrate_folio NULL
diff --git a/fs/coredump.c b/fs/coredump.c
index 5dce257c67fc..f9d82ffc4b88 100644
--- a/fs/coredump.c
+++ b/fs/coredump.c
@@ -1103,8 +1103,10 @@ void vfs_coredump(const kernel_siginfo_t *siginfo)
* We must use the same mm->flags while dumping core to avoid
* inconsistency of bit flags, since this flag is not protected
* by any locks.
+ *
+ * Note that we only care about MMF_DUMP* flags.
*/
- .mm_flags = mm->flags,
+ .mm_flags = __mm_flags_get_dumpable(mm),
.vma_meta = NULL,
.cpu = raw_smp_processor_id(),
};
diff --git a/fs/dax.c b/fs/dax.c
index 20ecf652c129..89f071ba7b10 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -1375,51 +1375,24 @@ static vm_fault_t dax_pmd_load_hole(struct xa_state *xas, struct vm_fault *vmf,
const struct iomap_iter *iter, void **entry)
{
struct address_space *mapping = vmf->vma->vm_file->f_mapping;
- unsigned long pmd_addr = vmf->address & PMD_MASK;
- struct vm_area_struct *vma = vmf->vma;
struct inode *inode = mapping->host;
- pgtable_t pgtable = NULL;
struct folio *zero_folio;
- spinlock_t *ptl;
- pmd_t pmd_entry;
- unsigned long pfn;
+ vm_fault_t ret;
zero_folio = mm_get_huge_zero_folio(vmf->vma->vm_mm);
- if (unlikely(!zero_folio))
- goto fallback;
-
- pfn = page_to_pfn(&zero_folio->page);
- *entry = dax_insert_entry(xas, vmf, iter, *entry, pfn,
- DAX_PMD | DAX_ZERO_PAGE);
-
- if (arch_needs_pgtable_deposit()) {
- pgtable = pte_alloc_one(vma->vm_mm);
- if (!pgtable)
- return VM_FAULT_OOM;
- }
-
- ptl = pmd_lock(vmf->vma->vm_mm, vmf->pmd);
- if (!pmd_none(*(vmf->pmd))) {
- spin_unlock(ptl);
- goto fallback;
+ if (unlikely(!zero_folio)) {
+ trace_dax_pmd_load_hole_fallback(inode, vmf, zero_folio, *entry);
+ return VM_FAULT_FALLBACK;
}
- if (pgtable) {
- pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable);
- mm_inc_nr_ptes(vma->vm_mm);
- }
- pmd_entry = folio_mk_pmd(zero_folio, vmf->vma->vm_page_prot);
- set_pmd_at(vmf->vma->vm_mm, pmd_addr, vmf->pmd, pmd_entry);
- spin_unlock(ptl);
- trace_dax_pmd_load_hole(inode, vmf, zero_folio, *entry);
- return VM_FAULT_NOPAGE;
+ *entry = dax_insert_entry(xas, vmf, iter, *entry, folio_pfn(zero_folio),
+ DAX_PMD | DAX_ZERO_PAGE);
-fallback:
- if (pgtable)
- pte_free(vma->vm_mm, pgtable);
- trace_dax_pmd_load_hole_fallback(inode, vmf, zero_folio, *entry);
- return VM_FAULT_FALLBACK;
+ ret = vmf_insert_folio_pmd(vmf, zero_folio, false);
+ if (ret == VM_FAULT_NOPAGE)
+ trace_dax_pmd_load_hole(inode, vmf, zero_folio, *entry);
+ return ret;
}
#else
static vm_fault_t dax_pmd_load_hole(struct xa_state *xas, struct vm_fault *vmf,
diff --git a/fs/exec.c b/fs/exec.c
index 2a1e5e4042a1..dbac0e84cc3e 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1999,7 +1999,7 @@ void set_dumpable(struct mm_struct *mm, int value)
if (WARN_ON((unsigned)value > SUID_DUMP_ROOT))
return;
- set_mask_bits(&mm->flags, MMF_DUMPABLE_MASK, value);
+ __mm_flags_set_mask_dumpable(mm, value);
}
SYSCALL_DEFINE3(execve,
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index e80cd8f2c049..8a89f0aa1d4d 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -935,7 +935,7 @@ static int fuse_check_folio(struct folio *folio)
{
if (folio_mapped(folio) ||
folio->mapping != NULL ||
- (folio->flags & PAGE_FLAGS_CHECK_AT_PREP &
+ (folio->flags.f & PAGE_FLAGS_CHECK_AT_PREP &
~(1 << PG_locked |
1 << PG_referenced |
1 << PG_lru |
diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index fe0faad4892f..0c0a80b3baca 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -40,7 +40,7 @@ static void gfs2_ail_error(struct gfs2_glock *gl, const struct buffer_head *bh)
"AIL buffer %p: blocknr %llu state 0x%08lx mapping %p page "
"state 0x%lx\n",
bh, (unsigned long long)bh->b_blocknr, bh->b_state,
- bh->b_folio->mapping, bh->b_folio->flags);
+ bh->b_folio->mapping, bh->b_folio->flags.f);
fs_err(sdp, "AIL glock %u:%llu mapping %p\n",
gl->gl_name.ln_type, gl->gl_name.ln_number,
gfs2_glock2aspace(gl));
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 09d4baef29cf..34d496a2b7de 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -1052,7 +1052,7 @@ static int hugetlbfs_migrate_folio(struct address_space *mapping,
int rc;
rc = migrate_huge_page_move_mapping(mapping, dst, src);
- if (rc != MIGRATEPAGE_SUCCESS)
+ if (rc)
return rc;
if (hugetlb_folio_subpool(src)) {
@@ -1063,7 +1063,7 @@ static int hugetlbfs_migrate_folio(struct address_space *mapping,
folio_migrate_flags(dst, src);
- return MIGRATEPAGE_SUCCESS;
+ return 0;
}
#else
#define hugetlbfs_migrate_folio NULL
diff --git a/fs/jffs2/file.c b/fs/jffs2/file.c
index dd3dff95cb24..b697f3c259ef 100644
--- a/fs/jffs2/file.c
+++ b/fs/jffs2/file.c
@@ -230,7 +230,7 @@ static int jffs2_write_begin(const struct kiocb *iocb,
goto release_sem;
}
}
- jffs2_dbg(1, "end write_begin(). folio->flags %lx\n", folio->flags);
+ jffs2_dbg(1, "end write_begin(). folio->flags %lx\n", folio->flags.f);
release_sem:
mutex_unlock(&c->alloc_sem);
@@ -259,7 +259,7 @@ static int jffs2_write_end(const struct kiocb *iocb,
jffs2_dbg(1, "%s(): ino #%lu, page at 0x%llx, range %d-%d, flags %lx\n",
__func__, inode->i_ino, folio_pos(folio),
- start, end, folio->flags);
+ start, end, folio->flags.f);
/* We need to avoid deadlock with page_cache_read() in
jffs2_garbage_collect_pass(). So the folio must be
diff --git a/fs/jfs/jfs_metapage.c b/fs/jfs/jfs_metapage.c
index b98cf3bb6c1f..871cf4fb3636 100644
--- a/fs/jfs/jfs_metapage.c
+++ b/fs/jfs/jfs_metapage.c
@@ -169,7 +169,7 @@ static int __metapage_migrate_folio(struct address_space *mapping,
}
rc = filemap_migrate_folio(mapping, dst, src, mode);
- if (rc != MIGRATEPAGE_SUCCESS)
+ if (rc)
return rc;
for (i = 0; i < MPS_PER_PAGE; i++) {
@@ -199,7 +199,7 @@ static int __metapage_migrate_folio(struct address_space *mapping,
}
}
- return MIGRATEPAGE_SUCCESS;
+ return 0;
}
#endif /* CONFIG_MIGRATION */
@@ -242,7 +242,7 @@ static int __metapage_migrate_folio(struct address_space *mapping,
return -EAGAIN;
rc = filemap_migrate_folio(mapping, dst, src, mode);
- if (rc != MIGRATEPAGE_SUCCESS)
+ if (rc)
return rc;
if (unlikely(insert_metapage(dst, mp)))
@@ -253,7 +253,7 @@ static int __metapage_migrate_folio(struct address_space *mapping,
mp->folio = dst;
remove_metapage(src, mp);
- return MIGRATEPAGE_SUCCESS;
+ return 0;
}
#endif /* CONFIG_MIGRATION */
diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c
index 806b056d2260..56c4da417b6a 100644
--- a/fs/nilfs2/page.c
+++ b/fs/nilfs2/page.c
@@ -167,7 +167,7 @@ void nilfs_folio_bug(struct folio *folio)
printk(KERN_CRIT "NILFS_FOLIO_BUG(%p): cnt=%d index#=%llu flags=0x%lx "
"mapping=%p ino=%lu\n",
folio, folio_ref_count(folio),
- (unsigned long long)folio->index, folio->flags, m, ino);
+ (unsigned long long)folio->index, folio->flags.f, m, ino);
head = folio_buffers(folio);
if (head) {
diff --git a/fs/ntfs3/inode.c b/fs/ntfs3/inode.c
index 37cbbee7fa58..48b4f73a93ee 100644
--- a/fs/ntfs3/inode.c
+++ b/fs/ntfs3/inode.c
@@ -871,9 +871,9 @@ out:
}
static int ntfs_resident_writepage(struct folio *folio,
- struct writeback_control *wbc, void *data)
+ struct writeback_control *wbc)
{
- struct address_space *mapping = data;
+ struct address_space *mapping = folio->mapping;
struct inode *inode = mapping->host;
struct ntfs_inode *ni = ntfs_i(inode);
int ret;
@@ -907,9 +907,14 @@ static int ntfs_writepages(struct address_space *mapping,
if (unlikely(ntfs3_forced_shutdown(inode->i_sb)))
return -EIO;
- if (is_resident(ntfs_i(inode)))
- return write_cache_pages(mapping, wbc, ntfs_resident_writepage,
- mapping);
+ if (is_resident(ntfs_i(inode))) {
+ struct folio *folio = NULL;
+ int error;
+
+ while ((folio = writeback_iter(mapping, wbc, folio, &error)))
+ error = ntfs_resident_writepage(folio, wbc);
+ return error;
+ }
return mpage_writepages(mapping, wbc, ntfs_get_block);
}
diff --git a/fs/pidfs.c b/fs/pidfs.c
index 108e7527f837..9913c5268fef 100644
--- a/fs/pidfs.c
+++ b/fs/pidfs.c
@@ -357,8 +357,11 @@ static long pidfd_info(struct file *file, unsigned int cmd, unsigned long arg)
if ((kinfo.mask & PIDFD_INFO_COREDUMP) && !(kinfo.coredump_mask)) {
task_lock(task);
- if (task->mm)
- kinfo.coredump_mask = pidfs_coredump_mask(task->mm->flags);
+ if (task->mm) {
+ unsigned long flags = __mm_flags_get_dumpable(task->mm);
+
+ kinfo.coredump_mask = pidfs_coredump_mask(flags);
+ }
task_unlock(task);
}
diff --git a/fs/proc/array.c b/fs/proc/array.c
index d6a0369caa93..d84b291dd1ed 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -422,7 +422,7 @@ static inline void task_thp_status(struct seq_file *m, struct mm_struct *mm)
bool thp_enabled = IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE);
if (thp_enabled)
- thp_enabled = !test_bit(MMF_DISABLE_THP, &mm->flags);
+ thp_enabled = !mm_flags_test(MMF_DISABLE_THP_COMPLETELY, mm);
seq_printf(m, "THP_enabled:\t%d\n", thp_enabled);
}
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 62d35631ba8c..b997ceef9135 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -1163,7 +1163,7 @@ static int __set_oom_adj(struct file *file, int oom_adj, bool legacy)
struct task_struct *p = find_lock_task_mm(task);
if (p) {
- if (test_bit(MMF_MULTIPROCESS, &p->mm->flags)) {
+ if (mm_flags_test(MMF_MULTIPROCESS, p->mm)) {
mm = p->mm;
mmgrab(mm);
}
@@ -2962,8 +2962,10 @@ static ssize_t proc_coredump_filter_read(struct file *file, char __user *buf,
ret = 0;
mm = get_task_mm(task);
if (mm) {
+ unsigned long flags = __mm_flags_get_dumpable(mm);
+
len = snprintf(buffer, sizeof(buffer), "%08lx\n",
- ((mm->flags & MMF_DUMP_FILTER_MASK) >>
+ ((flags & MMF_DUMP_FILTER_MASK) >>
MMF_DUMP_FILTER_SHIFT));
mmput(mm);
ret = simple_read_from_buffer(buf, count, ppos, buffer, len);
@@ -3002,9 +3004,9 @@ static ssize_t proc_coredump_filter_write(struct file *file,
for (i = 0, mask = 1; i < MMF_DUMP_FILTER_BITS; i++, mask <<= 1) {
if (val & mask)
- set_bit(i + MMF_DUMP_FILTER_SHIFT, &mm->flags);
+ mm_flags_set(i + MMF_DUMP_FILTER_SHIFT, mm);
else
- clear_bit(i + MMF_DUMP_FILTER_SHIFT, &mm->flags);
+ mm_flags_clear(i + MMF_DUMP_FILTER_SHIFT, mm);
}
mmput(mm);
@@ -3274,7 +3276,7 @@ static int proc_pid_ksm_stat(struct seq_file *m, struct pid_namespace *ns,
seq_printf(m, "ksm_merging_pages %lu\n", mm->ksm_merging_pages);
seq_printf(m, "ksm_process_profit %ld\n", ksm_process_profit(mm));
seq_printf(m, "ksm_merge_any: %s\n",
- test_bit(MMF_VM_MERGE_ANY, &mm->flags) ? "yes" : "no");
+ mm_flags_test(MMF_VM_MERGE_ANY, mm) ? "yes" : "no");
ret = mmap_read_lock_killable(mm);
if (ret) {
mmput(mm);
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index e737401d7383..d1598576506c 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -378,16 +378,21 @@ extern void proc_self_init(void);
* task_[no]mmu.c
*/
struct mem_size_stats;
-struct proc_maps_private {
- struct inode *inode;
- struct task_struct *task;
+
+struct proc_maps_locking_ctx {
struct mm_struct *mm;
- struct vma_iterator iter;
- loff_t last_pos;
#ifdef CONFIG_PER_VMA_LOCK
bool mmap_locked;
struct vm_area_struct *locked_vma;
#endif
+};
+
+struct proc_maps_private {
+ struct inode *inode;
+ struct task_struct *task;
+ struct vma_iterator iter;
+ loff_t last_pos;
+ struct proc_maps_locking_ctx lock_ctx;
#ifdef CONFIG_NUMA
struct mempolicy *task_mempolicy;
#endif
diff --git a/fs/proc/page.c b/fs/proc/page.c
index ba3568e97fd1..771e0b6bc630 100644
--- a/fs/proc/page.c
+++ b/fs/proc/page.c
@@ -163,7 +163,7 @@ u64 stable_page_flags(const struct page *page)
snapshot_page(&ps, page);
folio = &ps.folio_snapshot;
- k = folio->flags;
+ k = folio->flags.f;
mapping = (unsigned long)folio->mapping;
is_anon = mapping & FOLIO_MAPPING_ANON;
@@ -238,7 +238,7 @@ u64 stable_page_flags(const struct page *page)
if (u & (1 << KPF_HUGE))
u |= kpf_copy_bit(k, KPF_HWPOISON, PG_hwpoison);
else
- u |= kpf_copy_bit(ps.page_snapshot.flags, KPF_HWPOISON, PG_hwpoison);
+ u |= kpf_copy_bit(ps.page_snapshot.flags.f, KPF_HWPOISON, PG_hwpoison);
#endif
u |= kpf_copy_bit(k, KPF_RESERVED, PG_reserved);
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 29cca0e6d0ff..ced01cf3c5ab 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -132,18 +132,24 @@ static void release_task_mempolicy(struct proc_maps_private *priv)
#ifdef CONFIG_PER_VMA_LOCK
-static void unlock_vma(struct proc_maps_private *priv)
+static void reset_lock_ctx(struct proc_maps_locking_ctx *lock_ctx)
{
- if (priv->locked_vma) {
- vma_end_read(priv->locked_vma);
- priv->locked_vma = NULL;
+ lock_ctx->locked_vma = NULL;
+ lock_ctx->mmap_locked = false;
+}
+
+static void unlock_ctx_vma(struct proc_maps_locking_ctx *lock_ctx)
+{
+ if (lock_ctx->locked_vma) {
+ vma_end_read(lock_ctx->locked_vma);
+ lock_ctx->locked_vma = NULL;
}
}
static const struct seq_operations proc_pid_maps_op;
static inline bool lock_vma_range(struct seq_file *m,
- struct proc_maps_private *priv)
+ struct proc_maps_locking_ctx *lock_ctx)
{
/*
* smaps and numa_maps perform page table walk, therefore require
@@ -151,25 +157,24 @@ static inline bool lock_vma_range(struct seq_file *m,
* walking the vma tree under rcu read protection.
*/
if (m->op != &proc_pid_maps_op) {
- if (mmap_read_lock_killable(priv->mm))
+ if (mmap_read_lock_killable(lock_ctx->mm))
return false;
- priv->mmap_locked = true;
+ lock_ctx->mmap_locked = true;
} else {
rcu_read_lock();
- priv->locked_vma = NULL;
- priv->mmap_locked = false;
+ reset_lock_ctx(lock_ctx);
}
return true;
}
-static inline void unlock_vma_range(struct proc_maps_private *priv)
+static inline void unlock_vma_range(struct proc_maps_locking_ctx *lock_ctx)
{
- if (priv->mmap_locked) {
- mmap_read_unlock(priv->mm);
+ if (lock_ctx->mmap_locked) {
+ mmap_read_unlock(lock_ctx->mm);
} else {
- unlock_vma(priv);
+ unlock_ctx_vma(lock_ctx);
rcu_read_unlock();
}
}
@@ -177,15 +182,16 @@ static inline void unlock_vma_range(struct proc_maps_private *priv)
static struct vm_area_struct *get_next_vma(struct proc_maps_private *priv,
loff_t last_pos)
{
+ struct proc_maps_locking_ctx *lock_ctx = &priv->lock_ctx;
struct vm_area_struct *vma;
- if (priv->mmap_locked)
+ if (lock_ctx->mmap_locked)
return vma_next(&priv->iter);
- unlock_vma(priv);
- vma = lock_next_vma(priv->mm, &priv->iter, last_pos);
+ unlock_ctx_vma(lock_ctx);
+ vma = lock_next_vma(lock_ctx->mm, &priv->iter, last_pos);
if (!IS_ERR_OR_NULL(vma))
- priv->locked_vma = vma;
+ lock_ctx->locked_vma = vma;
return vma;
}
@@ -193,14 +199,16 @@ static struct vm_area_struct *get_next_vma(struct proc_maps_private *priv,
static inline bool fallback_to_mmap_lock(struct proc_maps_private *priv,
loff_t pos)
{
- if (priv->mmap_locked)
+ struct proc_maps_locking_ctx *lock_ctx = &priv->lock_ctx;
+
+ if (lock_ctx->mmap_locked)
return false;
rcu_read_unlock();
- mmap_read_lock(priv->mm);
+ mmap_read_lock(lock_ctx->mm);
/* Reinitialize the iterator after taking mmap_lock */
vma_iter_set(&priv->iter, pos);
- priv->mmap_locked = true;
+ lock_ctx->mmap_locked = true;
return true;
}
@@ -208,14 +216,14 @@ static inline bool fallback_to_mmap_lock(struct proc_maps_private *priv,
#else /* CONFIG_PER_VMA_LOCK */
static inline bool lock_vma_range(struct seq_file *m,
- struct proc_maps_private *priv)
+ struct proc_maps_locking_ctx *lock_ctx)
{
- return mmap_read_lock_killable(priv->mm) == 0;
+ return mmap_read_lock_killable(lock_ctx->mm) == 0;
}
-static inline void unlock_vma_range(struct proc_maps_private *priv)
+static inline void unlock_vma_range(struct proc_maps_locking_ctx *lock_ctx)
{
- mmap_read_unlock(priv->mm);
+ mmap_read_unlock(lock_ctx->mm);
}
static struct vm_area_struct *get_next_vma(struct proc_maps_private *priv,
@@ -258,7 +266,7 @@ retry:
*ppos = vma->vm_end;
} else {
*ppos = SENTINEL_VMA_GATE;
- vma = get_gate_vma(priv->mm);
+ vma = get_gate_vma(priv->lock_ctx.mm);
}
return vma;
@@ -267,6 +275,7 @@ retry:
static void *m_start(struct seq_file *m, loff_t *ppos)
{
struct proc_maps_private *priv = m->private;
+ struct proc_maps_locking_ctx *lock_ctx;
loff_t last_addr = *ppos;
struct mm_struct *mm;
@@ -278,14 +287,15 @@ static void *m_start(struct seq_file *m, loff_t *ppos)
if (!priv->task)
return ERR_PTR(-ESRCH);
- mm = priv->mm;
+ lock_ctx = &priv->lock_ctx;
+ mm = lock_ctx->mm;
if (!mm || !mmget_not_zero(mm)) {
put_task_struct(priv->task);
priv->task = NULL;
return NULL;
}
- if (!lock_vma_range(m, priv)) {
+ if (!lock_vma_range(m, lock_ctx)) {
mmput(mm);
put_task_struct(priv->task);
priv->task = NULL;
@@ -318,13 +328,13 @@ static void *m_next(struct seq_file *m, void *v, loff_t *ppos)
static void m_stop(struct seq_file *m, void *v)
{
struct proc_maps_private *priv = m->private;
- struct mm_struct *mm = priv->mm;
+ struct mm_struct *mm = priv->lock_ctx.mm;
if (!priv->task)
return;
release_task_mempolicy(priv);
- unlock_vma_range(priv);
+ unlock_vma_range(&priv->lock_ctx);
mmput(mm);
put_task_struct(priv->task);
priv->task = NULL;
@@ -339,9 +349,9 @@ static int proc_maps_open(struct inode *inode, struct file *file,
return -ENOMEM;
priv->inode = inode;
- priv->mm = proc_mem_open(inode, PTRACE_MODE_READ);
- if (IS_ERR(priv->mm)) {
- int err = PTR_ERR(priv->mm);
+ priv->lock_ctx.mm = proc_mem_open(inode, PTRACE_MODE_READ);
+ if (IS_ERR(priv->lock_ctx.mm)) {
+ int err = PTR_ERR(priv->lock_ctx.mm);
seq_release_private(inode, file);
return err;
@@ -355,8 +365,8 @@ static int proc_map_release(struct inode *inode, struct file *file)
struct seq_file *seq = file->private_data;
struct proc_maps_private *priv = seq->private;
- if (priv->mm)
- mmdrop(priv->mm);
+ if (priv->lock_ctx.mm)
+ mmdrop(priv->lock_ctx.mm);
return seq_release_private(inode, file);
}
@@ -517,28 +527,90 @@ static int pid_maps_open(struct inode *inode, struct file *file)
PROCMAP_QUERY_VMA_FLAGS \
)
-static int query_vma_setup(struct mm_struct *mm)
+#ifdef CONFIG_PER_VMA_LOCK
+
+static int query_vma_setup(struct proc_maps_locking_ctx *lock_ctx)
{
- return mmap_read_lock_killable(mm);
+ reset_lock_ctx(lock_ctx);
+
+ return 0;
}
-static void query_vma_teardown(struct mm_struct *mm, struct vm_area_struct *vma)
+static void query_vma_teardown(struct proc_maps_locking_ctx *lock_ctx)
{
- mmap_read_unlock(mm);
+ if (lock_ctx->mmap_locked) {
+ mmap_read_unlock(lock_ctx->mm);
+ lock_ctx->mmap_locked = false;
+ } else {
+ unlock_ctx_vma(lock_ctx);
+ }
+}
+
+static struct vm_area_struct *query_vma_find_by_addr(struct proc_maps_locking_ctx *lock_ctx,
+ unsigned long addr)
+{
+ struct mm_struct *mm = lock_ctx->mm;
+ struct vm_area_struct *vma;
+ struct vma_iterator vmi;
+
+ if (lock_ctx->mmap_locked)
+ return find_vma(mm, addr);
+
+ /* Unlock previously locked VMA and find the next one under RCU */
+ unlock_ctx_vma(lock_ctx);
+ rcu_read_lock();
+ vma_iter_init(&vmi, mm, addr);
+ vma = lock_next_vma(mm, &vmi, addr);
+ rcu_read_unlock();
+
+ if (!vma)
+ return NULL;
+
+ if (!IS_ERR(vma)) {
+ lock_ctx->locked_vma = vma;
+ return vma;
+ }
+
+ if (PTR_ERR(vma) == -EAGAIN) {
+ /* Fallback to mmap_lock on vma->vm_refcnt overflow */
+ mmap_read_lock(mm);
+ vma = find_vma(mm, addr);
+ lock_ctx->mmap_locked = true;
+ }
+
+ return vma;
+}
+
+#else /* CONFIG_PER_VMA_LOCK */
+
+static int query_vma_setup(struct proc_maps_locking_ctx *lock_ctx)
+{
+ return mmap_read_lock_killable(lock_ctx->mm);
+}
+
+static void query_vma_teardown(struct proc_maps_locking_ctx *lock_ctx)
+{
+ mmap_read_unlock(lock_ctx->mm);
}
-static struct vm_area_struct *query_vma_find_by_addr(struct mm_struct *mm, unsigned long addr)
+static struct vm_area_struct *query_vma_find_by_addr(struct proc_maps_locking_ctx *lock_ctx,
+ unsigned long addr)
{
- return find_vma(mm, addr);
+ return find_vma(lock_ctx->mm, addr);
}
-static struct vm_area_struct *query_matching_vma(struct mm_struct *mm,
+#endif /* CONFIG_PER_VMA_LOCK */
+
+static struct vm_area_struct *query_matching_vma(struct proc_maps_locking_ctx *lock_ctx,
unsigned long addr, u32 flags)
{
struct vm_area_struct *vma;
next_vma:
- vma = query_vma_find_by_addr(mm, addr);
+ vma = query_vma_find_by_addr(lock_ctx, addr);
+ if (IS_ERR(vma))
+ return vma;
+
if (!vma)
goto no_vma;
@@ -579,11 +651,11 @@ no_vma:
return ERR_PTR(-ENOENT);
}
-static int do_procmap_query(struct proc_maps_private *priv, void __user *uarg)
+static int do_procmap_query(struct mm_struct *mm, void __user *uarg)
{
+ struct proc_maps_locking_ctx lock_ctx = { .mm = mm };
struct procmap_query karg;
struct vm_area_struct *vma;
- struct mm_struct *mm;
const char *name = NULL;
char build_id_buf[BUILD_ID_SIZE_MAX], *name_buf = NULL;
__u64 usize;
@@ -610,17 +682,16 @@ static int do_procmap_query(struct proc_maps_private *priv, void __user *uarg)
if (!!karg.build_id_size != !!karg.build_id_addr)
return -EINVAL;
- mm = priv->mm;
if (!mm || !mmget_not_zero(mm))
return -ESRCH;
- err = query_vma_setup(mm);
+ err = query_vma_setup(&lock_ctx);
if (err) {
mmput(mm);
return err;
}
- vma = query_matching_vma(mm, karg.query_addr, karg.query_flags);
+ vma = query_matching_vma(&lock_ctx, karg.query_addr, karg.query_flags);
if (IS_ERR(vma)) {
err = PTR_ERR(vma);
vma = NULL;
@@ -705,7 +776,7 @@ static int do_procmap_query(struct proc_maps_private *priv, void __user *uarg)
}
/* unlock vma or mmap_lock, and put mm_struct before copying data to user */
- query_vma_teardown(mm, vma);
+ query_vma_teardown(&lock_ctx);
mmput(mm);
if (karg.vma_name_size && copy_to_user(u64_to_user_ptr(karg.vma_name_addr),
@@ -725,7 +796,7 @@ static int do_procmap_query(struct proc_maps_private *priv, void __user *uarg)
return 0;
out:
- query_vma_teardown(mm, vma);
+ query_vma_teardown(&lock_ctx);
mmput(mm);
kfree(name_buf);
return err;
@@ -738,7 +809,8 @@ static long procfs_procmap_ioctl(struct file *file, unsigned int cmd, unsigned l
switch (cmd) {
case PROCMAP_QUERY:
- return do_procmap_query(priv, (void __user *)arg);
+ /* priv->lock_ctx.mm is set during file open operation */
+ return do_procmap_query(priv->lock_ctx.mm, (void __user *)arg);
default:
return -ENOIOCTLCMD;
}
@@ -1297,8 +1369,8 @@ static int show_smap(struct seq_file *m, void *v)
__show_smap(m, &mss, false);
seq_printf(m, "THPeligible: %8u\n",
- !!thp_vma_allowable_orders(vma, vma->vm_flags,
- TVA_SMAPS | TVA_ENFORCE_SYSFS, THP_ORDERS_ALL));
+ !!thp_vma_allowable_orders(vma, vma->vm_flags, TVA_SMAPS,
+ THP_ORDERS_ALL));
if (arch_pkeys_enabled())
seq_printf(m, "ProtectionKey: %8u\n", vma_pkey(vma));
@@ -1311,7 +1383,7 @@ static int show_smaps_rollup(struct seq_file *m, void *v)
{
struct proc_maps_private *priv = m->private;
struct mem_size_stats mss = {};
- struct mm_struct *mm = priv->mm;
+ struct mm_struct *mm = priv->lock_ctx.mm;
struct vm_area_struct *vma;
unsigned long vma_start = 0, last_vma_end = 0;
int ret = 0;
@@ -1456,9 +1528,9 @@ static int smaps_rollup_open(struct inode *inode, struct file *file)
goto out_free;
priv->inode = inode;
- priv->mm = proc_mem_open(inode, PTRACE_MODE_READ);
- if (IS_ERR_OR_NULL(priv->mm)) {
- ret = priv->mm ? PTR_ERR(priv->mm) : -ESRCH;
+ priv->lock_ctx.mm = proc_mem_open(inode, PTRACE_MODE_READ);
+ if (IS_ERR_OR_NULL(priv->lock_ctx.mm)) {
+ ret = priv->lock_ctx.mm ? PTR_ERR(priv->lock_ctx.mm) : -ESRCH;
single_release(inode, file);
goto out_free;
@@ -1476,8 +1548,8 @@ static int smaps_rollup_release(struct inode *inode, struct file *file)
struct seq_file *seq = file->private_data;
struct proc_maps_private *priv = seq->private;
- if (priv->mm)
- mmdrop(priv->mm);
+ if (priv->lock_ctx.mm)
+ mmdrop(priv->lock_ctx.mm);
kfree(priv);
return single_release(inode, file);
@@ -1520,7 +1592,7 @@ static inline bool pte_is_pinned(struct vm_area_struct *vma, unsigned long addr,
return false;
if (!is_cow_mapping(vma->vm_flags))
return false;
- if (likely(!test_bit(MMF_HAS_PINNED, &vma->vm_mm->flags)))
+ if (likely(!mm_flags_test(MMF_HAS_PINNED, vma->vm_mm)))
return false;
folio = vm_normal_folio(vma, addr, pte);
if (!folio)
diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c
index 59bfd61d653a..d362919f4f68 100644
--- a/fs/proc/task_nommu.c
+++ b/fs/proc/task_nommu.c
@@ -204,7 +204,7 @@ static void *m_start(struct seq_file *m, loff_t *ppos)
if (!priv->task)
return ERR_PTR(-ESRCH);
- mm = priv->mm;
+ mm = priv->lock_ctx.mm;
if (!mm || !mmget_not_zero(mm)) {
put_task_struct(priv->task);
priv->task = NULL;
@@ -226,7 +226,7 @@ static void *m_start(struct seq_file *m, loff_t *ppos)
static void m_stop(struct seq_file *m, void *v)
{
struct proc_maps_private *priv = m->private;
- struct mm_struct *mm = priv->mm;
+ struct mm_struct *mm = priv->lock_ctx.mm;
if (!priv->task)
return;
@@ -259,9 +259,9 @@ static int maps_open(struct inode *inode, struct file *file,
return -ENOMEM;
priv->inode = inode;
- priv->mm = proc_mem_open(inode, PTRACE_MODE_READ);
- if (IS_ERR_OR_NULL(priv->mm)) {
- int err = priv->mm ? PTR_ERR(priv->mm) : -ESRCH;
+ priv->lock_ctx.mm = proc_mem_open(inode, PTRACE_MODE_READ);
+ if (IS_ERR_OR_NULL(priv->lock_ctx.mm)) {
+ int err = priv->lock_ctx.mm ? PTR_ERR(priv->lock_ctx.mm) : -ESRCH;
seq_release_private(inode, file);
return err;
@@ -276,8 +276,8 @@ static int map_release(struct inode *inode, struct file *file)
struct seq_file *seq = file->private_data;
struct proc_maps_private *priv = seq->private;
- if (priv->mm)
- mmdrop(priv->mm);
+ if (priv->lock_ctx.mm)
+ mmdrop(priv->lock_ctx.mm);
return seq_release_private(inode, file);
}
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index e75a6cec67be..ca41ce8208c4 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -107,7 +107,7 @@ static int do_readpage(struct folio *folio)
size_t offset = 0;
dbg_gen("ino %lu, pg %lu, i_size %lld, flags %#lx",
- inode->i_ino, folio->index, i_size, folio->flags);
+ inode->i_ino, folio->index, i_size, folio->flags.f);
ubifs_assert(c, !folio_test_checked(folio));
ubifs_assert(c, !folio->private);
@@ -600,7 +600,7 @@ static int populate_page(struct ubifs_info *c, struct folio *folio,
pgoff_t end_index;
dbg_gen("ino %lu, pg %lu, i_size %lld, flags %#lx",
- inode->i_ino, folio->index, i_size, folio->flags);
+ inode->i_ino, folio->index, i_size, folio->flags.f);
end_index = (i_size - 1) >> PAGE_SHIFT;
if (!i_size || folio->index > end_index) {
@@ -988,7 +988,7 @@ static int ubifs_writepage(struct folio *folio, struct writeback_control *wbc)
int err, len = folio_size(folio);
dbg_gen("ino %lu, pg %lu, pg flags %#lx",
- inode->i_ino, folio->index, folio->flags);
+ inode->i_ino, folio->index, folio->flags.f);
ubifs_assert(c, folio->private != NULL);
/* Is the folio fully outside @i_size? (truncate in progress) */
diff --git a/include/asm-generic/memory_model.h b/include/asm-generic/memory_model.h
index 74d0077cc5fa..efa6610acbc7 100644
--- a/include/asm-generic/memory_model.h
+++ b/include/asm-generic/memory_model.h
@@ -53,7 +53,7 @@ static inline int pfn_valid(unsigned long pfn)
*/
#define __page_to_pfn(pg) \
({ const struct page *__pg = (pg); \
- int __sec = page_to_section(__pg); \
+ int __sec = memdesc_section(__pg->flags); \
(unsigned long)(__pg - __section_mem_map_addr(__nr_to_section(__sec))); \
})
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index fe1797bbec42..28ceaeffc0c9 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -199,7 +199,7 @@ struct gendisk {
unsigned int zone_wplugs_hash_bits;
atomic_t nr_zone_wplugs;
spinlock_t zone_wplugs_lock;
- struct mempool_s *zone_wplugs_pool;
+ struct mempool *zone_wplugs_pool;
struct hlist_head *zone_wplugs_hash;
struct workqueue_struct *zone_wplugs_wq;
#endif /* CONFIG_BLK_DEV_ZONED */
diff --git a/include/linux/bpfptr.h b/include/linux/bpfptr.h
index 1af241525a17..f6e0795db484 100644
--- a/include/linux/bpfptr.h
+++ b/include/linux/bpfptr.h
@@ -67,7 +67,7 @@ static inline int copy_to_bpfptr_offset(bpfptr_t dst, size_t offset,
static inline void *kvmemdup_bpfptr_noprof(bpfptr_t src, size_t len)
{
- void *p = kvmalloc_noprof(len, GFP_USER | __GFP_NOWARN);
+ void *p = kvmalloc_node_align_noprof(len, 1, GFP_USER | __GFP_NOWARN, NUMA_NO_NODE);
if (!p)
return ERR_PTR(-ENOMEM);
diff --git a/include/linux/damon.h b/include/linux/damon.h
index 9e62b2a85538..aa7381be388c 100644
--- a/include/linux/damon.h
+++ b/include/linux/damon.h
@@ -110,7 +110,7 @@ struct damon_target {
*
* @DAMOS_WILLNEED: Call ``madvise()`` for the region with MADV_WILLNEED.
* @DAMOS_COLD: Call ``madvise()`` for the region with MADV_COLD.
- * @DAMOS_PAGEOUT: Call ``madvise()`` for the region with MADV_PAGEOUT.
+ * @DAMOS_PAGEOUT: Reclaim the region.
* @DAMOS_HUGEPAGE: Call ``madvise()`` for the region with MADV_HUGEPAGE.
* @DAMOS_NOHUGEPAGE: Call ``madvise()`` for the region with MADV_NOHUGEPAGE.
* @DAMOS_LRU_PRIO: Prioritize the region on its LRU lists.
@@ -121,10 +121,10 @@ struct damon_target {
* @NR_DAMOS_ACTIONS: Total number of DAMOS actions
*
* The support of each action is up to running &struct damon_operations.
- * &enum DAMON_OPS_VADDR and &enum DAMON_OPS_FVADDR supports all actions except
- * &enum DAMOS_LRU_PRIO and &enum DAMOS_LRU_DEPRIO. &enum DAMON_OPS_PADDR
- * supports only &enum DAMOS_PAGEOUT, &enum DAMOS_LRU_PRIO, &enum
- * DAMOS_LRU_DEPRIO, and &DAMOS_STAT.
+ * Refer to 'Operation Action' section of Documentation/mm/damon/design.rst for
+ * status of the supports.
+ *
+ * Note that DAMOS_PAGEOUT doesn't trigger demotions.
*/
enum damos_action {
DAMOS_WILLNEED,
@@ -748,7 +748,8 @@ struct damon_attrs {
* Accesses to other fields must be protected by themselves.
*
* @ops: Set of monitoring operations for given use cases.
- *
+ * @addr_unit: Scale factor for core to ops address conversion.
+ * @min_sz_region: Minimum region size.
* @adaptive_targets: Head of monitoring targets (&damon_target) list.
* @schemes: Head of schemes (&damos) list.
*/
@@ -790,6 +791,8 @@ struct damon_ctx {
struct mutex kdamond_lock;
struct damon_operations ops;
+ unsigned long addr_unit;
+ unsigned long min_sz_region;
struct list_head adaptive_targets;
struct list_head schemes;
@@ -878,7 +881,7 @@ static inline void damon_insert_region(struct damon_region *r,
void damon_add_region(struct damon_region *r, struct damon_target *t);
void damon_destroy_region(struct damon_region *r, struct damon_target *t);
int damon_set_regions(struct damon_target *t, struct damon_addr_range *ranges,
- unsigned int nr_ranges);
+ unsigned int nr_ranges, unsigned long min_sz_region);
void damon_update_region_access_rate(struct damon_region *r, bool accessed,
struct damon_attrs *attrs);
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index 7748489fde1b..29ef70022da1 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -94,12 +94,15 @@ extern struct kobj_attribute thpsize_shmem_enabled_attr;
#define THP_ORDERS_ALL \
(THP_ORDERS_ALL_ANON | THP_ORDERS_ALL_SPECIAL | THP_ORDERS_ALL_FILE_DEFAULT)
-#define TVA_SMAPS (1 << 0) /* Will be used for procfs */
-#define TVA_IN_PF (1 << 1) /* Page fault handler */
-#define TVA_ENFORCE_SYSFS (1 << 2) /* Obey sysfs configuration */
+enum tva_type {
+ TVA_SMAPS, /* Exposing "THPeligible:" in smaps. */
+ TVA_PAGEFAULT, /* Serving a page fault. */
+ TVA_KHUGEPAGED, /* Khugepaged collapse. */
+ TVA_FORCED_COLLAPSE, /* Forced collapse (e.g. MADV_COLLAPSE). */
+};
-#define thp_vma_allowable_order(vma, vm_flags, tva_flags, order) \
- (!!thp_vma_allowable_orders(vma, vm_flags, tva_flags, BIT(order)))
+#define thp_vma_allowable_order(vma, vm_flags, type, order) \
+ (!!thp_vma_allowable_orders(vma, vm_flags, type, BIT(order)))
#define split_folio(f) split_folio_to_list(f, NULL)
@@ -264,14 +267,14 @@ static inline unsigned long thp_vma_suitable_orders(struct vm_area_struct *vma,
unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma,
vm_flags_t vm_flags,
- unsigned long tva_flags,
+ enum tva_type type,
unsigned long orders);
/**
* thp_vma_allowable_orders - determine hugepage orders that are allowed for vma
* @vma: the vm area to check
* @vm_flags: use these vm_flags instead of vma->vm_flags
- * @tva_flags: Which TVA flags to honour
+ * @type: TVA type
* @orders: bitfield of all orders to consider
*
* Calculates the intersection of the requested hugepage orders and the allowed
@@ -285,11 +288,14 @@ unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma,
static inline
unsigned long thp_vma_allowable_orders(struct vm_area_struct *vma,
vm_flags_t vm_flags,
- unsigned long tva_flags,
+ enum tva_type type,
unsigned long orders)
{
- /* Optimization to check if required orders are enabled early. */
- if ((tva_flags & TVA_ENFORCE_SYSFS) && vma_is_anonymous(vma)) {
+ /*
+ * Optimization to check if required orders are enabled early. Only
+ * forced collapse ignores sysfs configs.
+ */
+ if (type != TVA_FORCED_COLLAPSE && vma_is_anonymous(vma)) {
unsigned long mask = READ_ONCE(huge_anon_orders_always);
if (vm_flags & VM_HUGEPAGE)
@@ -303,7 +309,7 @@ unsigned long thp_vma_allowable_orders(struct vm_area_struct *vma,
return 0;
}
- return __thp_vma_allowable_orders(vma, vm_flags, tva_flags, orders);
+ return __thp_vma_allowable_orders(vma, vm_flags, type, orders);
}
struct thpsize {
@@ -318,16 +324,32 @@ struct thpsize {
(transparent_hugepage_flags & \
(1<<TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG))
+/*
+ * Check whether THPs are explicitly disabled for this VMA, for example,
+ * through madvise or prctl.
+ */
static inline bool vma_thp_disabled(struct vm_area_struct *vma,
- vm_flags_t vm_flags)
-{
+ vm_flags_t vm_flags, bool forced_collapse)
+{
+ /* Are THPs disabled for this VMA? */
+ if (vm_flags & VM_NOHUGEPAGE)
+ return true;
+ /* Are THPs disabled for all VMAs in the whole process? */
+ if (mm_flags_test(MMF_DISABLE_THP_COMPLETELY, vma->vm_mm))
+ return true;
+ /*
+ * Are THPs disabled only for VMAs where we didn't get an explicit
+ * advise to use them?
+ */
+ if (vm_flags & VM_HUGEPAGE)
+ return false;
/*
- * Explicitly disabled through madvise or prctl, or some
- * architectures may disable THP for some mappings, for
- * example, s390 kvm.
+ * Forcing a collapse (e.g., madv_collapse), is a clear advice to
+ * use THPs.
*/
- return (vm_flags & VM_NOHUGEPAGE) ||
- test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags);
+ if (forced_collapse)
+ return false;
+ return mm_flags_test(MMF_DISABLE_THP_EXCEPT_ADVISED, vma->vm_mm);
}
static inline bool thp_disabled_by_hw(void)
@@ -479,6 +501,8 @@ extern unsigned long huge_zero_pfn;
static inline bool is_huge_zero_folio(const struct folio *folio)
{
+ VM_WARN_ON_ONCE(!folio);
+
return READ_ONCE(huge_zero_folio) == folio;
}
@@ -495,6 +519,17 @@ static inline bool is_huge_zero_pmd(pmd_t pmd)
struct folio *mm_get_huge_zero_folio(struct mm_struct *mm);
void mm_put_huge_zero_folio(struct mm_struct *mm);
+static inline struct folio *get_persistent_huge_zero_folio(void)
+{
+ if (!IS_ENABLED(CONFIG_PERSISTENT_HUGE_ZERO_FOLIO))
+ return NULL;
+
+ if (unlikely(!huge_zero_folio))
+ return NULL;
+
+ return huge_zero_folio;
+}
+
static inline bool thp_migration_supported(void)
{
return IS_ENABLED(CONFIG_ARCH_ENABLE_THP_MIGRATION);
@@ -526,7 +561,7 @@ static inline unsigned long thp_vma_suitable_orders(struct vm_area_struct *vma,
static inline unsigned long thp_vma_allowable_orders(struct vm_area_struct *vma,
vm_flags_t vm_flags,
- unsigned long tva_flags,
+ enum tva_type type,
unsigned long orders)
{
return 0;
@@ -685,6 +720,11 @@ static inline int change_huge_pud(struct mmu_gather *tlb,
{
return 0;
}
+
+static inline struct folio *get_persistent_huge_zero_folio(void)
+{
+ return NULL;
+}
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
static inline int split_folio_to_list_to_order(struct folio *folio,
@@ -698,4 +738,26 @@ static inline int split_folio_to_order(struct folio *folio, int new_order)
return split_folio_to_list_to_order(folio, NULL, new_order);
}
+/**
+ * largest_zero_folio - Get the largest zero size folio available
+ *
+ * This function shall be used when mm_get_huge_zero_folio() cannot be
+ * used as there is no appropriate mm lifetime to tie the huge zero folio
+ * from the caller.
+ *
+ * Deduce the size of the folio with folio_size instead of assuming the
+ * folio size.
+ *
+ * Return: pointer to PMD sized zero folio if CONFIG_PERSISTENT_HUGE_ZERO_FOLIO
+ * is enabled or a single page sized zero folio
+ */
+static inline struct folio *largest_zero_folio(void)
+{
+ struct folio *folio = get_persistent_huge_zero_folio();
+
+ if (folio)
+ return folio;
+
+ return page_folio(ZERO_PAGE(0));
+}
#endif /* _LINUX_HUGE_MM_H */
diff --git a/include/linux/khugepaged.h b/include/linux/khugepaged.h
index ff6120463745..eb1946a70cff 100644
--- a/include/linux/khugepaged.h
+++ b/include/linux/khugepaged.h
@@ -2,6 +2,8 @@
#ifndef _LINUX_KHUGEPAGED_H
#define _LINUX_KHUGEPAGED_H
+#include <linux/mm.h>
+
extern unsigned int khugepaged_max_ptes_none __read_mostly;
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
extern struct attribute_group khugepaged_attr_group;
@@ -20,13 +22,13 @@ extern int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr,
static inline void khugepaged_fork(struct mm_struct *mm, struct mm_struct *oldmm)
{
- if (test_bit(MMF_VM_HUGEPAGE, &oldmm->flags))
+ if (mm_flags_test(MMF_VM_HUGEPAGE, oldmm))
__khugepaged_enter(mm);
}
static inline void khugepaged_exit(struct mm_struct *mm)
{
- if (test_bit(MMF_VM_HUGEPAGE, &mm->flags))
+ if (mm_flags_test(MMF_VM_HUGEPAGE, mm))
__khugepaged_exit(mm);
}
#else /* CONFIG_TRANSPARENT_HUGEPAGE */
diff --git a/include/linux/ksm.h b/include/linux/ksm.h
index c17b955e7b0b..22e67ca7cba3 100644
--- a/include/linux/ksm.h
+++ b/include/linux/ksm.h
@@ -56,13 +56,13 @@ static inline long mm_ksm_zero_pages(struct mm_struct *mm)
static inline void ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm)
{
/* Adding mm to ksm is best effort on fork. */
- if (test_bit(MMF_VM_MERGEABLE, &oldmm->flags))
+ if (mm_flags_test(MMF_VM_MERGEABLE, oldmm))
__ksm_enter(mm);
}
static inline int ksm_execve(struct mm_struct *mm)
{
- if (test_bit(MMF_VM_MERGE_ANY, &mm->flags))
+ if (mm_flags_test(MMF_VM_MERGE_ANY, mm))
return __ksm_enter(mm);
return 0;
@@ -70,7 +70,7 @@ static inline int ksm_execve(struct mm_struct *mm)
static inline void ksm_exit(struct mm_struct *mm)
{
- if (test_bit(MMF_VM_MERGEABLE, &mm->flags))
+ if (mm_flags_test(MMF_VM_MERGEABLE, mm))
__ksm_exit(mm);
}
diff --git a/include/linux/maple_tree.h b/include/linux/maple_tree.h
index bafe143b1f78..41e633264e51 100644
--- a/include/linux/maple_tree.h
+++ b/include/linux/maple_tree.h
@@ -57,17 +57,17 @@
* MT_FLAGS_ALLOC_RANGE flag.
*
* Node types:
- * 0x??1 = Root
- * 0x?00 = 16 bit nodes
- * 0x010 = 32 bit nodes
- * 0x110 = 64 bit nodes
+ * 0b??1 = Root
+ * 0b?00 = 16 bit nodes
+ * 0b010 = 32 bit nodes
+ * 0b110 = 64 bit nodes
*
* Slot size and location in the parent pointer:
* type : slot location
- * 0x??1 : Root
- * 0x?00 : 16 bit values, type in 0-1, slot in 2-6
- * 0x010 : 32 bit values, type in 0-2, slot in 3-6
- * 0x110 : 64 bit values, type in 0-2, slot in 3-6
+ * 0b??1 : Root
+ * 0b?00 : 16 bit values, type in 0-1, slot in 2-6
+ * 0b010 : 32 bit values, type in 0-2, slot in 3-6
+ * 0b110 : 64 bit values, type in 0-2, slot in 3-6
*/
/*
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 785173aa0739..e693978b2022 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -900,7 +900,13 @@ unsigned long mem_cgroup_get_zone_lru_size(struct lruvec *lruvec,
return READ_ONCE(mz->lru_zone_size[zone_idx][lru]);
}
-void mem_cgroup_handle_over_high(gfp_t gfp_mask);
+void __mem_cgroup_handle_over_high(gfp_t gfp_mask);
+
+static inline void mem_cgroup_handle_over_high(gfp_t gfp_mask)
+{
+ if (unlikely(current->memcg_nr_pages_over_high))
+ __mem_cgroup_handle_over_high(gfp_mask);
+}
unsigned long mem_cgroup_get_max(struct mem_cgroup *memcg);
@@ -1053,6 +1059,8 @@ extern int mem_cgroup_init(void);
#define MEM_CGROUP_ID_SHIFT 0
+#define root_mem_cgroup (NULL)
+
static inline struct mem_cgroup *folio_memcg(struct folio *folio)
{
return NULL;
diff --git a/include/linux/mempool.h b/include/linux/mempool.h
index 7b151441341b..34941a4b9026 100644
--- a/include/linux/mempool.h
+++ b/include/linux/mempool.h
@@ -15,7 +15,7 @@ struct kmem_cache;
typedef void * (mempool_alloc_t)(gfp_t gfp_mask, void *pool_data);
typedef void (mempool_free_t)(void *element, void *pool_data);
-typedef struct mempool_s {
+typedef struct mempool {
spinlock_t lock;
int min_nr; /* nr of elements at *elements */
int curr_nr; /* Current nr of elements at *elements */
diff --git a/include/linux/memremap.h b/include/linux/memremap.h
index 4aa151914eab..aa1b6aa877a0 100644
--- a/include/linux/memremap.h
+++ b/include/linux/memremap.h
@@ -157,45 +157,52 @@ static inline unsigned long pgmap_vmemmap_nr(struct dev_pagemap *pgmap)
return 1 << pgmap->vmemmap_shift;
}
-static inline bool is_device_private_page(const struct page *page)
+static inline bool folio_is_device_private(const struct folio *folio)
{
return IS_ENABLED(CONFIG_DEVICE_PRIVATE) &&
- is_zone_device_page(page) &&
- page_pgmap(page)->type == MEMORY_DEVICE_PRIVATE;
+ folio_is_zone_device(folio) &&
+ folio->pgmap->type == MEMORY_DEVICE_PRIVATE;
}
-static inline bool folio_is_device_private(const struct folio *folio)
+static inline bool is_device_private_page(const struct page *page)
{
- return is_device_private_page(&folio->page);
+ return IS_ENABLED(CONFIG_DEVICE_PRIVATE) &&
+ folio_is_device_private(page_folio(page));
}
-static inline bool is_pci_p2pdma_page(const struct page *page)
+static inline bool folio_is_pci_p2pdma(const struct folio *folio)
{
return IS_ENABLED(CONFIG_PCI_P2PDMA) &&
- is_zone_device_page(page) &&
- page_pgmap(page)->type == MEMORY_DEVICE_PCI_P2PDMA;
+ folio_is_zone_device(folio) &&
+ folio->pgmap->type == MEMORY_DEVICE_PCI_P2PDMA;
}
-static inline bool is_device_coherent_page(const struct page *page)
+static inline bool is_pci_p2pdma_page(const struct page *page)
{
- return is_zone_device_page(page) &&
- page_pgmap(page)->type == MEMORY_DEVICE_COHERENT;
+ return IS_ENABLED(CONFIG_PCI_P2PDMA) &&
+ folio_is_pci_p2pdma(page_folio(page));
}
static inline bool folio_is_device_coherent(const struct folio *folio)
{
- return is_device_coherent_page(&folio->page);
+ return folio_is_zone_device(folio) &&
+ folio->pgmap->type == MEMORY_DEVICE_COHERENT;
}
-static inline bool is_fsdax_page(const struct page *page)
+static inline bool is_device_coherent_page(const struct page *page)
{
- return is_zone_device_page(page) &&
- page_pgmap(page)->type == MEMORY_DEVICE_FS_DAX;
+ return folio_is_device_coherent(page_folio(page));
}
static inline bool folio_is_fsdax(const struct folio *folio)
{
- return is_fsdax_page(&folio->page);
+ return folio_is_zone_device(folio) &&
+ folio->pgmap->type == MEMORY_DEVICE_FS_DAX;
+}
+
+static inline bool is_fsdax_page(const struct page *page)
+{
+ return folio_is_fsdax(page_folio(page));
}
#ifdef CONFIG_ZONE_DEVICE
diff --git a/include/linux/migrate.h b/include/linux/migrate.h
index 9009e27b5f44..1f0ac122c3bf 100644
--- a/include/linux/migrate.h
+++ b/include/linux/migrate.h
@@ -12,14 +12,6 @@ typedef void free_folio_t(struct folio *folio, unsigned long private);
struct migration_target_control;
-/*
- * Return values from addresss_space_operations.migratepage():
- * - negative errno on page migration failure;
- * - zero on page migration success;
- */
-#define MIGRATEPAGE_SUCCESS 0
-#define MIGRATEPAGE_UNMAP 1
-
/**
* struct movable_operations - Driver page migration
* @isolate_page:
@@ -35,8 +27,7 @@ struct migration_target_control;
* @src page. The driver should copy the contents of the
* @src page to the @dst page and set up the fields of @dst page.
* Both pages are locked.
- * If page migration is successful, the driver should
- * return MIGRATEPAGE_SUCCESS.
+ * If page migration is successful, the driver should return 0.
* If the driver cannot migrate the page at the moment, it can return
* -EAGAIN. The VM interprets this as a temporary migration failure and
* will retry it later. Any other error value is a permanent migration
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 1ae97a0b8ec7..00c8a54127d3 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -34,6 +34,8 @@
#include <linux/slab.h>
#include <linux/cacheinfo.h>
#include <linux/rcuwait.h>
+#include <linux/bitmap.h>
+#include <linux/bitops.h>
struct mempolicy;
struct anon_vma;
@@ -69,6 +71,15 @@ static inline void totalram_pages_add(long count)
extern void * high_memory;
+/*
+ * Convert between pages and MB
+ * 20 is the shift for 1MB (2^20 = 1MB)
+ * PAGE_SHIFT is the shift for page size (e.g., 12 for 4KB pages)
+ * So (20 - PAGE_SHIFT) converts between pages and MB
+ */
+#define PAGES_TO_MB(pages) ((pages) >> (20 - PAGE_SHIFT))
+#define MB_TO_PAGES(mb) ((mb) << (20 - PAGE_SHIFT))
+
#ifdef CONFIG_SYSCTL
extern int sysctl_legacy_va_layout;
#else
@@ -648,13 +659,21 @@ struct vm_operations_struct {
struct mempolicy *(*get_policy)(struct vm_area_struct *vma,
unsigned long addr, pgoff_t *ilx);
#endif
+#ifdef CONFIG_FIND_NORMAL_PAGE
/*
- * Called by vm_normal_page() for special PTEs to find the
- * page for @addr. This is useful if the default behavior
- * (using pte_page()) would not find the correct page.
+ * Called by vm_normal_page() for special PTEs in @vma at @addr. This
+ * allows for returning a "normal" page from vm_normal_page() even
+ * though the PTE indicates that the "struct page" either does not exist
+ * or should not be touched: "special".
+ *
+ * Do not add new users: this really only works when a "normal" page
+ * was mapped, but then the PTE got changed to something weird (+
+ * marked special) that would not make pte_pfn() identify the originally
+ * inserted page.
*/
- struct page *(*find_special_page)(struct vm_area_struct *vma,
- unsigned long addr);
+ struct page *(*find_normal_page)(struct vm_area_struct *vma,
+ unsigned long addr);
+#endif /* CONFIG_FIND_NORMAL_PAGE */
};
#ifdef CONFIG_NUMA_BALANCING
@@ -703,6 +722,36 @@ static inline void assert_fault_locked(struct vm_fault *vmf)
}
#endif /* CONFIG_PER_VMA_LOCK */
+static inline bool mm_flags_test(int flag, const struct mm_struct *mm)
+{
+ return test_bit(flag, ACCESS_PRIVATE(&mm->flags, __mm_flags));
+}
+
+static inline bool mm_flags_test_and_set(int flag, struct mm_struct *mm)
+{
+ return test_and_set_bit(flag, ACCESS_PRIVATE(&mm->flags, __mm_flags));
+}
+
+static inline bool mm_flags_test_and_clear(int flag, struct mm_struct *mm)
+{
+ return test_and_clear_bit(flag, ACCESS_PRIVATE(&mm->flags, __mm_flags));
+}
+
+static inline void mm_flags_set(int flag, struct mm_struct *mm)
+{
+ set_bit(flag, ACCESS_PRIVATE(&mm->flags, __mm_flags));
+}
+
+static inline void mm_flags_clear(int flag, struct mm_struct *mm)
+{
+ clear_bit(flag, ACCESS_PRIVATE(&mm->flags, __mm_flags));
+}
+
+static inline void mm_flags_clear_all(struct mm_struct *mm)
+{
+ bitmap_zero(ACCESS_PRIVATE(&mm->flags, __mm_flags), NUM_MM_FLAG_BITS);
+}
+
extern const struct vm_operations_struct vma_dummy_vm_ops;
static inline void vma_init(struct vm_area_struct *vma, struct mm_struct *mm)
@@ -975,7 +1024,7 @@ static inline unsigned int compound_order(struct page *page)
{
struct folio *folio = (struct folio *)page;
- if (!test_bit(PG_head, &folio->flags))
+ if (!test_bit(PG_head, &folio->flags.f))
return 0;
return folio_large_order(folio);
}
@@ -1505,21 +1554,26 @@ static inline bool is_nommu_shared_mapping(vm_flags_t flags)
*/
static inline int page_zone_id(struct page *page)
{
- return (page->flags >> ZONEID_PGSHIFT) & ZONEID_MASK;
+ return (page->flags.f >> ZONEID_PGSHIFT) & ZONEID_MASK;
}
#ifdef NODE_NOT_IN_PAGE_FLAGS
-int page_to_nid(const struct page *page);
+int memdesc_nid(memdesc_flags_t mdf);
#else
-static inline int page_to_nid(const struct page *page)
+static inline int memdesc_nid(memdesc_flags_t mdf)
{
- return (PF_POISONED_CHECK(page)->flags >> NODES_PGSHIFT) & NODES_MASK;
+ return (mdf.f >> NODES_PGSHIFT) & NODES_MASK;
}
#endif
+static inline int page_to_nid(const struct page *page)
+{
+ return memdesc_nid(PF_POISONED_CHECK(page)->flags);
+}
+
static inline int folio_nid(const struct folio *folio)
{
- return page_to_nid(&folio->page);
+ return memdesc_nid(folio->flags);
}
#ifdef CONFIG_NUMA_BALANCING
@@ -1588,14 +1642,14 @@ static inline void page_cpupid_reset_last(struct page *page)
#else
static inline int folio_last_cpupid(struct folio *folio)
{
- return (folio->flags >> LAST_CPUPID_PGSHIFT) & LAST_CPUPID_MASK;
+ return (folio->flags.f >> LAST_CPUPID_PGSHIFT) & LAST_CPUPID_MASK;
}
int folio_xchg_last_cpupid(struct folio *folio, int cpupid);
static inline void page_cpupid_reset_last(struct page *page)
{
- page->flags |= LAST_CPUPID_MASK << LAST_CPUPID_PGSHIFT;
+ page->flags.f |= LAST_CPUPID_MASK << LAST_CPUPID_PGSHIFT;
}
#endif /* LAST_CPUPID_NOT_IN_PAGE_FLAGS */
@@ -1691,7 +1745,7 @@ static inline u8 page_kasan_tag(const struct page *page)
u8 tag = KASAN_TAG_KERNEL;
if (kasan_enabled()) {
- tag = (page->flags >> KASAN_TAG_PGSHIFT) & KASAN_TAG_MASK;
+ tag = (page->flags.f >> KASAN_TAG_PGSHIFT) & KASAN_TAG_MASK;
tag ^= 0xff;
}
@@ -1706,12 +1760,12 @@ static inline void page_kasan_tag_set(struct page *page, u8 tag)
return;
tag ^= 0xff;
- old_flags = READ_ONCE(page->flags);
+ old_flags = READ_ONCE(page->flags.f);
do {
flags = old_flags;
flags &= ~(KASAN_TAG_MASK << KASAN_TAG_PGSHIFT);
flags |= (tag & KASAN_TAG_MASK) << KASAN_TAG_PGSHIFT;
- } while (unlikely(!try_cmpxchg(&page->flags, &old_flags, flags)));
+ } while (unlikely(!try_cmpxchg(&page->flags.f, &old_flags, flags)));
}
static inline void page_kasan_tag_reset(struct page *page)
@@ -1742,26 +1796,26 @@ static inline pg_data_t *page_pgdat(const struct page *page)
return NODE_DATA(page_to_nid(page));
}
-static inline struct zone *folio_zone(const struct folio *folio)
+static inline pg_data_t *folio_pgdat(const struct folio *folio)
{
- return page_zone(&folio->page);
+ return NODE_DATA(folio_nid(folio));
}
-static inline pg_data_t *folio_pgdat(const struct folio *folio)
+static inline struct zone *folio_zone(const struct folio *folio)
{
- return page_pgdat(&folio->page);
+ return &folio_pgdat(folio)->node_zones[folio_zonenum(folio)];
}
#ifdef SECTION_IN_PAGE_FLAGS
static inline void set_page_section(struct page *page, unsigned long section)
{
- page->flags &= ~(SECTIONS_MASK << SECTIONS_PGSHIFT);
- page->flags |= (section & SECTIONS_MASK) << SECTIONS_PGSHIFT;
+ page->flags.f &= ~(SECTIONS_MASK << SECTIONS_PGSHIFT);
+ page->flags.f |= (section & SECTIONS_MASK) << SECTIONS_PGSHIFT;
}
-static inline unsigned long page_to_section(const struct page *page)
+static inline unsigned long memdesc_section(memdesc_flags_t mdf)
{
- return (page->flags >> SECTIONS_PGSHIFT) & SECTIONS_MASK;
+ return (mdf.f >> SECTIONS_PGSHIFT) & SECTIONS_MASK;
}
#endif
@@ -1900,7 +1954,7 @@ static inline bool folio_needs_cow_for_dma(struct vm_area_struct *vma,
{
VM_BUG_ON(!(raw_read_seqcount(&vma->vm_mm->write_protect_seq) & 1));
- if (!test_bit(MMF_HAS_PINNED, &vma->vm_mm->flags))
+ if (!mm_flags_test(MMF_HAS_PINNED, vma->vm_mm))
return false;
return folio_maybe_dma_pinned(folio);
@@ -1966,14 +2020,14 @@ static inline bool folio_is_longterm_pinnable(struct folio *folio)
static inline void set_page_zone(struct page *page, enum zone_type zone)
{
- page->flags &= ~(ZONES_MASK << ZONES_PGSHIFT);
- page->flags |= (zone & ZONES_MASK) << ZONES_PGSHIFT;
+ page->flags.f &= ~(ZONES_MASK << ZONES_PGSHIFT);
+ page->flags.f |= (zone & ZONES_MASK) << ZONES_PGSHIFT;
}
static inline void set_page_node(struct page *page, unsigned long node)
{
- page->flags &= ~(NODES_MASK << NODES_PGSHIFT);
- page->flags |= (node & NODES_MASK) << NODES_PGSHIFT;
+ page->flags.f &= ~(NODES_MASK << NODES_PGSHIFT);
+ page->flags.f |= (node & NODES_MASK) << NODES_PGSHIFT;
}
static inline void set_page_links(struct page *page, enum zone_type zone,
@@ -2015,7 +2069,7 @@ static inline long compound_nr(struct page *page)
{
struct folio *folio = (struct folio *)page;
- if (!test_bit(PG_head, &folio->flags))
+ if (!test_bit(PG_head, &folio->flags.f))
return 1;
return folio_large_nr_pages(folio);
}
@@ -2351,6 +2405,8 @@ struct folio *vm_normal_folio_pmd(struct vm_area_struct *vma,
unsigned long addr, pmd_t pmd);
struct page *vm_normal_page_pmd(struct vm_area_struct *vma, unsigned long addr,
pmd_t pmd);
+struct page *vm_normal_page_pud(struct vm_area_struct *vma, unsigned long addr,
+ pud_t pud);
void zap_vma_ptes(struct vm_area_struct *vma, unsigned long address,
unsigned long size);
diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h
index 89b518ff097e..150302b4a905 100644
--- a/include/linux/mm_inline.h
+++ b/include/linux/mm_inline.h
@@ -143,7 +143,7 @@ static inline int lru_tier_from_refs(int refs, bool workingset)
static inline int folio_lru_refs(struct folio *folio)
{
- unsigned long flags = READ_ONCE(folio->flags);
+ unsigned long flags = READ_ONCE(folio->flags.f);
if (!(flags & BIT(PG_referenced)))
return 0;
@@ -156,7 +156,7 @@ static inline int folio_lru_refs(struct folio *folio)
static inline int folio_lru_gen(struct folio *folio)
{
- unsigned long flags = READ_ONCE(folio->flags);
+ unsigned long flags = READ_ONCE(folio->flags.f);
return ((flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1;
}
@@ -268,7 +268,7 @@ static inline bool lru_gen_add_folio(struct lruvec *lruvec, struct folio *folio,
gen = lru_gen_from_seq(seq);
flags = (gen + 1UL) << LRU_GEN_PGOFF;
/* see the comment on MIN_NR_GENS about PG_active */
- set_mask_bits(&folio->flags, LRU_GEN_MASK | BIT(PG_active), flags);
+ set_mask_bits(&folio->flags.f, LRU_GEN_MASK | BIT(PG_active), flags);
lru_gen_update_size(lruvec, folio, -1, gen);
/* for folio_rotate_reclaimable() */
@@ -293,7 +293,7 @@ static inline bool lru_gen_del_folio(struct lruvec *lruvec, struct folio *folio,
/* for folio_migrate_flags() */
flags = !reclaiming && lru_gen_is_active(lruvec, gen) ? BIT(PG_active) : 0;
- flags = set_mask_bits(&folio->flags, LRU_GEN_MASK, flags);
+ flags = set_mask_bits(&folio->flags.f, LRU_GEN_MASK, flags);
gen = ((flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1;
lru_gen_update_size(lruvec, folio, gen, -1);
@@ -304,9 +304,9 @@ static inline bool lru_gen_del_folio(struct lruvec *lruvec, struct folio *folio,
static inline void folio_migrate_refs(struct folio *new, struct folio *old)
{
- unsigned long refs = READ_ONCE(old->flags) & LRU_REFS_MASK;
+ unsigned long refs = READ_ONCE(old->flags.f) & LRU_REFS_MASK;
- set_mask_bits(&new->flags, LRU_REFS_MASK, refs);
+ set_mask_bits(&new->flags.f, LRU_REFS_MASK, refs);
}
#else /* !CONFIG_LRU_GEN */
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 08bc2442db93..d934a3a5b443 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -20,6 +20,7 @@
#include <linux/seqlock.h>
#include <linux/percpu_counter.h>
#include <linux/types.h>
+#include <linux/bitmap.h>
#include <asm/mmu.h>
@@ -33,6 +34,10 @@ struct address_space;
struct futex_private_hash;
struct mem_cgroup;
+typedef struct {
+ unsigned long f;
+} memdesc_flags_t;
+
/*
* Each physical page in the system has a struct page associated with
* it to keep track of whatever it is we are using the page for at the
@@ -71,7 +76,7 @@ struct mem_cgroup;
#endif
struct page {
- unsigned long flags; /* Atomic flags, some possibly
+ memdesc_flags_t flags; /* Atomic flags, some possibly
* updated asynchronously */
/*
* Five words (20/40 bytes) are available in this union.
@@ -382,7 +387,7 @@ struct folio {
union {
struct {
/* public: */
- unsigned long flags;
+ memdesc_flags_t flags;
union {
struct list_head lru;
/* private: avoid cluttering the output */
@@ -927,6 +932,15 @@ struct mm_cid {
};
#endif
+/*
+ * Opaque type representing current mm_struct flag state. Must be accessed via
+ * mm_flags_xxx() helper functions.
+ */
+#define NUM_MM_FLAG_BITS (64)
+typedef struct {
+ DECLARE_BITMAP(__mm_flags, NUM_MM_FLAG_BITS);
+} __private mm_flags_t;
+
struct kioctx_table;
struct iommu_mm_data;
struct mm_struct {
@@ -1026,10 +1040,10 @@ struct mm_struct {
* counters
*/
/*
- * With some kernel config, the current mmap_lock's offset
- * inside 'mm_struct' is at 0x120, which is very optimal, as
+ * Typically the current mmap_lock's offset is 56 bytes from
+ * the last cacheline boundary, which is very optimal, as
* its two hot fields 'count' and 'owner' sit in 2 different
- * cachelines, and when mmap_lock is highly contended, both
+ * cachelines, and when mmap_lock is highly contended, both
* of the 2 fields will be accessed frequently, current layout
* will help to reduce cache bouncing.
*
@@ -1109,7 +1123,7 @@ struct mm_struct {
/* Architecture-specific MM context */
mm_context_t context;
- unsigned long flags; /* Must use atomic bitops to access */
+ mm_flags_t flags; /* Must use mm_flags_* hlpers to access */
#ifdef CONFIG_AIO
spinlock_t ioctx_lock;
@@ -1219,6 +1233,40 @@ struct mm_struct {
unsigned long cpu_bitmap[];
};
+/* Set the first system word of mm flags, non-atomically. */
+static inline void __mm_flags_set_word(struct mm_struct *mm, unsigned long value)
+{
+ unsigned long *bitmap = ACCESS_PRIVATE(&mm->flags, __mm_flags);
+
+ bitmap_copy(bitmap, &value, BITS_PER_LONG);
+}
+
+/* Obtain a read-only view of the bitmap. */
+static inline const unsigned long *__mm_flags_get_bitmap(const struct mm_struct *mm)
+{
+ return (const unsigned long *)ACCESS_PRIVATE(&mm->flags, __mm_flags);
+}
+
+/* Read the first system word of mm flags, non-atomically. */
+static inline unsigned long __mm_flags_get_word(const struct mm_struct *mm)
+{
+ const unsigned long *bitmap = __mm_flags_get_bitmap(mm);
+
+ return bitmap_read(bitmap, 0, BITS_PER_LONG);
+}
+
+/*
+ * Update the first system word of mm flags ONLY, applying the specified mask to
+ * it, then setting all flags specified by bits.
+ */
+static inline void __mm_flags_set_mask_bits_word(struct mm_struct *mm,
+ unsigned long mask, unsigned long bits)
+{
+ unsigned long *bitmap = ACCESS_PRIVATE(&mm->flags, __mm_flags);
+
+ set_mask_bits(bitmap, mask, bits);
+}
+
#define MM_MT_FLAGS (MT_FLAGS_ALLOC_RANGE | MT_FLAGS_LOCK_EXTERN | \
MT_FLAGS_USE_RCU)
extern struct mm_struct init_mm;
@@ -1719,7 +1767,7 @@ enum {
* the modes are SUID_DUMP_* defined in linux/sched/coredump.h
*/
#define MMF_DUMPABLE_BITS 2
-#define MMF_DUMPABLE_MASK ((1 << MMF_DUMPABLE_BITS) - 1)
+#define MMF_DUMPABLE_MASK (BIT(MMF_DUMPABLE_BITS) - 1)
/* coredump filter bits */
#define MMF_DUMP_ANON_PRIVATE 2
#define MMF_DUMP_ANON_SHARED 3
@@ -1734,13 +1782,13 @@ enum {
#define MMF_DUMP_FILTER_SHIFT MMF_DUMPABLE_BITS
#define MMF_DUMP_FILTER_BITS 9
#define MMF_DUMP_FILTER_MASK \
- (((1 << MMF_DUMP_FILTER_BITS) - 1) << MMF_DUMP_FILTER_SHIFT)
+ ((BIT(MMF_DUMP_FILTER_BITS) - 1) << MMF_DUMP_FILTER_SHIFT)
#define MMF_DUMP_FILTER_DEFAULT \
- ((1 << MMF_DUMP_ANON_PRIVATE) | (1 << MMF_DUMP_ANON_SHARED) |\
- (1 << MMF_DUMP_HUGETLB_PRIVATE) | MMF_DUMP_MASK_DEFAULT_ELF)
+ (BIT(MMF_DUMP_ANON_PRIVATE) | BIT(MMF_DUMP_ANON_SHARED) | \
+ BIT(MMF_DUMP_HUGETLB_PRIVATE) | MMF_DUMP_MASK_DEFAULT_ELF)
#ifdef CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS
-# define MMF_DUMP_MASK_DEFAULT_ELF (1 << MMF_DUMP_ELF_HEADERS)
+# define MMF_DUMP_MASK_DEFAULT_ELF BIT(MMF_DUMP_ELF_HEADERS)
#else
# define MMF_DUMP_MASK_DEFAULT_ELF 0
#endif
@@ -1748,19 +1796,16 @@ enum {
#define MMF_VM_MERGEABLE 16 /* KSM may merge identical pages */
#define MMF_VM_HUGEPAGE 17 /* set when mm is available for khugepaged */
-/*
- * This one-shot flag is dropped due to necessity of changing exe once again
- * on NFS restore
- */
-//#define MMF_EXE_FILE_CHANGED 18 /* see prctl_set_mm_exe_file() */
+#define MMF_HUGE_ZERO_FOLIO 18 /* mm has ever used the global huge zero folio */
#define MMF_HAS_UPROBES 19 /* has uprobes */
#define MMF_RECALC_UPROBES 20 /* MMF_HAS_UPROBES can be wrong */
#define MMF_OOM_SKIP 21 /* mm is of no interest for the OOM killer */
#define MMF_UNSTABLE 22 /* mm is unstable for copy_from_user */
-#define MMF_HUGE_ZERO_PAGE 23 /* mm has ever used the global huge zero page */
-#define MMF_DISABLE_THP 24 /* disable THP for all VMAs */
-#define MMF_DISABLE_THP_MASK (1 << MMF_DISABLE_THP)
+#define MMF_DISABLE_THP_EXCEPT_ADVISED 23 /* no THP except when advised (e.g., VM_HUGEPAGE) */
+#define MMF_DISABLE_THP_COMPLETELY 24 /* no THP for all VMAs */
+#define MMF_DISABLE_THP_MASK (BIT(MMF_DISABLE_THP_COMPLETELY) | \
+ BIT(MMF_DISABLE_THP_EXCEPT_ADVISED))
#define MMF_OOM_REAP_QUEUED 25 /* mm was queued for oom_reaper */
#define MMF_MULTIPROCESS 26 /* mm is shared between processes */
/*
@@ -1773,27 +1818,33 @@ enum {
#define MMF_HAS_PINNED 27 /* FOLL_PIN has run, never cleared */
#define MMF_HAS_MDWE 28
-#define MMF_HAS_MDWE_MASK (1 << MMF_HAS_MDWE)
-
+#define MMF_HAS_MDWE_MASK BIT(MMF_HAS_MDWE)
#define MMF_HAS_MDWE_NO_INHERIT 29
#define MMF_VM_MERGE_ANY 30
-#define MMF_VM_MERGE_ANY_MASK (1 << MMF_VM_MERGE_ANY)
+#define MMF_VM_MERGE_ANY_MASK BIT(MMF_VM_MERGE_ANY)
#define MMF_TOPDOWN 31 /* mm searches top down by default */
-#define MMF_TOPDOWN_MASK (1 << MMF_TOPDOWN)
+#define MMF_TOPDOWN_MASK BIT(MMF_TOPDOWN)
-#define MMF_INIT_MASK (MMF_DUMPABLE_MASK | MMF_DUMP_FILTER_MASK |\
+#define MMF_INIT_LEGACY_MASK (MMF_DUMPABLE_MASK | MMF_DUMP_FILTER_MASK |\
MMF_DISABLE_THP_MASK | MMF_HAS_MDWE_MASK |\
MMF_VM_MERGE_ANY_MASK | MMF_TOPDOWN_MASK)
-static inline unsigned long mmf_init_flags(unsigned long flags)
+/* Legacy flags must fit within 32 bits. */
+static_assert((u64)MMF_INIT_LEGACY_MASK <= (u64)UINT_MAX);
+
+/*
+ * Initialise legacy flags according to masks, propagating selected flags on
+ * fork. Further flag manipulation can be performed by the caller.
+ */
+static inline unsigned long mmf_init_legacy_flags(unsigned long flags)
{
if (flags & (1UL << MMF_HAS_MDWE_NO_INHERIT))
flags &= ~((1UL << MMF_HAS_MDWE) |
(1UL << MMF_HAS_MDWE_NO_INHERIT));
- return flags & MMF_INIT_MASK;
+ return flags & MMF_INIT_LEGACY_MASK;
}
#endif /* _LINUX_MM_TYPES_H */
diff --git a/include/linux/mman.h b/include/linux/mman.h
index de9e8e6229a4..0ba8a7e8b90a 100644
--- a/include/linux/mman.h
+++ b/include/linux/mman.h
@@ -201,7 +201,7 @@ static inline bool arch_memory_deny_write_exec_supported(void)
static inline bool map_deny_write_exec(unsigned long old, unsigned long new)
{
/* If MDWE is disabled, we have nothing to deny. */
- if (!test_bit(MMF_HAS_MDWE, &current->mm->flags))
+ if (!mm_flags_test(MMF_HAS_MDWE, current->mm))
return false;
/* If the new VMA is not executable, we have nothing to deny. */
diff --git a/include/linux/mmap_lock.h b/include/linux/mmap_lock.h
index 11a078de9150..2c9fffa58714 100644
--- a/include/linux/mmap_lock.h
+++ b/include/linux/mmap_lock.h
@@ -148,91 +148,6 @@ static inline void vma_refcount_put(struct vm_area_struct *vma)
}
/*
- * Try to read-lock a vma. The function is allowed to occasionally yield false
- * locked result to avoid performance overhead, in which case we fall back to
- * using mmap_lock. The function should never yield false unlocked result.
- * False locked result is possible if mm_lock_seq overflows or if vma gets
- * reused and attached to a different mm before we lock it.
- * Returns the vma on success, NULL on failure to lock and EAGAIN if vma got
- * detached.
- *
- * WARNING! The vma passed to this function cannot be used if the function
- * fails to lock it because in certain cases RCU lock is dropped and then
- * reacquired. Once RCU lock is dropped the vma can be concurently freed.
- */
-static inline struct vm_area_struct *vma_start_read(struct mm_struct *mm,
- struct vm_area_struct *vma)
-{
- int oldcnt;
-
- /*
- * Check before locking. A race might cause false locked result.
- * We can use READ_ONCE() for the mm_lock_seq here, and don't need
- * ACQUIRE semantics, because this is just a lockless check whose result
- * we don't rely on for anything - the mm_lock_seq read against which we
- * need ordering is below.
- */
- if (READ_ONCE(vma->vm_lock_seq) == READ_ONCE(mm->mm_lock_seq.sequence))
- return NULL;
-
- /*
- * If VMA_LOCK_OFFSET is set, __refcount_inc_not_zero_limited_acquire()
- * will fail because VMA_REF_LIMIT is less than VMA_LOCK_OFFSET.
- * Acquire fence is required here to avoid reordering against later
- * vm_lock_seq check and checks inside lock_vma_under_rcu().
- */
- if (unlikely(!__refcount_inc_not_zero_limited_acquire(&vma->vm_refcnt, &oldcnt,
- VMA_REF_LIMIT))) {
- /* return EAGAIN if vma got detached from under us */
- return oldcnt ? NULL : ERR_PTR(-EAGAIN);
- }
-
- rwsem_acquire_read(&vma->vmlock_dep_map, 0, 1, _RET_IP_);
-
- /*
- * If vma got attached to another mm from under us, that mm is not
- * stable and can be freed in the narrow window after vma->vm_refcnt
- * is dropped and before rcuwait_wake_up(mm) is called. Grab it before
- * releasing vma->vm_refcnt.
- */
- if (unlikely(vma->vm_mm != mm)) {
- /* Use a copy of vm_mm in case vma is freed after we drop vm_refcnt */
- struct mm_struct *other_mm = vma->vm_mm;
-
- /*
- * __mmdrop() is a heavy operation and we don't need RCU
- * protection here. Release RCU lock during these operations.
- * We reinstate the RCU read lock as the caller expects it to
- * be held when this function returns even on error.
- */
- rcu_read_unlock();
- mmgrab(other_mm);
- vma_refcount_put(vma);
- mmdrop(other_mm);
- rcu_read_lock();
- return NULL;
- }
-
- /*
- * Overflow of vm_lock_seq/mm_lock_seq might produce false locked result.
- * False unlocked result is impossible because we modify and check
- * vma->vm_lock_seq under vma->vm_refcnt protection and mm->mm_lock_seq
- * modification invalidates all existing locks.
- *
- * We must use ACQUIRE semantics for the mm_lock_seq so that if we are
- * racing with vma_end_write_all(), we only start reading from the VMA
- * after it has been unlocked.
- * This pairs with RELEASE semantics in vma_end_write_all().
- */
- if (unlikely(vma->vm_lock_seq == raw_read_seqcount(&mm->mm_lock_seq))) {
- vma_refcount_put(vma);
- return NULL;
- }
-
- return vma;
-}
-
-/*
* Use only while holding mmap read lock which guarantees that locking will not
* fail (nobody can concurrently write-lock the vma). vma_start_read() should
* not be used in such cases because it might fail due to mm_lock_seq overflow.
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 0c5da9141983..f3272ef5131b 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -234,7 +234,21 @@ enum node_stat_item {
#endif
#ifdef CONFIG_NUMA_BALANCING
PGPROMOTE_SUCCESS, /* promote successfully */
- PGPROMOTE_CANDIDATE, /* candidate pages to promote */
+ /**
+ * Candidate pages for promotion based on hint fault latency. This
+ * counter is used to control the promotion rate and adjust the hot
+ * threshold.
+ */
+ PGPROMOTE_CANDIDATE,
+ /**
+ * Not rate-limited (NRL) candidate pages for those can be promoted
+ * without considering hot threshold because of enough free pages in
+ * fast-tier node. These promotions bypass the regular hotness checks
+ * and do NOT influence the promotion rate-limiter or
+ * threshold-adjustment logic.
+ * This is for statistics/monitoring purposes.
+ */
+ PGPROMOTE_CANDIDATE_NRL,
#endif
/* PGDEMOTE_*: pages demoted */
PGDEMOTE_KSWAPD,
@@ -245,6 +259,7 @@ enum node_stat_item {
NR_HUGETLB,
#endif
NR_BALLOON_PAGES,
+ NR_KERNEL_FILE_PAGES,
NR_VM_NODE_STAT_ITEMS
};
@@ -1169,26 +1184,31 @@ static inline bool zone_is_empty(struct zone *zone)
#define KASAN_TAG_MASK ((1UL << KASAN_TAG_WIDTH) - 1)
#define ZONEID_MASK ((1UL << ZONEID_SHIFT) - 1)
+static inline enum zone_type memdesc_zonenum(memdesc_flags_t flags)
+{
+ ASSERT_EXCLUSIVE_BITS(flags.f, ZONES_MASK << ZONES_PGSHIFT);
+ return (flags.f >> ZONES_PGSHIFT) & ZONES_MASK;
+}
+
static inline enum zone_type page_zonenum(const struct page *page)
{
- ASSERT_EXCLUSIVE_BITS(page->flags, ZONES_MASK << ZONES_PGSHIFT);
- return (page->flags >> ZONES_PGSHIFT) & ZONES_MASK;
+ return memdesc_zonenum(page->flags);
}
static inline enum zone_type folio_zonenum(const struct folio *folio)
{
- return page_zonenum(&folio->page);
+ return memdesc_zonenum(folio->flags);
}
#ifdef CONFIG_ZONE_DEVICE
-static inline bool is_zone_device_page(const struct page *page)
+static inline bool memdesc_is_zone_device(memdesc_flags_t mdf)
{
- return page_zonenum(page) == ZONE_DEVICE;
+ return memdesc_zonenum(mdf) == ZONE_DEVICE;
}
static inline struct dev_pagemap *page_pgmap(const struct page *page)
{
- VM_WARN_ON_ONCE_PAGE(!is_zone_device_page(page), page);
+ VM_WARN_ON_ONCE_PAGE(!memdesc_is_zone_device(page->flags), page);
return page_folio(page)->pgmap;
}
@@ -1203,9 +1223,9 @@ static inline struct dev_pagemap *page_pgmap(const struct page *page)
static inline bool zone_device_pages_have_same_pgmap(const struct page *a,
const struct page *b)
{
- if (is_zone_device_page(a) != is_zone_device_page(b))
+ if (memdesc_is_zone_device(a->flags) != memdesc_is_zone_device(b->flags))
return false;
- if (!is_zone_device_page(a))
+ if (!memdesc_is_zone_device(a->flags))
return true;
return page_pgmap(a) == page_pgmap(b);
}
@@ -1213,7 +1233,7 @@ static inline bool zone_device_pages_have_same_pgmap(const struct page *a,
extern void memmap_init_zone_device(struct zone *, unsigned long,
unsigned long, struct dev_pagemap *);
#else
-static inline bool is_zone_device_page(const struct page *page)
+static inline bool memdesc_is_zone_device(memdesc_flags_t mdf)
{
return false;
}
@@ -1228,9 +1248,14 @@ static inline struct dev_pagemap *page_pgmap(const struct page *page)
}
#endif
+static inline bool is_zone_device_page(const struct page *page)
+{
+ return memdesc_is_zone_device(page->flags);
+}
+
static inline bool folio_is_zone_device(const struct folio *folio)
{
- return is_zone_device_page(&folio->page);
+ return memdesc_is_zone_device(folio->flags);
}
static inline bool is_zone_movable_page(const struct page *page)
diff --git a/include/linux/netfs.h b/include/linux/netfs.h
index 98c96d649bf9..72ee7d210a74 100644
--- a/include/linux/netfs.h
+++ b/include/linux/netfs.h
@@ -21,7 +21,7 @@
#include <linux/rolling_buffer.h>
enum netfs_sreq_ref_trace;
-typedef struct mempool_s mempool_t;
+typedef struct mempool mempool_t;
struct folio_queue;
/**
diff --git a/include/linux/oom.h b/include/linux/oom.h
index 1e0fc6931ce9..7b02bc1d0a7e 100644
--- a/include/linux/oom.h
+++ b/include/linux/oom.h
@@ -91,7 +91,7 @@ static inline bool tsk_is_oom_victim(struct task_struct * tsk)
*/
static inline vm_fault_t check_stable_address_space(struct mm_struct *mm)
{
- if (unlikely(test_bit(MMF_UNSTABLE, &mm->flags)))
+ if (unlikely(mm_flags_test(MMF_UNSTABLE, mm)))
return VM_FAULT_SIGBUS;
return 0;
}
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index 8d3fa3a91ce4..d53a86e68c89 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -217,7 +217,7 @@ static __always_inline const struct page *page_fixed_fake_head(const struct page
* cold cacheline in some cases.
*/
if (IS_ALIGNED((unsigned long)page, PAGE_SIZE) &&
- test_bit(PG_head, &page->flags)) {
+ test_bit(PG_head, &page->flags.f)) {
/*
* We can safely access the field of the @page[1] with PG_head
* because the @page is a compound page composed with at least
@@ -325,14 +325,14 @@ static __always_inline int PageTail(const struct page *page)
static __always_inline int PageCompound(const struct page *page)
{
- return test_bit(PG_head, &page->flags) ||
+ return test_bit(PG_head, &page->flags.f) ||
READ_ONCE(page->compound_head) & 1;
}
#define PAGE_POISON_PATTERN -1l
static inline int PagePoisoned(const struct page *page)
{
- return READ_ONCE(page->flags) == PAGE_POISON_PATTERN;
+ return READ_ONCE(page->flags.f) == PAGE_POISON_PATTERN;
}
#ifdef CONFIG_DEBUG_VM
@@ -349,8 +349,8 @@ static const unsigned long *const_folio_flags(const struct folio *folio,
const struct page *page = &folio->page;
VM_BUG_ON_PGFLAGS(page->compound_head & 1, page);
- VM_BUG_ON_PGFLAGS(n > 0 && !test_bit(PG_head, &page->flags), page);
- return &page[n].flags;
+ VM_BUG_ON_PGFLAGS(n > 0 && !test_bit(PG_head, &page->flags.f), page);
+ return &page[n].flags.f;
}
static unsigned long *folio_flags(struct folio *folio, unsigned n)
@@ -358,8 +358,8 @@ static unsigned long *folio_flags(struct folio *folio, unsigned n)
struct page *page = &folio->page;
VM_BUG_ON_PGFLAGS(page->compound_head & 1, page);
- VM_BUG_ON_PGFLAGS(n > 0 && !test_bit(PG_head, &page->flags), page);
- return &page[n].flags;
+ VM_BUG_ON_PGFLAGS(n > 0 && !test_bit(PG_head, &page->flags.f), page);
+ return &page[n].flags.f;
}
/*
@@ -449,37 +449,37 @@ FOLIO_CLEAR_FLAG(name, page)
#define TESTPAGEFLAG(uname, lname, policy) \
FOLIO_TEST_FLAG(lname, FOLIO_##policy) \
static __always_inline int Page##uname(const struct page *page) \
-{ return test_bit(PG_##lname, &policy(page, 0)->flags); }
+{ return test_bit(PG_##lname, &policy(page, 0)->flags.f); }
#define SETPAGEFLAG(uname, lname, policy) \
FOLIO_SET_FLAG(lname, FOLIO_##policy) \
static __always_inline void SetPage##uname(struct page *page) \
-{ set_bit(PG_##lname, &policy(page, 1)->flags); }
+{ set_bit(PG_##lname, &policy(page, 1)->flags.f); }
#define CLEARPAGEFLAG(uname, lname, policy) \
FOLIO_CLEAR_FLAG(lname, FOLIO_##policy) \
static __always_inline void ClearPage##uname(struct page *page) \
-{ clear_bit(PG_##lname, &policy(page, 1)->flags); }
+{ clear_bit(PG_##lname, &policy(page, 1)->flags.f); }
#define __SETPAGEFLAG(uname, lname, policy) \
__FOLIO_SET_FLAG(lname, FOLIO_##policy) \
static __always_inline void __SetPage##uname(struct page *page) \
-{ __set_bit(PG_##lname, &policy(page, 1)->flags); }
+{ __set_bit(PG_##lname, &policy(page, 1)->flags.f); }
#define __CLEARPAGEFLAG(uname, lname, policy) \
__FOLIO_CLEAR_FLAG(lname, FOLIO_##policy) \
static __always_inline void __ClearPage##uname(struct page *page) \
-{ __clear_bit(PG_##lname, &policy(page, 1)->flags); }
+{ __clear_bit(PG_##lname, &policy(page, 1)->flags.f); }
#define TESTSETFLAG(uname, lname, policy) \
FOLIO_TEST_SET_FLAG(lname, FOLIO_##policy) \
static __always_inline int TestSetPage##uname(struct page *page) \
-{ return test_and_set_bit(PG_##lname, &policy(page, 1)->flags); }
+{ return test_and_set_bit(PG_##lname, &policy(page, 1)->flags.f); }
#define TESTCLEARFLAG(uname, lname, policy) \
FOLIO_TEST_CLEAR_FLAG(lname, FOLIO_##policy) \
static __always_inline int TestClearPage##uname(struct page *page) \
-{ return test_and_clear_bit(PG_##lname, &policy(page, 1)->flags); }
+{ return test_and_clear_bit(PG_##lname, &policy(page, 1)->flags.f); }
#define PAGEFLAG(uname, lname, policy) \
TESTPAGEFLAG(uname, lname, policy) \
@@ -846,7 +846,7 @@ static __always_inline bool folio_test_head(const struct folio *folio)
static __always_inline int PageHead(const struct page *page)
{
PF_POISONED_CHECK(page);
- return test_bit(PG_head, &page->flags) && !page_is_fake_head(page);
+ return test_bit(PG_head, &page->flags.f) && !page_is_fake_head(page);
}
__SETPAGEFLAG(Head, head, PF_ANY)
@@ -1170,28 +1170,28 @@ static __always_inline int PageAnonExclusive(const struct page *page)
*/
if (PageHuge(page))
page = compound_head(page);
- return test_bit(PG_anon_exclusive, &PF_ANY(page, 1)->flags);
+ return test_bit(PG_anon_exclusive, &PF_ANY(page, 1)->flags.f);
}
static __always_inline void SetPageAnonExclusive(struct page *page)
{
VM_BUG_ON_PGFLAGS(!PageAnonNotKsm(page), page);
VM_BUG_ON_PGFLAGS(PageHuge(page) && !PageHead(page), page);
- set_bit(PG_anon_exclusive, &PF_ANY(page, 1)->flags);
+ set_bit(PG_anon_exclusive, &PF_ANY(page, 1)->flags.f);
}
static __always_inline void ClearPageAnonExclusive(struct page *page)
{
VM_BUG_ON_PGFLAGS(!PageAnonNotKsm(page), page);
VM_BUG_ON_PGFLAGS(PageHuge(page) && !PageHead(page), page);
- clear_bit(PG_anon_exclusive, &PF_ANY(page, 1)->flags);
+ clear_bit(PG_anon_exclusive, &PF_ANY(page, 1)->flags.f);
}
static __always_inline void __ClearPageAnonExclusive(struct page *page)
{
VM_BUG_ON_PGFLAGS(!PageAnon(page), page);
VM_BUG_ON_PGFLAGS(PageHuge(page) && !PageHead(page), page);
- __clear_bit(PG_anon_exclusive, &PF_ANY(page, 1)->flags);
+ __clear_bit(PG_anon_exclusive, &PF_ANY(page, 1)->flags.f);
}
#ifdef CONFIG_MMU
@@ -1241,7 +1241,7 @@ static __always_inline void __ClearPageAnonExclusive(struct page *page)
*/
static inline int folio_has_private(const struct folio *folio)
{
- return !!(folio->flags & PAGE_FLAGS_PRIVATE);
+ return !!(folio->flags.f & PAGE_FLAGS_PRIVATE);
}
#undef PF_ANY
diff --git a/include/linux/pageblock-flags.h b/include/linux/pageblock-flags.h
index 6a44be0f39f4..e046278a01fa 100644
--- a/include/linux/pageblock-flags.h
+++ b/include/linux/pageblock-flags.h
@@ -13,12 +13,11 @@
#include <linux/types.h>
-#define PB_migratetype_bits 3
/* Bit indices that affect a whole block of pages */
enum pageblock_bits {
- PB_migrate,
- PB_migrate_end = PB_migrate + PB_migratetype_bits - 1,
- /* 3 bits required for migrate types */
+ PB_migrate_0,
+ PB_migrate_1,
+ PB_migrate_2,
PB_compact_skip,/* If set the block is skipped by compaction */
#ifdef CONFIG_MEMORY_ISOLATION
@@ -37,11 +36,10 @@ enum pageblock_bits {
#define NR_PAGEBLOCK_BITS (roundup_pow_of_two(__NR_PAGEBLOCK_BITS))
-#define MIGRATETYPE_MASK ((1UL << (PB_migrate_end + 1)) - 1)
+#define MIGRATETYPE_MASK (BIT(PB_migrate_0)|BIT(PB_migrate_1)|BIT(PB_migrate_2))
#ifdef CONFIG_MEMORY_ISOLATION
-#define MIGRATETYPE_AND_ISO_MASK \
- (((1UL << (PB_migrate_end + 1)) - 1) | BIT(PB_migrate_isolate))
+#define MIGRATETYPE_AND_ISO_MASK (MIGRATETYPE_MASK | BIT(PB_migrate_isolate))
#else
#define MIGRATETYPE_AND_ISO_MASK MIGRATETYPE_MASK
#endif
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 12a12dae727d..f0dfdfb13cd9 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -211,6 +211,8 @@ enum mapping_flags {
folio contents */
AS_INACCESSIBLE = 8, /* Do not attempt direct R/W access to the mapping */
AS_WRITEBACK_MAY_DEADLOCK_ON_RECLAIM = 9,
+ AS_KERNEL_FILE = 10, /* mapping for a fake kernel file that shouldn't
+ account usage to user cgroups */
/* Bits 16-25 are used for FOLIO_ORDER */
AS_FOLIO_ORDER_BITS = 5,
AS_FOLIO_ORDER_MIN = 16,
diff --git a/include/linux/pagevec.h b/include/linux/pagevec.h
index 5d3a0cccc6bf..63be5a451627 100644
--- a/include/linux/pagevec.h
+++ b/include/linux/pagevec.h
@@ -51,12 +51,12 @@ static inline void folio_batch_reinit(struct folio_batch *fbatch)
fbatch->i = 0;
}
-static inline unsigned int folio_batch_count(struct folio_batch *fbatch)
+static inline unsigned int folio_batch_count(const struct folio_batch *fbatch)
{
return fbatch->nr;
}
-static inline unsigned int folio_batch_space(struct folio_batch *fbatch)
+static inline unsigned int folio_batch_space(const struct folio_batch *fbatch)
{
return PAGEVEC_SIZE - fbatch->nr;
}
diff --git a/include/linux/pgalloc_tag.h b/include/linux/pgalloc_tag.h
index 8a7f4f802c57..38a82d65e58e 100644
--- a/include/linux/pgalloc_tag.h
+++ b/include/linux/pgalloc_tag.h
@@ -107,7 +107,8 @@ static inline bool get_page_tag_ref(struct page *page, union codetag_ref *ref,
if (static_key_enabled(&mem_profiling_compressed)) {
pgalloc_tag_idx idx;
- idx = (page->flags >> alloc_tag_ref_offs) & alloc_tag_ref_mask;
+ idx = (page->flags.f >> alloc_tag_ref_offs) &
+ alloc_tag_ref_mask;
idx_to_ref(idx, ref);
handle->page = page;
} else {
@@ -149,11 +150,11 @@ static inline void update_page_tag_ref(union pgtag_ref_handle handle, union code
idx = (unsigned long)ref_to_idx(ref);
idx = (idx & alloc_tag_ref_mask) << alloc_tag_ref_offs;
do {
- old_flags = READ_ONCE(page->flags);
+ old_flags = READ_ONCE(page->flags.f);
flags = old_flags;
flags &= ~(alloc_tag_ref_mask << alloc_tag_ref_offs);
flags |= idx;
- } while (unlikely(!try_cmpxchg(&page->flags, &old_flags, flags)));
+ } while (unlikely(!try_cmpxchg(&page->flags.f, &old_flags, flags)));
} else {
if (WARN_ON(!handle.ref || !ref))
return;
diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
index 2b80fd456c8b..94249e671a7e 100644
--- a/include/linux/pgtable.h
+++ b/include/linux/pgtable.h
@@ -1975,6 +1975,32 @@ static inline bool arch_has_pfn_modify_check(void)
/* Page-Table Modification Mask */
typedef unsigned int pgtbl_mod_mask;
+enum pgtable_level {
+ PGTABLE_LEVEL_PTE = 0,
+ PGTABLE_LEVEL_PMD,
+ PGTABLE_LEVEL_PUD,
+ PGTABLE_LEVEL_P4D,
+ PGTABLE_LEVEL_PGD,
+};
+
+static inline const char *pgtable_level_to_str(enum pgtable_level level)
+{
+ switch (level) {
+ case PGTABLE_LEVEL_PTE:
+ return "pte";
+ case PGTABLE_LEVEL_PMD:
+ return "pmd";
+ case PGTABLE_LEVEL_PUD:
+ return "pud";
+ case PGTABLE_LEVEL_P4D:
+ return "p4d";
+ case PGTABLE_LEVEL_PGD:
+ return "pgd";
+ default:
+ return "unknown";
+ }
+}
+
#endif /* !__ASSEMBLY__ */
#if !defined(MAX_POSSIBLE_PHYSMEM_BITS) && !defined(CONFIG_64BIT)
diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index 6cd020eea37a..e8aff6d2deda 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -394,18 +394,8 @@ typedef int __bitwise rmap_t;
/* The anonymous (sub)page is exclusive to a single process. */
#define RMAP_EXCLUSIVE ((__force rmap_t)BIT(0))
-/*
- * Internally, we're using an enum to specify the granularity. We make the
- * compiler emit specialized code for each granularity.
- */
-enum rmap_level {
- RMAP_LEVEL_PTE = 0,
- RMAP_LEVEL_PMD,
- RMAP_LEVEL_PUD,
-};
-
-static inline void __folio_rmap_sanity_checks(const struct folio *folio,
- const struct page *page, int nr_pages, enum rmap_level level)
+static __always_inline void __folio_rmap_sanity_checks(const struct folio *folio,
+ const struct page *page, int nr_pages, enum pgtable_level level)
{
/* hugetlb folios are handled separately. */
VM_WARN_ON_FOLIO(folio_test_hugetlb(folio), folio);
@@ -427,18 +417,18 @@ static inline void __folio_rmap_sanity_checks(const struct folio *folio,
VM_WARN_ON_FOLIO(page_folio(page + nr_pages - 1) != folio, folio);
switch (level) {
- case RMAP_LEVEL_PTE:
+ case PGTABLE_LEVEL_PTE:
break;
- case RMAP_LEVEL_PMD:
+ case PGTABLE_LEVEL_PMD:
/*
* We don't support folios larger than a single PMD yet. So
- * when RMAP_LEVEL_PMD is set, we assume that we are creating
+ * when PGTABLE_LEVEL_PMD is set, we assume that we are creating
* a single "entire" mapping of the folio.
*/
VM_WARN_ON_FOLIO(folio_nr_pages(folio) != HPAGE_PMD_NR, folio);
VM_WARN_ON_FOLIO(nr_pages != HPAGE_PMD_NR, folio);
break;
- case RMAP_LEVEL_PUD:
+ case PGTABLE_LEVEL_PUD:
/*
* Assume that we are creating a single "entire" mapping of the
* folio.
@@ -447,7 +437,7 @@ static inline void __folio_rmap_sanity_checks(const struct folio *folio,
VM_WARN_ON_FOLIO(nr_pages != HPAGE_PUD_NR, folio);
break;
default:
- VM_WARN_ON_ONCE(true);
+ BUILD_BUG();
}
/*
@@ -567,14 +557,14 @@ static inline void hugetlb_remove_rmap(struct folio *folio)
static __always_inline void __folio_dup_file_rmap(struct folio *folio,
struct page *page, int nr_pages, struct vm_area_struct *dst_vma,
- enum rmap_level level)
+ enum pgtable_level level)
{
const int orig_nr_pages = nr_pages;
__folio_rmap_sanity_checks(folio, page, nr_pages, level);
switch (level) {
- case RMAP_LEVEL_PTE:
+ case PGTABLE_LEVEL_PTE:
if (!folio_test_large(folio)) {
atomic_inc(&folio->_mapcount);
break;
@@ -587,11 +577,13 @@ static __always_inline void __folio_dup_file_rmap(struct folio *folio,
}
folio_add_large_mapcount(folio, orig_nr_pages, dst_vma);
break;
- case RMAP_LEVEL_PMD:
- case RMAP_LEVEL_PUD:
+ case PGTABLE_LEVEL_PMD:
+ case PGTABLE_LEVEL_PUD:
atomic_inc(&folio->_entire_mapcount);
folio_inc_large_mapcount(folio, dst_vma);
break;
+ default:
+ BUILD_BUG();
}
}
@@ -609,13 +601,13 @@ static __always_inline void __folio_dup_file_rmap(struct folio *folio,
static inline void folio_dup_file_rmap_ptes(struct folio *folio,
struct page *page, int nr_pages, struct vm_area_struct *dst_vma)
{
- __folio_dup_file_rmap(folio, page, nr_pages, dst_vma, RMAP_LEVEL_PTE);
+ __folio_dup_file_rmap(folio, page, nr_pages, dst_vma, PGTABLE_LEVEL_PTE);
}
static __always_inline void folio_dup_file_rmap_pte(struct folio *folio,
struct page *page, struct vm_area_struct *dst_vma)
{
- __folio_dup_file_rmap(folio, page, 1, dst_vma, RMAP_LEVEL_PTE);
+ __folio_dup_file_rmap(folio, page, 1, dst_vma, PGTABLE_LEVEL_PTE);
}
/**
@@ -632,7 +624,7 @@ static inline void folio_dup_file_rmap_pmd(struct folio *folio,
struct page *page, struct vm_area_struct *dst_vma)
{
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
- __folio_dup_file_rmap(folio, page, HPAGE_PMD_NR, dst_vma, RMAP_LEVEL_PTE);
+ __folio_dup_file_rmap(folio, page, HPAGE_PMD_NR, dst_vma, PGTABLE_LEVEL_PTE);
#else
WARN_ON_ONCE(true);
#endif
@@ -640,7 +632,7 @@ static inline void folio_dup_file_rmap_pmd(struct folio *folio,
static __always_inline int __folio_try_dup_anon_rmap(struct folio *folio,
struct page *page, int nr_pages, struct vm_area_struct *dst_vma,
- struct vm_area_struct *src_vma, enum rmap_level level)
+ struct vm_area_struct *src_vma, enum pgtable_level level)
{
const int orig_nr_pages = nr_pages;
bool maybe_pinned;
@@ -665,7 +657,7 @@ static __always_inline int __folio_try_dup_anon_rmap(struct folio *folio,
* copying if the folio maybe pinned.
*/
switch (level) {
- case RMAP_LEVEL_PTE:
+ case PGTABLE_LEVEL_PTE:
if (unlikely(maybe_pinned)) {
for (i = 0; i < nr_pages; i++)
if (PageAnonExclusive(page + i))
@@ -687,8 +679,8 @@ static __always_inline int __folio_try_dup_anon_rmap(struct folio *folio,
} while (page++, --nr_pages > 0);
folio_add_large_mapcount(folio, orig_nr_pages, dst_vma);
break;
- case RMAP_LEVEL_PMD:
- case RMAP_LEVEL_PUD:
+ case PGTABLE_LEVEL_PMD:
+ case PGTABLE_LEVEL_PUD:
if (PageAnonExclusive(page)) {
if (unlikely(maybe_pinned))
return -EBUSY;
@@ -697,6 +689,8 @@ static __always_inline int __folio_try_dup_anon_rmap(struct folio *folio,
atomic_inc(&folio->_entire_mapcount);
folio_inc_large_mapcount(folio, dst_vma);
break;
+ default:
+ BUILD_BUG();
}
return 0;
}
@@ -730,7 +724,7 @@ static inline int folio_try_dup_anon_rmap_ptes(struct folio *folio,
struct vm_area_struct *src_vma)
{
return __folio_try_dup_anon_rmap(folio, page, nr_pages, dst_vma,
- src_vma, RMAP_LEVEL_PTE);
+ src_vma, PGTABLE_LEVEL_PTE);
}
static __always_inline int folio_try_dup_anon_rmap_pte(struct folio *folio,
@@ -738,7 +732,7 @@ static __always_inline int folio_try_dup_anon_rmap_pte(struct folio *folio,
struct vm_area_struct *src_vma)
{
return __folio_try_dup_anon_rmap(folio, page, 1, dst_vma, src_vma,
- RMAP_LEVEL_PTE);
+ PGTABLE_LEVEL_PTE);
}
/**
@@ -770,7 +764,7 @@ static inline int folio_try_dup_anon_rmap_pmd(struct folio *folio,
{
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
return __folio_try_dup_anon_rmap(folio, page, HPAGE_PMD_NR, dst_vma,
- src_vma, RMAP_LEVEL_PMD);
+ src_vma, PGTABLE_LEVEL_PMD);
#else
WARN_ON_ONCE(true);
return -EBUSY;
@@ -778,7 +772,7 @@ static inline int folio_try_dup_anon_rmap_pmd(struct folio *folio,
}
static __always_inline int __folio_try_share_anon_rmap(struct folio *folio,
- struct page *page, int nr_pages, enum rmap_level level)
+ struct page *page, int nr_pages, enum pgtable_level level)
{
VM_WARN_ON_FOLIO(!folio_test_anon(folio), folio);
VM_WARN_ON_FOLIO(!PageAnonExclusive(page), folio);
@@ -873,7 +867,7 @@ static __always_inline int __folio_try_share_anon_rmap(struct folio *folio,
static inline int folio_try_share_anon_rmap_pte(struct folio *folio,
struct page *page)
{
- return __folio_try_share_anon_rmap(folio, page, 1, RMAP_LEVEL_PTE);
+ return __folio_try_share_anon_rmap(folio, page, 1, PGTABLE_LEVEL_PTE);
}
/**
@@ -904,7 +898,7 @@ static inline int folio_try_share_anon_rmap_pmd(struct folio *folio,
{
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
return __folio_try_share_anon_rmap(folio, page, HPAGE_PMD_NR,
- RMAP_LEVEL_PMD);
+ PGTABLE_LEVEL_PMD);
#else
WARN_ON_ONCE(true);
return -EBUSY;
diff --git a/include/linux/sched/coredump.h b/include/linux/sched/coredump.h
index 6eb65ceed213..b7fafe999073 100644
--- a/include/linux/sched/coredump.h
+++ b/include/linux/sched/coredump.h
@@ -8,6 +8,20 @@
#define SUID_DUMP_USER 1 /* Dump as user of process */
#define SUID_DUMP_ROOT 2 /* Dump as root */
+static inline unsigned long __mm_flags_get_dumpable(struct mm_struct *mm)
+{
+ /*
+ * By convention, dumpable bits are contained in first 32 bits of the
+ * bitmap, so we can simply access this first unsigned long directly.
+ */
+ return __mm_flags_get_word(mm);
+}
+
+static inline void __mm_flags_set_mask_dumpable(struct mm_struct *mm, int value)
+{
+ __mm_flags_set_mask_bits_word(mm, MMF_DUMPABLE_MASK, value);
+}
+
extern void set_dumpable(struct mm_struct *mm, int value);
/*
* This returns the actual value of the suid_dumpable flag. For things
@@ -22,7 +36,9 @@ static inline int __get_dumpable(unsigned long mm_flags)
static inline int get_dumpable(struct mm_struct *mm)
{
- return __get_dumpable(mm->flags);
+ unsigned long flags = __mm_flags_get_dumpable(mm);
+
+ return __get_dumpable(flags);
}
#endif /* _LINUX_SCHED_COREDUMP_H */
diff --git a/include/linux/slab.h b/include/linux/slab.h
index d5a8ab98035c..6dc300bac2a1 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -465,9 +465,13 @@ int kmem_cache_shrink(struct kmem_cache *s);
/*
* Common kmalloc functions provided by all allocators
*/
-void * __must_check krealloc_noprof(const void *objp, size_t new_size,
- gfp_t flags) __realloc_size(2);
-#define krealloc(...) alloc_hooks(krealloc_noprof(__VA_ARGS__))
+void * __must_check krealloc_node_align_noprof(const void *objp, size_t new_size,
+ unsigned long align,
+ gfp_t flags, int nid) __realloc_size(2);
+#define krealloc_noprof(_o, _s, _f) krealloc_node_align_noprof(_o, _s, 1, _f, NUMA_NO_NODE)
+#define krealloc_node_align(...) alloc_hooks(krealloc_node_align_noprof(__VA_ARGS__))
+#define krealloc_node(_o, _s, _f, _n) krealloc_node_align(_o, _s, 1, _f, _n)
+#define krealloc(...) krealloc_node(__VA_ARGS__, NUMA_NO_NODE)
void kfree(const void *objp);
void kfree_sensitive(const void *objp);
@@ -1041,18 +1045,20 @@ static inline __alloc_size(1) void *kzalloc_noprof(size_t size, gfp_t flags)
#define kzalloc(...) alloc_hooks(kzalloc_noprof(__VA_ARGS__))
#define kzalloc_node(_size, _flags, _node) kmalloc_node(_size, (_flags)|__GFP_ZERO, _node)
-void *__kvmalloc_node_noprof(DECL_BUCKET_PARAMS(size, b), gfp_t flags, int node) __alloc_size(1);
-#define kvmalloc_node_noprof(size, flags, node) \
- __kvmalloc_node_noprof(PASS_BUCKET_PARAMS(size, NULL), flags, node)
-#define kvmalloc_node(...) alloc_hooks(kvmalloc_node_noprof(__VA_ARGS__))
-
-#define kvmalloc(_size, _flags) kvmalloc_node(_size, _flags, NUMA_NO_NODE)
-#define kvmalloc_noprof(_size, _flags) kvmalloc_node_noprof(_size, _flags, NUMA_NO_NODE)
+void *__kvmalloc_node_noprof(DECL_BUCKET_PARAMS(size, b), unsigned long align,
+ gfp_t flags, int node) __alloc_size(1);
+#define kvmalloc_node_align_noprof(_size, _align, _flags, _node) \
+ __kvmalloc_node_noprof(PASS_BUCKET_PARAMS(_size, NULL), _align, _flags, _node)
+#define kvmalloc_node_align(...) \
+ alloc_hooks(kvmalloc_node_align_noprof(__VA_ARGS__))
+#define kvmalloc_node(_s, _f, _n) kvmalloc_node_align(_s, 1, _f, _n)
+#define kvmalloc(...) kvmalloc_node(__VA_ARGS__, NUMA_NO_NODE)
#define kvzalloc(_size, _flags) kvmalloc(_size, (_flags)|__GFP_ZERO)
#define kvzalloc_node(_size, _flags, _node) kvmalloc_node(_size, (_flags)|__GFP_ZERO, _node)
+
#define kmem_buckets_valloc(_b, _size, _flags) \
- alloc_hooks(__kvmalloc_node_noprof(PASS_BUCKET_PARAMS(_size, _b), _flags, NUMA_NO_NODE))
+ alloc_hooks(__kvmalloc_node_noprof(PASS_BUCKET_PARAMS(_size, _b), 1, _flags, NUMA_NO_NODE))
static inline __alloc_size(1, 2) void *
kvmalloc_array_node_noprof(size_t n, size_t size, gfp_t flags, int node)
@@ -1062,7 +1068,7 @@ kvmalloc_array_node_noprof(size_t n, size_t size, gfp_t flags, int node)
if (unlikely(check_mul_overflow(n, size, &bytes)))
return NULL;
- return kvmalloc_node_noprof(bytes, flags, node);
+ return kvmalloc_node_align_noprof(bytes, 1, flags, node);
}
#define kvmalloc_array_noprof(...) kvmalloc_array_node_noprof(__VA_ARGS__, NUMA_NO_NODE)
@@ -1073,9 +1079,12 @@ kvmalloc_array_node_noprof(size_t n, size_t size, gfp_t flags, int node)
#define kvcalloc_node(...) alloc_hooks(kvcalloc_node_noprof(__VA_ARGS__))
#define kvcalloc(...) alloc_hooks(kvcalloc_noprof(__VA_ARGS__))
-void *kvrealloc_noprof(const void *p, size_t size, gfp_t flags)
- __realloc_size(2);
-#define kvrealloc(...) alloc_hooks(kvrealloc_noprof(__VA_ARGS__))
+void *kvrealloc_node_align_noprof(const void *p, size_t size, unsigned long align,
+ gfp_t flags, int nid) __realloc_size(2);
+#define kvrealloc_node_align(...) \
+ alloc_hooks(kvrealloc_node_align_noprof(__VA_ARGS__))
+#define kvrealloc_node(_p, _s, _f, _n) kvrealloc_node_align(_p, _s, 1, _f, _n)
+#define kvrealloc(...) kvrealloc_node(__VA_ARGS__, NUMA_NO_NODE)
extern void kvfree(const void *addr);
DEFINE_FREE(kvfree, void *, if (!IS_ERR_OR_NULL(_T)) kvfree(_T))
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 7012a0f758d8..a2bb20841616 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -310,7 +310,6 @@ struct swap_info_struct {
/* list of cluster that contains at least one free slot */
struct list_head frag_clusters[SWAP_NR_ORDERS];
/* list of cluster that are fragmented or contented */
- atomic_long_t frag_cluster_nr[SWAP_NR_ORDERS];
unsigned int pages; /* total of usable pages of swap */
atomic_long_t inuse_pages; /* number of those currently in use */
struct swap_sequential_cluster *global_cluster; /* Use one global cluster for rotating device */
@@ -321,11 +320,8 @@ struct swap_info_struct {
struct completion comp; /* seldom referenced */
spinlock_t lock; /*
* protect map scan related fields like
- * swap_map, lowest_bit, highest_bit,
- * inuse_pages, cluster_next,
- * cluster_nr, lowest_alloc,
- * highest_alloc, free/discard cluster
- * list. other fields are only changed
+ * swap_map, inuse_pages and all cluster
+ * lists. other fields are only changed
* at swapon/swapoff, so are protected
* by swap_lock. changing flags need
* hold this lock and swap_lock. If
diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
index 2759dac6be44..eb54b7b3202f 100644
--- a/include/linux/vmalloc.h
+++ b/include/linux/vmalloc.h
@@ -197,9 +197,15 @@ extern void *__vcalloc_noprof(size_t n, size_t size, gfp_t flags) __alloc_size(1
extern void *vcalloc_noprof(size_t n, size_t size) __alloc_size(1, 2);
#define vcalloc(...) alloc_hooks(vcalloc_noprof(__VA_ARGS__))
-void * __must_check vrealloc_noprof(const void *p, size_t size, gfp_t flags)
- __realloc_size(2);
-#define vrealloc(...) alloc_hooks(vrealloc_noprof(__VA_ARGS__))
+void *__must_check vrealloc_node_align_noprof(const void *p, size_t size,
+ unsigned long align, gfp_t flags, int nid) __realloc_size(2);
+#define vrealloc_node_noprof(_p, _s, _f, _nid) \
+ vrealloc_node_align_noprof(_p, _s, 1, _f, _nid)
+#define vrealloc_noprof(_p, _s, _f) \
+ vrealloc_node_align_noprof(_p, _s, 1, _f, NUMA_NO_NODE)
+#define vrealloc_node_align(...) alloc_hooks(vrealloc_node_align_noprof(__VA_ARGS__))
+#define vrealloc_node(...) alloc_hooks(vrealloc_node_noprof(__VA_ARGS__))
+#define vrealloc(...) alloc_hooks(vrealloc_noprof(__VA_ARGS__))
extern void vfree(const void *addr);
extern void vfree_atomic(const void *addr);
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index a2848d731a46..2a7e134d03ee 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -360,12 +360,6 @@ bool wb_over_bg_thresh(struct bdi_writeback *wb);
struct folio *writeback_iter(struct address_space *mapping,
struct writeback_control *wbc, struct folio *folio, int *error);
-typedef int (*writepage_t)(struct folio *folio, struct writeback_control *wbc,
- void *data);
-
-int write_cache_pages(struct address_space *mapping,
- struct writeback_control *wbc, writepage_t writepage,
- void *data);
int do_writepages(struct address_space *mapping, struct writeback_control *wbc);
void writeback_set_ratelimit(void);
void tag_pages_for_writeback(struct address_space *mapping,
diff --git a/include/trace/events/cma.h b/include/trace/events/cma.h
index 383c09f583ac..37195edf2498 100644
--- a/include/trace/events/cma.h
+++ b/include/trace/events/cma.h
@@ -38,25 +38,32 @@ TRACE_EVENT(cma_release,
TRACE_EVENT(cma_alloc_start,
- TP_PROTO(const char *name, unsigned long count, unsigned int align),
+ TP_PROTO(const char *name, unsigned long request_count, unsigned long available_count,
+ unsigned long total_count, unsigned int align),
- TP_ARGS(name, count, align),
+ TP_ARGS(name, request_count, available_count, total_count, align),
TP_STRUCT__entry(
__string(name, name)
- __field(unsigned long, count)
+ __field(unsigned long, request_count)
+ __field(unsigned long, available_count)
+ __field(unsigned long, total_count)
__field(unsigned int, align)
),
TP_fast_assign(
__assign_str(name);
- __entry->count = count;
+ __entry->request_count = request_count;
+ __entry->available_count = available_count;
+ __entry->total_count = total_count;
__entry->align = align;
),
- TP_printk("name=%s count=%lu align=%u",
+ TP_printk("name=%s request_count=%lu available_count=%lu total_count=%lu align=%u",
__get_str(name),
- __entry->count,
+ __entry->request_count,
+ __entry->available_count,
+ __entry->total_count,
__entry->align)
);
diff --git a/include/trace/events/kmem.h b/include/trace/events/kmem.h
index 474358773abe..7f93e754da5c 100644
--- a/include/trace/events/kmem.h
+++ b/include/trace/events/kmem.h
@@ -22,6 +22,7 @@ TRACE_EVENT(kmem_cache_alloc,
TP_STRUCT__entry(
__field( unsigned long, call_site )
__field( const void *, ptr )
+ __string( name, s->name )
__field( size_t, bytes_req )
__field( size_t, bytes_alloc )
__field( unsigned long, gfp_flags )
@@ -32,6 +33,7 @@ TRACE_EVENT(kmem_cache_alloc,
TP_fast_assign(
__entry->call_site = call_site;
__entry->ptr = ptr;
+ __assign_str(name);
__entry->bytes_req = s->object_size;
__entry->bytes_alloc = s->size;
__entry->gfp_flags = (__force unsigned long)gfp_flags;
@@ -41,9 +43,10 @@ TRACE_EVENT(kmem_cache_alloc,
(s->flags & SLAB_ACCOUNT)) : false;
),
- TP_printk("call_site=%pS ptr=%p bytes_req=%zu bytes_alloc=%zu gfp_flags=%s node=%d accounted=%s",
+ TP_printk("call_site=%pS ptr=%p name=%s bytes_req=%zu bytes_alloc=%zu gfp_flags=%s node=%d accounted=%s",
(void *)__entry->call_site,
__entry->ptr,
+ __get_str(name),
__entry->bytes_req,
__entry->bytes_alloc,
show_gfp_flags(__entry->gfp_flags),
diff --git a/include/trace/events/page_ref.h b/include/trace/events/page_ref.h
index fe33a255b7d0..ea6b5c4baf3d 100644
--- a/include/trace/events/page_ref.h
+++ b/include/trace/events/page_ref.h
@@ -28,7 +28,7 @@ DECLARE_EVENT_CLASS(page_ref_mod_template,
TP_fast_assign(
__entry->pfn = page_to_pfn(page);
- __entry->flags = page->flags;
+ __entry->flags = page->flags.f;
__entry->count = page_ref_count(page);
__entry->mapcount = atomic_read(&page->_mapcount);
__entry->mapping = page->mapping;
@@ -77,7 +77,7 @@ DECLARE_EVENT_CLASS(page_ref_mod_and_test_template,
TP_fast_assign(
__entry->pfn = page_to_pfn(page);
- __entry->flags = page->flags;
+ __entry->flags = page->flags.f;
__entry->count = page_ref_count(page);
__entry->mapcount = atomic_read(&page->_mapcount);
__entry->mapping = page->mapping;
diff --git a/include/uapi/linux/mempolicy.h b/include/uapi/linux/mempolicy.h
index 1f9bb10d1a47..8fbbe613611a 100644
--- a/include/uapi/linux/mempolicy.h
+++ b/include/uapi/linux/mempolicy.h
@@ -66,10 +66,16 @@ enum {
#define MPOL_F_MORON (1 << 4) /* Migrate On protnone Reference On Node */
/*
- * These bit locations are exposed in the vm.zone_reclaim_mode sysctl
- * ABI. New bits are OK, but existing bits can never change.
+ * Enabling zone reclaim means the page allocator will attempt to fulfill
+ * the allocation request on the current node by triggering reclaim and
+ * trying to shrink the current node.
+ * Fallback allocations on the next candidates in the zonelist are considered
+ * when reclaim fails to free up enough memory in the current node/zone.
+ *
+ * These bit locations are exposed in the vm.zone_reclaim_mode sysctl.
+ * New bits are OK, but existing bits should not be changed.
*/
-#define RECLAIM_ZONE (1<<0) /* Run shrink_inactive_list on the zone */
+#define RECLAIM_ZONE (1<<0) /* Enable zone reclaim */
#define RECLAIM_WRITE (1<<1) /* Writeout pages during reclaim */
#define RECLAIM_UNMAP (1<<2) /* Unmap pages during reclaim */
diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h
index ed3aed264aeb..51c4e8c82b1e 100644
--- a/include/uapi/linux/prctl.h
+++ b/include/uapi/linux/prctl.h
@@ -177,7 +177,17 @@ struct prctl_mm_map {
#define PR_GET_TID_ADDRESS 40
+/*
+ * Flags for PR_SET_THP_DISABLE are only applicable when disabling. Bit 0
+ * is reserved, so PR_GET_THP_DISABLE can return "1 | flags", to effectively
+ * return "1" when no flags were specified for PR_SET_THP_DISABLE.
+ */
#define PR_SET_THP_DISABLE 41
+/*
+ * Don't disable THPs when explicitly advised (e.g., MADV_HUGEPAGE /
+ * VM_HUGEPAGE, MADV_COLLAPSE).
+ */
+# define PR_THP_DISABLE_EXCEPT_ADVISED (1 << 1)
#define PR_GET_THP_DISABLE 42
/*
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 7ca1940607bd..31a12b60055f 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -1153,15 +1153,15 @@ static int install_breakpoint(struct uprobe *uprobe, struct vm_area_struct *vma,
* set MMF_HAS_UPROBES in advance for uprobe_pre_sstep_notifier(),
* the task can hit this breakpoint right after __replace_page().
*/
- first_uprobe = !test_bit(MMF_HAS_UPROBES, &mm->flags);
+ first_uprobe = !mm_flags_test(MMF_HAS_UPROBES, mm);
if (first_uprobe)
- set_bit(MMF_HAS_UPROBES, &mm->flags);
+ mm_flags_set(MMF_HAS_UPROBES, mm);
ret = set_swbp(&uprobe->arch, vma, vaddr);
if (!ret)
- clear_bit(MMF_RECALC_UPROBES, &mm->flags);
+ mm_flags_clear(MMF_RECALC_UPROBES, mm);
else if (first_uprobe)
- clear_bit(MMF_HAS_UPROBES, &mm->flags);
+ mm_flags_clear(MMF_HAS_UPROBES, mm);
return ret;
}
@@ -1171,7 +1171,7 @@ static int remove_breakpoint(struct uprobe *uprobe, struct vm_area_struct *vma,
{
struct mm_struct *mm = vma->vm_mm;
- set_bit(MMF_RECALC_UPROBES, &mm->flags);
+ mm_flags_set(MMF_RECALC_UPROBES, mm);
return set_orig_insn(&uprobe->arch, vma, vaddr);
}
@@ -1303,7 +1303,7 @@ register_for_each_vma(struct uprobe *uprobe, struct uprobe_consumer *new)
/* consult only the "caller", new consumer. */
if (consumer_filter(new, mm))
err = install_breakpoint(uprobe, vma, info->vaddr);
- } else if (test_bit(MMF_HAS_UPROBES, &mm->flags)) {
+ } else if (mm_flags_test(MMF_HAS_UPROBES, mm)) {
if (!filter_chain(uprobe, mm))
err |= remove_breakpoint(uprobe, vma, info->vaddr);
}
@@ -1595,7 +1595,7 @@ int uprobe_mmap(struct vm_area_struct *vma)
if (vma->vm_file &&
(vma->vm_flags & (VM_WRITE|VM_SHARED)) == VM_WRITE &&
- test_bit(MMF_HAS_UPROBES, &vma->vm_mm->flags))
+ mm_flags_test(MMF_HAS_UPROBES, vma->vm_mm))
delayed_ref_ctr_inc(vma);
if (!valid_vma(vma, true))
@@ -1655,12 +1655,12 @@ void uprobe_munmap(struct vm_area_struct *vma, unsigned long start, unsigned lon
if (!atomic_read(&vma->vm_mm->mm_users)) /* called by mmput() ? */
return;
- if (!test_bit(MMF_HAS_UPROBES, &vma->vm_mm->flags) ||
- test_bit(MMF_RECALC_UPROBES, &vma->vm_mm->flags))
+ if (!mm_flags_test(MMF_HAS_UPROBES, vma->vm_mm) ||
+ mm_flags_test(MMF_RECALC_UPROBES, vma->vm_mm))
return;
if (vma_has_uprobes(vma, start, end))
- set_bit(MMF_RECALC_UPROBES, &vma->vm_mm->flags);
+ mm_flags_set(MMF_RECALC_UPROBES, vma->vm_mm);
}
static vm_fault_t xol_fault(const struct vm_special_mapping *sm,
@@ -1823,10 +1823,10 @@ void uprobe_end_dup_mmap(void)
void uprobe_dup_mmap(struct mm_struct *oldmm, struct mm_struct *newmm)
{
- if (test_bit(MMF_HAS_UPROBES, &oldmm->flags)) {
- set_bit(MMF_HAS_UPROBES, &newmm->flags);
+ if (mm_flags_test(MMF_HAS_UPROBES, oldmm)) {
+ mm_flags_set(MMF_HAS_UPROBES, newmm);
/* unconditionally, dup_mmap() skips VM_DONTCOPY vmas */
- set_bit(MMF_RECALC_UPROBES, &newmm->flags);
+ mm_flags_set(MMF_RECALC_UPROBES, newmm);
}
}
@@ -2370,7 +2370,7 @@ static void mmf_recalc_uprobes(struct mm_struct *mm)
return;
}
- clear_bit(MMF_HAS_UPROBES, &mm->flags);
+ mm_flags_clear(MMF_HAS_UPROBES, mm);
}
static int is_trap_at_addr(struct mm_struct *mm, unsigned long vaddr)
@@ -2468,7 +2468,7 @@ static struct uprobe *find_active_uprobe_rcu(unsigned long bp_vaddr, int *is_swb
*is_swbp = -EFAULT;
}
- if (!uprobe && test_and_clear_bit(MMF_RECALC_UPROBES, &mm->flags))
+ if (!uprobe && mm_flags_test_and_clear(MMF_RECALC_UPROBES, mm))
mmf_recalc_uprobes(mm);
mmap_read_unlock(mm);
@@ -2818,7 +2818,7 @@ int uprobe_pre_sstep_notifier(struct pt_regs *regs)
if (!current->mm)
return 0;
- if (!test_bit(MMF_HAS_UPROBES, &current->mm->flags) &&
+ if (!mm_flags_test(MMF_HAS_UPROBES, current->mm) &&
(!current->utask || !current->utask->return_instances))
return 0;
diff --git a/kernel/fork.c b/kernel/fork.c
index af673856499d..5115be549234 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1057,11 +1057,14 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
mm_init_uprobes_state(mm);
hugetlb_count_init(mm);
+ mm_flags_clear_all(mm);
if (current->mm) {
- mm->flags = mmf_init_flags(current->mm->flags);
+ unsigned long flags = __mm_flags_get_word(current->mm);
+
+ __mm_flags_set_word(mm, mmf_init_legacy_flags(flags));
mm->def_flags = current->mm->def_flags & VM_INIT_DEF_MASK;
} else {
- mm->flags = default_dump_filter;
+ __mm_flags_set_word(mm, default_dump_filter);
mm->def_flags = 0;
}
@@ -1884,7 +1887,7 @@ static void copy_oom_score_adj(u64 clone_flags, struct task_struct *tsk)
/* We need to synchronize with __set_oom_adj */
mutex_lock(&oom_adj_mutex);
- set_bit(MMF_MULTIPROCESS, &tsk->mm->flags);
+ mm_flags_set(MMF_MULTIPROCESS, tsk->mm);
/* Update the values in case they were changed after copy_signal */
tsk->signal->oom_score_adj = current->signal->oom_score_adj;
tsk->signal->oom_score_adj_min = current->signal->oom_score_adj_min;
diff --git a/kernel/kexec_handover.c b/kernel/kexec_handover.c
index ecd1ac210dbd..8079fc4b9189 100644
--- a/kernel/kexec_handover.c
+++ b/kernel/kexec_handover.c
@@ -405,6 +405,7 @@ static int __init kho_parse_scratch_size(char *p)
{
size_t len;
unsigned long sizes[3];
+ size_t total_size = 0;
int i;
if (!p)
@@ -441,11 +442,19 @@ static int __init kho_parse_scratch_size(char *p)
}
sizes[i] = memparse(p, &endp);
- if (!sizes[i] || endp == p)
+ if (endp == p)
return -EINVAL;
p = endp;
+ total_size += sizes[i];
}
+ if (!total_size)
+ return -EINVAL;
+
+ /* The string should be fully consumed by now. */
+ if (*p)
+ return -EINVAL;
+
scratch_size_lowmem = sizes[0];
scratch_size_global = sizes[1];
scratch_size_pernode = sizes[2];
diff --git a/kernel/rcu/rcuscale.c b/kernel/rcu/rcuscale.c
index b521d0455992..7484d8ad5767 100644
--- a/kernel/rcu/rcuscale.c
+++ b/kernel/rcu/rcuscale.c
@@ -796,7 +796,7 @@ kfree_scale_thread(void *arg)
pr_alert("Total time taken by all kfree'ers: %llu ns, loops: %d, batches: %ld, memory footprint: %lldMB\n",
(unsigned long long)(end_time - start_time), kfree_loops,
rcuscale_seq_diff(b_rcu_gp_test_finished, b_rcu_gp_test_started),
- (mem_begin - mem_during) >> (20 - PAGE_SHIFT));
+ PAGES_TO_MB(mem_begin - mem_during));
if (shutdown) {
smp_mb(); /* Assign before wake. */
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index b173a059315c..e256793b9a08 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1495,7 +1495,7 @@ static unsigned int task_nr_scan_windows(struct task_struct *p)
* by the PTE scanner and NUMA hinting faults should be trapped based
* on resident pages
*/
- nr_scan_pages = sysctl_numa_balancing_scan_size << (20 - PAGE_SHIFT);
+ nr_scan_pages = MB_TO_PAGES(sysctl_numa_balancing_scan_size);
rss = get_mm_rss(p->mm);
if (!rss)
rss = nr_scan_pages;
@@ -1923,17 +1923,18 @@ bool should_numa_migrate_memory(struct task_struct *p, struct folio *folio,
struct pglist_data *pgdat;
unsigned long rate_limit;
unsigned int latency, th, def_th;
+ long nr = folio_nr_pages(folio);
pgdat = NODE_DATA(dst_nid);
if (pgdat_free_space_enough(pgdat)) {
/* workload changed, reset hot threshold */
pgdat->nbp_threshold = 0;
+ mod_node_page_state(pgdat, PGPROMOTE_CANDIDATE_NRL, nr);
return true;
}
def_th = sysctl_numa_balancing_hot_threshold;
- rate_limit = sysctl_numa_balancing_promote_rate_limit << \
- (20 - PAGE_SHIFT);
+ rate_limit = MB_TO_PAGES(sysctl_numa_balancing_promote_rate_limit);
numa_promotion_adjust_threshold(pgdat, rate_limit, def_th);
th = pgdat->nbp_threshold ? : def_th;
@@ -1941,8 +1942,7 @@ bool should_numa_migrate_memory(struct task_struct *p, struct folio *folio,
if (latency >= th)
return false;
- return !numa_promotion_rate_limit(pgdat, rate_limit,
- folio_nr_pages(folio));
+ return !numa_promotion_rate_limit(pgdat, rate_limit, nr);
}
this_cpupid = cpu_pid_to_cpupid(dst_cpu, current->pid);
diff --git a/kernel/sys.c b/kernel/sys.c
index 1e28b40053ce..a46d9b75880b 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -2392,9 +2392,9 @@ static inline unsigned long get_current_mdwe(void)
{
unsigned long ret = 0;
- if (test_bit(MMF_HAS_MDWE, &current->mm->flags))
+ if (mm_flags_test(MMF_HAS_MDWE, current->mm))
ret |= PR_MDWE_REFUSE_EXEC_GAIN;
- if (test_bit(MMF_HAS_MDWE_NO_INHERIT, &current->mm->flags))
+ if (mm_flags_test(MMF_HAS_MDWE_NO_INHERIT, current->mm))
ret |= PR_MDWE_NO_INHERIT;
return ret;
@@ -2427,9 +2427,9 @@ static inline int prctl_set_mdwe(unsigned long bits, unsigned long arg3,
return -EPERM; /* Cannot unset the flags */
if (bits & PR_MDWE_NO_INHERIT)
- set_bit(MMF_HAS_MDWE_NO_INHERIT, &current->mm->flags);
+ mm_flags_set(MMF_HAS_MDWE_NO_INHERIT, current->mm);
if (bits & PR_MDWE_REFUSE_EXEC_GAIN)
- set_bit(MMF_HAS_MDWE, &current->mm->flags);
+ mm_flags_set(MMF_HAS_MDWE, current->mm);
return 0;
}
@@ -2452,6 +2452,51 @@ static int prctl_get_auxv(void __user *addr, unsigned long len)
return sizeof(mm->saved_auxv);
}
+static int prctl_get_thp_disable(unsigned long arg2, unsigned long arg3,
+ unsigned long arg4, unsigned long arg5)
+{
+ struct mm_struct *mm = current->mm;
+
+ if (arg2 || arg3 || arg4 || arg5)
+ return -EINVAL;
+
+ /* If disabled, we return "1 | flags", otherwise 0. */
+ if (mm_flags_test(MMF_DISABLE_THP_COMPLETELY, mm))
+ return 1;
+ else if (mm_flags_test(MMF_DISABLE_THP_EXCEPT_ADVISED, mm))
+ return 1 | PR_THP_DISABLE_EXCEPT_ADVISED;
+ return 0;
+}
+
+static int prctl_set_thp_disable(bool thp_disable, unsigned long flags,
+ unsigned long arg4, unsigned long arg5)
+{
+ struct mm_struct *mm = current->mm;
+
+ if (arg4 || arg5)
+ return -EINVAL;
+
+ /* Flags are only allowed when disabling. */
+ if ((!thp_disable && flags) || (flags & ~PR_THP_DISABLE_EXCEPT_ADVISED))
+ return -EINVAL;
+ if (mmap_write_lock_killable(current->mm))
+ return -EINTR;
+ if (thp_disable) {
+ if (flags & PR_THP_DISABLE_EXCEPT_ADVISED) {
+ mm_flags_clear(MMF_DISABLE_THP_COMPLETELY, mm);
+ mm_flags_set(MMF_DISABLE_THP_EXCEPT_ADVISED, mm);
+ } else {
+ mm_flags_set(MMF_DISABLE_THP_COMPLETELY, mm);
+ mm_flags_clear(MMF_DISABLE_THP_EXCEPT_ADVISED, mm);
+ }
+ } else {
+ mm_flags_clear(MMF_DISABLE_THP_COMPLETELY, mm);
+ mm_flags_clear(MMF_DISABLE_THP_EXCEPT_ADVISED, mm);
+ }
+ mmap_write_unlock(current->mm);
+ return 0;
+}
+
SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
unsigned long, arg4, unsigned long, arg5)
{
@@ -2625,20 +2670,10 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
return -EINVAL;
return task_no_new_privs(current) ? 1 : 0;
case PR_GET_THP_DISABLE:
- if (arg2 || arg3 || arg4 || arg5)
- return -EINVAL;
- error = !!test_bit(MMF_DISABLE_THP, &me->mm->flags);
+ error = prctl_get_thp_disable(arg2, arg3, arg4, arg5);
break;
case PR_SET_THP_DISABLE:
- if (arg3 || arg4 || arg5)
- return -EINVAL;
- if (mmap_write_lock_killable(me->mm))
- return -EINTR;
- if (arg2)
- set_bit(MMF_DISABLE_THP, &me->mm->flags);
- else
- clear_bit(MMF_DISABLE_THP, &me->mm->flags);
- mmap_write_unlock(me->mm);
+ error = prctl_set_thp_disable(arg2, arg3, arg4, arg5);
break;
case PR_MPX_ENABLE_MANAGEMENT:
case PR_MPX_DISABLE_MANAGEMENT:
@@ -2770,7 +2805,7 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
if (arg2 || arg3 || arg4 || arg5)
return -EINVAL;
- error = !!test_bit(MMF_VM_MERGE_ANY, &me->mm->flags);
+ error = !!mm_flags_test(MMF_VM_MERGE_ANY, me->mm);
break;
#endif
case PR_RISCV_V_SET_CONTROL:
diff --git a/lib/maple_tree.c b/lib/maple_tree.c
index b4ee2d29d7a9..c57a4615bdff 100644
--- a/lib/maple_tree.c
+++ b/lib/maple_tree.c
@@ -405,11 +405,11 @@ static __always_inline bool mt_is_alloc(struct maple_tree *mt)
* a reuse of the last bit in the node type. This is possible by using bit 1 to
* indicate if bit 2 is part of the type or the slot.
*
- * Note types:
- * 0x??1 = Root
- * 0x?00 = 16 bit nodes
- * 0x010 = 32 bit nodes
- * 0x110 = 64 bit nodes
+ * Node types:
+ * 0b??1 = Root
+ * 0b?00 = 16 bit nodes
+ * 0b010 = 32 bit nodes
+ * 0b110 = 64 bit nodes
*
* Slot size and alignment
* 0b??1 : Root
@@ -427,7 +427,7 @@ static __always_inline bool mt_is_alloc(struct maple_tree *mt)
#define MAPLE_PARENT_16B_SLOT_MASK 0xFC
#define MAPLE_PARENT_RANGE64 0x06
-#define MAPLE_PARENT_RANGE32 0x04
+#define MAPLE_PARENT_RANGE32 0x02
#define MAPLE_PARENT_NOT_RANGE16 0x02
/*
diff --git a/lib/rhashtable.c b/lib/rhashtable.c
index 3e555d012ed6..fde0f0e556f8 100644
--- a/lib/rhashtable.c
+++ b/lib/rhashtable.c
@@ -184,8 +184,8 @@ static struct bucket_table *bucket_table_alloc(struct rhashtable *ht,
static struct lock_class_key __key;
tbl = alloc_hooks_tag(ht->alloc_tag,
- kvmalloc_node_noprof(struct_size(tbl, buckets, nbuckets),
- gfp|__GFP_ZERO, NUMA_NO_NODE));
+ kvmalloc_node_align_noprof(struct_size(tbl, buckets, nbuckets),
+ 1, gfp|__GFP_ZERO, NUMA_NO_NODE));
size = nbuckets;
diff --git a/lib/test_hmm.c b/lib/test_hmm.c
index 761725bc713c..83e3d8208a54 100644
--- a/lib/test_hmm.c
+++ b/lib/test_hmm.c
@@ -140,7 +140,7 @@ static int dmirror_bounce_init(struct dmirror_bounce *bounce,
static bool dmirror_is_private_zone(struct dmirror_device *mdevice)
{
return (mdevice->zone_device_type ==
- HMM_DMIRROR_MEMORY_DEVICE_PRIVATE) ? true : false;
+ HMM_DMIRROR_MEMORY_DEVICE_PRIVATE);
}
static enum migrate_vma_direction
diff --git a/lib/test_kho.c b/lib/test_kho.c
index c2eb899c3b45..fe8504e3407b 100644
--- a/lib/test_kho.c
+++ b/lib/test_kho.c
@@ -67,13 +67,20 @@ static struct notifier_block kho_test_nb = {
static int kho_test_save_data(struct kho_test_state *state, void *fdt)
{
- phys_addr_t *folios_info __free(kvfree) = NULL;
+ phys_addr_t *folios_info;
int err = 0;
- folios_info = kvmalloc_array(state->nr_folios, sizeof(*folios_info),
- GFP_KERNEL);
- if (!folios_info)
- return -ENOMEM;
+ err |= fdt_begin_node(fdt, "data");
+ err |= fdt_property(fdt, "nr_folios", &state->nr_folios,
+ sizeof(state->nr_folios));
+ err |= fdt_property_placeholder(fdt, "folios_info",
+ state->nr_folios * sizeof(*folios_info),
+ (void **)&folios_info);
+ err |= fdt_property(fdt, "csum", &state->csum, sizeof(state->csum));
+ err |= fdt_end_node(fdt);
+
+ if (err)
+ return err;
for (int i = 0; i < state->nr_folios; i++) {
struct folio *folio = state->folios[i];
@@ -83,17 +90,9 @@ static int kho_test_save_data(struct kho_test_state *state, void *fdt)
err = kho_preserve_folio(folio);
if (err)
- return err;
+ break;
}
- err |= fdt_begin_node(fdt, "data");
- err |= fdt_property(fdt, "nr_folios", &state->nr_folios,
- sizeof(state->nr_folios));
- err |= fdt_property(fdt, "folios_info", folios_info,
- state->nr_folios * sizeof(*folios_info));
- err |= fdt_property(fdt, "csum", &state->csum, sizeof(state->csum));
- err |= fdt_end_node(fdt);
-
return err;
}
@@ -140,7 +139,10 @@ static int kho_test_generate_data(struct kho_test_state *state)
unsigned int size;
void *addr;
- /* cap allocation so that we won't exceed max_mem */
+ /*
+ * Since get_order() rounds up, make sure that actual
+ * allocation is smaller so that we won't exceed max_mem
+ */
if (alloc_size + (PAGE_SIZE << order) > max_mem) {
order = get_order(max_mem - alloc_size);
if (order)
@@ -165,13 +167,14 @@ static int kho_test_generate_data(struct kho_test_state *state)
err_free_folios:
for (int i = 0; i < state->nr_folios; i++)
folio_put(state->folios[i]);
+ state->nr_folios = 0;
return -ENOMEM;
}
static int kho_test_save(void)
{
struct kho_test_state *state = &kho_test_state;
- struct folio **folios __free(kvfree) = NULL;
+ struct folio **folios;
unsigned long max_nr;
int err;
@@ -185,13 +188,23 @@ static int kho_test_save(void)
err = kho_test_generate_data(state);
if (err)
- return err;
+ goto err_free_folios;
err = kho_test_prepare_fdt(state);
if (err)
- return err;
+ goto err_free_folios;
- return register_kho_notifier(&kho_test_nb);
+ err = register_kho_notifier(&kho_test_nb);
+ if (err)
+ goto err_free_fdt;
+
+ return 0;
+
+err_free_fdt:
+ folio_put(state->fdt);
+err_free_folios:
+ kvfree(folios);
+ return err;
}
static int kho_test_restore_data(const void *fdt, int node)
@@ -291,6 +304,7 @@ static void kho_test_cleanup(void)
folio_put(kho_test_state.folios[i]);
kvfree(kho_test_state.folios);
+ folio_put(kho_test_state.fdt);
}
static void __exit kho_test_exit(void)
diff --git a/lib/test_maple_tree.c b/lib/test_maple_tree.c
index cb3936595b0d..1433ecc854cb 100644
--- a/lib/test_maple_tree.c
+++ b/lib/test_maple_tree.c
@@ -3562,7 +3562,7 @@ static noinline void __init check_state_handling(struct maple_tree *mt)
MT_BUG_ON(mt, mas.last != 0x1500);
MT_BUG_ON(mt, !mas_is_active(&mas));
- /* find: start ->active on value */;
+ /* find: start ->active on value */
mas_set(&mas, 1200);
entry = mas_find(&mas, ULONG_MAX);
MT_BUG_ON(mt, entry != ptr);
diff --git a/lib/xarray.c b/lib/xarray.c
index ae3d80f4b4ee..9a8b4916540c 100644
--- a/lib/xarray.c
+++ b/lib/xarray.c
@@ -370,7 +370,7 @@ static void *xas_alloc(struct xa_state *xas, unsigned int shift)
if (node) {
xas->xa_alloc = NULL;
} else {
- gfp_t gfp = GFP_NOWAIT | __GFP_NOWARN;
+ gfp_t gfp = GFP_NOWAIT;
if (xas->xa->xa_flags & XA_FLAGS_ACCOUNT)
gfp |= __GFP_ACCOUNT;
diff --git a/mm/Kconfig b/mm/Kconfig
index e443fe8cd6cf..4108bcd96784 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -823,6 +823,22 @@ config ARCH_WANT_GENERAL_HUGETLB
config ARCH_WANTS_THP_SWAP
def_bool n
+config PERSISTENT_HUGE_ZERO_FOLIO
+ bool "Allocate a PMD sized folio for zeroing"
+ depends on TRANSPARENT_HUGEPAGE
+ help
+ Enable this option to reduce the runtime refcounting overhead
+ of the huge zero folio and expand the places in the kernel
+ that can use huge zero folios. For instance, block I/O benefits
+ from access to large folios for zeroing memory.
+
+ With this option enabled, the huge zero folio is allocated
+ once and never freed. One full huge page's worth of memory shall
+ be used.
+
+ Say Y if your system has lots of memory. Say N if you are
+ memory constrained.
+
config MM_ID
def_bool n
@@ -1381,6 +1397,8 @@ config PT_RECLAIM
Note: now only empty user PTE page table pages will be reclaimed.
+config FIND_NORMAL_PAGE
+ def_bool n
source "mm/damon/Kconfig"
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 783904d8c5ef..e4d578e6121c 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -510,7 +510,7 @@ static void wb_update_bandwidth_workfn(struct work_struct *work)
/*
* Initial write bandwidth: 100 MB/s
*/
-#define INIT_BW (100 << (20 - PAGE_SHIFT))
+#define INIT_BW MB_TO_PAGES(100)
static int wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi,
gfp_t gfp)
diff --git a/mm/cma.c b/mm/cma.c
index 2ffa4befb99a..e56ec64d0567 100644
--- a/mm/cma.c
+++ b/mm/cma.c
@@ -864,7 +864,7 @@ static struct page *__cma_alloc(struct cma *cma, unsigned long count,
if (!count)
return page;
- trace_cma_alloc_start(name, count, align);
+ trace_cma_alloc_start(name, count, cma->available_count, cma->count, align);
for (r = 0; r < cma->nranges; r++) {
page = NULL;
diff --git a/mm/damon/Kconfig b/mm/damon/Kconfig
index b3171f9406c1..8c868f7035fc 100644
--- a/mm/damon/Kconfig
+++ b/mm/damon/Kconfig
@@ -104,7 +104,7 @@ config DAMON_STAT
config DAMON_STAT_ENABLED_DEFAULT
bool "Enable DAMON_STAT by default"
- depends on DAMON_PADDR
+ depends on DAMON_STAT
default DAMON_STAT
help
Whether to enable DAMON_STAT by default. Users can disable it in
diff --git a/mm/damon/core.c b/mm/damon/core.c
index 08065b363972..be5942435d78 100644
--- a/mm/damon/core.c
+++ b/mm/damon/core.c
@@ -201,6 +201,7 @@ static int damon_fill_regions_holes(struct damon_region *first,
* @t: the given target.
* @ranges: array of new monitoring target ranges.
* @nr_ranges: length of @ranges.
+ * @min_sz_region: minimum region size.
*
* This function adds new regions to, or modify existing regions of a
* monitoring target to fit in specific ranges.
@@ -208,7 +209,7 @@ static int damon_fill_regions_holes(struct damon_region *first,
* Return: 0 if success, or negative error code otherwise.
*/
int damon_set_regions(struct damon_target *t, struct damon_addr_range *ranges,
- unsigned int nr_ranges)
+ unsigned int nr_ranges, unsigned long min_sz_region)
{
struct damon_region *r, *next;
unsigned int i;
@@ -245,16 +246,16 @@ int damon_set_regions(struct damon_target *t, struct damon_addr_range *ranges,
/* no region intersects with this range */
newr = damon_new_region(
ALIGN_DOWN(range->start,
- DAMON_MIN_REGION),
- ALIGN(range->end, DAMON_MIN_REGION));
+ min_sz_region),
+ ALIGN(range->end, min_sz_region));
if (!newr)
return -ENOMEM;
damon_insert_region(newr, damon_prev_region(r), r, t);
} else {
/* resize intersecting regions to fit in this range */
first->ar.start = ALIGN_DOWN(range->start,
- DAMON_MIN_REGION);
- last->ar.end = ALIGN(range->end, DAMON_MIN_REGION);
+ min_sz_region);
+ last->ar.end = ALIGN(range->end, min_sz_region);
/* fill possible holes in the range */
err = damon_fill_regions_holes(first, last, t);
@@ -544,6 +545,9 @@ struct damon_ctx *damon_new_ctx(void)
ctx->attrs.min_nr_regions = 10;
ctx->attrs.max_nr_regions = 1000;
+ ctx->addr_unit = 1;
+ ctx->min_sz_region = DAMON_MIN_REGION;
+
INIT_LIST_HEAD(&ctx->adaptive_targets);
INIT_LIST_HEAD(&ctx->schemes);
@@ -570,6 +574,23 @@ void damon_destroy_ctx(struct damon_ctx *ctx)
kfree(ctx);
}
+static bool damon_attrs_equals(const struct damon_attrs *attrs1,
+ const struct damon_attrs *attrs2)
+{
+ const struct damon_intervals_goal *ig1 = &attrs1->intervals_goal;
+ const struct damon_intervals_goal *ig2 = &attrs2->intervals_goal;
+
+ return attrs1->sample_interval == attrs2->sample_interval &&
+ attrs1->aggr_interval == attrs2->aggr_interval &&
+ attrs1->ops_update_interval == attrs2->ops_update_interval &&
+ attrs1->min_nr_regions == attrs2->min_nr_regions &&
+ attrs1->max_nr_regions == attrs2->max_nr_regions &&
+ ig1->access_bp == ig2->access_bp &&
+ ig1->aggrs == ig2->aggrs &&
+ ig1->min_sample_us == ig2->min_sample_us &&
+ ig1->max_sample_us == ig2->max_sample_us;
+}
+
static unsigned int damon_age_for_new_attrs(unsigned int age,
struct damon_attrs *old_attrs, struct damon_attrs *new_attrs)
{
@@ -1108,8 +1129,8 @@ static struct damon_target *damon_nth_target(int n, struct damon_ctx *ctx)
*
* If @src has no region, @dst keeps current regions.
*/
-static int damon_commit_target_regions(
- struct damon_target *dst, struct damon_target *src)
+static int damon_commit_target_regions(struct damon_target *dst,
+ struct damon_target *src, unsigned long src_min_sz_region)
{
struct damon_region *src_region;
struct damon_addr_range *ranges;
@@ -1126,18 +1147,19 @@ static int damon_commit_target_regions(
i = 0;
damon_for_each_region(src_region, src)
ranges[i++] = src_region->ar;
- err = damon_set_regions(dst, ranges, i);
+ err = damon_set_regions(dst, ranges, i, src_min_sz_region);
kfree(ranges);
return err;
}
static int damon_commit_target(
struct damon_target *dst, bool dst_has_pid,
- struct damon_target *src, bool src_has_pid)
+ struct damon_target *src, bool src_has_pid,
+ unsigned long src_min_sz_region)
{
int err;
- err = damon_commit_target_regions(dst, src);
+ err = damon_commit_target_regions(dst, src, src_min_sz_region);
if (err)
return err;
if (dst_has_pid)
@@ -1159,7 +1181,8 @@ static int damon_commit_targets(
if (src_target) {
err = damon_commit_target(
dst_target, damon_target_has_pid(dst),
- src_target, damon_target_has_pid(src));
+ src_target, damon_target_has_pid(src),
+ src->min_sz_region);
if (err)
return err;
} else {
@@ -1182,7 +1205,8 @@ static int damon_commit_targets(
if (!new_target)
return -ENOMEM;
err = damon_commit_target(new_target, false,
- src_target, damon_target_has_pid(src));
+ src_target, damon_target_has_pid(src),
+ src->min_sz_region);
if (err) {
damon_destroy_target(new_target, NULL);
return err;
@@ -1222,10 +1246,14 @@ int damon_commit_ctx(struct damon_ctx *dst, struct damon_ctx *src)
* 2. ops update should be done after pid handling is done (target
* committing require putting pids).
*/
- err = damon_set_attrs(dst, &src->attrs);
- if (err)
- return err;
+ if (!damon_attrs_equals(&dst->attrs, &src->attrs)) {
+ err = damon_set_attrs(dst, &src->attrs);
+ if (err)
+ return err;
+ }
dst->ops = src->ops;
+ dst->addr_unit = src->addr_unit;
+ dst->min_sz_region = src->min_sz_region;
return 0;
}
@@ -1258,8 +1286,8 @@ static unsigned long damon_region_sz_limit(struct damon_ctx *ctx)
if (ctx->attrs.min_nr_regions)
sz /= ctx->attrs.min_nr_regions;
- if (sz < DAMON_MIN_REGION)
- sz = DAMON_MIN_REGION;
+ if (sz < ctx->min_sz_region)
+ sz = ctx->min_sz_region;
return sz;
}
@@ -1603,6 +1631,7 @@ static bool damos_valid_target(struct damon_ctx *c, struct damon_target *t,
* @t: The target of the region.
* @rp: The pointer to the region.
* @s: The scheme to be applied.
+ * @min_sz_region: minimum region size.
*
* If a quota of a scheme has exceeded in a quota charge window, the scheme's
* action would applied to only a part of the target access pattern fulfilling
@@ -1620,7 +1649,7 @@ static bool damos_valid_target(struct damon_ctx *c, struct damon_target *t,
* Return: true if the region should be entirely skipped, false otherwise.
*/
static bool damos_skip_charged_region(struct damon_target *t,
- struct damon_region **rp, struct damos *s)
+ struct damon_region **rp, struct damos *s, unsigned long min_sz_region)
{
struct damon_region *r = *rp;
struct damos_quota *quota = &s->quota;
@@ -1642,11 +1671,11 @@ static bool damos_skip_charged_region(struct damon_target *t,
if (quota->charge_addr_from && r->ar.start <
quota->charge_addr_from) {
sz_to_skip = ALIGN_DOWN(quota->charge_addr_from -
- r->ar.start, DAMON_MIN_REGION);
+ r->ar.start, min_sz_region);
if (!sz_to_skip) {
- if (damon_sz_region(r) <= DAMON_MIN_REGION)
+ if (damon_sz_region(r) <= min_sz_region)
return true;
- sz_to_skip = DAMON_MIN_REGION;
+ sz_to_skip = min_sz_region;
}
damon_split_region_at(t, r, sz_to_skip);
r = damon_next_region(r);
@@ -1671,7 +1700,8 @@ static void damos_update_stat(struct damos *s,
}
static bool damos_filter_match(struct damon_ctx *ctx, struct damon_target *t,
- struct damon_region *r, struct damos_filter *filter)
+ struct damon_region *r, struct damos_filter *filter,
+ unsigned long min_sz_region)
{
bool matched = false;
struct damon_target *ti;
@@ -1688,8 +1718,8 @@ static bool damos_filter_match(struct damon_ctx *ctx, struct damon_target *t,
matched = target_idx == filter->target_idx;
break;
case DAMOS_FILTER_TYPE_ADDR:
- start = ALIGN_DOWN(filter->addr_range.start, DAMON_MIN_REGION);
- end = ALIGN_DOWN(filter->addr_range.end, DAMON_MIN_REGION);
+ start = ALIGN_DOWN(filter->addr_range.start, min_sz_region);
+ end = ALIGN_DOWN(filter->addr_range.end, min_sz_region);
/* inside the range */
if (start <= r->ar.start && r->ar.end <= end) {
@@ -1725,7 +1755,7 @@ static bool damos_filter_out(struct damon_ctx *ctx, struct damon_target *t,
s->core_filters_allowed = false;
damos_for_each_filter(filter, s) {
- if (damos_filter_match(ctx, t, r, filter)) {
+ if (damos_filter_match(ctx, t, r, filter, ctx->min_sz_region)) {
if (filter->allow)
s->core_filters_allowed = true;
return !filter->allow;
@@ -1860,7 +1890,7 @@ static void damos_apply_scheme(struct damon_ctx *c, struct damon_target *t,
if (c->ops.apply_scheme) {
if (quota->esz && quota->charged_sz + sz > quota->esz) {
sz = ALIGN_DOWN(quota->esz - quota->charged_sz,
- DAMON_MIN_REGION);
+ c->min_sz_region);
if (!sz)
goto update_stat;
damon_split_region_at(t, r, sz);
@@ -1908,7 +1938,7 @@ static void damon_do_apply_schemes(struct damon_ctx *c,
if (quota->esz && quota->charged_sz >= quota->esz)
continue;
- if (damos_skip_charged_region(t, &r, s))
+ if (damos_skip_charged_region(t, &r, s, c->min_sz_region))
continue;
if (!damos_valid_target(c, t, r, s))
@@ -2306,7 +2336,8 @@ static void damon_split_region_at(struct damon_target *t,
}
/* Split every region in the given target into 'nr_subs' regions */
-static void damon_split_regions_of(struct damon_target *t, int nr_subs)
+static void damon_split_regions_of(struct damon_target *t, int nr_subs,
+ unsigned long min_sz_region)
{
struct damon_region *r, *next;
unsigned long sz_region, sz_sub = 0;
@@ -2316,13 +2347,13 @@ static void damon_split_regions_of(struct damon_target *t, int nr_subs)
sz_region = damon_sz_region(r);
for (i = 0; i < nr_subs - 1 &&
- sz_region > 2 * DAMON_MIN_REGION; i++) {
+ sz_region > 2 * min_sz_region; i++) {
/*
* Randomly select size of left sub-region to be at
* least 10 percent and at most 90% of original region
*/
sz_sub = ALIGN_DOWN(damon_rand(1, 10) *
- sz_region / 10, DAMON_MIN_REGION);
+ sz_region / 10, min_sz_region);
/* Do not allow blank region */
if (sz_sub == 0 || sz_sub >= sz_region)
continue;
@@ -2362,7 +2393,7 @@ static void kdamond_split_regions(struct damon_ctx *ctx)
nr_subregions = 3;
damon_for_each_target(t, ctx)
- damon_split_regions_of(t, nr_subregions);
+ damon_split_regions_of(t, nr_subregions, ctx->min_sz_region);
last_nr_regions = nr_regions;
}
@@ -2755,7 +2786,7 @@ int damon_set_region_biggest_system_ram_default(struct damon_target *t,
addr_range.start = *start;
addr_range.end = *end;
- return damon_set_regions(t, &addr_range, 1);
+ return damon_set_regions(t, &addr_range, 1, DAMON_MIN_REGION);
}
/*
diff --git a/mm/damon/ops-common.c b/mm/damon/ops-common.c
index 99321ff5cb92..998c5180a603 100644
--- a/mm/damon/ops-common.c
+++ b/mm/damon/ops-common.c
@@ -303,7 +303,7 @@ static unsigned int __damon_migrate_folio_list(
* instead of migrated.
*/
.gfp_mask = (GFP_HIGHUSER_MOVABLE & ~__GFP_RECLAIM) |
- __GFP_NOWARN | __GFP_NOMEMALLOC | GFP_NOWAIT,
+ __GFP_NOMEMALLOC | GFP_NOWAIT,
.nid = target_nid,
};
@@ -412,3 +412,12 @@ unsigned long damon_migrate_pages(struct list_head *folio_list, int target_nid)
return nr_migrated;
}
+
+bool damos_ops_has_filter(struct damos *s)
+{
+ struct damos_filter *f;
+
+ damos_for_each_ops_filter(f, s)
+ return true;
+ return false;
+}
diff --git a/mm/damon/ops-common.h b/mm/damon/ops-common.h
index 61ad54aaf256..5efa5b5970de 100644
--- a/mm/damon/ops-common.h
+++ b/mm/damon/ops-common.h
@@ -21,3 +21,5 @@ int damon_hot_score(struct damon_ctx *c, struct damon_region *r,
bool damos_folio_filter_match(struct damos_filter *filter, struct folio *folio);
unsigned long damon_migrate_pages(struct list_head *folio_list, int target_nid);
+
+bool damos_ops_has_filter(struct damos *s);
diff --git a/mm/damon/paddr.c b/mm/damon/paddr.c
index 53a55c5114fb..07a8aead439e 100644
--- a/mm/damon/paddr.c
+++ b/mm/damon/paddr.c
@@ -18,7 +18,26 @@
#include "../internal.h"
#include "ops-common.h"
-static void damon_pa_mkold(unsigned long paddr)
+static phys_addr_t damon_pa_phys_addr(
+ unsigned long addr, unsigned long addr_unit)
+{
+ return (phys_addr_t)addr * addr_unit;
+}
+
+static unsigned long damon_pa_core_addr(
+ phys_addr_t pa, unsigned long addr_unit)
+{
+ /*
+ * Use div_u64() for avoiding linking errors related with __udivdi3,
+ * __aeabi_uldivmod, or similar problems. This should also improve the
+ * performance optimization (read div_u64() comment for the detail).
+ */
+ if (sizeof(pa) == 8 && sizeof(addr_unit) == 4)
+ return div_u64(pa, addr_unit);
+ return pa / addr_unit;
+}
+
+static void damon_pa_mkold(phys_addr_t paddr)
{
struct folio *folio = damon_get_folio(PHYS_PFN(paddr));
@@ -29,11 +48,12 @@ static void damon_pa_mkold(unsigned long paddr)
folio_put(folio);
}
-static void __damon_pa_prepare_access_check(struct damon_region *r)
+static void __damon_pa_prepare_access_check(struct damon_region *r,
+ unsigned long addr_unit)
{
r->sampling_addr = damon_rand(r->ar.start, r->ar.end);
- damon_pa_mkold(r->sampling_addr);
+ damon_pa_mkold(damon_pa_phys_addr(r->sampling_addr, addr_unit));
}
static void damon_pa_prepare_access_checks(struct damon_ctx *ctx)
@@ -43,11 +63,11 @@ static void damon_pa_prepare_access_checks(struct damon_ctx *ctx)
damon_for_each_target(t, ctx) {
damon_for_each_region(r, t)
- __damon_pa_prepare_access_check(r);
+ __damon_pa_prepare_access_check(r, ctx->addr_unit);
}
}
-static bool damon_pa_young(unsigned long paddr, unsigned long *folio_sz)
+static bool damon_pa_young(phys_addr_t paddr, unsigned long *folio_sz)
{
struct folio *folio = damon_get_folio(PHYS_PFN(paddr));
bool accessed;
@@ -62,23 +82,25 @@ static bool damon_pa_young(unsigned long paddr, unsigned long *folio_sz)
}
static void __damon_pa_check_access(struct damon_region *r,
- struct damon_attrs *attrs)
+ struct damon_attrs *attrs, unsigned long addr_unit)
{
- static unsigned long last_addr;
+ static phys_addr_t last_addr;
static unsigned long last_folio_sz = PAGE_SIZE;
static bool last_accessed;
+ phys_addr_t sampling_addr = damon_pa_phys_addr(
+ r->sampling_addr, addr_unit);
/* If the region is in the last checked page, reuse the result */
if (ALIGN_DOWN(last_addr, last_folio_sz) ==
- ALIGN_DOWN(r->sampling_addr, last_folio_sz)) {
+ ALIGN_DOWN(sampling_addr, last_folio_sz)) {
damon_update_region_access_rate(r, last_accessed, attrs);
return;
}
- last_accessed = damon_pa_young(r->sampling_addr, &last_folio_sz);
+ last_accessed = damon_pa_young(sampling_addr, &last_folio_sz);
damon_update_region_access_rate(r, last_accessed, attrs);
- last_addr = r->sampling_addr;
+ last_addr = sampling_addr;
}
static unsigned int damon_pa_check_accesses(struct damon_ctx *ctx)
@@ -89,7 +111,8 @@ static unsigned int damon_pa_check_accesses(struct damon_ctx *ctx)
damon_for_each_target(t, ctx) {
damon_for_each_region(r, t) {
- __damon_pa_check_access(r, &ctx->attrs);
+ __damon_pa_check_access(
+ r, &ctx->attrs, ctx->addr_unit);
max_nr_accesses = max(r->nr_accesses, max_nr_accesses);
}
}
@@ -125,10 +148,11 @@ static bool damon_pa_invalid_damos_folio(struct folio *folio, struct damos *s)
return false;
}
-static unsigned long damon_pa_pageout(struct damon_region *r, struct damos *s,
+static unsigned long damon_pa_pageout(struct damon_region *r,
+ unsigned long addr_unit, struct damos *s,
unsigned long *sz_filter_passed)
{
- unsigned long addr, applied;
+ phys_addr_t addr, applied;
LIST_HEAD(folio_list);
bool install_young_filter = true;
struct damos_filter *filter;
@@ -149,8 +173,8 @@ static unsigned long damon_pa_pageout(struct damon_region *r, struct damos *s,
damos_add_filter(s, filter);
}
- addr = r->ar.start;
- while (addr < r->ar.end) {
+ addr = damon_pa_phys_addr(r->ar.start, addr_unit);
+ while (addr < damon_pa_phys_addr(r->ar.end, addr_unit)) {
folio = damon_get_folio(PHYS_PFN(addr));
if (damon_pa_invalid_damos_folio(folio, s)) {
addr += PAGE_SIZE;
@@ -160,7 +184,7 @@ static unsigned long damon_pa_pageout(struct damon_region *r, struct damos *s,
if (damos_pa_filter_out(s, folio))
goto put_folio;
else
- *sz_filter_passed += folio_size(folio);
+ *sz_filter_passed += folio_size(folio) / addr_unit;
folio_clear_referenced(folio);
folio_test_clear_young(folio);
@@ -179,18 +203,19 @@ put_folio:
applied = reclaim_pages(&folio_list);
cond_resched();
s->last_applied = folio;
- return applied * PAGE_SIZE;
+ return damon_pa_core_addr(applied * PAGE_SIZE, addr_unit);
}
static inline unsigned long damon_pa_mark_accessed_or_deactivate(
- struct damon_region *r, struct damos *s, bool mark_accessed,
+ struct damon_region *r, unsigned long addr_unit,
+ struct damos *s, bool mark_accessed,
unsigned long *sz_filter_passed)
{
- unsigned long addr, applied = 0;
+ phys_addr_t addr, applied = 0;
struct folio *folio;
- addr = r->ar.start;
- while (addr < r->ar.end) {
+ addr = damon_pa_phys_addr(r->ar.start, addr_unit);
+ while (addr < damon_pa_phys_addr(r->ar.end, addr_unit)) {
folio = damon_get_folio(PHYS_PFN(addr));
if (damon_pa_invalid_damos_folio(folio, s)) {
addr += PAGE_SIZE;
@@ -200,7 +225,7 @@ static inline unsigned long damon_pa_mark_accessed_or_deactivate(
if (damos_pa_filter_out(s, folio))
goto put_folio;
else
- *sz_filter_passed += folio_size(folio);
+ *sz_filter_passed += folio_size(folio) / addr_unit;
if (mark_accessed)
folio_mark_accessed(folio);
@@ -212,32 +237,35 @@ put_folio:
folio_put(folio);
}
s->last_applied = folio;
- return applied * PAGE_SIZE;
+ return damon_pa_core_addr(applied * PAGE_SIZE, addr_unit);
}
static unsigned long damon_pa_mark_accessed(struct damon_region *r,
- struct damos *s, unsigned long *sz_filter_passed)
+ unsigned long addr_unit, struct damos *s,
+ unsigned long *sz_filter_passed)
{
- return damon_pa_mark_accessed_or_deactivate(r, s, true,
+ return damon_pa_mark_accessed_or_deactivate(r, addr_unit, s, true,
sz_filter_passed);
}
static unsigned long damon_pa_deactivate_pages(struct damon_region *r,
- struct damos *s, unsigned long *sz_filter_passed)
+ unsigned long addr_unit, struct damos *s,
+ unsigned long *sz_filter_passed)
{
- return damon_pa_mark_accessed_or_deactivate(r, s, false,
+ return damon_pa_mark_accessed_or_deactivate(r, addr_unit, s, false,
sz_filter_passed);
}
-static unsigned long damon_pa_migrate(struct damon_region *r, struct damos *s,
+static unsigned long damon_pa_migrate(struct damon_region *r,
+ unsigned long addr_unit, struct damos *s,
unsigned long *sz_filter_passed)
{
- unsigned long addr, applied;
+ phys_addr_t addr, applied;
LIST_HEAD(folio_list);
struct folio *folio;
- addr = r->ar.start;
- while (addr < r->ar.end) {
+ addr = damon_pa_phys_addr(r->ar.start, addr_unit);
+ while (addr < damon_pa_phys_addr(r->ar.end, addr_unit)) {
folio = damon_get_folio(PHYS_PFN(addr));
if (damon_pa_invalid_damos_folio(folio, s)) {
addr += PAGE_SIZE;
@@ -247,7 +275,7 @@ static unsigned long damon_pa_migrate(struct damon_region *r, struct damos *s,
if (damos_pa_filter_out(s, folio))
goto put_folio;
else
- *sz_filter_passed += folio_size(folio);
+ *sz_filter_passed += folio_size(folio) / addr_unit;
if (!folio_isolate_lru(folio))
goto put_folio;
@@ -259,29 +287,21 @@ put_folio:
applied = damon_migrate_pages(&folio_list, s->target_nid);
cond_resched();
s->last_applied = folio;
- return applied * PAGE_SIZE;
+ return damon_pa_core_addr(applied * PAGE_SIZE, addr_unit);
}
-static bool damon_pa_scheme_has_filter(struct damos *s)
-{
- struct damos_filter *f;
-
- damos_for_each_ops_filter(f, s)
- return true;
- return false;
-}
-
-static unsigned long damon_pa_stat(struct damon_region *r, struct damos *s,
+static unsigned long damon_pa_stat(struct damon_region *r,
+ unsigned long addr_unit, struct damos *s,
unsigned long *sz_filter_passed)
{
- unsigned long addr;
+ phys_addr_t addr;
struct folio *folio;
- if (!damon_pa_scheme_has_filter(s))
+ if (!damos_ops_has_filter(s))
return 0;
- addr = r->ar.start;
- while (addr < r->ar.end) {
+ addr = damon_pa_phys_addr(r->ar.start, addr_unit);
+ while (addr < damon_pa_phys_addr(r->ar.end, addr_unit)) {
folio = damon_get_folio(PHYS_PFN(addr));
if (damon_pa_invalid_damos_folio(folio, s)) {
addr += PAGE_SIZE;
@@ -289,7 +309,7 @@ static unsigned long damon_pa_stat(struct damon_region *r, struct damos *s,
}
if (!damos_pa_filter_out(s, folio))
- *sz_filter_passed += folio_size(folio);
+ *sz_filter_passed += folio_size(folio) / addr_unit;
addr += folio_size(folio);
folio_put(folio);
}
@@ -301,18 +321,22 @@ static unsigned long damon_pa_apply_scheme(struct damon_ctx *ctx,
struct damon_target *t, struct damon_region *r,
struct damos *scheme, unsigned long *sz_filter_passed)
{
+ unsigned long aunit = ctx->addr_unit;
+
switch (scheme->action) {
case DAMOS_PAGEOUT:
- return damon_pa_pageout(r, scheme, sz_filter_passed);
+ return damon_pa_pageout(r, aunit, scheme, sz_filter_passed);
case DAMOS_LRU_PRIO:
- return damon_pa_mark_accessed(r, scheme, sz_filter_passed);
+ return damon_pa_mark_accessed(r, aunit, scheme,
+ sz_filter_passed);
case DAMOS_LRU_DEPRIO:
- return damon_pa_deactivate_pages(r, scheme, sz_filter_passed);
+ return damon_pa_deactivate_pages(r, aunit, scheme,
+ sz_filter_passed);
case DAMOS_MIGRATE_HOT:
case DAMOS_MIGRATE_COLD:
- return damon_pa_migrate(r, scheme, sz_filter_passed);
+ return damon_pa_migrate(r, aunit, scheme, sz_filter_passed);
case DAMOS_STAT:
- return damon_pa_stat(r, scheme, sz_filter_passed);
+ return damon_pa_stat(r, aunit, scheme, sz_filter_passed);
default:
/* DAMOS actions that not yet supported by 'paddr'. */
break;
diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c
index c96c2154128f..fe4e73d0ebbb 100644
--- a/mm/damon/sysfs.c
+++ b/mm/damon/sysfs.c
@@ -834,6 +834,7 @@ static const struct damon_sysfs_ops_name damon_sysfs_ops_names[] = {
struct damon_sysfs_context {
struct kobject kobj;
enum damon_ops_id ops_id;
+ unsigned long addr_unit;
struct damon_sysfs_attrs *attrs;
struct damon_sysfs_targets *targets;
struct damon_sysfs_schemes *schemes;
@@ -849,6 +850,7 @@ static struct damon_sysfs_context *damon_sysfs_context_alloc(
return NULL;
context->kobj = (struct kobject){};
context->ops_id = ops_id;
+ context->addr_unit = 1;
return context;
}
@@ -997,6 +999,32 @@ static ssize_t operations_store(struct kobject *kobj,
return -EINVAL;
}
+static ssize_t addr_unit_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct damon_sysfs_context *context = container_of(kobj,
+ struct damon_sysfs_context, kobj);
+
+ return sysfs_emit(buf, "%lu\n", context->addr_unit);
+}
+
+static ssize_t addr_unit_store(struct kobject *kobj,
+ struct kobj_attribute *attr, const char *buf, size_t count)
+{
+ struct damon_sysfs_context *context = container_of(kobj,
+ struct damon_sysfs_context, kobj);
+ unsigned long input_addr_unit;
+ int err = kstrtoul(buf, 0, &input_addr_unit);
+
+ if (err)
+ return err;
+ if (!input_addr_unit)
+ return -EINVAL;
+
+ context->addr_unit = input_addr_unit;
+ return count;
+}
+
static void damon_sysfs_context_release(struct kobject *kobj)
{
kfree(container_of(kobj, struct damon_sysfs_context, kobj));
@@ -1008,9 +1036,13 @@ static struct kobj_attribute damon_sysfs_context_avail_operations_attr =
static struct kobj_attribute damon_sysfs_context_operations_attr =
__ATTR_RW_MODE(operations, 0600);
+static struct kobj_attribute damon_sysfs_context_addr_unit_attr =
+ __ATTR_RW_MODE(addr_unit, 0600);
+
static struct attribute *damon_sysfs_context_attrs[] = {
&damon_sysfs_context_avail_operations_attr.attr,
&damon_sysfs_context_operations_attr.attr,
+ &damon_sysfs_context_addr_unit_attr.attr,
NULL,
};
ATTRIBUTE_GROUPS(damon_sysfs_context);
@@ -1301,7 +1333,8 @@ static int damon_sysfs_set_attrs(struct damon_ctx *ctx,
}
static int damon_sysfs_set_regions(struct damon_target *t,
- struct damon_sysfs_regions *sysfs_regions)
+ struct damon_sysfs_regions *sysfs_regions,
+ unsigned long min_sz_region)
{
struct damon_addr_range *ranges = kmalloc_array(sysfs_regions->nr,
sizeof(*ranges), GFP_KERNEL | __GFP_NOWARN);
@@ -1323,7 +1356,7 @@ static int damon_sysfs_set_regions(struct damon_target *t,
if (ranges[i - 1].end > ranges[i].start)
goto out;
}
- err = damon_set_regions(t, ranges, sysfs_regions->nr);
+ err = damon_set_regions(t, ranges, sysfs_regions->nr, min_sz_region);
out:
kfree(ranges);
return err;
@@ -1344,7 +1377,7 @@ static int damon_sysfs_add_target(struct damon_sysfs_target *sys_target,
/* caller will destroy targets */
return -EINVAL;
}
- return damon_sysfs_set_regions(t, sys_target->regions);
+ return damon_sysfs_set_regions(t, sys_target->regions, ctx->min_sz_region);
}
static int damon_sysfs_add_targets(struct damon_ctx *ctx,
@@ -1401,6 +1434,8 @@ static int damon_sysfs_apply_inputs(struct damon_ctx *ctx,
err = damon_select_ops(ctx, sys_ctx->ops_id);
if (err)
return err;
+ ctx->addr_unit = sys_ctx->addr_unit;
+ ctx->min_sz_region = max(DAMON_MIN_REGION / sys_ctx->addr_unit, 1);
err = damon_sysfs_set_attrs(ctx, sys_ctx->attrs);
if (err)
return err;
diff --git a/mm/damon/tests/core-kunit.h b/mm/damon/tests/core-kunit.h
index dfedfff19940..51369e35298b 100644
--- a/mm/damon/tests/core-kunit.h
+++ b/mm/damon/tests/core-kunit.h
@@ -230,14 +230,14 @@ static void damon_test_split_regions_of(struct kunit *test)
t = damon_new_target();
r = damon_new_region(0, 22);
damon_add_region(r, t);
- damon_split_regions_of(t, 2);
+ damon_split_regions_of(t, 2, DAMON_MIN_REGION);
KUNIT_EXPECT_LE(test, damon_nr_regions(t), 2u);
damon_free_target(t);
t = damon_new_target();
r = damon_new_region(0, 220);
damon_add_region(r, t);
- damon_split_regions_of(t, 4);
+ damon_split_regions_of(t, 4, DAMON_MIN_REGION);
KUNIT_EXPECT_LE(test, damon_nr_regions(t), 4u);
damon_free_target(t);
damon_destroy_ctx(c);
@@ -303,7 +303,7 @@ static void damon_test_set_regions(struct kunit *test)
damon_add_region(r1, t);
damon_add_region(r2, t);
- damon_set_regions(t, &range, 1);
+ damon_set_regions(t, &range, 1, DAMON_MIN_REGION);
KUNIT_EXPECT_EQ(test, damon_nr_regions(t), 3);
damon_for_each_region(r, t) {
@@ -419,6 +419,22 @@ static void damos_test_new_filter(struct kunit *test)
damos_destroy_filter(filter);
}
+static void damos_test_commit_filter(struct kunit *test)
+{
+ struct damos_filter *src_filter = damos_new_filter(
+ DAMOS_FILTER_TYPE_ANON, true, true);
+ struct damos_filter *dst_filter = damos_new_filter(
+ DAMOS_FILTER_TYPE_ACTIVE, false, false);
+
+ damos_commit_filter(dst_filter, src_filter);
+ KUNIT_EXPECT_EQ(test, dst_filter->type, src_filter->type);
+ KUNIT_EXPECT_EQ(test, dst_filter->matching, src_filter->matching);
+ KUNIT_EXPECT_EQ(test, dst_filter->allow, src_filter->allow);
+
+ damos_destroy_filter(src_filter);
+ damos_destroy_filter(dst_filter);
+}
+
static void damos_test_filter_out(struct kunit *test)
{
struct damon_target *t;
@@ -434,25 +450,29 @@ static void damos_test_filter_out(struct kunit *test)
damon_add_region(r, t);
/* region in the range */
- KUNIT_EXPECT_TRUE(test, damos_filter_match(NULL, t, r, f));
+ KUNIT_EXPECT_TRUE(test,
+ damos_filter_match(NULL, t, r, f, DAMON_MIN_REGION));
KUNIT_EXPECT_EQ(test, damon_nr_regions(t), 1);
/* region before the range */
r->ar.start = DAMON_MIN_REGION * 1;
r->ar.end = DAMON_MIN_REGION * 2;
- KUNIT_EXPECT_FALSE(test, damos_filter_match(NULL, t, r, f));
+ KUNIT_EXPECT_FALSE(test,
+ damos_filter_match(NULL, t, r, f, DAMON_MIN_REGION));
KUNIT_EXPECT_EQ(test, damon_nr_regions(t), 1);
/* region after the range */
r->ar.start = DAMON_MIN_REGION * 6;
r->ar.end = DAMON_MIN_REGION * 8;
- KUNIT_EXPECT_FALSE(test, damos_filter_match(NULL, t, r, f));
+ KUNIT_EXPECT_FALSE(test,
+ damos_filter_match(NULL, t, r, f, DAMON_MIN_REGION));
KUNIT_EXPECT_EQ(test, damon_nr_regions(t), 1);
/* region started before the range */
r->ar.start = DAMON_MIN_REGION * 1;
r->ar.end = DAMON_MIN_REGION * 4;
- KUNIT_EXPECT_FALSE(test, damos_filter_match(NULL, t, r, f));
+ KUNIT_EXPECT_FALSE(test,
+ damos_filter_match(NULL, t, r, f, DAMON_MIN_REGION));
/* filter should have split the region */
KUNIT_EXPECT_EQ(test, r->ar.start, DAMON_MIN_REGION * 1);
KUNIT_EXPECT_EQ(test, r->ar.end, DAMON_MIN_REGION * 2);
@@ -465,7 +485,8 @@ static void damos_test_filter_out(struct kunit *test)
/* region started in the range */
r->ar.start = DAMON_MIN_REGION * 2;
r->ar.end = DAMON_MIN_REGION * 8;
- KUNIT_EXPECT_TRUE(test, damos_filter_match(NULL, t, r, f));
+ KUNIT_EXPECT_TRUE(test,
+ damos_filter_match(NULL, t, r, f, DAMON_MIN_REGION));
/* filter should have split the region */
KUNIT_EXPECT_EQ(test, r->ar.start, DAMON_MIN_REGION * 2);
KUNIT_EXPECT_EQ(test, r->ar.end, DAMON_MIN_REGION * 6);
@@ -594,6 +615,7 @@ static struct kunit_case damon_test_cases[] = {
KUNIT_CASE(damon_test_set_attrs),
KUNIT_CASE(damon_test_moving_sum),
KUNIT_CASE(damos_test_new_filter),
+ KUNIT_CASE(damos_test_commit_filter),
KUNIT_CASE(damos_test_filter_out),
KUNIT_CASE(damon_test_feed_loop_next_input),
KUNIT_CASE(damon_test_set_filters_default_reject),
diff --git a/mm/damon/tests/vaddr-kunit.h b/mm/damon/tests/vaddr-kunit.h
index d2b37ccf2cc0..fce38dd53cf8 100644
--- a/mm/damon/tests/vaddr-kunit.h
+++ b/mm/damon/tests/vaddr-kunit.h
@@ -141,7 +141,7 @@ static void damon_do_test_apply_three_regions(struct kunit *test,
damon_add_region(r, t);
}
- damon_set_regions(t, three_regions, 3);
+ damon_set_regions(t, three_regions, 3, DAMON_MIN_REGION);
for (i = 0; i < nr_expected / 2; i++) {
r = __nth_region_of(t, i);
diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c
index 87e825349bdf..8c048f9b129e 100644
--- a/mm/damon/vaddr.c
+++ b/mm/damon/vaddr.c
@@ -299,7 +299,7 @@ static void damon_va_update(struct damon_ctx *ctx)
damon_for_each_target(t, ctx) {
if (damon_va_three_regions(t, three_regions))
continue;
- damon_set_regions(t, three_regions, 3);
+ damon_set_regions(t, three_regions, 3, DAMON_MIN_REGION);
}
}
@@ -890,6 +890,107 @@ free_lists:
return applied * PAGE_SIZE;
}
+struct damos_va_stat_private {
+ struct damos *scheme;
+ unsigned long *sz_filter_passed;
+};
+
+static inline bool damos_va_invalid_folio(struct folio *folio,
+ struct damos *s)
+{
+ return !folio || folio == s->last_applied;
+}
+
+static int damos_va_stat_pmd_entry(pmd_t *pmd, unsigned long addr,
+ unsigned long next, struct mm_walk *walk)
+{
+ struct damos_va_stat_private *priv = walk->private;
+ struct damos *s = priv->scheme;
+ unsigned long *sz_filter_passed = priv->sz_filter_passed;
+ struct vm_area_struct *vma = walk->vma;
+ struct folio *folio;
+ spinlock_t *ptl;
+ pte_t *start_pte, *pte, ptent;
+ int nr;
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ if (pmd_trans_huge(*pmd)) {
+ pmd_t pmde;
+
+ ptl = pmd_trans_huge_lock(pmd, vma);
+ if (!ptl)
+ return 0;
+ pmde = pmdp_get(pmd);
+ if (!pmd_present(pmde))
+ goto huge_unlock;
+
+ folio = vm_normal_folio_pmd(vma, addr, pmde);
+
+ if (damos_va_invalid_folio(folio, s))
+ goto huge_unlock;
+
+ if (!damos_va_filter_out(s, folio, vma, addr, NULL, pmd))
+ *sz_filter_passed += folio_size(folio);
+ s->last_applied = folio;
+
+huge_unlock:
+ spin_unlock(ptl);
+ return 0;
+ }
+#endif
+ start_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
+ if (!start_pte)
+ return 0;
+
+ for (; addr < next; pte += nr, addr += nr * PAGE_SIZE) {
+ nr = 1;
+ ptent = ptep_get(pte);
+
+ if (pte_none(ptent) || !pte_present(ptent))
+ continue;
+
+ folio = vm_normal_folio(vma, addr, ptent);
+
+ if (damos_va_invalid_folio(folio, s))
+ continue;
+
+ if (!damos_va_filter_out(s, folio, vma, addr, pte, NULL))
+ *sz_filter_passed += folio_size(folio);
+ nr = folio_nr_pages(folio);
+ s->last_applied = folio;
+ }
+ pte_unmap_unlock(start_pte, ptl);
+ return 0;
+}
+
+static unsigned long damos_va_stat(struct damon_target *target,
+ struct damon_region *r, struct damos *s,
+ unsigned long *sz_filter_passed)
+{
+ struct damos_va_stat_private priv;
+ struct mm_struct *mm;
+ struct mm_walk_ops walk_ops = {
+ .pmd_entry = damos_va_stat_pmd_entry,
+ .walk_lock = PGWALK_RDLOCK,
+ };
+
+ priv.scheme = s;
+ priv.sz_filter_passed = sz_filter_passed;
+
+ if (!damos_ops_has_filter(s))
+ return 0;
+
+ mm = damon_get_mm(target);
+ if (!mm)
+ return 0;
+
+ mmap_read_lock(mm);
+ walk_page_range(mm, r->ar.start, r->ar.end, &walk_ops, &priv);
+ mmap_read_unlock(mm);
+ mmput(mm);
+ return 0;
+}
+
static unsigned long damon_va_apply_scheme(struct damon_ctx *ctx,
struct damon_target *t, struct damon_region *r,
struct damos *scheme, unsigned long *sz_filter_passed)
@@ -916,7 +1017,7 @@ static unsigned long damon_va_apply_scheme(struct damon_ctx *ctx,
case DAMOS_MIGRATE_COLD:
return damos_va_migrate(t, r, scheme, sz_filter_passed);
case DAMOS_STAT:
- return 0;
+ return damos_va_stat(t, r, scheme, sz_filter_passed);
default:
/*
* DAMOS actions that are not yet supported by 'vaddr'.
diff --git a/mm/debug.c b/mm/debug.c
index b4388f4dcd4d..64ddb0c4b4be 100644
--- a/mm/debug.c
+++ b/mm/debug.c
@@ -182,7 +182,7 @@ void dump_mm(const struct mm_struct *mm)
"start_code %lx end_code %lx start_data %lx end_data %lx\n"
"start_brk %lx brk %lx start_stack %lx\n"
"arg_start %lx arg_end %lx env_start %lx env_end %lx\n"
- "binfmt %px flags %lx\n"
+ "binfmt %px flags %*pb\n"
#ifdef CONFIG_AIO
"ioctx_table %px\n"
#endif
@@ -211,7 +211,7 @@ void dump_mm(const struct mm_struct *mm)
mm->start_code, mm->end_code, mm->start_data, mm->end_data,
mm->start_brk, mm->brk, mm->start_stack,
mm->arg_start, mm->arg_end, mm->env_start, mm->env_end,
- mm->binfmt, mm->flags,
+ mm->binfmt, NUM_MM_FLAG_BITS, __mm_flags_get_bitmap(mm),
#ifdef CONFIG_AIO
mm->ioctx_table,
#endif
diff --git a/mm/filemap.c b/mm/filemap.c
index 751838ef05e5..cd9387b0a5b5 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -190,6 +190,9 @@ static void filemap_unaccount_folio(struct address_space *mapping,
__lruvec_stat_mod_folio(folio, NR_FILE_THPS, -nr);
filemap_nr_thps_dec(mapping);
}
+ if (test_bit(AS_KERNEL_FILE, &folio->mapping->flags))
+ mod_node_page_state(folio_pgdat(folio),
+ NR_KERNEL_FILE_PAGES, -nr);
/*
* At this point folio must be either written or cleaned by
@@ -960,8 +963,14 @@ int filemap_add_folio(struct address_space *mapping, struct folio *folio,
{
void *shadow = NULL;
int ret;
+ struct mem_cgroup *tmp;
+ bool kernel_file = test_bit(AS_KERNEL_FILE, &mapping->flags);
+ if (kernel_file)
+ tmp = set_active_memcg(root_mem_cgroup);
ret = mem_cgroup_charge(folio, NULL, gfp);
+ if (kernel_file)
+ set_active_memcg(tmp);
if (ret)
return ret;
@@ -983,6 +992,10 @@ int filemap_add_folio(struct address_space *mapping, struct folio *folio,
if (!(gfp & __GFP_WRITE) && shadow)
workingset_refault(folio, shadow);
folio_add_lru(folio);
+ if (kernel_file)
+ mod_node_page_state(folio_pgdat(folio),
+ NR_KERNEL_FILE_PAGES,
+ folio_nr_pages(folio));
}
return ret;
}
@@ -1140,10 +1153,10 @@ static int wake_page_function(wait_queue_entry_t *wait, unsigned mode, int sync,
*/
flags = wait->flags;
if (flags & WQ_FLAG_EXCLUSIVE) {
- if (test_bit(key->bit_nr, &key->folio->flags))
+ if (test_bit(key->bit_nr, &key->folio->flags.f))
return -1;
if (flags & WQ_FLAG_CUSTOM) {
- if (test_and_set_bit(key->bit_nr, &key->folio->flags))
+ if (test_and_set_bit(key->bit_nr, &key->folio->flags.f))
return -1;
flags |= WQ_FLAG_DONE;
}
@@ -1226,9 +1239,9 @@ static inline bool folio_trylock_flag(struct folio *folio, int bit_nr,
struct wait_queue_entry *wait)
{
if (wait->flags & WQ_FLAG_EXCLUSIVE) {
- if (test_and_set_bit(bit_nr, &folio->flags))
+ if (test_and_set_bit(bit_nr, &folio->flags.f))
return false;
- } else if (test_bit(bit_nr, &folio->flags))
+ } else if (test_bit(bit_nr, &folio->flags.f))
return false;
wait->flags |= WQ_FLAG_WOKEN | WQ_FLAG_DONE;
@@ -1961,7 +1974,7 @@ no_page:
gfp &= ~__GFP_FS;
if (fgp_flags & FGP_NOWAIT) {
gfp &= ~GFP_KERNEL;
- gfp |= GFP_NOWAIT | __GFP_NOWARN;
+ gfp |= GFP_NOWAIT;
}
if (WARN_ON_ONCE(!(fgp_flags & (FGP_LOCK | FGP_FOR_MMAP))))
fgp_flags |= FGP_LOCK;
@@ -2447,6 +2460,9 @@ static bool filemap_range_uptodate(struct address_space *mapping,
pos -= folio_pos(folio);
}
+ if (pos == 0 && count >= folio_size(folio))
+ return false;
+
return mapping->a_ops->is_partially_uptodate(folio, pos, count);
}
@@ -2619,9 +2635,10 @@ retry:
goto err;
}
if (!folio_test_uptodate(folio)) {
- if ((iocb->ki_flags & IOCB_WAITQ) &&
- folio_batch_count(fbatch) > 1)
- iocb->ki_flags |= IOCB_NOWAIT;
+ if (folio_batch_count(fbatch) > 1) {
+ err = -EAGAIN;
+ goto err;
+ }
err = filemap_update_page(iocb, mapping, count, folio,
need_uptodate);
if (err)
@@ -3323,9 +3340,17 @@ static struct file *do_async_mmap_readahead(struct vm_fault *vmf,
if (vmf->vma->vm_flags & VM_RAND_READ || !ra->ra_pages)
return fpin;
- mmap_miss = READ_ONCE(ra->mmap_miss);
- if (mmap_miss)
- WRITE_ONCE(ra->mmap_miss, --mmap_miss);
+ /*
+ * If the folio is locked, we're likely racing against another fault.
+ * Don't touch the mmap_miss counter to avoid decreasing it multiple
+ * times for a single folio and break the balance with mmap_miss
+ * increase in do_sync_mmap_readahead().
+ */
+ if (likely(!folio_test_locked(folio))) {
+ mmap_miss = READ_ONCE(ra->mmap_miss);
+ if (mmap_miss)
+ WRITE_ONCE(ra->mmap_miss, --mmap_miss);
+ }
if (folio_test_readahead(folio)) {
fpin = maybe_unlock_mmap_for_io(vmf, fpin);
diff --git a/mm/gup.c b/mm/gup.c
index 0bc4d140fc07..ed02d65f9c72 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -148,7 +148,7 @@ int __must_check try_grab_folio(struct folio *folio, int refs,
if (WARN_ON_ONCE(folio_ref_count(folio) <= 0))
return -ENOMEM;
- if (unlikely(!(flags & FOLL_PCI_P2PDMA) && is_pci_p2pdma_page(&folio->page)))
+ if (unlikely(!(flags & FOLL_PCI_P2PDMA) && folio_is_pci_p2pdma(folio)))
return -EREMOTEIO;
if (flags & FOLL_GET)
@@ -475,10 +475,10 @@ EXPORT_SYMBOL_GPL(unpin_folios);
* lifecycle. Avoid setting the bit unless necessary, or it might cause write
* cache bouncing on large SMP machines for concurrent pinned gups.
*/
-static inline void mm_set_has_pinned_flag(unsigned long *mm_flags)
+static inline void mm_set_has_pinned_flag(struct mm_struct *mm)
{
- if (!test_bit(MMF_HAS_PINNED, mm_flags))
- set_bit(MMF_HAS_PINNED, mm_flags);
+ if (!mm_flags_test(MMF_HAS_PINNED, mm))
+ mm_flags_set(MMF_HAS_PINNED, mm);
}
#ifdef CONFIG_MMU
@@ -1693,7 +1693,7 @@ static __always_inline long __get_user_pages_locked(struct mm_struct *mm,
mmap_assert_locked(mm);
if (flags & FOLL_PIN)
- mm_set_has_pinned_flag(&mm->flags);
+ mm_set_has_pinned_flag(mm);
/*
* FOLL_PIN and FOLL_GET are mutually exclusive. Traditional behavior
@@ -3218,7 +3218,7 @@ static int gup_fast_fallback(unsigned long start, unsigned long nr_pages,
return -EINVAL;
if (gup_flags & FOLL_PIN)
- mm_set_has_pinned_flag(&current->mm->flags);
+ mm_set_has_pinned_flag(current->mm);
if (!(gup_flags & FOLL_FAST_ONLY))
might_lock_read(&current->mm->mmap_lock);
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 9c38a95e9f09..26cedfcd7418 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -99,12 +99,12 @@ static inline bool file_thp_enabled(struct vm_area_struct *vma)
unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma,
vm_flags_t vm_flags,
- unsigned long tva_flags,
+ enum tva_type type,
unsigned long orders)
{
- bool smaps = tva_flags & TVA_SMAPS;
- bool in_pf = tva_flags & TVA_IN_PF;
- bool enforce_sysfs = tva_flags & TVA_ENFORCE_SYSFS;
+ const bool smaps = type == TVA_SMAPS;
+ const bool in_pf = type == TVA_PAGEFAULT;
+ const bool forced_collapse = type == TVA_FORCED_COLLAPSE;
unsigned long supported_orders;
/* Check the intersection of requested and supported orders. */
@@ -122,7 +122,7 @@ unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma,
if (!vma->vm_mm) /* vdso */
return 0;
- if (thp_disabled_by_hw() || vma_thp_disabled(vma, vm_flags))
+ if (thp_disabled_by_hw() || vma_thp_disabled(vma, vm_flags, forced_collapse))
return 0;
/* khugepaged doesn't collapse DAX vma, but page fault is fine. */
@@ -167,14 +167,14 @@ unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma,
if (!in_pf && shmem_file(vma->vm_file))
return orders & shmem_allowable_huge_orders(file_inode(vma->vm_file),
vma, vma->vm_pgoff, 0,
- !enforce_sysfs);
+ forced_collapse);
if (!vma_is_anonymous(vma)) {
/*
- * Enforce sysfs THP requirements as necessary. Anonymous vmas
+ * Enforce THP collapse requirements as necessary. Anonymous vmas
* were already handled in thp_vma_allowable_orders().
*/
- if (enforce_sysfs &&
+ if (!forced_collapse &&
(!hugepage_global_enabled() || (!(vm_flags & VM_HUGEPAGE) &&
!hugepage_global_always())))
return 0;
@@ -207,7 +207,7 @@ unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma,
return orders;
}
-static bool get_huge_zero_page(void)
+static bool get_huge_zero_folio(void)
{
struct folio *zero_folio;
retry:
@@ -237,7 +237,7 @@ retry:
return true;
}
-static void put_huge_zero_page(void)
+static void put_huge_zero_folio(void)
{
/*
* Counter should never go to zero here. Only shrinker can put
@@ -248,33 +248,39 @@ static void put_huge_zero_page(void)
struct folio *mm_get_huge_zero_folio(struct mm_struct *mm)
{
- if (test_bit(MMF_HUGE_ZERO_PAGE, &mm->flags))
+ if (IS_ENABLED(CONFIG_PERSISTENT_HUGE_ZERO_FOLIO))
+ return huge_zero_folio;
+
+ if (mm_flags_test(MMF_HUGE_ZERO_FOLIO, mm))
return READ_ONCE(huge_zero_folio);
- if (!get_huge_zero_page())
+ if (!get_huge_zero_folio())
return NULL;
- if (test_and_set_bit(MMF_HUGE_ZERO_PAGE, &mm->flags))
- put_huge_zero_page();
+ if (mm_flags_test_and_set(MMF_HUGE_ZERO_FOLIO, mm))
+ put_huge_zero_folio();
return READ_ONCE(huge_zero_folio);
}
void mm_put_huge_zero_folio(struct mm_struct *mm)
{
- if (test_bit(MMF_HUGE_ZERO_PAGE, &mm->flags))
- put_huge_zero_page();
+ if (IS_ENABLED(CONFIG_PERSISTENT_HUGE_ZERO_FOLIO))
+ return;
+
+ if (mm_flags_test(MMF_HUGE_ZERO_FOLIO, mm))
+ put_huge_zero_folio();
}
-static unsigned long shrink_huge_zero_page_count(struct shrinker *shrink,
- struct shrink_control *sc)
+static unsigned long shrink_huge_zero_folio_count(struct shrinker *shrink,
+ struct shrink_control *sc)
{
/* we can free zero page only if last reference remains */
return atomic_read(&huge_zero_refcount) == 1 ? HPAGE_PMD_NR : 0;
}
-static unsigned long shrink_huge_zero_page_scan(struct shrinker *shrink,
- struct shrink_control *sc)
+static unsigned long shrink_huge_zero_folio_scan(struct shrinker *shrink,
+ struct shrink_control *sc)
{
if (atomic_cmpxchg(&huge_zero_refcount, 1, 0) == 1) {
struct folio *zero_folio = xchg(&huge_zero_folio, NULL);
@@ -287,7 +293,7 @@ static unsigned long shrink_huge_zero_page_scan(struct shrinker *shrink,
return 0;
}
-static struct shrinker *huge_zero_page_shrinker;
+static struct shrinker *huge_zero_folio_shrinker;
#ifdef CONFIG_SYSFS
static ssize_t enabled_show(struct kobject *kobj,
@@ -849,33 +855,47 @@ static inline void hugepage_exit_sysfs(struct kobject *hugepage_kobj)
static int __init thp_shrinker_init(void)
{
- huge_zero_page_shrinker = shrinker_alloc(0, "thp-zero");
- if (!huge_zero_page_shrinker)
- return -ENOMEM;
-
deferred_split_shrinker = shrinker_alloc(SHRINKER_NUMA_AWARE |
SHRINKER_MEMCG_AWARE |
SHRINKER_NONSLAB,
"thp-deferred_split");
- if (!deferred_split_shrinker) {
- shrinker_free(huge_zero_page_shrinker);
+ if (!deferred_split_shrinker)
return -ENOMEM;
- }
-
- huge_zero_page_shrinker->count_objects = shrink_huge_zero_page_count;
- huge_zero_page_shrinker->scan_objects = shrink_huge_zero_page_scan;
- shrinker_register(huge_zero_page_shrinker);
deferred_split_shrinker->count_objects = deferred_split_count;
deferred_split_shrinker->scan_objects = deferred_split_scan;
shrinker_register(deferred_split_shrinker);
+ if (IS_ENABLED(CONFIG_PERSISTENT_HUGE_ZERO_FOLIO)) {
+ /*
+ * Bump the reference of the huge_zero_folio and do not
+ * initialize the shrinker.
+ *
+ * huge_zero_folio will always be NULL on failure. We assume
+ * that get_huge_zero_folio() will most likely not fail as
+ * thp_shrinker_init() is invoked early on during boot.
+ */
+ if (!get_huge_zero_folio())
+ pr_warn("Allocating persistent huge zero folio failed\n");
+ return 0;
+ }
+
+ huge_zero_folio_shrinker = shrinker_alloc(0, "thp-zero");
+ if (!huge_zero_folio_shrinker) {
+ shrinker_free(deferred_split_shrinker);
+ return -ENOMEM;
+ }
+
+ huge_zero_folio_shrinker->count_objects = shrink_huge_zero_folio_count;
+ huge_zero_folio_shrinker->scan_objects = shrink_huge_zero_folio_scan;
+ shrinker_register(huge_zero_folio_shrinker);
+
return 0;
}
static void __init thp_shrinker_exit(void)
{
- shrinker_free(huge_zero_page_shrinker);
+ shrinker_free(huge_zero_folio_shrinker);
shrinker_free(deferred_split_shrinker);
}
@@ -911,7 +931,7 @@ static int __init hugepage_init(void)
* where the extra memory used could hurt more than TLB overhead
* is likely to save. The admin can still enable it through /sys.
*/
- if (totalram_pages() < (512 << (20 - PAGE_SHIFT))) {
+ if (totalram_pages() < MB_TO_PAGES(512)) {
transparent_hugepage_flags = 0;
return 0;
}
@@ -1125,7 +1145,7 @@ static unsigned long __thp_get_unmapped_area(struct file *filp,
off_sub = (off - ret) & (size - 1);
- if (test_bit(MMF_TOPDOWN, &current->mm->flags) && !off_sub)
+ if (mm_flags_test(MMF_TOPDOWN, current->mm) && !off_sub)
return ret + size;
ret += off_sub;
@@ -1309,6 +1329,7 @@ static void set_huge_zero_folio(pgtable_t pgtable, struct mm_struct *mm,
{
pmd_t entry;
entry = folio_mk_pmd(zero_folio, vma->vm_page_prot);
+ entry = pmd_mkspecial(entry);
pgtable_trans_huge_deposit(mm, pmd, pgtable);
set_pmd_at(mm, haddr, pmd, entry);
mm_inc_nr_ptes(mm);
@@ -1379,15 +1400,25 @@ struct folio_or_pfn {
bool is_folio;
};
-static int insert_pmd(struct vm_area_struct *vma, unsigned long addr,
+static vm_fault_t insert_pmd(struct vm_area_struct *vma, unsigned long addr,
pmd_t *pmd, struct folio_or_pfn fop, pgprot_t prot,
- bool write, pgtable_t pgtable)
+ bool write)
{
struct mm_struct *mm = vma->vm_mm;
+ pgtable_t pgtable = NULL;
+ spinlock_t *ptl;
pmd_t entry;
- lockdep_assert_held(pmd_lockptr(mm, pmd));
+ if (addr < vma->vm_start || addr >= vma->vm_end)
+ return VM_FAULT_SIGBUS;
+
+ if (arch_needs_pgtable_deposit()) {
+ pgtable = pte_alloc_one(vma->vm_mm);
+ if (!pgtable)
+ return VM_FAULT_OOM;
+ }
+ ptl = pmd_lock(mm, pmd);
if (!pmd_none(*pmd)) {
const unsigned long pfn = fop.is_folio ? folio_pfn(fop.folio) :
fop.pfn;
@@ -1395,23 +1426,26 @@ static int insert_pmd(struct vm_area_struct *vma, unsigned long addr,
if (write) {
if (pmd_pfn(*pmd) != pfn) {
WARN_ON_ONCE(!is_huge_zero_pmd(*pmd));
- return -EEXIST;
+ goto out_unlock;
}
entry = pmd_mkyoung(*pmd);
entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
if (pmdp_set_access_flags(vma, addr, pmd, entry, 1))
update_mmu_cache_pmd(vma, addr, pmd);
}
-
- return -EEXIST;
+ goto out_unlock;
}
if (fop.is_folio) {
entry = folio_mk_pmd(fop.folio, vma->vm_page_prot);
- folio_get(fop.folio);
- folio_add_file_rmap_pmd(fop.folio, &fop.folio->page, vma);
- add_mm_counter(mm, mm_counter_file(fop.folio), HPAGE_PMD_NR);
+ if (is_huge_zero_folio(fop.folio)) {
+ entry = pmd_mkspecial(entry);
+ } else {
+ folio_get(fop.folio);
+ folio_add_file_rmap_pmd(fop.folio, &fop.folio->page, vma);
+ add_mm_counter(mm, mm_counter_file(fop.folio), HPAGE_PMD_NR);
+ }
} else {
entry = pmd_mkhuge(pfn_pmd(fop.pfn, prot));
entry = pmd_mkspecial(entry);
@@ -1424,11 +1458,17 @@ static int insert_pmd(struct vm_area_struct *vma, unsigned long addr,
if (pgtable) {
pgtable_trans_huge_deposit(mm, pmd, pgtable);
mm_inc_nr_ptes(mm);
+ pgtable = NULL;
}
set_pmd_at(mm, addr, pmd, entry);
update_mmu_cache_pmd(vma, addr, pmd);
- return 0;
+
+out_unlock:
+ spin_unlock(ptl);
+ if (pgtable)
+ pte_free(mm, pgtable);
+ return VM_FAULT_NOPAGE;
}
/**
@@ -1450,9 +1490,6 @@ vm_fault_t vmf_insert_pfn_pmd(struct vm_fault *vmf, unsigned long pfn,
struct folio_or_pfn fop = {
.pfn = pfn,
};
- pgtable_t pgtable = NULL;
- spinlock_t *ptl;
- int error;
/*
* If we had pmd_special, we could avoid all these restrictions,
@@ -1464,25 +1501,9 @@ vm_fault_t vmf_insert_pfn_pmd(struct vm_fault *vmf, unsigned long pfn,
(VM_PFNMAP|VM_MIXEDMAP));
BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags));
- if (addr < vma->vm_start || addr >= vma->vm_end)
- return VM_FAULT_SIGBUS;
-
- if (arch_needs_pgtable_deposit()) {
- pgtable = pte_alloc_one(vma->vm_mm);
- if (!pgtable)
- return VM_FAULT_OOM;
- }
-
pfnmap_setup_cachemode_pfn(pfn, &pgprot);
- ptl = pmd_lock(vma->vm_mm, vmf->pmd);
- error = insert_pmd(vma, addr, vmf->pmd, fop, pgprot, write,
- pgtable);
- spin_unlock(ptl);
- if (error && pgtable)
- pte_free(vma->vm_mm, pgtable);
-
- return VM_FAULT_NOPAGE;
+ return insert_pmd(vma, addr, vmf->pmd, fop, pgprot, write);
}
EXPORT_SYMBOL_GPL(vmf_insert_pfn_pmd);
@@ -1491,35 +1512,15 @@ vm_fault_t vmf_insert_folio_pmd(struct vm_fault *vmf, struct folio *folio,
{
struct vm_area_struct *vma = vmf->vma;
unsigned long addr = vmf->address & PMD_MASK;
- struct mm_struct *mm = vma->vm_mm;
struct folio_or_pfn fop = {
.folio = folio,
.is_folio = true,
};
- spinlock_t *ptl;
- pgtable_t pgtable = NULL;
- int error;
-
- if (addr < vma->vm_start || addr >= vma->vm_end)
- return VM_FAULT_SIGBUS;
if (WARN_ON_ONCE(folio_order(folio) != PMD_ORDER))
return VM_FAULT_SIGBUS;
- if (arch_needs_pgtable_deposit()) {
- pgtable = pte_alloc_one(vma->vm_mm);
- if (!pgtable)
- return VM_FAULT_OOM;
- }
-
- ptl = pmd_lock(mm, vmf->pmd);
- error = insert_pmd(vma, addr, vmf->pmd, fop, vma->vm_page_prot,
- write, pgtable);
- spin_unlock(ptl);
- if (error && pgtable)
- pte_free(mm, pgtable);
-
- return VM_FAULT_NOPAGE;
+ return insert_pmd(vma, addr, vmf->pmd, fop, vma->vm_page_prot, write);
}
EXPORT_SYMBOL_GPL(vmf_insert_folio_pmd);
@@ -1531,25 +1532,30 @@ static pud_t maybe_pud_mkwrite(pud_t pud, struct vm_area_struct *vma)
return pud;
}
-static void insert_pud(struct vm_area_struct *vma, unsigned long addr,
+static vm_fault_t insert_pud(struct vm_area_struct *vma, unsigned long addr,
pud_t *pud, struct folio_or_pfn fop, pgprot_t prot, bool write)
{
struct mm_struct *mm = vma->vm_mm;
+ spinlock_t *ptl;
pud_t entry;
+ if (addr < vma->vm_start || addr >= vma->vm_end)
+ return VM_FAULT_SIGBUS;
+
+ ptl = pud_lock(mm, pud);
if (!pud_none(*pud)) {
const unsigned long pfn = fop.is_folio ? folio_pfn(fop.folio) :
fop.pfn;
if (write) {
if (WARN_ON_ONCE(pud_pfn(*pud) != pfn))
- return;
+ goto out_unlock;
entry = pud_mkyoung(*pud);
entry = maybe_pud_mkwrite(pud_mkdirty(entry), vma);
if (pudp_set_access_flags(vma, addr, pud, entry, 1))
update_mmu_cache_pud(vma, addr, pud);
}
- return;
+ goto out_unlock;
}
if (fop.is_folio) {
@@ -1568,6 +1574,9 @@ static void insert_pud(struct vm_area_struct *vma, unsigned long addr,
}
set_pud_at(mm, addr, pud, entry);
update_mmu_cache_pud(vma, addr, pud);
+out_unlock:
+ spin_unlock(ptl);
+ return VM_FAULT_NOPAGE;
}
/**
@@ -1589,7 +1598,6 @@ vm_fault_t vmf_insert_pfn_pud(struct vm_fault *vmf, unsigned long pfn,
struct folio_or_pfn fop = {
.pfn = pfn,
};
- spinlock_t *ptl;
/*
* If we had pud_special, we could avoid all these restrictions,
@@ -1601,16 +1609,9 @@ vm_fault_t vmf_insert_pfn_pud(struct vm_fault *vmf, unsigned long pfn,
(VM_PFNMAP|VM_MIXEDMAP));
BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags));
- if (addr < vma->vm_start || addr >= vma->vm_end)
- return VM_FAULT_SIGBUS;
-
pfnmap_setup_cachemode_pfn(pfn, &pgprot);
- ptl = pud_lock(vma->vm_mm, vmf->pud);
- insert_pud(vma, addr, vmf->pud, fop, pgprot, write);
- spin_unlock(ptl);
-
- return VM_FAULT_NOPAGE;
+ return insert_pud(vma, addr, vmf->pud, fop, pgprot, write);
}
EXPORT_SYMBOL_GPL(vmf_insert_pfn_pud);
@@ -1627,25 +1628,15 @@ vm_fault_t vmf_insert_folio_pud(struct vm_fault *vmf, struct folio *folio,
{
struct vm_area_struct *vma = vmf->vma;
unsigned long addr = vmf->address & PUD_MASK;
- pud_t *pud = vmf->pud;
- struct mm_struct *mm = vma->vm_mm;
struct folio_or_pfn fop = {
.folio = folio,
.is_folio = true,
};
- spinlock_t *ptl;
-
- if (addr < vma->vm_start || addr >= vma->vm_end)
- return VM_FAULT_SIGBUS;
if (WARN_ON_ONCE(folio_order(folio) != PUD_ORDER))
return VM_FAULT_SIGBUS;
- ptl = pud_lock(mm, pud);
- insert_pud(vma, addr, vmf->pud, fop, vma->vm_page_prot, write);
- spin_unlock(ptl);
-
- return VM_FAULT_NOPAGE;
+ return insert_pud(vma, addr, vmf->pud, fop, vma->vm_page_prot, write);
}
EXPORT_SYMBOL_GPL(vmf_insert_folio_pud);
#endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
@@ -1675,7 +1666,8 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
int ret = -ENOMEM;
pmd = pmdp_get_lockless(src_pmd);
- if (unlikely(pmd_present(pmd) && pmd_special(pmd))) {
+ if (unlikely(pmd_present(pmd) && pmd_special(pmd) &&
+ !is_huge_zero_pmd(pmd))) {
dst_ptl = pmd_lock(dst_mm, dst_pmd);
src_ptl = pmd_lockptr(src_mm, src_pmd);
spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
@@ -3310,8 +3302,8 @@ static void __split_folio_to_order(struct folio *folio, int old_order,
* unreferenced sub-pages of an anonymous THP: we can simply drop
* PG_anon_exclusive (-> PG_mappedtodisk) for these here.
*/
- new_folio->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
- new_folio->flags |= (folio->flags &
+ new_folio->flags.f &= ~PAGE_FLAGS_CHECK_AT_PREP;
+ new_folio->flags.f |= (folio->flags.f &
((1L << PG_referenced) |
(1L << PG_swapbacked) |
(1L << PG_swapcache) |
@@ -4327,8 +4319,8 @@ static int split_huge_pages_pid(int pid, unsigned long vaddr_start,
goto out;
}
- pr_debug("Split huge pages in pid: %d, vaddr: [0x%lx - 0x%lx]\n",
- pid, vaddr_start, vaddr_end);
+ pr_debug("Split huge pages in pid: %d, vaddr: [0x%lx - 0x%lx], new_order: %u, in_folio_offset: %ld\n",
+ pid, vaddr_start, vaddr_end, new_order, in_folio_offset);
mmap_read_lock(mm);
/*
@@ -4438,8 +4430,8 @@ static int split_huge_pages_in_file(const char *file_path, pgoff_t off_start,
if (IS_ERR(candidate))
goto out;
- pr_debug("split file-backed THPs in file: %s, page offset: [0x%lx - 0x%lx]\n",
- file_path, off_start, off_end);
+ pr_debug("split file-backed THPs in file: %s, page offset: [0x%lx - 0x%lx], new_order: %u, in_folio_offset: %ld\n",
+ file_path, off_start, off_end, new_order, in_folio_offset);
mapping = candidate->f_mapping;
min_order = mapping_min_folio_order(mapping);
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index eed59cfb5d21..1e777cc51ad0 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -3654,6 +3654,9 @@ static void __init hugetlb_hstate_alloc_pages(struct hstate *h)
return;
}
+ if (!h->max_huge_pages)
+ return;
+
/* do node specific alloc */
if (hugetlb_hstate_alloc_pages_specific_nodes(h))
return;
@@ -6932,6 +6935,11 @@ int hugetlb_mfill_atomic_pte(pte_t *dst_pte,
folio = alloc_hugetlb_folio(dst_vma, dst_addr, false);
if (IS_ERR(folio)) {
+ pte_t *actual_pte = hugetlb_walk(dst_vma, dst_addr, PMD_SIZE);
+ if (actual_pte) {
+ ret = -EEXIST;
+ goto out;
+ }
ret = -ENOMEM;
goto out;
}
diff --git a/mm/internal.h b/mm/internal.h
index 45b725c3dc03..45da9ff5694f 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -1333,11 +1333,6 @@ extern const struct trace_print_flags pageflag_names[];
extern const struct trace_print_flags vmaflag_names[];
extern const struct trace_print_flags gfpflag_names[];
-static inline bool is_migrate_highatomic(enum migratetype migratetype)
-{
- return migratetype == MIGRATE_HIGHATOMIC;
-}
-
void setup_zone_pageset(struct zone *zone);
struct migration_target_control {
diff --git a/mm/kasan/init.c b/mm/kasan/init.c
index 8fce3370c84e..f084e7a5df1e 100644
--- a/mm/kasan/init.c
+++ b/mm/kasan/init.c
@@ -266,11 +266,9 @@ int __ref kasan_populate_early_shadow(const void *shadow_start,
}
if (pgd_none(*pgd)) {
- p4d_t *p;
if (slab_is_available()) {
- p = p4d_alloc(&init_mm, pgd, addr);
- if (!p)
+ if (!p4d_alloc(&init_mm, pgd, addr))
return -ENOMEM;
} else {
pgd_populate_kernel(addr, pgd,
diff --git a/mm/kasan/kasan_test_c.c b/mm/kasan/kasan_test_c.c
index f4b17984b627..4cf2b5f8d6c1 100644
--- a/mm/kasan/kasan_test_c.c
+++ b/mm/kasan/kasan_test_c.c
@@ -1073,6 +1073,45 @@ static void kmem_cache_rcu_uaf(struct kunit *test)
kmem_cache_destroy(cache);
}
+/*
+ * Check that SLAB_TYPESAFE_BY_RCU objects are immediately reused when
+ * CONFIG_SLUB_RCU_DEBUG is off, and stay at the same address.
+ * Without this, KASAN builds would be unable to trigger bugs caused by
+ * SLAB_TYPESAFE_BY_RCU users handling reycled objects improperly.
+ */
+static void kmem_cache_rcu_reuse(struct kunit *test)
+{
+ char *p, *p2;
+ struct kmem_cache *cache;
+
+ KASAN_TEST_NEEDS_CONFIG_OFF(test, CONFIG_SLUB_RCU_DEBUG);
+
+ cache = kmem_cache_create("test_cache", 16, 0, SLAB_TYPESAFE_BY_RCU,
+ NULL);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, cache);
+
+ migrate_disable();
+ p = kmem_cache_alloc(cache, GFP_KERNEL);
+ if (!p) {
+ kunit_err(test, "Allocation failed: %s\n", __func__);
+ goto out;
+ }
+
+ kmem_cache_free(cache, p);
+ p2 = kmem_cache_alloc(cache, GFP_KERNEL);
+ if (!p2) {
+ kunit_err(test, "Allocation failed: %s\n", __func__);
+ goto out;
+ }
+ KUNIT_EXPECT_PTR_EQ(test, p, p2);
+
+ kmem_cache_free(cache, p2);
+
+out:
+ migrate_enable();
+ kmem_cache_destroy(cache);
+}
+
static void kmem_cache_double_destroy(struct kunit *test)
{
struct kmem_cache *cache;
@@ -2106,6 +2145,7 @@ static struct kunit_case kasan_kunit_test_cases[] = {
KUNIT_CASE(kmem_cache_double_free),
KUNIT_CASE(kmem_cache_invalid_free),
KUNIT_CASE(kmem_cache_rcu_uaf),
+ KUNIT_CASE(kmem_cache_rcu_reuse),
KUNIT_CASE(kmem_cache_double_destroy),
KUNIT_CASE(kmem_cache_accounted),
KUNIT_CASE(kmem_cache_bulk),
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index b486c1d19b2d..4ec324a4c1fe 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -410,7 +410,7 @@ static inline int hpage_collapse_test_exit(struct mm_struct *mm)
static inline int hpage_collapse_test_exit_or_disable(struct mm_struct *mm)
{
return hpage_collapse_test_exit(mm) ||
- test_bit(MMF_DISABLE_THP, &mm->flags);
+ mm_flags_test(MMF_DISABLE_THP_COMPLETELY, mm);
}
static bool hugepage_pmd_enabled(void)
@@ -445,7 +445,7 @@ void __khugepaged_enter(struct mm_struct *mm)
/* __khugepaged_exit() must not run from under us */
VM_BUG_ON_MM(hpage_collapse_test_exit(mm), mm);
- if (unlikely(test_and_set_bit(MMF_VM_HUGEPAGE, &mm->flags)))
+ if (unlikely(mm_flags_test_and_set(MMF_VM_HUGEPAGE, mm)))
return;
mm_slot = mm_slot_alloc(mm_slot_cache);
@@ -472,10 +472,9 @@ void __khugepaged_enter(struct mm_struct *mm)
void khugepaged_enter_vma(struct vm_area_struct *vma,
vm_flags_t vm_flags)
{
- if (!test_bit(MMF_VM_HUGEPAGE, &vma->vm_mm->flags) &&
+ if (!mm_flags_test(MMF_VM_HUGEPAGE, vma->vm_mm) &&
hugepage_pmd_enabled()) {
- if (thp_vma_allowable_order(vma, vm_flags, TVA_ENFORCE_SYSFS,
- PMD_ORDER))
+ if (thp_vma_allowable_order(vma, vm_flags, TVA_KHUGEPAGED, PMD_ORDER))
__khugepaged_enter(vma->vm_mm);
}
}
@@ -497,7 +496,7 @@ void __khugepaged_exit(struct mm_struct *mm)
spin_unlock(&khugepaged_mm_lock);
if (free) {
- clear_bit(MMF_VM_HUGEPAGE, &mm->flags);
+ mm_flags_clear(MMF_VM_HUGEPAGE, mm);
mm_slot_free(mm_slot_cache, mm_slot);
mmdrop(mm);
} else if (mm_slot) {
@@ -921,7 +920,8 @@ static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address,
struct collapse_control *cc)
{
struct vm_area_struct *vma;
- unsigned long tva_flags = cc->is_khugepaged ? TVA_ENFORCE_SYSFS : 0;
+ enum tva_type type = cc->is_khugepaged ? TVA_KHUGEPAGED :
+ TVA_FORCED_COLLAPSE;
if (unlikely(hpage_collapse_test_exit_or_disable(mm)))
return SCAN_ANY_PROCESS;
@@ -932,7 +932,7 @@ static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address,
if (!thp_vma_suitable_order(vma, address, PMD_ORDER))
return SCAN_ADDRESS_RANGE;
- if (!thp_vma_allowable_order(vma, vma->vm_flags, tva_flags, PMD_ORDER))
+ if (!thp_vma_allowable_order(vma, vma->vm_flags, type, PMD_ORDER))
return SCAN_VMA_CHECK;
/*
* Anon VMA expected, the address may be unmapped then
@@ -1459,7 +1459,7 @@ static void collect_mm_slot(struct khugepaged_mm_slot *mm_slot)
/*
* Not strictly needed because the mm exited already.
*
- * clear_bit(MMF_VM_HUGEPAGE, &mm->flags);
+ * mm_flags_clear(MMF_VM_HUGEPAGE, mm);
*/
/* khugepaged_mm_lock actually not necessary for the below */
@@ -1533,9 +1533,9 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr,
* in the page cache with a single hugepage. If a mm were to fault-in
* this memory (mapped by a suitably aligned VMA), we'd get the hugepage
* and map it by a PMD, regardless of sysfs THP settings. As such, let's
- * analogously elide sysfs THP settings here.
+ * analogously elide sysfs THP settings here and force collapse.
*/
- if (!thp_vma_allowable_order(vma, vma->vm_flags, 0, PMD_ORDER))
+ if (!thp_vma_allowable_order(vma, vma->vm_flags, TVA_FORCED_COLLAPSE, PMD_ORDER))
return SCAN_VMA_CHECK;
/* Keep pmd pgtable for uffd-wp; see comment in retract_page_tables() */
@@ -2402,7 +2402,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result,
mm_slot = khugepaged_scan.mm_slot;
slot = &mm_slot->slot;
} else {
- slot = list_entry(khugepaged_scan.mm_head.next,
+ slot = list_first_entry(&khugepaged_scan.mm_head,
struct mm_slot, mm_node);
mm_slot = mm_slot_entry(slot, struct khugepaged_mm_slot, slot);
khugepaged_scan.address = 0;
@@ -2432,8 +2432,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result,
progress++;
break;
}
- if (!thp_vma_allowable_order(vma, vma->vm_flags,
- TVA_ENFORCE_SYSFS, PMD_ORDER)) {
+ if (!thp_vma_allowable_order(vma, vma->vm_flags, TVA_KHUGEPAGED, PMD_ORDER)) {
skip:
progress++;
continue;
@@ -2516,9 +2515,8 @@ breakouterloop_mmap_lock:
* khugepaged runs here, khugepaged_exit will find
* mm_slot not pointing to the exiting mm.
*/
- if (slot->mm_node.next != &khugepaged_scan.mm_head) {
- slot = list_entry(slot->mm_node.next,
- struct mm_slot, mm_node);
+ if (!list_is_last(&slot->mm_node, &khugepaged_scan.mm_head)) {
+ slot = list_next_entry(slot, mm_node);
khugepaged_scan.mm_slot =
mm_slot_entry(slot, struct khugepaged_mm_slot, slot);
khugepaged_scan.address = 0;
@@ -2767,7 +2765,7 @@ int madvise_collapse(struct vm_area_struct *vma, unsigned long start,
BUG_ON(vma->vm_start > start);
BUG_ON(vma->vm_end < end);
- if (!thp_vma_allowable_order(vma, vma->vm_flags, 0, PMD_ORDER))
+ if (!thp_vma_allowable_order(vma, vma->vm_flags, TVA_FORCED_COLLAPSE, PMD_ORDER))
return -EINVAL;
cc = kmalloc(sizeof(*cc), GFP_KERNEL);
diff --git a/mm/ksm.c b/mm/ksm.c
index 160787bb121c..2ef29802a49b 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -1217,8 +1217,8 @@ mm_exiting:
spin_unlock(&ksm_mmlist_lock);
mm_slot_free(mm_slot_cache, mm_slot);
- clear_bit(MMF_VM_MERGEABLE, &mm->flags);
- clear_bit(MMF_VM_MERGE_ANY, &mm->flags);
+ mm_flags_clear(MMF_VM_MERGEABLE, mm);
+ mm_flags_clear(MMF_VM_MERGE_ANY, mm);
mmdrop(mm);
} else
spin_unlock(&ksm_mmlist_lock);
@@ -2620,8 +2620,8 @@ no_vmas:
spin_unlock(&ksm_mmlist_lock);
mm_slot_free(mm_slot_cache, mm_slot);
- clear_bit(MMF_VM_MERGEABLE, &mm->flags);
- clear_bit(MMF_VM_MERGE_ANY, &mm->flags);
+ mm_flags_clear(MMF_VM_MERGEABLE, mm);
+ mm_flags_clear(MMF_VM_MERGE_ANY, mm);
mmap_read_unlock(mm);
mmdrop(mm);
} else {
@@ -2742,7 +2742,7 @@ static int __ksm_del_vma(struct vm_area_struct *vma)
vm_flags_t ksm_vma_flags(const struct mm_struct *mm, const struct file *file,
vm_flags_t vm_flags)
{
- if (test_bit(MMF_VM_MERGE_ANY, &mm->flags) &&
+ if (mm_flags_test(MMF_VM_MERGE_ANY, mm) &&
__ksm_should_add_vma(file, vm_flags))
vm_flags |= VM_MERGEABLE;
@@ -2784,16 +2784,16 @@ int ksm_enable_merge_any(struct mm_struct *mm)
{
int err;
- if (test_bit(MMF_VM_MERGE_ANY, &mm->flags))
+ if (mm_flags_test(MMF_VM_MERGE_ANY, mm))
return 0;
- if (!test_bit(MMF_VM_MERGEABLE, &mm->flags)) {
+ if (!mm_flags_test(MMF_VM_MERGEABLE, mm)) {
err = __ksm_enter(mm);
if (err)
return err;
}
- set_bit(MMF_VM_MERGE_ANY, &mm->flags);
+ mm_flags_set(MMF_VM_MERGE_ANY, mm);
ksm_add_vmas(mm);
return 0;
@@ -2815,7 +2815,7 @@ int ksm_disable_merge_any(struct mm_struct *mm)
{
int err;
- if (!test_bit(MMF_VM_MERGE_ANY, &mm->flags))
+ if (!mm_flags_test(MMF_VM_MERGE_ANY, mm))
return 0;
err = ksm_del_vmas(mm);
@@ -2824,7 +2824,7 @@ int ksm_disable_merge_any(struct mm_struct *mm)
return err;
}
- clear_bit(MMF_VM_MERGE_ANY, &mm->flags);
+ mm_flags_clear(MMF_VM_MERGE_ANY, mm);
return 0;
}
@@ -2832,9 +2832,9 @@ int ksm_disable(struct mm_struct *mm)
{
mmap_assert_write_locked(mm);
- if (!test_bit(MMF_VM_MERGEABLE, &mm->flags))
+ if (!mm_flags_test(MMF_VM_MERGEABLE, mm))
return 0;
- if (test_bit(MMF_VM_MERGE_ANY, &mm->flags))
+ if (mm_flags_test(MMF_VM_MERGE_ANY, mm))
return ksm_disable_merge_any(mm);
return ksm_del_vmas(mm);
}
@@ -2852,7 +2852,7 @@ int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
if (!vma_ksm_compatible(vma))
return 0;
- if (!test_bit(MMF_VM_MERGEABLE, &mm->flags)) {
+ if (!mm_flags_test(MMF_VM_MERGEABLE, mm)) {
err = __ksm_enter(mm);
if (err)
return err;
@@ -2912,7 +2912,7 @@ int __ksm_enter(struct mm_struct *mm)
list_add_tail(&slot->mm_node, &ksm_scan.mm_slot->slot.mm_node);
spin_unlock(&ksm_mmlist_lock);
- set_bit(MMF_VM_MERGEABLE, &mm->flags);
+ mm_flags_set(MMF_VM_MERGEABLE, mm);
mmgrab(mm);
if (needs_wakeup)
@@ -2954,8 +2954,8 @@ void __ksm_exit(struct mm_struct *mm)
if (easy_to_free) {
mm_slot_free(mm_slot_cache, mm_slot);
- clear_bit(MMF_VM_MERGE_ANY, &mm->flags);
- clear_bit(MMF_VM_MERGEABLE, &mm->flags);
+ mm_flags_clear(MMF_VM_MERGE_ANY, mm);
+ mm_flags_clear(MMF_VM_MERGEABLE, mm);
mmdrop(mm);
} else if (mm_slot) {
mmap_write_lock(mm);
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 8dd7fbed5a94..9712a751690f 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -2203,7 +2203,7 @@ static unsigned long calculate_high_delay(struct mem_cgroup *memcg,
* try_charge() (context permitting), as well as from the userland
* return path where reclaim is always able to block.
*/
-void mem_cgroup_handle_over_high(gfp_t gfp_mask)
+void __mem_cgroup_handle_over_high(gfp_t gfp_mask)
{
unsigned long penalty_jiffies;
unsigned long pflags;
@@ -2213,9 +2213,6 @@ void mem_cgroup_handle_over_high(gfp_t gfp_mask)
struct mem_cgroup *memcg;
bool in_retry = false;
- if (likely(!nr_pages))
- return;
-
memcg = get_mem_cgroup_from_mm(current->mm);
current->memcg_nr_pages_over_high = 0;
@@ -2486,7 +2483,7 @@ done_restock:
if (current->memcg_nr_pages_over_high > MEMCG_CHARGE_BATCH &&
!(current->flags & PF_MEMALLOC) &&
gfpflags_allow_blocking(gfp_mask))
- mem_cgroup_handle_over_high(gfp_mask);
+ __mem_cgroup_handle_over_high(gfp_mask);
return 0;
}
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index df6ee59527dd..b93ab99ad3ef 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -1708,10 +1708,10 @@ static int identify_page_state(unsigned long pfn, struct page *p,
* carried out only if the first check can't determine the page status.
*/
for (ps = error_states;; ps++)
- if ((p->flags & ps->mask) == ps->res)
+ if ((p->flags.f & ps->mask) == ps->res)
break;
- page_flags |= (p->flags & (1UL << PG_dirty));
+ page_flags |= (p->flags.f & (1UL << PG_dirty));
if (!ps->mask)
for (ps = error_states;; ps++)
@@ -2137,7 +2137,7 @@ retry:
return action_result(pfn, MF_MSG_FREE_HUGE, res);
}
- page_flags = folio->flags;
+ page_flags = folio->flags.f;
if (!hwpoison_user_mappings(folio, p, pfn, flags)) {
folio_unlock(folio);
@@ -2397,7 +2397,7 @@ try_again:
* folio_remove_rmap_*() in try_to_unmap_one(). So to determine page
* status correctly, we save a copy of the page flags at this time.
*/
- page_flags = folio->flags;
+ page_flags = folio->flags.f;
/*
* __munlock_folio() may clear a writeback folio's LRU flag without
@@ -2742,13 +2742,13 @@ static int soft_offline_in_use_page(struct page *page)
putback_movable_pages(&pagelist);
pr_info("%#lx: %s migration failed %ld, type %pGp\n",
- pfn, msg_page[huge], ret, &page->flags);
+ pfn, msg_page[huge], ret, &page->flags.f);
if (ret > 0)
ret = -EBUSY;
}
} else {
pr_info("%#lx: %s isolation failed, page count %d, type %pGp\n",
- pfn, msg_page[huge], page_count(page), &page->flags);
+ pfn, msg_page[huge], page_count(page), &page->flags.f);
ret = -EBUSY;
}
return ret;
diff --git a/mm/memory.c b/mm/memory.c
index 0ba4f6b71847..d9de6c056179 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -491,22 +491,8 @@ static inline void add_mm_rss_vec(struct mm_struct *mm, int *rss)
add_mm_counter(mm, i, rss[i]);
}
-/*
- * This function is called to print an error when a bad pte
- * is found. For example, we might have a PFN-mapped pte in
- * a region that doesn't allow it.
- *
- * The calling function must still handle the error.
- */
-static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr,
- pte_t pte, struct page *page)
+static bool is_bad_page_map_ratelimited(void)
{
- pgd_t *pgd = pgd_offset(vma->vm_mm, addr);
- p4d_t *p4d = p4d_offset(pgd, addr);
- pud_t *pud = pud_offset(p4d, addr);
- pmd_t *pmd = pmd_offset(pud, addr);
- struct address_space *mapping;
- pgoff_t index;
static unsigned long resume;
static unsigned long nr_shown;
static unsigned long nr_unshown;
@@ -518,7 +504,7 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr,
if (nr_shown == 60) {
if (time_before(jiffies, resume)) {
nr_unshown++;
- return;
+ return true;
}
if (nr_unshown) {
pr_alert("BUG: Bad page map: %lu messages suppressed\n",
@@ -529,15 +515,91 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr,
}
if (nr_shown++ == 0)
resume = jiffies + 60 * HZ;
+ return false;
+}
+
+static void __print_bad_page_map_pgtable(struct mm_struct *mm, unsigned long addr)
+{
+ unsigned long long pgdv, p4dv, pudv, pmdv;
+ p4d_t p4d, *p4dp;
+ pud_t pud, *pudp;
+ pmd_t pmd, *pmdp;
+ pgd_t *pgdp;
+
+ /*
+ * Although this looks like a fully lockless pgtable walk, it is not:
+ * see locking requirements for print_bad_page_map().
+ */
+ pgdp = pgd_offset(mm, addr);
+ pgdv = pgd_val(*pgdp);
+
+ if (!pgd_present(*pgdp) || pgd_leaf(*pgdp)) {
+ pr_alert("pgd:%08llx\n", pgdv);
+ return;
+ }
+
+ p4dp = p4d_offset(pgdp, addr);
+ p4d = p4dp_get(p4dp);
+ p4dv = p4d_val(p4d);
+
+ if (!p4d_present(p4d) || p4d_leaf(p4d)) {
+ pr_alert("pgd:%08llx p4d:%08llx\n", pgdv, p4dv);
+ return;
+ }
+
+ pudp = pud_offset(p4dp, addr);
+ pud = pudp_get(pudp);
+ pudv = pud_val(pud);
+
+ if (!pud_present(pud) || pud_leaf(pud)) {
+ pr_alert("pgd:%08llx p4d:%08llx pud:%08llx\n", pgdv, p4dv, pudv);
+ return;
+ }
+
+ pmdp = pmd_offset(pudp, addr);
+ pmd = pmdp_get(pmdp);
+ pmdv = pmd_val(pmd);
+
+ /*
+ * Dumping the PTE would be nice, but it's tricky with CONFIG_HIGHPTE,
+ * because the table should already be mapped by the caller and
+ * doing another map would be bad. print_bad_page_map() should
+ * already take care of printing the PTE.
+ */
+ pr_alert("pgd:%08llx p4d:%08llx pud:%08llx pmd:%08llx\n", pgdv,
+ p4dv, pudv, pmdv);
+}
+
+/*
+ * This function is called to print an error when a bad page table entry (e.g.,
+ * corrupted page table entry) is found. For example, we might have a
+ * PFN-mapped pte in a region that doesn't allow it.
+ *
+ * The calling function must still handle the error.
+ *
+ * This function must be called during a proper page table walk, as it will
+ * re-walk the page table to dump information: the caller MUST prevent page
+ * table teardown (by holding mmap, vma or rmap lock) and MUST hold the leaf
+ * page table lock.
+ */
+static void print_bad_page_map(struct vm_area_struct *vma,
+ unsigned long addr, unsigned long long entry, struct page *page,
+ enum pgtable_level level)
+{
+ struct address_space *mapping;
+ pgoff_t index;
+
+ if (is_bad_page_map_ratelimited())
+ return;
mapping = vma->vm_file ? vma->vm_file->f_mapping : NULL;
index = linear_page_index(vma, addr);
- pr_alert("BUG: Bad page map in process %s pte:%08llx pmd:%08llx\n",
- current->comm,
- (long long)pte_val(pte), (long long)pmd_val(*pmd));
+ pr_alert("BUG: Bad page map in process %s %s:%08llx", current->comm,
+ pgtable_level_to_str(level), entry);
+ __print_bad_page_map_pgtable(vma->vm_mm, addr);
if (page)
- dump_page(page, "bad pte");
+ dump_page(page, "bad page map");
pr_alert("addr:%px vm_flags:%08lx anon_vma:%px mapping:%px index:%lx\n",
(void *)addr, vma->vm_flags, vma->anon_vma, mapping, index);
pr_alert("file:%pD fault:%ps mmap:%ps mmap_prepare: %ps read_folio:%ps\n",
@@ -549,18 +611,39 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr,
dump_stack();
add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
}
+#define print_bad_pte(vma, addr, pte, page) \
+ print_bad_page_map(vma, addr, pte_val(pte), page, PGTABLE_LEVEL_PTE)
-/*
- * vm_normal_page -- This function gets the "struct page" associated with a pte.
+/**
+ * __vm_normal_page() - Get the "struct page" associated with a page table entry.
+ * @vma: The VMA mapping the page table entry.
+ * @addr: The address where the page table entry is mapped.
+ * @pfn: The PFN stored in the page table entry.
+ * @special: Whether the page table entry is marked "special".
+ * @level: The page table level for error reporting purposes only.
+ * @entry: The page table entry value for error reporting purposes only.
*
* "Special" mappings do not wish to be associated with a "struct page" (either
* it doesn't exist, or it exists but they don't want to touch it). In this
- * case, NULL is returned here. "Normal" mappings do have a struct page.
+ * case, NULL is returned here. "Normal" mappings do have a struct page and
+ * are ordinarily refcounted.
+ *
+ * Page mappings of the shared zero folios are always considered "special", as
+ * they are not ordinarily refcounted: neither the refcount nor the mapcount
+ * of these folios is adjusted when mapping them into user page tables.
+ * Selected page table walkers (such as GUP) can still identify mappings of the
+ * shared zero folios and work with the underlying "struct page".
*
- * There are 2 broad cases. Firstly, an architecture may define a pte_special()
- * pte bit, in which case this function is trivial. Secondly, an architecture
- * may not have a spare pte bit, which requires a more complicated scheme,
- * described below.
+ * There are 2 broad cases. Firstly, an architecture may define a "special"
+ * page table entry bit, such as pte_special(), in which case this function is
+ * trivial. Secondly, an architecture may not have a spare page table
+ * entry bit, which requires a more complicated scheme, described below.
+ *
+ * With CONFIG_FIND_NORMAL_PAGE, we might have the "special" bit set on
+ * page table entries that actually map "normal" pages: however, that page
+ * cannot be looked up through the PFN stored in the page table entry, but
+ * instead will be looked up through vm_ops->find_normal_page(). So far, this
+ * only applies to PTEs.
*
* A raw VM_PFNMAP mapping (ie. one that is not COWed) is always considered a
* special mapping (even if there are underlying and valid "struct pages").
@@ -585,72 +668,104 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr,
*
* VM_MIXEDMAP mappings can likewise contain memory with or without "struct
* page" backing, however the difference is that _all_ pages with a struct
- * page (that is, those where pfn_valid is true) are refcounted and considered
- * normal pages by the VM. The only exception are zeropages, which are
- * *never* refcounted.
+ * page (that is, those where pfn_valid is true, except the shared zero
+ * folios) are refcounted and considered normal pages by the VM.
*
* The disadvantage is that pages are refcounted (which can be slower and
* simply not an option for some PFNMAP users). The advantage is that we
* don't have to follow the strict linearity rule of PFNMAP mappings in
* order to support COWable mappings.
*
+ * Return: Returns the "struct page" if this is a "normal" mapping. Returns
+ * NULL if this is a "special" mapping.
*/
-struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
- pte_t pte)
+static inline struct page *__vm_normal_page(struct vm_area_struct *vma,
+ unsigned long addr, unsigned long pfn, bool special,
+ unsigned long long entry, enum pgtable_level level)
{
- unsigned long pfn = pte_pfn(pte);
-
if (IS_ENABLED(CONFIG_ARCH_HAS_PTE_SPECIAL)) {
- if (likely(!pte_special(pte)))
- goto check_pfn;
- if (vma->vm_ops && vma->vm_ops->find_special_page)
- return vma->vm_ops->find_special_page(vma, addr);
- if (vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP))
- return NULL;
- if (is_zero_pfn(pfn))
- return NULL;
-
- print_bad_pte(vma, addr, pte, NULL);
- return NULL;
- }
-
- /* !CONFIG_ARCH_HAS_PTE_SPECIAL case follows: */
-
- if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) {
- if (vma->vm_flags & VM_MIXEDMAP) {
- if (!pfn_valid(pfn))
- return NULL;
- if (is_zero_pfn(pfn))
+ if (unlikely(special)) {
+#ifdef CONFIG_FIND_NORMAL_PAGE
+ if (vma->vm_ops && vma->vm_ops->find_normal_page)
+ return vma->vm_ops->find_normal_page(vma, addr);
+#endif /* CONFIG_FIND_NORMAL_PAGE */
+ if (vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP))
return NULL;
- goto out;
- } else {
- unsigned long off;
- off = (addr - vma->vm_start) >> PAGE_SHIFT;
- if (pfn == vma->vm_pgoff + off)
- return NULL;
- if (!is_cow_mapping(vma->vm_flags))
+ if (is_zero_pfn(pfn) || is_huge_zero_pfn(pfn))
return NULL;
+
+ print_bad_page_map(vma, addr, entry, NULL, level);
+ return NULL;
}
- }
+ /*
+ * With CONFIG_ARCH_HAS_PTE_SPECIAL, any special page table
+ * mappings (incl. shared zero folios) are marked accordingly.
+ */
+ } else {
+ if (unlikely(vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP))) {
+ if (vma->vm_flags & VM_MIXEDMAP) {
+ /* If it has a "struct page", it's "normal". */
+ if (!pfn_valid(pfn))
+ return NULL;
+ } else {
+ unsigned long off = (addr - vma->vm_start) >> PAGE_SHIFT;
- if (is_zero_pfn(pfn))
- return NULL;
+ /* Only CoW'ed anon folios are "normal". */
+ if (pfn == vma->vm_pgoff + off)
+ return NULL;
+ if (!is_cow_mapping(vma->vm_flags))
+ return NULL;
+ }
+ }
+
+ if (is_zero_pfn(pfn) || is_huge_zero_pfn(pfn))
+ return NULL;
+ }
-check_pfn:
if (unlikely(pfn > highest_memmap_pfn)) {
- print_bad_pte(vma, addr, pte, NULL);
+ /* Corrupted page table entry. */
+ print_bad_page_map(vma, addr, entry, NULL, level);
return NULL;
}
-
/*
* NOTE! We still have PageReserved() pages in the page tables.
- * eg. VDSO mappings can cause them to exist.
+ * For example, VDSO mappings can cause them to exist.
*/
-out:
- VM_WARN_ON_ONCE(is_zero_pfn(pfn));
+ VM_WARN_ON_ONCE(is_zero_pfn(pfn) || is_huge_zero_pfn(pfn));
return pfn_to_page(pfn);
}
+/**
+ * vm_normal_page() - Get the "struct page" associated with a PTE
+ * @vma: The VMA mapping the @pte.
+ * @addr: The address where the @pte is mapped.
+ * @pte: The PTE.
+ *
+ * Get the "struct page" associated with a PTE. See __vm_normal_page()
+ * for details on "normal" and "special" mappings.
+ *
+ * Return: Returns the "struct page" if this is a "normal" mapping. Returns
+ * NULL if this is a "special" mapping.
+ */
+struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
+ pte_t pte)
+{
+ return __vm_normal_page(vma, addr, pte_pfn(pte), pte_special(pte),
+ pte_val(pte), PGTABLE_LEVEL_PTE);
+}
+
+/**
+ * vm_normal_folio() - Get the "struct folio" associated with a PTE
+ * @vma: The VMA mapping the @pte.
+ * @addr: The address where the @pte is mapped.
+ * @pte: The PTE.
+ *
+ * Get the "struct folio" associated with a PTE. See __vm_normal_page()
+ * for details on "normal" and "special" mappings.
+ *
+ * Return: Returns the "struct folio" if this is a "normal" mapping. Returns
+ * NULL if this is a "special" mapping.
+ */
struct folio *vm_normal_folio(struct vm_area_struct *vma, unsigned long addr,
pte_t pte)
{
@@ -662,43 +777,37 @@ struct folio *vm_normal_folio(struct vm_area_struct *vma, unsigned long addr,
}
#ifdef CONFIG_PGTABLE_HAS_HUGE_LEAVES
+/**
+ * vm_normal_page_pmd() - Get the "struct page" associated with a PMD
+ * @vma: The VMA mapping the @pmd.
+ * @addr: The address where the @pmd is mapped.
+ * @pmd: The PMD.
+ *
+ * Get the "struct page" associated with a PTE. See __vm_normal_page()
+ * for details on "normal" and "special" mappings.
+ *
+ * Return: Returns the "struct page" if this is a "normal" mapping. Returns
+ * NULL if this is a "special" mapping.
+ */
struct page *vm_normal_page_pmd(struct vm_area_struct *vma, unsigned long addr,
pmd_t pmd)
{
- unsigned long pfn = pmd_pfn(pmd);
-
- /* Currently it's only used for huge pfnmaps */
- if (unlikely(pmd_special(pmd)))
- return NULL;
-
- if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) {
- if (vma->vm_flags & VM_MIXEDMAP) {
- if (!pfn_valid(pfn))
- return NULL;
- goto out;
- } else {
- unsigned long off;
- off = (addr - vma->vm_start) >> PAGE_SHIFT;
- if (pfn == vma->vm_pgoff + off)
- return NULL;
- if (!is_cow_mapping(vma->vm_flags))
- return NULL;
- }
- }
-
- if (is_huge_zero_pfn(pfn))
- return NULL;
- if (unlikely(pfn > highest_memmap_pfn))
- return NULL;
-
- /*
- * NOTE! We still have PageReserved() pages in the page tables.
- * eg. VDSO mappings can cause them to exist.
- */
-out:
- return pfn_to_page(pfn);
+ return __vm_normal_page(vma, addr, pmd_pfn(pmd), pmd_special(pmd),
+ pmd_val(pmd), PGTABLE_LEVEL_PMD);
}
+/**
+ * vm_normal_folio_pmd() - Get the "struct folio" associated with a PMD
+ * @vma: The VMA mapping the @pmd.
+ * @addr: The address where the @pmd is mapped.
+ * @pmd: The PMD.
+ *
+ * Get the "struct folio" associated with a PTE. See __vm_normal_page()
+ * for details on "normal" and "special" mappings.
+ *
+ * Return: Returns the "struct folio" if this is a "normal" mapping. Returns
+ * NULL if this is a "special" mapping.
+ */
struct folio *vm_normal_folio_pmd(struct vm_area_struct *vma,
unsigned long addr, pmd_t pmd)
{
@@ -708,6 +817,25 @@ struct folio *vm_normal_folio_pmd(struct vm_area_struct *vma,
return page_folio(page);
return NULL;
}
+
+/**
+ * vm_normal_page_pud() - Get the "struct page" associated with a PUD
+ * @vma: The VMA mapping the @pud.
+ * @addr: The address where the @pud is mapped.
+ * @pud: The PUD.
+ *
+ * Get the "struct page" associated with a PUD. See __vm_normal_page()
+ * for details on "normal" and "special" mappings.
+ *
+ * Return: Returns the "struct page" if this is a "normal" mapping. Returns
+ * NULL if this is a "special" mapping.
+ */
+struct page *vm_normal_page_pud(struct vm_area_struct *vma,
+ unsigned long addr, pud_t pud)
+{
+ return __vm_normal_page(vma, addr, pud_pfn(pud), pud_special(pud),
+ pud_val(pud), PGTABLE_LEVEL_PUD);
+}
#endif
/**
@@ -4387,8 +4515,8 @@ static struct folio *alloc_swap_folio(struct vm_fault *vmf)
* Get a list of all the (large) orders below PMD_ORDER that are enabled
* and suitable for swapping THP.
*/
- orders = thp_vma_allowable_orders(vma, vma->vm_flags,
- TVA_IN_PF | TVA_ENFORCE_SYSFS, BIT(PMD_ORDER) - 1);
+ orders = thp_vma_allowable_orders(vma, vma->vm_flags, TVA_PAGEFAULT,
+ BIT(PMD_ORDER) - 1);
orders = thp_vma_suitable_orders(vma, vmf->address, orders);
orders = thp_swap_suitable_orders(swp_offset(entry),
vmf->address, orders);
@@ -4935,8 +5063,8 @@ static struct folio *alloc_anon_folio(struct vm_fault *vmf)
* for this vma. Then filter out the orders that can't be allocated over
* the faulting address and still be fully contained in the vma.
*/
- orders = thp_vma_allowable_orders(vma, vma->vm_flags,
- TVA_IN_PF | TVA_ENFORCE_SYSFS, BIT(PMD_ORDER) - 1);
+ orders = thp_vma_allowable_orders(vma, vma->vm_flags, TVA_PAGEFAULT,
+ BIT(PMD_ORDER) - 1);
orders = thp_vma_suitable_orders(vma, vmf->address, orders);
if (!orders)
@@ -5204,9 +5332,11 @@ vm_fault_t do_set_pmd(struct vm_fault *vmf, struct folio *folio, struct page *pa
* It is too late to allocate a small folio, we already have a large
* folio in the pagecache: especially s390 KVM cannot tolerate any
* PMD mappings, but PTE-mapped THP are fine. So let's simply refuse any
- * PMD mappings if THPs are disabled.
+ * PMD mappings if THPs are disabled. As we already have a THP,
+ * behave as if we are forcing a collapse.
*/
- if (thp_disabled_by_hw() || vma_thp_disabled(vma, vma->vm_flags))
+ if (thp_disabled_by_hw() || vma_thp_disabled(vma, vma->vm_flags,
+ /* forced_collapse=*/ true))
return ret;
if (!thp_vma_suitable_order(vma, haddr, PMD_ORDER))
@@ -6126,8 +6256,7 @@ static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma,
return VM_FAULT_OOM;
retry_pud:
if (pud_none(*vmf.pud) &&
- thp_vma_allowable_order(vma, vm_flags,
- TVA_IN_PF | TVA_ENFORCE_SYSFS, PUD_ORDER)) {
+ thp_vma_allowable_order(vma, vm_flags, TVA_PAGEFAULT, PUD_ORDER)) {
ret = create_huge_pud(&vmf);
if (!(ret & VM_FAULT_FALLBACK))
return ret;
@@ -6161,8 +6290,7 @@ retry_pud:
goto retry_pud;
if (pmd_none(*vmf.pmd) &&
- thp_vma_allowable_order(vma, vm_flags,
- TVA_IN_PF | TVA_ENFORCE_SYSFS, PMD_ORDER)) {
+ thp_vma_allowable_order(vma, vm_flags, TVA_PAGEFAULT, PMD_ORDER)) {
ret = create_huge_pmd(&vmf);
if (!(ret & VM_FAULT_FALLBACK))
return ret;
diff --git a/mm/migrate.c b/mm/migrate.c
index 9e5ef39ce73a..8e435a078fc3 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -231,18 +231,17 @@ static void putback_movable_ops_page(struct page *page)
* src and dst are also released by migration core. These pages will not be
* folios in the future, so that must be reworked.
*
- * Returns MIGRATEPAGE_SUCCESS on success, otherwise a negative error
- * code.
+ * Returns 0 on success, otherwise a negative error code.
*/
static int migrate_movable_ops_page(struct page *dst, struct page *src,
enum migrate_mode mode)
{
- int rc = MIGRATEPAGE_SUCCESS;
+ int rc;
VM_WARN_ON_ONCE_PAGE(!page_has_movable_ops(src), src);
VM_WARN_ON_ONCE_PAGE(!PageMovableOpsIsolated(src), src);
rc = page_movable_ops(src)->migrate_page(dst, src, mode);
- if (rc == MIGRATEPAGE_SUCCESS)
+ if (!rc)
ClearPageMovableOpsIsolated(src);
return rc;
}
@@ -587,7 +586,7 @@ static int __folio_migrate_mapping(struct address_space *mapping,
if (folio_test_swapbacked(folio))
__folio_set_swapbacked(newfolio);
- return MIGRATEPAGE_SUCCESS;
+ return 0;
}
oldzone = folio_zone(folio);
@@ -688,7 +687,7 @@ static int __folio_migrate_mapping(struct address_space *mapping,
}
local_irq_enable();
- return MIGRATEPAGE_SUCCESS;
+ return 0;
}
int folio_migrate_mapping(struct address_space *mapping,
@@ -737,7 +736,7 @@ int migrate_huge_page_move_mapping(struct address_space *mapping,
xas_unlock_irq(&xas);
- return MIGRATEPAGE_SUCCESS;
+ return 0;
}
/*
@@ -853,14 +852,14 @@ static int __migrate_folio(struct address_space *mapping, struct folio *dst,
return rc;
rc = __folio_migrate_mapping(mapping, dst, src, expected_count);
- if (rc != MIGRATEPAGE_SUCCESS)
+ if (rc)
return rc;
if (src_private)
folio_attach_private(dst, folio_detach_private(src));
folio_migrate_flags(dst, src);
- return MIGRATEPAGE_SUCCESS;
+ return 0;
}
/**
@@ -967,7 +966,7 @@ recheck_buffers:
}
rc = filemap_migrate_folio(mapping, dst, src, mode);
- if (rc != MIGRATEPAGE_SUCCESS)
+ if (rc)
goto unlock_buffers;
bh = head;
@@ -1071,7 +1070,7 @@ static int fallback_migrate_folio(struct address_space *mapping,
*
* Return value:
* < 0 - error code
- * MIGRATEPAGE_SUCCESS - success
+ * 0 - success
*/
static int move_to_new_folio(struct folio *dst, struct folio *src,
enum migrate_mode mode)
@@ -1099,7 +1098,7 @@ static int move_to_new_folio(struct folio *dst, struct folio *src,
else
rc = fallback_migrate_folio(mapping, dst, src, mode);
- if (rc == MIGRATEPAGE_SUCCESS) {
+ if (!rc) {
/*
* For pagecache folios, src->mapping must be cleared before src
* is freed. Anonymous folios must stay anonymous until freed.
@@ -1189,7 +1188,7 @@ static void migrate_folio_done(struct folio *src,
static int migrate_folio_unmap(new_folio_t get_new_folio,
free_folio_t put_new_folio, unsigned long private,
struct folio *src, struct folio **dstp, enum migrate_mode mode,
- enum migrate_reason reason, struct list_head *ret)
+ struct list_head *ret)
{
struct folio *dst;
int rc = -EAGAIN;
@@ -1198,16 +1197,6 @@ static int migrate_folio_unmap(new_folio_t get_new_folio,
bool locked = false;
bool dst_locked = false;
- if (folio_ref_count(src) == 1) {
- /* Folio was freed from under us. So we are done. */
- folio_clear_active(src);
- folio_clear_unevictable(src);
- /* free_pages_prepare() will clear PG_isolated. */
- list_del(&src->lru);
- migrate_folio_done(src, reason);
- return MIGRATEPAGE_SUCCESS;
- }
-
dst = get_new_folio(src, private);
if (!dst)
return -ENOMEM;
@@ -1297,7 +1286,7 @@ static int migrate_folio_unmap(new_folio_t get_new_folio,
if (unlikely(page_has_movable_ops(&src->page))) {
__migrate_folio_record(dst, old_page_state, anon_vma);
- return MIGRATEPAGE_UNMAP;
+ return 0;
}
/*
@@ -1327,7 +1316,7 @@ static int migrate_folio_unmap(new_folio_t get_new_folio,
if (!folio_mapped(src)) {
__migrate_folio_record(dst, old_page_state, anon_vma);
- return MIGRATEPAGE_UNMAP;
+ return 0;
}
out:
@@ -1459,7 +1448,7 @@ static int unmap_and_move_huge_page(new_folio_t get_new_folio,
if (folio_ref_count(src) == 1) {
/* page was freed from under us. So we are done. */
folio_putback_hugetlb(src);
- return MIGRATEPAGE_SUCCESS;
+ return 0;
}
dst = get_new_folio(src, private);
@@ -1522,8 +1511,7 @@ static int unmap_and_move_huge_page(new_folio_t get_new_folio,
rc = move_to_new_folio(dst, src, mode);
if (page_was_mapped)
- remove_migration_ptes(src,
- rc == MIGRATEPAGE_SUCCESS ? dst : src, 0);
+ remove_migration_ptes(src, !rc ? dst : src, 0);
unlock_put_anon:
folio_unlock(dst);
@@ -1532,7 +1520,7 @@ put_anon:
if (anon_vma)
put_anon_vma(anon_vma);
- if (rc == MIGRATEPAGE_SUCCESS) {
+ if (!rc) {
move_hugetlb_state(src, dst, reason);
put_new_folio = NULL;
}
@@ -1540,7 +1528,7 @@ put_anon:
out_unlock:
folio_unlock(src);
out:
- if (rc == MIGRATEPAGE_SUCCESS)
+ if (!rc)
folio_putback_hugetlb(src);
else if (rc != -EAGAIN)
list_move_tail(&src->lru, ret);
@@ -1650,7 +1638,7 @@ static int migrate_hugetlbs(struct list_head *from, new_folio_t get_new_folio,
reason, ret_folios);
/*
* The rules are:
- * Success: hugetlb folio will be put back
+ * 0: hugetlb folio will be put back
* -EAGAIN: stay on the from list
* -ENOMEM: stay on the from list
* Other errno: put on ret_folios list
@@ -1667,7 +1655,7 @@ static int migrate_hugetlbs(struct list_head *from, new_folio_t get_new_folio,
retry++;
nr_retry_pages += nr_pages;
break;
- case MIGRATEPAGE_SUCCESS:
+ case 0:
stats->nr_succeeded += nr_pages;
break;
default:
@@ -1721,7 +1709,7 @@ static void migrate_folios_move(struct list_head *src_folios,
reason, ret_folios);
/*
* The rules are:
- * Success: folio will be freed
+ * 0: folio will be freed
* -EAGAIN: stay on the unmap_folios list
* Other errno: put on ret_folios list
*/
@@ -1731,7 +1719,7 @@ static void migrate_folios_move(struct list_head *src_folios,
*thp_retry += is_thp;
*nr_retry_pages += nr_pages;
break;
- case MIGRATEPAGE_SUCCESS:
+ case 0:
stats->nr_succeeded += nr_pages;
stats->nr_thp_succeeded += is_thp;
break;
@@ -1870,14 +1858,27 @@ static int migrate_pages_batch(struct list_head *from,
continue;
}
+ /*
+ * If we are holding the last folio reference, the folio
+ * was freed from under us, so just drop our reference.
+ */
+ if (likely(!page_has_movable_ops(&folio->page)) &&
+ folio_ref_count(folio) == 1) {
+ folio_clear_active(folio);
+ folio_clear_unevictable(folio);
+ list_del(&folio->lru);
+ migrate_folio_done(folio, reason);
+ stats->nr_succeeded += nr_pages;
+ stats->nr_thp_succeeded += is_thp;
+ continue;
+ }
+
rc = migrate_folio_unmap(get_new_folio, put_new_folio,
- private, folio, &dst, mode, reason,
- ret_folios);
+ private, folio, &dst, mode, ret_folios);
/*
* The rules are:
- * Success: folio will be freed
- * Unmap: folio will be put on unmap_folios list,
- * dst folio put on dst_folios list
+ * 0: folio will be put on unmap_folios list,
+ * dst folio put on dst_folios list
* -EAGAIN: stay on the from list
* -ENOMEM: stay on the from list
* Other errno: put on ret_folios list
@@ -1927,11 +1928,7 @@ static int migrate_pages_batch(struct list_head *from,
thp_retry += is_thp;
nr_retry_pages += nr_pages;
break;
- case MIGRATEPAGE_SUCCESS:
- stats->nr_succeeded += nr_pages;
- stats->nr_thp_succeeded += is_thp;
- break;
- case MIGRATEPAGE_UNMAP:
+ case 0:
list_move_tail(&folio->lru, &unmap_folios);
list_add_tail(&dst->lru, &dst_folios);
break;
diff --git a/mm/migrate_device.c b/mm/migrate_device.c
index e05e14d6eacd..abd9f6850db6 100644
--- a/mm/migrate_device.c
+++ b/mm/migrate_device.c
@@ -778,7 +778,7 @@ static void __migrate_device_pages(unsigned long *src_pfns,
if (migrate && migrate->fault_page == page)
extra_cnt = 1;
r = folio_migrate_mapping(mapping, newfolio, folio, extra_cnt);
- if (r != MIGRATEPAGE_SUCCESS)
+ if (r)
src_pfns[i] &= ~MIGRATE_PFN_MIGRATE;
else
folio_migrate_flags(newfolio, folio);
diff --git a/mm/mincore.c b/mm/mincore.c
index 10dabefc3acc..2f3e1816a30d 100644
--- a/mm/mincore.c
+++ b/mm/mincore.c
@@ -47,6 +47,48 @@ static int mincore_hugetlb(pte_t *pte, unsigned long hmask, unsigned long addr,
return 0;
}
+static unsigned char mincore_swap(swp_entry_t entry, bool shmem)
+{
+ struct swap_info_struct *si;
+ struct folio *folio = NULL;
+ unsigned char present = 0;
+
+ if (!IS_ENABLED(CONFIG_SWAP)) {
+ WARN_ON(1);
+ return 0;
+ }
+
+ /*
+ * Shmem mapping may contain swapin error entries, which are
+ * absent. Page table may contain migration or hwpoison
+ * entries which are always uptodate.
+ */
+ if (non_swap_entry(entry))
+ return !shmem;
+
+ /*
+ * Shmem mapping lookup is lockless, so we need to grab the swap
+ * device. mincore page table walk locks the PTL, and the swap
+ * device is stable, avoid touching the si for better performance.
+ */
+ if (shmem) {
+ si = get_swap_device(entry);
+ if (!si)
+ return 0;
+ }
+ folio = filemap_get_entry(swap_address_space(entry),
+ swap_cache_index(entry));
+ if (shmem)
+ put_swap_device(si);
+ /* The swap cache space contains either folio, shadow or NULL */
+ if (folio && !xa_is_value(folio)) {
+ present = folio_test_uptodate(folio);
+ folio_put(folio);
+ }
+
+ return present;
+}
+
/*
* Later we can get more picky about what "in core" means precisely.
* For now, simply check to see if the page is in the page cache,
@@ -64,8 +106,15 @@ static unsigned char mincore_page(struct address_space *mapping, pgoff_t index)
* any other file mapping (ie. marked !present and faulted in with
* tmpfs's .fault). So swapped out tmpfs mappings are tested here.
*/
- folio = filemap_get_incore_folio(mapping, index);
- if (!IS_ERR(folio)) {
+ folio = filemap_get_entry(mapping, index);
+ if (folio) {
+ if (xa_is_value(folio)) {
+ if (shmem_mapping(mapping))
+ return mincore_swap(radix_to_swp_entry(folio),
+ true);
+ else
+ return 0;
+ }
present = folio_test_uptodate(folio);
folio_put(folio);
}
@@ -143,23 +192,7 @@ static int mincore_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
for (i = 0; i < step; i++)
vec[i] = 1;
} else { /* pte is a swap entry */
- swp_entry_t entry = pte_to_swp_entry(pte);
-
- if (non_swap_entry(entry)) {
- /*
- * migration or hwpoison entries are always
- * uptodate
- */
- *vec = 1;
- } else {
-#ifdef CONFIG_SWAP
- *vec = mincore_page(swap_address_space(entry),
- swap_cache_index(entry));
-#else
- WARN_ON(1);
- *vec = 1;
-#endif
- }
+ *vec = mincore_swap(pte_to_swp_entry(pte), false);
}
vec += step;
}
diff --git a/mm/mmap.c b/mm/mmap.c
index 7306253cc3b5..7a057e0e8da9 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -802,7 +802,7 @@ unsigned long mm_get_unmapped_area_vmflags(struct mm_struct *mm, struct file *fi
unsigned long pgoff, unsigned long flags,
vm_flags_t vm_flags)
{
- if (test_bit(MMF_TOPDOWN, &mm->flags))
+ if (mm_flags_test(MMF_TOPDOWN, mm))
return arch_get_unmapped_area_topdown(filp, addr, len, pgoff,
flags, vm_flags);
return arch_get_unmapped_area(filp, addr, len, pgoff, flags, vm_flags);
@@ -1284,7 +1284,7 @@ void exit_mmap(struct mm_struct *mm)
* Set MMF_OOM_SKIP to hide this task from the oom killer/reaper
* because the memory has been already freed.
*/
- set_bit(MMF_OOM_SKIP, &mm->flags);
+ mm_flags_set(MMF_OOM_SKIP, mm);
mmap_write_lock(mm);
mt_clear_in_rcu(&mm->mm_mt);
vma_iter_set(&vmi, vma->vm_end);
@@ -1859,14 +1859,14 @@ loop_out:
mas_set_range(&vmi.mas, mpnt->vm_start, mpnt->vm_end - 1);
mas_store(&vmi.mas, XA_ZERO_ENTRY);
/* Avoid OOM iterating a broken tree */
- set_bit(MMF_OOM_SKIP, &mm->flags);
+ mm_flags_set(MMF_OOM_SKIP, mm);
}
/*
* The mm_struct is going to exit, but the locks will be dropped
* first. Set the mm_struct as unstable is advisable as it is
* not fully initialised.
*/
- set_bit(MMF_UNSTABLE, &mm->flags);
+ mm_flags_set(MMF_UNSTABLE, mm);
}
out:
mmap_write_unlock(mm);
diff --git a/mm/mmap_lock.c b/mm/mmap_lock.c
index b006cec8e6fe..0a0db5849b8e 100644
--- a/mm/mmap_lock.c
+++ b/mm/mmap_lock.c
@@ -128,6 +128,95 @@ void vma_mark_detached(struct vm_area_struct *vma)
}
/*
+ * Try to read-lock a vma. The function is allowed to occasionally yield false
+ * locked result to avoid performance overhead, in which case we fall back to
+ * using mmap_lock. The function should never yield false unlocked result.
+ * False locked result is possible if mm_lock_seq overflows or if vma gets
+ * reused and attached to a different mm before we lock it.
+ * Returns the vma on success, NULL on failure to lock and EAGAIN if vma got
+ * detached.
+ *
+ * IMPORTANT: RCU lock must be held upon entering the function, but upon error
+ * IT IS RELEASED. The caller must handle this correctly.
+ */
+static inline struct vm_area_struct *vma_start_read(struct mm_struct *mm,
+ struct vm_area_struct *vma)
+{
+ struct mm_struct *other_mm;
+ int oldcnt;
+
+ RCU_LOCKDEP_WARN(!rcu_read_lock_held(), "no rcu lock held");
+ /*
+ * Check before locking. A race might cause false locked result.
+ * We can use READ_ONCE() for the mm_lock_seq here, and don't need
+ * ACQUIRE semantics, because this is just a lockless check whose result
+ * we don't rely on for anything - the mm_lock_seq read against which we
+ * need ordering is below.
+ */
+ if (READ_ONCE(vma->vm_lock_seq) == READ_ONCE(mm->mm_lock_seq.sequence)) {
+ vma = NULL;
+ goto err;
+ }
+
+ /*
+ * If VMA_LOCK_OFFSET is set, __refcount_inc_not_zero_limited_acquire()
+ * will fail because VMA_REF_LIMIT is less than VMA_LOCK_OFFSET.
+ * Acquire fence is required here to avoid reordering against later
+ * vm_lock_seq check and checks inside lock_vma_under_rcu().
+ */
+ if (unlikely(!__refcount_inc_not_zero_limited_acquire(&vma->vm_refcnt, &oldcnt,
+ VMA_REF_LIMIT))) {
+ /* return EAGAIN if vma got detached from under us */
+ vma = oldcnt ? NULL : ERR_PTR(-EAGAIN);
+ goto err;
+ }
+
+ rwsem_acquire_read(&vma->vmlock_dep_map, 0, 1, _RET_IP_);
+
+ if (unlikely(vma->vm_mm != mm))
+ goto err_unstable;
+
+ /*
+ * Overflow of vm_lock_seq/mm_lock_seq might produce false locked result.
+ * False unlocked result is impossible because we modify and check
+ * vma->vm_lock_seq under vma->vm_refcnt protection and mm->mm_lock_seq
+ * modification invalidates all existing locks.
+ *
+ * We must use ACQUIRE semantics for the mm_lock_seq so that if we are
+ * racing with vma_end_write_all(), we only start reading from the VMA
+ * after it has been unlocked.
+ * This pairs with RELEASE semantics in vma_end_write_all().
+ */
+ if (unlikely(vma->vm_lock_seq == raw_read_seqcount(&mm->mm_lock_seq))) {
+ vma_refcount_put(vma);
+ vma = NULL;
+ goto err;
+ }
+
+ return vma;
+err:
+ rcu_read_unlock();
+
+ return vma;
+err_unstable:
+ /*
+ * If vma got attached to another mm from under us, that mm is not
+ * stable and can be freed in the narrow window after vma->vm_refcnt
+ * is dropped and before rcuwait_wake_up(mm) is called. Grab it before
+ * releasing vma->vm_refcnt.
+ */
+ other_mm = vma->vm_mm; /* use a copy as vma can be freed after we drop vm_refcnt */
+
+ /* __mmdrop() is a heavy operation, do it after dropping RCU lock. */
+ rcu_read_unlock();
+ mmgrab(other_mm);
+ vma_refcount_put(vma);
+ mmdrop(other_mm);
+
+ return NULL;
+}
+
+/*
* Lookup and lock a VMA under RCU protection. Returned VMA is guaranteed to be
* stable and not isolated. If the VMA is not found or is being modified the
* function returns NULL.
@@ -138,11 +227,13 @@ struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm,
MA_STATE(mas, &mm->mm_mt, address, address);
struct vm_area_struct *vma;
- rcu_read_lock();
retry:
+ rcu_read_lock();
vma = mas_walk(&mas);
- if (!vma)
+ if (!vma) {
+ rcu_read_unlock();
goto inval;
+ }
vma = vma_start_read(mm, vma);
if (IS_ERR_OR_NULL(vma)) {
@@ -162,18 +253,17 @@ retry:
* From here on, we can access the VMA without worrying about which
* fields are accessible for RCU readers.
*/
+ rcu_read_unlock();
/* Check if the vma we locked is the right one. */
- if (unlikely(address < vma->vm_start || address >= vma->vm_end))
- goto inval_end_read;
+ if (unlikely(address < vma->vm_start || address >= vma->vm_end)) {
+ vma_end_read(vma);
+ goto inval;
+ }
- rcu_read_unlock();
return vma;
-inval_end_read:
- vma_end_read(vma);
inval:
- rcu_read_unlock();
count_vm_vma_lock_event(VMA_LOCK_ABORT);
return NULL;
}
@@ -228,6 +318,7 @@ retry:
*/
if (PTR_ERR(vma) == -EAGAIN) {
/* reset to search from the last address */
+ rcu_read_lock();
vma_iter_set(vmi, from_addr);
goto retry;
}
@@ -257,9 +348,9 @@ retry:
return vma;
fallback_unlock:
+ rcu_read_unlock();
vma_end_read(vma);
fallback:
- rcu_read_unlock();
vma = lock_next_vma_under_mmap_lock(mm, vmi, from_addr);
rcu_read_lock();
/* Reinitialize the iterator after re-entering rcu read section */
diff --git a/mm/mmu_gather.c b/mm/mmu_gather.c
index b49cc6385f1f..374aa6f021c6 100644
--- a/mm/mmu_gather.c
+++ b/mm/mmu_gather.c
@@ -32,7 +32,7 @@ static bool tlb_next_batch(struct mmu_gather *tlb)
if (tlb->batch_count == MAX_GATHER_BATCH_COUNT)
return false;
- batch = (void *)__get_free_page(GFP_NOWAIT | __GFP_NOWARN);
+ batch = (void *)__get_free_page(GFP_NOWAIT);
if (!batch)
return false;
@@ -364,7 +364,7 @@ void tlb_remove_table(struct mmu_gather *tlb, void *table)
struct mmu_table_batch **batch = &tlb->batch;
if (*batch == NULL) {
- *batch = (struct mmu_table_batch *)__get_free_page(GFP_NOWAIT | __GFP_NOWARN);
+ *batch = (struct mmu_table_batch *)__get_free_page(GFP_NOWAIT);
if (*batch == NULL) {
tlb_table_invalidate(tlb);
tlb_remove_table_one(table);
diff --git a/mm/mmzone.c b/mm/mmzone.c
index f9baa8882fbf..0c8f181d9d50 100644
--- a/mm/mmzone.c
+++ b/mm/mmzone.c
@@ -99,14 +99,14 @@ int folio_xchg_last_cpupid(struct folio *folio, int cpupid)
unsigned long old_flags, flags;
int last_cpupid;
- old_flags = READ_ONCE(folio->flags);
+ old_flags = READ_ONCE(folio->flags.f);
do {
flags = old_flags;
last_cpupid = (flags >> LAST_CPUPID_PGSHIFT) & LAST_CPUPID_MASK;
flags &= ~(LAST_CPUPID_MASK << LAST_CPUPID_PGSHIFT);
flags |= (cpupid & LAST_CPUPID_MASK) << LAST_CPUPID_PGSHIFT;
- } while (unlikely(!try_cmpxchg(&folio->flags, &old_flags, flags)));
+ } while (unlikely(!try_cmpxchg(&folio->flags.f, &old_flags, flags)));
return last_cpupid;
}
diff --git a/mm/nommu.c b/mm/nommu.c
index 8b819fafd57b..c3a23b082adb 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -64,7 +64,7 @@ const struct vm_operations_struct generic_file_vm_ops = {
*/
unsigned int kobjsize(const void *objp)
{
- struct page *page;
+ struct folio *folio;
/*
* If the object we have should not have ksize performed on it,
@@ -73,22 +73,22 @@ unsigned int kobjsize(const void *objp)
if (!objp || !virt_addr_valid(objp))
return 0;
- page = virt_to_head_page(objp);
+ folio = virt_to_folio(objp);
/*
* If the allocator sets PageSlab, we know the pointer came from
* kmalloc().
*/
- if (PageSlab(page))
+ if (folio_test_slab(folio))
return ksize(objp);
/*
- * If it's not a compound page, see if we have a matching VMA
+ * If it's not a large folio, see if we have a matching VMA
* region. This test is intentionally done in reverse order,
* so if there's no VMA, we still fall through and hand back
- * PAGE_SIZE for 0-order pages.
+ * PAGE_SIZE for 0-order folios.
*/
- if (!PageCompound(page)) {
+ if (!folio_test_large(folio)) {
struct vm_area_struct *vma;
vma = find_vma(current->mm, (unsigned long)objp);
@@ -100,7 +100,7 @@ unsigned int kobjsize(const void *objp)
* The ksize() function is only guaranteed to work for pointers
* returned by kmalloc(). So handle arbitrary pointers here.
*/
- return page_size(page);
+ return folio_size(folio);
}
void vfree(const void *addr)
@@ -119,7 +119,8 @@ void *__vmalloc_noprof(unsigned long size, gfp_t gfp_mask)
}
EXPORT_SYMBOL(__vmalloc_noprof);
-void *vrealloc_noprof(const void *p, size_t size, gfp_t flags)
+void *vrealloc_node_align_noprof(const void *p, size_t size, unsigned long align,
+ gfp_t flags, int node)
{
return krealloc_noprof(p, size, (flags | __GFP_COMP) & ~__GFP_HIGHMEM);
}
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 25923cfec9c6..17650f0b516e 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -1,7 +1,7 @@
// SPDX-License-Identifier: GPL-2.0-only
/*
* linux/mm/oom_kill.c
- *
+ *
* Copyright (C) 1998,2000 Rik van Riel
* Thanks go out to Claus Fischer for some serious inspiration and
* for goading me into coding this file...
@@ -218,7 +218,7 @@ long oom_badness(struct task_struct *p, unsigned long totalpages)
*/
adj = (long)p->signal->oom_score_adj;
if (adj == OOM_SCORE_ADJ_MIN ||
- test_bit(MMF_OOM_SKIP, &p->mm->flags) ||
+ mm_flags_test(MMF_OOM_SKIP, p->mm) ||
in_vfork(p)) {
task_unlock(p);
return LONG_MIN;
@@ -325,7 +325,7 @@ static int oom_evaluate_task(struct task_struct *task, void *arg)
* any memory is quite low.
*/
if (!is_sysrq_oom(oc) && tsk_is_oom_victim(task)) {
- if (test_bit(MMF_OOM_SKIP, &task->signal->oom_mm->flags))
+ if (mm_flags_test(MMF_OOM_SKIP, task->signal->oom_mm))
goto next;
goto abort;
}
@@ -524,7 +524,7 @@ static bool __oom_reap_task_mm(struct mm_struct *mm)
* should imply barriers already and the reader would hit a page fault
* if it stumbled over a reaped memory.
*/
- set_bit(MMF_UNSTABLE, &mm->flags);
+ mm_flags_set(MMF_UNSTABLE, mm);
for_each_vma(vmi, vma) {
if (vma->vm_flags & (VM_HUGETLB|VM_PFNMAP))
@@ -583,7 +583,7 @@ static bool oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm)
* under mmap_lock for reading because it serializes against the
* mmap_write_lock();mmap_write_unlock() cycle in exit_mmap().
*/
- if (test_bit(MMF_OOM_SKIP, &mm->flags)) {
+ if (mm_flags_test(MMF_OOM_SKIP, mm)) {
trace_skip_task_reaping(tsk->pid);
goto out_unlock;
}
@@ -619,7 +619,7 @@ static void oom_reap_task(struct task_struct *tsk)
schedule_timeout_idle(HZ/10);
if (attempts <= MAX_OOM_REAP_RETRIES ||
- test_bit(MMF_OOM_SKIP, &mm->flags))
+ mm_flags_test(MMF_OOM_SKIP, mm))
goto done;
pr_info("oom_reaper: unable to reap pid:%d (%s)\n",
@@ -634,7 +634,7 @@ done:
* Hide this mm from OOM killer because it has been either reaped or
* somebody can't call mmap_write_unlock(mm).
*/
- set_bit(MMF_OOM_SKIP, &mm->flags);
+ mm_flags_set(MMF_OOM_SKIP, mm);
/* Drop a reference taken by queue_oom_reaper */
put_task_struct(tsk);
@@ -670,7 +670,7 @@ static void wake_oom_reaper(struct timer_list *timer)
unsigned long flags;
/* The victim managed to terminate on its own - see exit_mmap */
- if (test_bit(MMF_OOM_SKIP, &mm->flags)) {
+ if (mm_flags_test(MMF_OOM_SKIP, mm)) {
put_task_struct(tsk);
return;
}
@@ -695,7 +695,7 @@ static void wake_oom_reaper(struct timer_list *timer)
static void queue_oom_reaper(struct task_struct *tsk)
{
/* mm is already queued? */
- if (test_and_set_bit(MMF_OOM_REAP_QUEUED, &tsk->signal->oom_mm->flags))
+ if (mm_flags_test_and_set(MMF_OOM_REAP_QUEUED, tsk->signal->oom_mm))
return;
get_task_struct(tsk);
@@ -892,7 +892,7 @@ static bool task_will_free_mem(struct task_struct *task)
* This task has already been drained by the oom reaper so there are
* only small chances it will free some more
*/
- if (test_bit(MMF_OOM_SKIP, &mm->flags))
+ if (mm_flags_test(MMF_OOM_SKIP, mm))
return false;
if (atomic_read(&mm->mm_users) <= 1)
@@ -977,7 +977,7 @@ static void __oom_kill_process(struct task_struct *victim, const char *message)
continue;
if (is_global_init(p)) {
can_oom_reap = false;
- set_bit(MMF_OOM_SKIP, &mm->flags);
+ mm_flags_set(MMF_OOM_SKIP, mm);
pr_info("oom killer %d (%s) has mm pinned by %d (%s)\n",
task_pid_nr(victim), victim->comm,
task_pid_nr(p), p->comm);
@@ -1235,7 +1235,7 @@ SYSCALL_DEFINE2(process_mrelease, int, pidfd, unsigned int, flags)
reap = true;
else {
/* Error only if the work has not been done already */
- if (!test_bit(MMF_OOM_SKIP, &mm->flags))
+ if (!mm_flags_test(MMF_OOM_SKIP, mm))
ret = -EINVAL;
}
task_unlock(p);
@@ -1251,7 +1251,7 @@ SYSCALL_DEFINE2(process_mrelease, int, pidfd, unsigned int, flags)
* Check MMF_OOM_SKIP again under mmap_read_lock protection to ensure
* possible change in exit_mmap is seen
*/
- if (!test_bit(MMF_OOM_SKIP, &mm->flags) && !__oom_reap_task_mm(mm))
+ if (mm_flags_test(MMF_OOM_SKIP, mm) && !__oom_reap_task_mm(mm))
ret = -EAGAIN;
mmap_read_unlock(mm);
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 3e248d1c3969..5f90fd6a7137 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -38,10 +38,10 @@
#include <linux/sched/rt.h>
#include <linux/sched/signal.h>
#include <linux/mm_inline.h>
+#include <linux/shmem_fs.h>
#include <trace/events/writeback.h>
#include "internal.h"
-#include "swap.h"
/*
* Sleep at most 200ms at a time in balance_dirty_pages().
@@ -2590,36 +2590,6 @@ done:
}
EXPORT_SYMBOL_GPL(writeback_iter);
-/**
- * write_cache_pages - walk the list of dirty pages of the given address space and write all of them.
- * @mapping: address space structure to write
- * @wbc: subtract the number of written pages from *@wbc->nr_to_write
- * @writepage: function called for each page
- * @data: data passed to writepage function
- *
- * Return: %0 on success, negative error code otherwise
- *
- * Note: please use writeback_iter() instead.
- */
-int write_cache_pages(struct address_space *mapping,
- struct writeback_control *wbc, writepage_t writepage,
- void *data)
-{
- struct folio *folio = NULL;
- int error;
-
- while ((folio = writeback_iter(mapping, wbc, folio, &error))) {
- error = writepage(folio, wbc, data);
- if (error == AOP_WRITEPAGE_ACTIVATE) {
- folio_unlock(folio);
- error = 0;
- }
- }
-
- return error;
-}
-EXPORT_SYMBOL(write_cache_pages);
-
int do_writepages(struct address_space *mapping, struct writeback_control *wbc)
{
int ret;
@@ -2735,12 +2705,18 @@ void __folio_mark_dirty(struct folio *folio, struct address_space *mapping,
{
unsigned long flags;
+ /*
+ * Shmem writeback relies on swap, and swap writeback is LRU based,
+ * not using the dirty mark.
+ */
+ VM_WARN_ON_ONCE(folio_test_swapcache(folio) || shmem_mapping(mapping));
+
xa_lock_irqsave(&mapping->i_pages, flags);
if (folio->mapping) { /* Race with truncate? */
WARN_ON_ONCE(warn && !folio_test_uptodate(folio));
folio_account_dirtied(folio, mapping);
- __xa_set_mark(&mapping->i_pages, folio_index(folio),
- PAGECACHE_TAG_DIRTY);
+ __xa_set_mark(&mapping->i_pages, folio->index,
+ PAGECACHE_TAG_DIRTY);
}
xa_unlock_irqrestore(&mapping->i_pages, flags);
}
@@ -3019,7 +2995,7 @@ bool __folio_end_writeback(struct folio *folio)
xa_lock_irqsave(&mapping->i_pages, flags);
ret = folio_xor_flags_has_waiters(folio, 1 << PG_writeback);
- __xa_clear_mark(&mapping->i_pages, folio_index(folio),
+ __xa_clear_mark(&mapping->i_pages, folio->index,
PAGECACHE_TAG_WRITEBACK);
if (bdi->capabilities & BDI_CAP_WRITEBACK_ACCT) {
struct bdi_writeback *wb = inode_to_wb(inode);
@@ -3056,7 +3032,7 @@ void __folio_start_writeback(struct folio *folio, bool keep_write)
VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
if (mapping && mapping_use_writeback_tags(mapping)) {
- XA_STATE(xas, &mapping->i_pages, folio_index(folio));
+ XA_STATE(xas, &mapping->i_pages, folio->index);
struct inode *inode = mapping->host;
struct backing_dev_info *bdi = inode_to_bdi(inode);
unsigned long flags;
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index d1d037f97c5f..0873d640f26c 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -355,7 +355,7 @@ static inline int pfn_to_bitidx(const struct page *page, unsigned long pfn)
static __always_inline bool is_standalone_pb_bit(enum pageblock_bits pb_bit)
{
- return pb_bit > PB_migrate_end && pb_bit < __NR_PAGEBLOCK_BITS;
+ return pb_bit >= PB_compact_skip && pb_bit < __NR_PAGEBLOCK_BITS;
}
static __always_inline void
@@ -370,7 +370,7 @@ get_pfnblock_bitmap_bitidx(const struct page *page, unsigned long pfn,
#else
BUILD_BUG_ON(NR_PAGEBLOCK_BITS != 4);
#endif
- BUILD_BUG_ON(__MIGRATE_TYPE_END >= (1 << PB_migratetype_bits));
+ BUILD_BUG_ON(__MIGRATE_TYPE_END > MIGRATETYPE_MASK);
VM_BUG_ON_PAGE(!zone_spans_pfn(page_zone(page), pfn), page);
bitmap = get_pageblock_bitmap(page, pfn);
@@ -538,8 +538,7 @@ static void set_pageblock_migratetype(struct page *page,
"Use set_pageblock_isolate() for pageblock isolation");
return;
}
- VM_WARN_ONCE(get_pfnblock_bit(page, page_to_pfn(page),
- PB_migrate_isolate),
+ VM_WARN_ONCE(get_pageblock_isolate(page),
"Use clear_pageblock_isolate() to unisolate pageblock");
/* MIGRATETYPE_AND_ISO_MASK clears PB_migrate_isolate if it is set */
#endif
@@ -797,7 +796,7 @@ static inline void account_freepages(struct zone *zone, int nr_pages,
if (is_migrate_cma(migratetype))
__mod_zone_page_state(zone, NR_FREE_CMA_PAGES, nr_pages);
- else if (is_migrate_highatomic(migratetype))
+ else if (migratetype == MIGRATE_HIGHATOMIC)
WRITE_ONCE(zone->nr_free_highatomic,
zone->nr_free_highatomic + nr_pages);
}
@@ -950,7 +949,7 @@ static inline void __free_one_page(struct page *page,
bool to_tail;
VM_BUG_ON(!zone_is_initialized(zone));
- VM_BUG_ON_PAGE(page->flags & PAGE_FLAGS_CHECK_AT_PREP, page);
+ VM_BUG_ON_PAGE(page->flags.f & PAGE_FLAGS_CHECK_AT_PREP, page);
VM_BUG_ON(migratetype == -1);
VM_BUG_ON_PAGE(pfn & ((1 << order) - 1), page);
@@ -1043,7 +1042,7 @@ static inline bool page_expected_state(struct page *page,
page->memcg_data |
#endif
page_pool_page_is_pp(page) |
- (page->flags & check_flags)))
+ (page->flags.f & check_flags)))
return false;
return true;
@@ -1059,7 +1058,7 @@ static const char *page_bad_reason(struct page *page, unsigned long flags)
bad_reason = "non-NULL mapping";
if (unlikely(page_ref_count(page) != 0))
bad_reason = "nonzero _refcount";
- if (unlikely(page->flags & flags)) {
+ if (unlikely(page->flags.f & flags)) {
if (flags == PAGE_FLAGS_CHECK_AT_PREP)
bad_reason = "PAGE_FLAGS_CHECK_AT_PREP flag(s) set";
else
@@ -1358,7 +1357,7 @@ __always_inline bool free_pages_prepare(struct page *page,
int i;
if (compound) {
- page[1].flags &= ~PAGE_FLAGS_SECOND;
+ page[1].flags.f &= ~PAGE_FLAGS_SECOND;
#ifdef NR_PAGES_IN_LARGE_FOLIO
folio->_nr_pages = 0;
#endif
@@ -1372,7 +1371,7 @@ __always_inline bool free_pages_prepare(struct page *page,
continue;
}
}
- (page + i)->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
+ (page + i)->flags.f &= ~PAGE_FLAGS_CHECK_AT_PREP;
}
}
if (folio_test_anon(folio)) {
@@ -1391,7 +1390,7 @@ __always_inline bool free_pages_prepare(struct page *page,
}
page_cpupid_reset_last(page);
- page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
+ page->flags.f &= ~PAGE_FLAGS_CHECK_AT_PREP;
reset_page_owner(page, order);
page_table_check_free(page, order);
pgalloc_tag_sub(page, 1 << order);
@@ -2034,7 +2033,13 @@ static int move_freepages_block(struct zone *zone, struct page *page,
/* Look for a buddy that straddles start_pfn */
static unsigned long find_large_buddy(unsigned long start_pfn)
{
- int order = 0;
+ /*
+ * If start_pfn is not an order-0 PageBuddy, next PageBuddy containing
+ * start_pfn has minimal order of __ffs(start_pfn) + 1. Start checking
+ * the order with __ffs(start_pfn). If start_pfn is order-0 PageBuddy,
+ * the starting order does not matter.
+ */
+ int order = start_pfn ? __ffs(start_pfn) : MAX_PAGE_ORDER;
struct page *page;
unsigned long pfn = start_pfn;
@@ -2058,9 +2063,9 @@ static unsigned long find_large_buddy(unsigned long start_pfn)
static inline void toggle_pageblock_isolate(struct page *page, bool isolate)
{
if (isolate)
- set_pfnblock_bit(page, page_to_pfn(page), PB_migrate_isolate);
+ set_pageblock_isolate(page);
else
- clear_pfnblock_bit(page, page_to_pfn(page), PB_migrate_isolate);
+ clear_pageblock_isolate(page);
}
/**
@@ -4182,7 +4187,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
}
static inline bool
-should_compact_retry(struct alloc_context *ac, unsigned int order, int alloc_flags,
+should_compact_retry(struct alloc_context *ac, int order, int alloc_flags,
enum compact_result compact_result,
enum compact_priority *compact_priority,
int *compaction_retries)
@@ -4408,7 +4413,7 @@ gfp_to_alloc_flags(gfp_t gfp_mask, unsigned int order)
if (!(gfp_mask & __GFP_NOMEMALLOC)) {
alloc_flags |= ALLOC_NON_BLOCK;
- if (order > 0)
+ if (order > 0 && (alloc_flags & ALLOC_MIN_RESERVE))
alloc_flags |= ALLOC_HIGHATOMIC;
}
@@ -5946,7 +5951,6 @@ static void per_cpu_pages_init(struct per_cpu_pages *pcp, struct per_cpu_zonesta
pcp->high_min = BOOT_PAGESET_HIGH;
pcp->high_max = BOOT_PAGESET_HIGH;
pcp->batch = BOOT_PAGESET_BATCH;
- pcp->free_count = 0;
}
static void __zone_set_pageset_high_and_batch(struct zone *zone, unsigned long high_min,
@@ -6236,16 +6240,13 @@ static void calculate_totalreserve_pages(void)
unsigned long managed_pages = zone_managed_pages(zone);
/* Find valid and maximum lowmem_reserve in the zone */
- for (j = i; j < MAX_NR_ZONES; j++) {
- if (zone->lowmem_reserve[j] > max)
- max = zone->lowmem_reserve[j];
- }
+ for (j = i; j < MAX_NR_ZONES; j++)
+ max = max(max, zone->lowmem_reserve[j]);
/* we treat the high watermark as reserved pages. */
max += high_wmark_pages(zone);
- if (max > managed_pages)
- max = managed_pages;
+ max = min_t(unsigned long, max, managed_pages);
pgdat->totalreserve_pages += max;
diff --git a/mm/pagewalk.c b/mm/pagewalk.c
index 648038247a8d..c6753d370ff4 100644
--- a/mm/pagewalk.c
+++ b/mm/pagewalk.c
@@ -902,23 +902,23 @@ struct folio *folio_walk_start(struct folio_walk *fw,
fw->pudp = pudp;
fw->pud = pud;
- /*
- * TODO: FW_MIGRATION support for PUD migration entries
- * once there are relevant users.
- */
- if (!pud_present(pud) || pud_special(pud)) {
+ if (pud_none(pud)) {
spin_unlock(ptl);
goto not_found;
- } else if (!pud_leaf(pud)) {
+ } else if (pud_present(pud) && !pud_leaf(pud)) {
spin_unlock(ptl);
goto pmd_table;
+ } else if (pud_present(pud)) {
+ page = vm_normal_page_pud(vma, addr, pud);
+ if (page)
+ goto found;
}
/*
- * TODO: vm_normal_page_pud() will be handy once we want to
- * support PUD mappings in VM_PFNMAP|VM_MIXEDMAP VMAs.
+ * TODO: FW_MIGRATION support for PUD migration entries
+ * once there are relevant users.
*/
- page = pud_page(pud);
- goto found;
+ spin_unlock(ptl);
+ goto not_found;
}
pmd_table:
diff --git a/mm/rmap.c b/mm/rmap.c
index 568198e9efc2..34333ae3bd80 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -79,7 +79,6 @@
#include <asm/tlbflush.h>
#define CREATE_TRACE_POINTS
-#include <trace/events/tlb.h>
#include <trace/events/migrate.h>
#include "internal.h"
@@ -285,7 +284,7 @@ int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src)
list_for_each_entry_reverse(pavc, &src->anon_vma_chain, same_vma) {
struct anon_vma *anon_vma;
- avc = anon_vma_chain_alloc(GFP_NOWAIT | __GFP_NOWARN);
+ avc = anon_vma_chain_alloc(GFP_NOWAIT);
if (unlikely(!avc)) {
unlock_anon_vma_root(root);
root = NULL;
@@ -1241,18 +1240,40 @@ int pfn_mkclean_range(unsigned long pfn, unsigned long nr_pages, pgoff_t pgoff,
return page_vma_mkclean_one(&pvmw);
}
-static __always_inline unsigned int __folio_add_rmap(struct folio *folio,
+static void __folio_mod_stat(struct folio *folio, int nr, int nr_pmdmapped)
+{
+ int idx;
+
+ if (nr) {
+ idx = folio_test_anon(folio) ? NR_ANON_MAPPED : NR_FILE_MAPPED;
+ __lruvec_stat_mod_folio(folio, idx, nr);
+ }
+ if (nr_pmdmapped) {
+ if (folio_test_anon(folio)) {
+ idx = NR_ANON_THPS;
+ __lruvec_stat_mod_folio(folio, idx, nr_pmdmapped);
+ } else {
+ /* NR_*_PMDMAPPED are not maintained per-memcg */
+ idx = folio_test_swapbacked(folio) ?
+ NR_SHMEM_PMDMAPPED : NR_FILE_PMDMAPPED;
+ __mod_node_page_state(folio_pgdat(folio), idx,
+ nr_pmdmapped);
+ }
+ }
+}
+
+static __always_inline void __folio_add_rmap(struct folio *folio,
struct page *page, int nr_pages, struct vm_area_struct *vma,
- enum rmap_level level, int *nr_pmdmapped)
+ enum pgtable_level level)
{
atomic_t *mapped = &folio->_nr_pages_mapped;
const int orig_nr_pages = nr_pages;
- int first = 0, nr = 0;
+ int first = 0, nr = 0, nr_pmdmapped = 0;
__folio_rmap_sanity_checks(folio, page, nr_pages, level);
switch (level) {
- case RMAP_LEVEL_PTE:
+ case PGTABLE_LEVEL_PTE:
if (!folio_test_large(folio)) {
nr = atomic_inc_and_test(&folio->_mapcount);
break;
@@ -1278,12 +1299,12 @@ static __always_inline unsigned int __folio_add_rmap(struct folio *folio,
folio_add_large_mapcount(folio, orig_nr_pages, vma);
break;
- case RMAP_LEVEL_PMD:
- case RMAP_LEVEL_PUD:
+ case PGTABLE_LEVEL_PMD:
+ case PGTABLE_LEVEL_PUD:
first = atomic_inc_and_test(&folio->_entire_mapcount);
if (IS_ENABLED(CONFIG_NO_PAGE_MAPCOUNT)) {
- if (level == RMAP_LEVEL_PMD && first)
- *nr_pmdmapped = folio_large_nr_pages(folio);
+ if (level == PGTABLE_LEVEL_PMD && first)
+ nr_pmdmapped = folio_large_nr_pages(folio);
nr = folio_inc_return_large_mapcount(folio, vma);
if (nr == 1)
/* Was completely unmapped. */
@@ -1301,8 +1322,8 @@ static __always_inline unsigned int __folio_add_rmap(struct folio *folio,
* We only track PMD mappings of PMD-sized
* folios separately.
*/
- if (level == RMAP_LEVEL_PMD)
- *nr_pmdmapped = nr_pages;
+ if (level == PGTABLE_LEVEL_PMD)
+ nr_pmdmapped = nr_pages;
nr = nr_pages - (nr & FOLIO_PAGES_MAPPED);
/* Raced ahead of a remove and another add? */
if (unlikely(nr < 0))
@@ -1314,8 +1335,10 @@ static __always_inline unsigned int __folio_add_rmap(struct folio *folio,
}
folio_inc_large_mapcount(folio, vma);
break;
+ default:
+ BUILD_BUG();
}
- return nr;
+ __folio_mod_stat(folio, nr, nr_pmdmapped);
}
/**
@@ -1403,59 +1426,37 @@ static void __page_check_anon_rmap(const struct folio *folio,
page);
}
-static void __folio_mod_stat(struct folio *folio, int nr, int nr_pmdmapped)
-{
- int idx;
-
- if (nr) {
- idx = folio_test_anon(folio) ? NR_ANON_MAPPED : NR_FILE_MAPPED;
- __lruvec_stat_mod_folio(folio, idx, nr);
- }
- if (nr_pmdmapped) {
- if (folio_test_anon(folio)) {
- idx = NR_ANON_THPS;
- __lruvec_stat_mod_folio(folio, idx, nr_pmdmapped);
- } else {
- /* NR_*_PMDMAPPED are not maintained per-memcg */
- idx = folio_test_swapbacked(folio) ?
- NR_SHMEM_PMDMAPPED : NR_FILE_PMDMAPPED;
- __mod_node_page_state(folio_pgdat(folio), idx,
- nr_pmdmapped);
- }
- }
-}
-
static __always_inline void __folio_add_anon_rmap(struct folio *folio,
struct page *page, int nr_pages, struct vm_area_struct *vma,
- unsigned long address, rmap_t flags, enum rmap_level level)
+ unsigned long address, rmap_t flags, enum pgtable_level level)
{
- int i, nr, nr_pmdmapped = 0;
+ int i;
VM_WARN_ON_FOLIO(!folio_test_anon(folio), folio);
- nr = __folio_add_rmap(folio, page, nr_pages, vma, level, &nr_pmdmapped);
+ __folio_add_rmap(folio, page, nr_pages, vma, level);
if (likely(!folio_test_ksm(folio)))
__page_check_anon_rmap(folio, page, vma, address);
- __folio_mod_stat(folio, nr, nr_pmdmapped);
-
if (flags & RMAP_EXCLUSIVE) {
switch (level) {
- case RMAP_LEVEL_PTE:
+ case PGTABLE_LEVEL_PTE:
for (i = 0; i < nr_pages; i++)
SetPageAnonExclusive(page + i);
break;
- case RMAP_LEVEL_PMD:
+ case PGTABLE_LEVEL_PMD:
SetPageAnonExclusive(page);
break;
- case RMAP_LEVEL_PUD:
+ case PGTABLE_LEVEL_PUD:
/*
* Keep the compiler happy, we don't support anonymous
* PUD mappings.
*/
WARN_ON_ONCE(1);
break;
+ default:
+ BUILD_BUG();
}
}
@@ -1509,7 +1510,7 @@ void folio_add_anon_rmap_ptes(struct folio *folio, struct page *page,
rmap_t flags)
{
__folio_add_anon_rmap(folio, page, nr_pages, vma, address, flags,
- RMAP_LEVEL_PTE);
+ PGTABLE_LEVEL_PTE);
}
/**
@@ -1530,7 +1531,7 @@ void folio_add_anon_rmap_pmd(struct folio *folio, struct page *page,
{
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
__folio_add_anon_rmap(folio, page, HPAGE_PMD_NR, vma, address, flags,
- RMAP_LEVEL_PMD);
+ PGTABLE_LEVEL_PMD);
#else
WARN_ON_ONCE(true);
#endif
@@ -1611,14 +1612,11 @@ void folio_add_new_anon_rmap(struct folio *folio, struct vm_area_struct *vma,
static __always_inline void __folio_add_file_rmap(struct folio *folio,
struct page *page, int nr_pages, struct vm_area_struct *vma,
- enum rmap_level level)
+ enum pgtable_level level)
{
- int nr, nr_pmdmapped = 0;
-
VM_WARN_ON_FOLIO(folio_test_anon(folio), folio);
- nr = __folio_add_rmap(folio, page, nr_pages, vma, level, &nr_pmdmapped);
- __folio_mod_stat(folio, nr, nr_pmdmapped);
+ __folio_add_rmap(folio, page, nr_pages, vma, level);
/* See comments in folio_add_anon_rmap_*() */
if (!folio_test_large(folio))
@@ -1639,7 +1637,7 @@ static __always_inline void __folio_add_file_rmap(struct folio *folio,
void folio_add_file_rmap_ptes(struct folio *folio, struct page *page,
int nr_pages, struct vm_area_struct *vma)
{
- __folio_add_file_rmap(folio, page, nr_pages, vma, RMAP_LEVEL_PTE);
+ __folio_add_file_rmap(folio, page, nr_pages, vma, PGTABLE_LEVEL_PTE);
}
/**
@@ -1656,7 +1654,7 @@ void folio_add_file_rmap_pmd(struct folio *folio, struct page *page,
struct vm_area_struct *vma)
{
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
- __folio_add_file_rmap(folio, page, HPAGE_PMD_NR, vma, RMAP_LEVEL_PMD);
+ __folio_add_file_rmap(folio, page, HPAGE_PMD_NR, vma, PGTABLE_LEVEL_PMD);
#else
WARN_ON_ONCE(true);
#endif
@@ -1677,7 +1675,7 @@ void folio_add_file_rmap_pud(struct folio *folio, struct page *page,
{
#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && \
defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD)
- __folio_add_file_rmap(folio, page, HPAGE_PUD_NR, vma, RMAP_LEVEL_PUD);
+ __folio_add_file_rmap(folio, page, HPAGE_PUD_NR, vma, PGTABLE_LEVEL_PUD);
#else
WARN_ON_ONCE(true);
#endif
@@ -1685,7 +1683,7 @@ void folio_add_file_rmap_pud(struct folio *folio, struct page *page,
static __always_inline void __folio_remove_rmap(struct folio *folio,
struct page *page, int nr_pages, struct vm_area_struct *vma,
- enum rmap_level level)
+ enum pgtable_level level)
{
atomic_t *mapped = &folio->_nr_pages_mapped;
int last = 0, nr = 0, nr_pmdmapped = 0;
@@ -1694,7 +1692,7 @@ static __always_inline void __folio_remove_rmap(struct folio *folio,
__folio_rmap_sanity_checks(folio, page, nr_pages, level);
switch (level) {
- case RMAP_LEVEL_PTE:
+ case PGTABLE_LEVEL_PTE:
if (!folio_test_large(folio)) {
nr = atomic_add_negative(-1, &folio->_mapcount);
break;
@@ -1704,7 +1702,7 @@ static __always_inline void __folio_remove_rmap(struct folio *folio,
nr = folio_sub_return_large_mapcount(folio, nr_pages, vma);
if (!nr) {
/* Now completely unmapped. */
- nr = folio_nr_pages(folio);
+ nr = folio_large_nr_pages(folio);
} else {
partially_mapped = nr < folio_large_nr_pages(folio) &&
!folio_entire_mapcount(folio);
@@ -1724,11 +1722,11 @@ static __always_inline void __folio_remove_rmap(struct folio *folio,
partially_mapped = nr && atomic_read(mapped);
break;
- case RMAP_LEVEL_PMD:
- case RMAP_LEVEL_PUD:
+ case PGTABLE_LEVEL_PMD:
+ case PGTABLE_LEVEL_PUD:
if (IS_ENABLED(CONFIG_NO_PAGE_MAPCOUNT)) {
last = atomic_add_negative(-1, &folio->_entire_mapcount);
- if (level == RMAP_LEVEL_PMD && last)
+ if (level == PGTABLE_LEVEL_PMD && last)
nr_pmdmapped = folio_large_nr_pages(folio);
nr = folio_dec_return_large_mapcount(folio, vma);
if (!nr) {
@@ -1748,9 +1746,9 @@ static __always_inline void __folio_remove_rmap(struct folio *folio,
nr = atomic_sub_return_relaxed(ENTIRELY_MAPPED, mapped);
if (likely(nr < ENTIRELY_MAPPED)) {
nr_pages = folio_large_nr_pages(folio);
- if (level == RMAP_LEVEL_PMD)
+ if (level == PGTABLE_LEVEL_PMD)
nr_pmdmapped = nr_pages;
- nr = nr_pages - (nr & FOLIO_PAGES_MAPPED);
+ nr = nr_pages - nr;
/* Raced ahead of another remove and an add? */
if (unlikely(nr < 0))
nr = 0;
@@ -1762,6 +1760,8 @@ static __always_inline void __folio_remove_rmap(struct folio *folio,
partially_mapped = nr && nr < nr_pmdmapped;
break;
+ default:
+ BUILD_BUG();
}
/*
@@ -1801,7 +1801,7 @@ static __always_inline void __folio_remove_rmap(struct folio *folio,
void folio_remove_rmap_ptes(struct folio *folio, struct page *page,
int nr_pages, struct vm_area_struct *vma)
{
- __folio_remove_rmap(folio, page, nr_pages, vma, RMAP_LEVEL_PTE);
+ __folio_remove_rmap(folio, page, nr_pages, vma, PGTABLE_LEVEL_PTE);
}
/**
@@ -1818,7 +1818,7 @@ void folio_remove_rmap_pmd(struct folio *folio, struct page *page,
struct vm_area_struct *vma)
{
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
- __folio_remove_rmap(folio, page, HPAGE_PMD_NR, vma, RMAP_LEVEL_PMD);
+ __folio_remove_rmap(folio, page, HPAGE_PMD_NR, vma, PGTABLE_LEVEL_PMD);
#else
WARN_ON_ONCE(true);
#endif
@@ -1839,7 +1839,7 @@ void folio_remove_rmap_pud(struct folio *folio, struct page *page,
{
#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && \
defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD)
- __folio_remove_rmap(folio, page, HPAGE_PUD_NR, vma, RMAP_LEVEL_PUD);
+ __folio_remove_rmap(folio, page, HPAGE_PUD_NR, vma, PGTABLE_LEVEL_PUD);
#else
WARN_ON_ONCE(true);
#endif
diff --git a/mm/shmem.c b/mm/shmem.c
index e2c76a30802b..640fecc42f60 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1006,15 +1006,15 @@ unsigned long shmem_partial_swap_usage(struct address_space *mapping,
pgoff_t start, pgoff_t end)
{
XA_STATE(xas, &mapping->i_pages, start);
- struct page *page;
+ struct folio *folio;
unsigned long swapped = 0;
unsigned long max = end - 1;
rcu_read_lock();
- xas_for_each(&xas, page, max) {
- if (xas_retry(&xas, page))
+ xas_for_each(&xas, folio, max) {
+ if (xas_retry(&xas, folio))
continue;
- if (xa_is_value(page))
+ if (xa_is_value(folio))
swapped += 1 << xas_get_order(&xas);
if (xas.xa_index == max)
break;
@@ -1817,7 +1817,7 @@ unsigned long shmem_allowable_huge_orders(struct inode *inode,
vm_flags_t vm_flags = vma ? vma->vm_flags : 0;
unsigned int global_orders;
- if (thp_disabled_by_hw() || (vma && vma_thp_disabled(vma, vm_flags)))
+ if (thp_disabled_by_hw() || (vma && vma_thp_disabled(vma, vm_flags, shmem_huge_force)))
return 0;
global_orders = shmem_huge_global_enabled(inode, index, write_end,
@@ -2430,7 +2430,6 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
goto failed;
}
folio_wait_writeback(folio);
- nr_pages = folio_nr_pages(folio);
/*
* Some architectures may have to restore extra metadata to the
@@ -5081,7 +5080,7 @@ static int shmem_fill_super(struct super_block *sb, struct fs_context *fc)
sb->s_flags |= SB_NOUSER;
}
sb->s_export_op = &shmem_export_ops;
- sb->s_flags |= SB_NOSEC | SB_I_VERSION;
+ sb->s_flags |= SB_NOSEC;
#if IS_ENABLED(CONFIG_UNICODE)
if (!ctx->encoding && ctx->strict_encoding) {
@@ -5385,6 +5384,9 @@ int shmem_init_fs_context(struct fs_context *fc)
fc->fs_private = ctx;
fc->ops = &shmem_fs_context_ops;
+#ifdef CONFIG_TMPFS
+ fc->sb_flags |= SB_I_VERSION;
+#endif
return 0;
}
diff --git a/mm/slab.h b/mm/slab.h
index 248b34c839b7..c41a512dd07c 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -50,7 +50,7 @@ typedef union {
/* Reuses the bits in struct page */
struct slab {
- unsigned long flags;
+ memdesc_flags_t flags;
struct kmem_cache *slab_cache;
union {
@@ -174,12 +174,12 @@ static inline void *slab_address(const struct slab *slab)
static inline int slab_nid(const struct slab *slab)
{
- return folio_nid(slab_folio(slab));
+ return memdesc_nid(slab->flags);
}
static inline pg_data_t *slab_pgdat(const struct slab *slab)
{
- return folio_pgdat(slab_folio(slab));
+ return NODE_DATA(slab_nid(slab));
}
static inline struct slab *virt_to_slab(const void *addr)
diff --git a/mm/slub.c b/mm/slub.c
index 30003763d224..af343ca570b5 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -657,17 +657,17 @@ static inline unsigned int slub_get_cpu_partial(struct kmem_cache *s)
*/
static inline bool slab_test_pfmemalloc(const struct slab *slab)
{
- return test_bit(SL_pfmemalloc, &slab->flags);
+ return test_bit(SL_pfmemalloc, &slab->flags.f);
}
static inline void slab_set_pfmemalloc(struct slab *slab)
{
- set_bit(SL_pfmemalloc, &slab->flags);
+ set_bit(SL_pfmemalloc, &slab->flags.f);
}
static inline void __slab_clear_pfmemalloc(struct slab *slab)
{
- __clear_bit(SL_pfmemalloc, &slab->flags);
+ __clear_bit(SL_pfmemalloc, &slab->flags.f);
}
/*
@@ -675,12 +675,12 @@ static inline void __slab_clear_pfmemalloc(struct slab *slab)
*/
static __always_inline void slab_lock(struct slab *slab)
{
- bit_spin_lock(SL_locked, &slab->flags);
+ bit_spin_lock(SL_locked, &slab->flags.f);
}
static __always_inline void slab_unlock(struct slab *slab)
{
- bit_spin_unlock(SL_locked, &slab->flags);
+ bit_spin_unlock(SL_locked, &slab->flags.f);
}
static inline bool
@@ -1046,7 +1046,7 @@ static void print_slab_info(const struct slab *slab)
{
pr_err("Slab 0x%p objects=%u used=%u fp=0x%p flags=%pGp\n",
slab, slab->objects, slab->inuse, slab->freelist,
- &slab->flags);
+ &slab->flags.f);
}
void skip_orig_size_check(struct kmem_cache *s, const void *object)
@@ -2755,17 +2755,17 @@ static void discard_slab(struct kmem_cache *s, struct slab *slab)
static inline bool slab_test_node_partial(const struct slab *slab)
{
- return test_bit(SL_partial, &slab->flags);
+ return test_bit(SL_partial, &slab->flags.f);
}
static inline void slab_set_node_partial(struct slab *slab)
{
- set_bit(SL_partial, &slab->flags);
+ set_bit(SL_partial, &slab->flags.f);
}
static inline void slab_clear_node_partial(struct slab *slab)
{
- clear_bit(SL_partial, &slab->flags);
+ clear_bit(SL_partial, &slab->flags.f);
}
/*
@@ -4881,7 +4881,7 @@ void kfree(const void *object)
EXPORT_SYMBOL(kfree);
static __always_inline __realloc_size(2) void *
-__do_krealloc(const void *p, size_t new_size, gfp_t flags)
+__do_krealloc(const void *p, size_t new_size, unsigned long align, gfp_t flags, int nid)
{
void *ret;
size_t ks = 0;
@@ -4895,6 +4895,16 @@ __do_krealloc(const void *p, size_t new_size, gfp_t flags)
if (!kasan_check_byte(p))
return NULL;
+ /*
+ * If reallocation is not necessary (e. g. the new size is less
+ * than the current allocated size), the current allocation will be
+ * preserved unless __GFP_THISNODE is set. In the latter case a new
+ * allocation on the requested node will be attempted.
+ */
+ if (unlikely(flags & __GFP_THISNODE) && nid != NUMA_NO_NODE &&
+ nid != page_to_nid(virt_to_page(p)))
+ goto alloc_new;
+
if (is_kfence_address(p)) {
ks = orig_size = kfence_ksize(p);
} else {
@@ -4917,6 +4927,10 @@ __do_krealloc(const void *p, size_t new_size, gfp_t flags)
if (new_size > ks)
goto alloc_new;
+ /* If the old object doesn't satisfy the new alignment, allocate a new one */
+ if (!IS_ALIGNED((unsigned long)p, align))
+ goto alloc_new;
+
/* Zero out spare memory. */
if (want_init_on_alloc(flags)) {
kasan_disable_current();
@@ -4939,7 +4953,7 @@ __do_krealloc(const void *p, size_t new_size, gfp_t flags)
return (void *)p;
alloc_new:
- ret = kmalloc_node_track_caller_noprof(new_size, flags, NUMA_NO_NODE, _RET_IP_);
+ ret = kmalloc_node_track_caller_noprof(new_size, flags, nid, _RET_IP_);
if (ret && p) {
/* Disable KASAN checks as the object's redzone is accessed. */
kasan_disable_current();
@@ -4951,14 +4965,19 @@ alloc_new:
}
/**
- * krealloc - reallocate memory. The contents will remain unchanged.
+ * krealloc_node_align - reallocate memory. The contents will remain unchanged.
* @p: object to reallocate memory for.
* @new_size: how many bytes of memory are required.
+ * @align: desired alignment.
* @flags: the type of memory to allocate.
+ * @nid: NUMA node or NUMA_NO_NODE
*
* If @p is %NULL, krealloc() behaves exactly like kmalloc(). If @new_size
* is 0 and @p is not a %NULL pointer, the object pointed to is freed.
*
+ * Only alignments up to those guaranteed by kmalloc() will be honored. Please see
+ * Documentation/core-api/memory-allocation.rst for more details.
+ *
* If __GFP_ZERO logic is requested, callers must ensure that, starting with the
* initial memory allocation, every subsequent call to this API for the same
* memory allocation is flagged with __GFP_ZERO. Otherwise, it is possible that
@@ -4983,7 +5002,8 @@ alloc_new:
*
* Return: pointer to the allocated memory or %NULL in case of error
*/
-void *krealloc_noprof(const void *p, size_t new_size, gfp_t flags)
+void *krealloc_node_align_noprof(const void *p, size_t new_size, unsigned long align,
+ gfp_t flags, int nid)
{
void *ret;
@@ -4992,13 +5012,13 @@ void *krealloc_noprof(const void *p, size_t new_size, gfp_t flags)
return ZERO_SIZE_PTR;
}
- ret = __do_krealloc(p, new_size, flags);
+ ret = __do_krealloc(p, new_size, align, flags, nid);
if (ret && kasan_reset_tag(p) != kasan_reset_tag(ret))
kfree(p);
return ret;
}
-EXPORT_SYMBOL(krealloc_noprof);
+EXPORT_SYMBOL(krealloc_node_align_noprof);
static gfp_t kmalloc_gfp_adjust(gfp_t flags, size_t size)
{
@@ -5029,9 +5049,13 @@ static gfp_t kmalloc_gfp_adjust(gfp_t flags, size_t size)
* failure, fall back to non-contiguous (vmalloc) allocation.
* @size: size of the request.
* @b: which set of kmalloc buckets to allocate from.
+ * @align: desired alignment.
* @flags: gfp mask for the allocation - must be compatible (superset) with GFP_KERNEL.
* @node: numa node to allocate from
*
+ * Only alignments up to those guaranteed by kmalloc() will be honored. Please see
+ * Documentation/core-api/memory-allocation.rst for more details.
+ *
* Uses kmalloc to get the memory but if the allocation fails then falls back
* to the vmalloc allocator. Use kvfree for freeing the memory.
*
@@ -5041,7 +5065,8 @@ static gfp_t kmalloc_gfp_adjust(gfp_t flags, size_t size)
*
* Return: pointer to the allocated memory of %NULL in case of failure
*/
-void *__kvmalloc_node_noprof(DECL_BUCKET_PARAMS(size, b), gfp_t flags, int node)
+void *__kvmalloc_node_noprof(DECL_BUCKET_PARAMS(size, b), unsigned long align,
+ gfp_t flags, int node)
{
void *ret;
@@ -5071,7 +5096,7 @@ void *__kvmalloc_node_noprof(DECL_BUCKET_PARAMS(size, b), gfp_t flags, int node)
* about the resulting pointer, and cannot play
* protection games.
*/
- return __vmalloc_node_range_noprof(size, 1, VMALLOC_START, VMALLOC_END,
+ return __vmalloc_node_range_noprof(size, align, VMALLOC_START, VMALLOC_END,
flags, PAGE_KERNEL, VM_ALLOW_HUGE_VMAP,
node, __builtin_return_address(0));
}
@@ -5115,14 +5140,19 @@ void kvfree_sensitive(const void *addr, size_t len)
EXPORT_SYMBOL(kvfree_sensitive);
/**
- * kvrealloc - reallocate memory; contents remain unchanged
+ * kvrealloc_node_align - reallocate memory; contents remain unchanged
* @p: object to reallocate memory for
* @size: the size to reallocate
+ * @align: desired alignment
* @flags: the flags for the page level allocator
+ * @nid: NUMA node id
*
* If @p is %NULL, kvrealloc() behaves exactly like kvmalloc(). If @size is 0
* and @p is not a %NULL pointer, the object pointed to is freed.
*
+ * Only alignments up to those guaranteed by kmalloc() will be honored. Please see
+ * Documentation/core-api/memory-allocation.rst for more details.
+ *
* If __GFP_ZERO logic is requested, callers must ensure that, starting with the
* initial memory allocation, every subsequent call to this API for the same
* memory allocation is flagged with __GFP_ZERO. Otherwise, it is possible that
@@ -5136,17 +5166,18 @@ EXPORT_SYMBOL(kvfree_sensitive);
*
* Return: pointer to the allocated memory or %NULL in case of error
*/
-void *kvrealloc_noprof(const void *p, size_t size, gfp_t flags)
+void *kvrealloc_node_align_noprof(const void *p, size_t size, unsigned long align,
+ gfp_t flags, int nid)
{
void *n;
if (is_vmalloc_addr(p))
- return vrealloc_noprof(p, size, flags);
+ return vrealloc_node_align_noprof(p, size, align, flags, nid);
- n = krealloc_noprof(p, size, kmalloc_gfp_adjust(flags, size));
+ n = krealloc_node_align_noprof(p, size, align, kmalloc_gfp_adjust(flags, size), nid);
if (!n) {
/* We failed to krealloc(), fall back to kvmalloc(). */
- n = kvmalloc_noprof(size, flags);
+ n = kvmalloc_node_align_noprof(size, align, flags, nid);
if (!n)
return NULL;
@@ -5162,7 +5193,7 @@ void *kvrealloc_noprof(const void *p, size_t size, gfp_t flags)
return n;
}
-EXPORT_SYMBOL(kvrealloc_noprof);
+EXPORT_SYMBOL(kvrealloc_node_align_noprof);
struct detached_freelist {
struct slab *slab;
diff --git a/mm/sparse.c b/mm/sparse.c
index e6075b622407..17c50a6415c2 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -43,11 +43,11 @@ static u8 section_to_node_table[NR_MEM_SECTIONS] __cacheline_aligned;
static u16 section_to_node_table[NR_MEM_SECTIONS] __cacheline_aligned;
#endif
-int page_to_nid(const struct page *page)
+int memdesc_nid(memdesc_flags_t mdf)
{
- return section_to_node_table[page_to_section(page)];
+ return section_to_node_table[memdesc_section(mdf)];
}
-EXPORT_SYMBOL(page_to_nid);
+EXPORT_SYMBOL(memdesc_nid);
static void set_section_nid(unsigned long section_nr, int nid)
{
diff --git a/mm/swap.c b/mm/swap.c
index b74ebe865dd9..b8cea6a1b86f 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -388,14 +388,14 @@ static void __lru_cache_activate_folio(struct folio *folio)
static void lru_gen_inc_refs(struct folio *folio)
{
- unsigned long new_flags, old_flags = READ_ONCE(folio->flags);
+ unsigned long new_flags, old_flags = READ_ONCE(folio->flags.f);
if (folio_test_unevictable(folio))
return;
/* see the comment on LRU_REFS_FLAGS */
if (!folio_test_referenced(folio)) {
- set_mask_bits(&folio->flags, LRU_REFS_MASK, BIT(PG_referenced));
+ set_mask_bits(&folio->flags.f, LRU_REFS_MASK, BIT(PG_referenced));
return;
}
@@ -407,7 +407,7 @@ static void lru_gen_inc_refs(struct folio *folio)
}
new_flags = old_flags + BIT(LRU_REFS_PGOFF);
- } while (!try_cmpxchg(&folio->flags, &old_flags, new_flags));
+ } while (!try_cmpxchg(&folio->flags.f, &old_flags, new_flags));
}
static bool lru_gen_clear_refs(struct folio *folio)
@@ -419,7 +419,7 @@ static bool lru_gen_clear_refs(struct folio *folio)
if (gen < 0)
return true;
- set_mask_bits(&folio->flags, LRU_REFS_FLAGS | BIT(PG_workingset), 0);
+ set_mask_bits(&folio->flags.f, LRU_REFS_FLAGS | BIT(PG_workingset), 0);
lrugen = &folio_lruvec(folio)->lrugen;
/* whether can do without shuffling under the LRU lock */
@@ -1098,7 +1098,7 @@ static const struct ctl_table swap_sysctl_table[] = {
*/
void __init swap_setup(void)
{
- unsigned long megs = totalram_pages() >> (20 - PAGE_SHIFT);
+ unsigned long megs = PAGES_TO_MB(totalram_pages());
/* Use a smaller cluster for small-memory machines */
if (megs < 16)
diff --git a/mm/swap.h b/mm/swap.h
index 911ad5ff0f89..1ae44d4193b1 100644
--- a/mm/swap.h
+++ b/mm/swap.h
@@ -64,9 +64,6 @@ void clear_shadow_from_swap_cache(int type, unsigned long begin,
void swapcache_clear(struct swap_info_struct *si, swp_entry_t entry, int nr);
struct folio *swap_cache_get_folio(swp_entry_t entry,
struct vm_area_struct *vma, unsigned long addr);
-struct folio *filemap_get_incore_folio(struct address_space *mapping,
- pgoff_t index);
-
struct folio *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
struct vm_area_struct *vma, unsigned long addr,
struct swap_iocb **plug);
@@ -178,13 +175,6 @@ static inline struct folio *swap_cache_get_folio(swp_entry_t entry,
return NULL;
}
-static inline
-struct folio *filemap_get_incore_folio(struct address_space *mapping,
- pgoff_t index)
-{
- return filemap_get_folio(mapping, index);
-}
-
static inline void *get_shadow_from_swap_cache(swp_entry_t entry)
{
return NULL;
diff --git a/mm/swap_state.c b/mm/swap_state.c
index c354435a0923..99513b74b5d8 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -323,44 +323,6 @@ struct folio *swap_cache_get_folio(swp_entry_t entry,
return folio;
}
-/**
- * filemap_get_incore_folio - Find and get a folio from the page or swap caches.
- * @mapping: The address_space to search.
- * @index: The page cache index.
- *
- * This differs from filemap_get_folio() in that it will also look for the
- * folio in the swap cache.
- *
- * Return: The found folio or %NULL.
- */
-struct folio *filemap_get_incore_folio(struct address_space *mapping,
- pgoff_t index)
-{
- swp_entry_t swp;
- struct swap_info_struct *si;
- struct folio *folio = filemap_get_entry(mapping, index);
-
- if (!folio)
- return ERR_PTR(-ENOENT);
- if (!xa_is_value(folio))
- return folio;
- if (!shmem_mapping(mapping))
- return ERR_PTR(-ENOENT);
-
- swp = radix_to_swp_entry(folio);
- /* There might be swapin error entries in shmem mapping. */
- if (non_swap_entry(swp))
- return ERR_PTR(-ENOENT);
- /* Prevent swapoff from happening to us */
- si = get_swap_device(swp);
- if (!si)
- return ERR_PTR(-ENOENT);
- index = swap_cache_index(swp);
- folio = filemap_get_folio(swap_address_space(swp), index);
- put_swap_device(si);
- return folio;
-}
-
struct folio *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
struct mempolicy *mpol, pgoff_t ilx, bool *new_page_allocated,
bool skip_if_exists)
diff --git a/mm/swapfile.c b/mm/swapfile.c
index b4f3cc712580..a7ffabbe65ef 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -470,11 +470,6 @@ static void move_cluster(struct swap_info_struct *si,
else
list_move_tail(&ci->list, list);
spin_unlock(&si->lock);
-
- if (ci->flags == CLUSTER_FLAG_FRAG)
- atomic_long_dec(&si->frag_cluster_nr[ci->order]);
- else if (new_flags == CLUSTER_FLAG_FRAG)
- atomic_long_inc(&si->frag_cluster_nr[ci->order]);
ci->flags = new_flags;
}
@@ -825,6 +820,29 @@ out:
return found;
}
+static unsigned int alloc_swap_scan_list(struct swap_info_struct *si,
+ struct list_head *list,
+ unsigned int order,
+ unsigned char usage,
+ bool scan_all)
+{
+ unsigned int found = SWAP_ENTRY_INVALID;
+
+ do {
+ struct swap_cluster_info *ci = isolate_lock_cluster(si, list);
+ unsigned long offset;
+
+ if (!ci)
+ break;
+ offset = cluster_offset(si, ci);
+ found = alloc_swap_scan_cluster(si, ci, offset, order, usage);
+ if (found)
+ break;
+ } while (scan_all);
+
+ return found;
+}
+
static void swap_reclaim_full_clusters(struct swap_info_struct *si, bool force)
{
long to_scan = 1;
@@ -913,46 +931,46 @@ static unsigned long cluster_alloc_swap_entry(struct swap_info_struct *si, int o
}
new_cluster:
- ci = isolate_lock_cluster(si, &si->free_clusters);
- if (ci) {
- found = alloc_swap_scan_cluster(si, ci, cluster_offset(si, ci),
- order, usage);
+ /*
+ * If the device need discard, prefer new cluster over nonfull
+ * to spread out the writes.
+ */
+ if (si->flags & SWP_PAGE_DISCARD) {
+ found = alloc_swap_scan_list(si, &si->free_clusters, order, usage,
+ false);
if (found)
goto done;
}
- /* Try reclaim from full clusters if free clusters list is drained */
+ if (order < PMD_ORDER) {
+ found = alloc_swap_scan_list(si, &si->nonfull_clusters[order],
+ order, usage, true);
+ if (found)
+ goto done;
+ }
+
+ if (!(si->flags & SWP_PAGE_DISCARD)) {
+ found = alloc_swap_scan_list(si, &si->free_clusters, order, usage,
+ false);
+ if (found)
+ goto done;
+ }
+
+ /* Try reclaim full clusters if free and nonfull lists are drained */
if (vm_swap_full())
swap_reclaim_full_clusters(si, false);
if (order < PMD_ORDER) {
- unsigned int frags = 0, frags_existing;
-
- while ((ci = isolate_lock_cluster(si, &si->nonfull_clusters[order]))) {
- found = alloc_swap_scan_cluster(si, ci, cluster_offset(si, ci),
- order, usage);
- if (found)
- goto done;
- /* Clusters failed to allocate are moved to frag_clusters */
- frags++;
- }
-
- frags_existing = atomic_long_read(&si->frag_cluster_nr[order]);
- while (frags < frags_existing &&
- (ci = isolate_lock_cluster(si, &si->frag_clusters[order]))) {
- atomic_long_dec(&si->frag_cluster_nr[order]);
- /*
- * Rotate the frag list to iterate, they were all
- * failing high order allocation or moved here due to
- * per-CPU usage, but they could contain newly released
- * reclaimable (eg. lazy-freed swap cache) slots.
- */
- found = alloc_swap_scan_cluster(si, ci, cluster_offset(si, ci),
- order, usage);
- if (found)
- goto done;
- frags++;
- }
+ /*
+ * Scan only one fragment cluster is good enough. Order 0
+ * allocation will surely success, and large allocation
+ * failure is not critical. Scanning one cluster still
+ * keeps the list rotated and reclaimed (for HAS_CACHE).
+ */
+ found = alloc_swap_scan_list(si, &si->frag_clusters[order], order,
+ usage, false);
+ if (found)
+ goto done;
}
/*
@@ -971,20 +989,15 @@ new_cluster:
* Clusters here have at least one usable slots and can't fail order 0
* allocation, but reclaim may drop si->lock and race with another user.
*/
- while ((ci = isolate_lock_cluster(si, &si->frag_clusters[o]))) {
- atomic_long_dec(&si->frag_cluster_nr[o]);
- found = alloc_swap_scan_cluster(si, ci, cluster_offset(si, ci),
- 0, usage);
- if (found)
- goto done;
- }
+ found = alloc_swap_scan_list(si, &si->frag_clusters[o],
+ 0, usage, true);
+ if (found)
+ goto done;
- while ((ci = isolate_lock_cluster(si, &si->nonfull_clusters[o]))) {
- found = alloc_swap_scan_cluster(si, ci, cluster_offset(si, ci),
- 0, usage);
- if (found)
- goto done;
- }
+ found = alloc_swap_scan_list(si, &si->nonfull_clusters[o],
+ 0, usage, true);
+ if (found)
+ goto done;
}
done:
if (!(si->flags & SWP_SOLIDSTATE))
@@ -3224,7 +3237,6 @@ static struct swap_cluster_info *setup_clusters(struct swap_info_struct *si,
for (i = 0; i < SWAP_NR_ORDERS; i++) {
INIT_LIST_HEAD(&si->nonfull_clusters[i]);
INIT_LIST_HEAD(&si->frag_clusters[i]);
- atomic_long_set(&si->frag_cluster_nr[i], 0);
}
/*
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
index aefdf3a812a1..50aaa8dcd24c 100644
--- a/mm/userfaultfd.c
+++ b/mm/userfaultfd.c
@@ -1026,18 +1026,64 @@ static inline bool is_pte_pages_stable(pte_t *dst_pte, pte_t *src_pte,
pmd_same(dst_pmdval, pmdp_get_lockless(dst_pmd));
}
-static int move_present_pte(struct mm_struct *mm,
- struct vm_area_struct *dst_vma,
- struct vm_area_struct *src_vma,
- unsigned long dst_addr, unsigned long src_addr,
- pte_t *dst_pte, pte_t *src_pte,
- pte_t orig_dst_pte, pte_t orig_src_pte,
- pmd_t *dst_pmd, pmd_t dst_pmdval,
- spinlock_t *dst_ptl, spinlock_t *src_ptl,
- struct folio *src_folio)
+/*
+ * Checks if the two ptes and the corresponding folio are eligible for batched
+ * move. If so, then returns pointer to the locked folio. Otherwise, returns NULL.
+ *
+ * NOTE: folio's reference is not required as the whole operation is within
+ * PTL's critical section.
+ */
+static struct folio *check_ptes_for_batched_move(struct vm_area_struct *src_vma,
+ unsigned long src_addr,
+ pte_t *src_pte, pte_t *dst_pte,
+ struct anon_vma *src_anon_vma)
+{
+ pte_t orig_dst_pte, orig_src_pte;
+ struct folio *folio;
+
+ orig_dst_pte = ptep_get(dst_pte);
+ if (!pte_none(orig_dst_pte))
+ return NULL;
+
+ orig_src_pte = ptep_get(src_pte);
+ if (!pte_present(orig_src_pte) || is_zero_pfn(pte_pfn(orig_src_pte)))
+ return NULL;
+
+ folio = vm_normal_folio(src_vma, src_addr, orig_src_pte);
+ if (!folio || !folio_trylock(folio))
+ return NULL;
+ if (!PageAnonExclusive(&folio->page) || folio_test_large(folio) ||
+ folio_anon_vma(folio) != src_anon_vma) {
+ folio_unlock(folio);
+ return NULL;
+ }
+ return folio;
+}
+
+/*
+ * Moves src folios to dst in a batch as long as they share the same
+ * anon_vma as the first folio, are not large, and can successfully
+ * take the lock via folio_trylock().
+ */
+static long move_present_ptes(struct mm_struct *mm,
+ struct vm_area_struct *dst_vma,
+ struct vm_area_struct *src_vma,
+ unsigned long dst_addr, unsigned long src_addr,
+ pte_t *dst_pte, pte_t *src_pte,
+ pte_t orig_dst_pte, pte_t orig_src_pte,
+ pmd_t *dst_pmd, pmd_t dst_pmdval,
+ spinlock_t *dst_ptl, spinlock_t *src_ptl,
+ struct folio **first_src_folio, unsigned long len,
+ struct anon_vma *src_anon_vma)
{
int err = 0;
+ struct folio *src_folio = *first_src_folio;
+ unsigned long src_start = src_addr;
+ unsigned long src_end;
+ len = pmd_addr_end(dst_addr, dst_addr + len) - dst_addr;
+ src_end = pmd_addr_end(src_addr, src_addr + len);
+ flush_cache_range(src_vma, src_addr, src_end);
double_pt_lock(dst_ptl, src_ptl);
if (!is_pte_pages_stable(dst_pte, src_pte, orig_dst_pte, orig_src_pte,
@@ -1051,31 +1097,56 @@ static int move_present_pte(struct mm_struct *mm,
err = -EBUSY;
goto out;
}
+ /* It's safe to drop the reference now as the page-table is holding one. */
+ folio_put(*first_src_folio);
+ *first_src_folio = NULL;
+ arch_enter_lazy_mmu_mode();
+
+ while (true) {
+ orig_src_pte = ptep_get_and_clear(mm, src_addr, src_pte);
+ /* Folio got pinned from under us. Put it back and fail the move. */
+ if (folio_maybe_dma_pinned(src_folio)) {
+ set_pte_at(mm, src_addr, src_pte, orig_src_pte);
+ err = -EBUSY;
+ break;
+ }
- orig_src_pte = ptep_clear_flush(src_vma, src_addr, src_pte);
- /* Folio got pinned from under us. Put it back and fail the move. */
- if (folio_maybe_dma_pinned(src_folio)) {
- set_pte_at(mm, src_addr, src_pte, orig_src_pte);
- err = -EBUSY;
- goto out;
- }
-
- folio_move_anon_rmap(src_folio, dst_vma);
- src_folio->index = linear_page_index(dst_vma, dst_addr);
+ folio_move_anon_rmap(src_folio, dst_vma);
+ src_folio->index = linear_page_index(dst_vma, dst_addr);
- orig_dst_pte = folio_mk_pte(src_folio, dst_vma->vm_page_prot);
- /* Set soft dirty bit so userspace can notice the pte was moved */
+ orig_dst_pte = folio_mk_pte(src_folio, dst_vma->vm_page_prot);
+ /* Set soft dirty bit so userspace can notice the pte was moved */
#ifdef CONFIG_MEM_SOFT_DIRTY
- orig_dst_pte = pte_mksoft_dirty(orig_dst_pte);
+ orig_dst_pte = pte_mksoft_dirty(orig_dst_pte);
#endif
- if (pte_dirty(orig_src_pte))
- orig_dst_pte = pte_mkdirty(orig_dst_pte);
- orig_dst_pte = pte_mkwrite(orig_dst_pte, dst_vma);
+ if (pte_dirty(orig_src_pte))
+ orig_dst_pte = pte_mkdirty(orig_dst_pte);
+ orig_dst_pte = pte_mkwrite(orig_dst_pte, dst_vma);
+ set_pte_at(mm, dst_addr, dst_pte, orig_dst_pte);
+
+ src_addr += PAGE_SIZE;
+ if (src_addr == src_end)
+ break;
+ dst_addr += PAGE_SIZE;
+ dst_pte++;
+ src_pte++;
+
+ folio_unlock(src_folio);
+ src_folio = check_ptes_for_batched_move(src_vma, src_addr, src_pte,
+ dst_pte, src_anon_vma);
+ if (!src_folio)
+ break;
+ }
- set_pte_at(mm, dst_addr, dst_pte, orig_dst_pte);
+ arch_leave_lazy_mmu_mode();
+ if (src_addr > src_start)
+ flush_tlb_range(src_vma, src_start, src_addr);
+
+ if (src_folio)
+ folio_unlock(src_folio);
out:
double_pt_unlock(dst_ptl, src_ptl);
- return err;
+ return src_addr > src_start ? src_addr - src_start : err;
}
static int move_swap_pte(struct mm_struct *mm, struct vm_area_struct *dst_vma,
@@ -1140,7 +1211,7 @@ static int move_swap_pte(struct mm_struct *mm, struct vm_area_struct *dst_vma,
set_pte_at(mm, dst_addr, dst_pte, orig_src_pte);
double_pt_unlock(dst_ptl, src_ptl);
- return 0;
+ return PAGE_SIZE;
}
static int move_zeropage_pte(struct mm_struct *mm,
@@ -1167,20 +1238,20 @@ static int move_zeropage_pte(struct mm_struct *mm,
set_pte_at(mm, dst_addr, dst_pte, zero_pte);
double_pt_unlock(dst_ptl, src_ptl);
- return 0;
+ return PAGE_SIZE;
}
/*
- * The mmap_lock for reading is held by the caller. Just move the page
- * from src_pmd to dst_pmd if possible, and return true if succeeded
- * in moving the page.
+ * The mmap_lock for reading is held by the caller. Just move the page(s)
+ * from src_pmd to dst_pmd if possible, and return number of bytes moved.
+ * On failure, an error code is returned.
*/
-static int move_pages_pte(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd,
- struct vm_area_struct *dst_vma,
- struct vm_area_struct *src_vma,
- unsigned long dst_addr, unsigned long src_addr,
- __u64 mode)
+static long move_pages_ptes(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd,
+ struct vm_area_struct *dst_vma,
+ struct vm_area_struct *src_vma,
+ unsigned long dst_addr, unsigned long src_addr,
+ unsigned long len, __u64 mode)
{
swp_entry_t entry;
struct swap_info_struct *si = NULL;
@@ -1194,11 +1265,10 @@ static int move_pages_pte(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd,
struct folio *src_folio = NULL;
struct anon_vma *src_anon_vma = NULL;
struct mmu_notifier_range range;
- int err = 0;
+ long ret = 0;
- flush_cache_range(src_vma, src_addr, src_addr + PAGE_SIZE);
mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm,
- src_addr, src_addr + PAGE_SIZE);
+ src_addr, src_addr + len);
mmu_notifier_invalidate_range_start(&range);
retry:
/*
@@ -1212,7 +1282,7 @@ retry:
/* Retry if a huge pmd materialized from under us */
if (unlikely(!dst_pte)) {
- err = -EAGAIN;
+ ret = -EAGAIN;
goto out;
}
@@ -1231,14 +1301,14 @@ retry:
* transparent huge pages under us.
*/
if (unlikely(!src_pte)) {
- err = -EAGAIN;
+ ret = -EAGAIN;
goto out;
}
/* Sanity checks before the operation */
if (pmd_none(*dst_pmd) || pmd_none(*src_pmd) ||
pmd_trans_huge(*dst_pmd) || pmd_trans_huge(*src_pmd)) {
- err = -EINVAL;
+ ret = -EINVAL;
goto out;
}
@@ -1246,7 +1316,7 @@ retry:
orig_dst_pte = ptep_get(dst_pte);
spin_unlock(dst_ptl);
if (!pte_none(orig_dst_pte)) {
- err = -EEXIST;
+ ret = -EEXIST;
goto out;
}
@@ -1255,21 +1325,21 @@ retry:
spin_unlock(src_ptl);
if (pte_none(orig_src_pte)) {
if (!(mode & UFFDIO_MOVE_MODE_ALLOW_SRC_HOLES))
- err = -ENOENT;
+ ret = -ENOENT;
else /* nothing to do to move a hole */
- err = 0;
+ ret = PAGE_SIZE;
goto out;
}
/* If PTE changed after we locked the folio them start over */
if (src_folio && unlikely(!pte_same(src_folio_pte, orig_src_pte))) {
- err = -EAGAIN;
+ ret = -EAGAIN;
goto out;
}
if (pte_present(orig_src_pte)) {
if (is_zero_pfn(pte_pfn(orig_src_pte))) {
- err = move_zeropage_pte(mm, dst_vma, src_vma,
+ ret = move_zeropage_pte(mm, dst_vma, src_vma,
dst_addr, src_addr, dst_pte, src_pte,
orig_dst_pte, orig_src_pte,
dst_pmd, dst_pmdval, dst_ptl, src_ptl);
@@ -1292,14 +1362,14 @@ retry:
spin_lock(src_ptl);
if (!pte_same(orig_src_pte, ptep_get(src_pte))) {
spin_unlock(src_ptl);
- err = -EAGAIN;
+ ret = -EAGAIN;
goto out;
}
folio = vm_normal_folio(src_vma, src_addr, orig_src_pte);
if (!folio || !PageAnonExclusive(&folio->page)) {
spin_unlock(src_ptl);
- err = -EBUSY;
+ ret = -EBUSY;
goto out;
}
@@ -1313,7 +1383,7 @@ retry:
*/
if (!locked && folio_test_large(folio)) {
spin_unlock(src_ptl);
- err = -EAGAIN;
+ ret = -EAGAIN;
goto out;
}
@@ -1332,7 +1402,7 @@ retry:
}
if (WARN_ON_ONCE(!folio_test_anon(src_folio))) {
- err = -EBUSY;
+ ret = -EBUSY;
goto out;
}
}
@@ -1343,8 +1413,8 @@ retry:
pte_unmap(src_pte);
pte_unmap(dst_pte);
src_pte = dst_pte = NULL;
- err = split_folio(src_folio);
- if (err)
+ ret = split_folio(src_folio);
+ if (ret)
goto out;
/* have to reacquire the folio after it got split */
folio_unlock(src_folio);
@@ -1362,7 +1432,7 @@ retry:
src_anon_vma = folio_get_anon_vma(src_folio);
if (!src_anon_vma) {
/* page was unmapped from under us */
- err = -EAGAIN;
+ ret = -EAGAIN;
goto out;
}
if (!anon_vma_trylock_write(src_anon_vma)) {
@@ -1375,10 +1445,11 @@ retry:
}
}
- err = move_present_pte(mm, dst_vma, src_vma,
- dst_addr, src_addr, dst_pte, src_pte,
- orig_dst_pte, orig_src_pte, dst_pmd,
- dst_pmdval, dst_ptl, src_ptl, src_folio);
+ ret = move_present_ptes(mm, dst_vma, src_vma,
+ dst_addr, src_addr, dst_pte, src_pte,
+ orig_dst_pte, orig_src_pte, dst_pmd,
+ dst_pmdval, dst_ptl, src_ptl, &src_folio,
+ len, src_anon_vma);
} else {
struct folio *folio = NULL;
@@ -1389,20 +1460,20 @@ retry:
pte_unmap(dst_pte);
src_pte = dst_pte = NULL;
migration_entry_wait(mm, src_pmd, src_addr);
- err = -EAGAIN;
+ ret = -EAGAIN;
} else
- err = -EFAULT;
+ ret = -EFAULT;
goto out;
}
if (!pte_swp_exclusive(orig_src_pte)) {
- err = -EBUSY;
+ ret = -EBUSY;
goto out;
}
si = get_swap_device(entry);
if (unlikely(!si)) {
- err = -EAGAIN;
+ ret = -EAGAIN;
goto out;
}
/*
@@ -1422,7 +1493,7 @@ retry:
swap_cache_index(entry));
if (!IS_ERR_OR_NULL(folio)) {
if (folio_test_large(folio)) {
- err = -EBUSY;
+ ret = -EBUSY;
folio_put(folio);
goto out;
}
@@ -1439,7 +1510,7 @@ retry:
goto retry;
}
}
- err = move_swap_pte(mm, dst_vma, dst_addr, src_addr, dst_pte, src_pte,
+ ret = move_swap_pte(mm, dst_vma, dst_addr, src_addr, dst_pte, src_pte,
orig_dst_pte, orig_src_pte, dst_pmd, dst_pmdval,
dst_ptl, src_ptl, src_folio, si, entry);
}
@@ -1466,7 +1537,7 @@ out:
if (si)
put_swap_device(si);
- return err;
+ return ret;
}
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
@@ -1737,7 +1808,7 @@ ssize_t move_pages(struct userfaultfd_ctx *ctx, unsigned long dst_start,
{
struct mm_struct *mm = ctx->mm;
struct vm_area_struct *src_vma, *dst_vma;
- unsigned long src_addr, dst_addr;
+ unsigned long src_addr, dst_addr, src_end;
pmd_t *src_pmd, *dst_pmd;
long err = -EINVAL;
ssize_t moved = 0;
@@ -1780,8 +1851,8 @@ ssize_t move_pages(struct userfaultfd_ctx *ctx, unsigned long dst_start,
if (err)
goto out_unlock;
- for (src_addr = src_start, dst_addr = dst_start;
- src_addr < src_start + len;) {
+ for (src_addr = src_start, dst_addr = dst_start, src_end = src_start + len;
+ src_addr < src_end;) {
spinlock_t *ptl;
pmd_t dst_pmdval;
unsigned long step_size;
@@ -1849,6 +1920,8 @@ ssize_t move_pages(struct userfaultfd_ctx *ctx, unsigned long dst_start,
dst_addr, src_addr);
step_size = HPAGE_PMD_SIZE;
} else {
+ long ret;
+
if (pmd_none(*src_pmd)) {
if (!(mode & UFFDIO_MOVE_MODE_ALLOW_SRC_HOLES)) {
err = -ENOENT;
@@ -1865,10 +1938,13 @@ ssize_t move_pages(struct userfaultfd_ctx *ctx, unsigned long dst_start,
break;
}
- err = move_pages_pte(mm, dst_pmd, src_pmd,
- dst_vma, src_vma,
- dst_addr, src_addr, mode);
- step_size = PAGE_SIZE;
+ ret = move_pages_ptes(mm, dst_pmd, src_pmd,
+ dst_vma, src_vma, dst_addr,
+ src_addr, src_end - src_addr, mode);
+ if (ret < 0)
+ err = ret;
+ else
+ step_size = ret;
}
cond_resched();
diff --git a/mm/util.c b/mm/util.c
index f814e6a59ab1..d235b74f7aff 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -471,17 +471,17 @@ void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack)
if (mmap_is_legacy(rlim_stack)) {
mm->mmap_base = TASK_UNMAPPED_BASE + random_factor;
- clear_bit(MMF_TOPDOWN, &mm->flags);
+ mm_flags_clear(MMF_TOPDOWN, mm);
} else {
mm->mmap_base = mmap_base(random_factor, rlim_stack);
- set_bit(MMF_TOPDOWN, &mm->flags);
+ mm_flags_set(MMF_TOPDOWN, mm);
}
}
#elif defined(CONFIG_MMU) && !defined(HAVE_ARCH_PICK_MMAP_LAYOUT)
void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack)
{
mm->mmap_base = TASK_UNMAPPED_BASE;
- clear_bit(MMF_TOPDOWN, &mm->flags);
+ mm_flags_clear(MMF_TOPDOWN, mm);
}
#endif
#ifdef CONFIG_MMU
diff --git a/mm/vma.h b/mm/vma.h
index b123a9cdedb0..bcdc261c5b15 100644
--- a/mm/vma.h
+++ b/mm/vma.h
@@ -145,7 +145,7 @@ struct vma_merge_struct {
*/
bool __remove_middle :1;
/*
- * Internal flag used during the merge operationr to indicate we will
+ * Internal flag used during the merge operation to indicate we will
* remove vmg->next.
*/
bool __remove_next :1;
diff --git a/mm/vma_init.c b/mm/vma_init.c
index 8e53c7943561..d847c6557261 100644
--- a/mm/vma_init.c
+++ b/mm/vma_init.c
@@ -1,7 +1,7 @@
// SPDX-License-Identifier: GPL-2.0-or-later
/*
- * Functions for initialisaing, allocating, freeing and duplicating VMAs. Shared
+ * Functions for initializing, allocating, freeing and duplicating VMAs. Shared
* between CONFIG_MMU and non-CONFIG_MMU kernel configurations.
*/
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 5edd536ba9d2..4249e1e01947 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -4089,19 +4089,29 @@ void *vzalloc_node_noprof(unsigned long size, int node)
EXPORT_SYMBOL(vzalloc_node_noprof);
/**
- * vrealloc - reallocate virtually contiguous memory; contents remain unchanged
+ * vrealloc_node_align_noprof - reallocate virtually contiguous memory; contents
+ * remain unchanged
* @p: object to reallocate memory for
* @size: the size to reallocate
+ * @align: requested alignment
* @flags: the flags for the page level allocator
+ * @nid: node number of the target node
+ *
+ * If @p is %NULL, vrealloc_XXX() behaves exactly like vmalloc_XXX(). If @size
+ * is 0 and @p is not a %NULL pointer, the object pointed to is freed.
*
- * If @p is %NULL, vrealloc() behaves exactly like vmalloc(). If @size is 0 and
- * @p is not a %NULL pointer, the object pointed to is freed.
+ * If the caller wants the new memory to be on specific node *only*,
+ * __GFP_THISNODE flag should be set, otherwise the function will try to avoid
+ * reallocation and possibly disregard the specified @nid.
*
* If __GFP_ZERO logic is requested, callers must ensure that, starting with the
* initial memory allocation, every subsequent call to this API for the same
* memory allocation is flagged with __GFP_ZERO. Otherwise, it is possible that
* __GFP_ZERO is not fully honored by this API.
*
+ * Requesting an alignment that is bigger than the alignment of the existing
+ * allocation will fail.
+ *
* In any case, the contents of the object pointed to are preserved up to the
* lesser of the new and old sizes.
*
@@ -4111,7 +4121,8 @@ EXPORT_SYMBOL(vzalloc_node_noprof);
* Return: pointer to the allocated memory; %NULL if @size is zero or in case of
* failure
*/
-void *vrealloc_noprof(const void *p, size_t size, gfp_t flags)
+void *vrealloc_node_align_noprof(const void *p, size_t size, unsigned long align,
+ gfp_t flags, int nid)
{
struct vm_struct *vm = NULL;
size_t alloced_size = 0;
@@ -4135,6 +4146,12 @@ void *vrealloc_noprof(const void *p, size_t size, gfp_t flags)
if (WARN(alloced_size < old_size,
"vrealloc() has mismatched area vs requested sizes (%p)\n", p))
return NULL;
+ if (WARN(!IS_ALIGNED((unsigned long)p, align),
+ "will not reallocate with a bigger alignment (0x%lx)\n", align))
+ return NULL;
+ if (unlikely(flags & __GFP_THISNODE) && nid != NUMA_NO_NODE &&
+ nid != page_to_nid(vmalloc_to_page(p)))
+ goto need_realloc;
}
/*
@@ -4165,8 +4182,10 @@ void *vrealloc_noprof(const void *p, size_t size, gfp_t flags)
return (void *)p;
}
+need_realloc:
/* TODO: Grow the vm_area, i.e. allocate and map additional pages. */
- n = __vmalloc_noprof(size, flags);
+ n = __vmalloc_node_noprof(size, align, flags, nid, __builtin_return_address(0));
+
if (!n)
return NULL;
@@ -5177,7 +5196,7 @@ static void vmap_init_nodes(void)
int n = clamp_t(unsigned int, num_possible_cpus(), 1, 128);
if (n > 1) {
- vn = kmalloc_array(n, sizeof(*vn), GFP_NOWAIT | __GFP_NOWARN);
+ vn = kmalloc_array(n, sizeof(*vn), GFP_NOWAIT);
if (vn) {
/* Node partition is 16 pages. */
vmap_zone_size = (1 << 4) * PAGE_SIZE;
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 674999999cd0..ca9e1cd3cd68 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -398,14 +398,7 @@ unsigned long zone_reclaimable_pages(struct zone *zone)
if (can_reclaim_anon_pages(NULL, zone_to_nid(zone), NULL))
nr += zone_page_state_snapshot(zone, NR_ZONE_INACTIVE_ANON) +
zone_page_state_snapshot(zone, NR_ZONE_ACTIVE_ANON);
- /*
- * If there are no reclaimable file-backed or anonymous pages,
- * ensure zones with sufficient free pages are not skipped.
- * This prevents zones like DMA32 from being ignored in reclaim
- * scenarios where they can still help alleviate memory pressure.
- */
- if (nr == 0)
- nr = zone_page_state_snapshot(zone, NR_FREE_PAGES);
+
return nr;
}
@@ -888,11 +881,11 @@ static bool lru_gen_set_refs(struct folio *folio)
{
/* see the comment on LRU_REFS_FLAGS */
if (!folio_test_referenced(folio) && !folio_test_workingset(folio)) {
- set_mask_bits(&folio->flags, LRU_REFS_MASK, BIT(PG_referenced));
+ set_mask_bits(&folio->flags.f, LRU_REFS_MASK, BIT(PG_referenced));
return false;
}
- set_mask_bits(&folio->flags, LRU_REFS_FLAGS, BIT(PG_workingset));
+ set_mask_bits(&folio->flags.f, LRU_REFS_FLAGS, BIT(PG_workingset));
return true;
}
#else
@@ -3257,13 +3250,13 @@ static bool positive_ctrl_err(struct ctrl_pos *sp, struct ctrl_pos *pv)
/* promote pages accessed through page tables */
static int folio_update_gen(struct folio *folio, int gen)
{
- unsigned long new_flags, old_flags = READ_ONCE(folio->flags);
+ unsigned long new_flags, old_flags = READ_ONCE(folio->flags.f);
VM_WARN_ON_ONCE(gen >= MAX_NR_GENS);
/* see the comment on LRU_REFS_FLAGS */
if (!folio_test_referenced(folio) && !folio_test_workingset(folio)) {
- set_mask_bits(&folio->flags, LRU_REFS_MASK, BIT(PG_referenced));
+ set_mask_bits(&folio->flags.f, LRU_REFS_MASK, BIT(PG_referenced));
return -1;
}
@@ -3274,7 +3267,7 @@ static int folio_update_gen(struct folio *folio, int gen)
new_flags = old_flags & ~(LRU_GEN_MASK | LRU_REFS_FLAGS);
new_flags |= ((gen + 1UL) << LRU_GEN_PGOFF) | BIT(PG_workingset);
- } while (!try_cmpxchg(&folio->flags, &old_flags, new_flags));
+ } while (!try_cmpxchg(&folio->flags.f, &old_flags, new_flags));
return ((old_flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1;
}
@@ -3285,7 +3278,7 @@ static int folio_inc_gen(struct lruvec *lruvec, struct folio *folio, bool reclai
int type = folio_is_file_lru(folio);
struct lru_gen_folio *lrugen = &lruvec->lrugen;
int new_gen, old_gen = lru_gen_from_seq(lrugen->min_seq[type]);
- unsigned long new_flags, old_flags = READ_ONCE(folio->flags);
+ unsigned long new_flags, old_flags = READ_ONCE(folio->flags.f);
VM_WARN_ON_ONCE_FOLIO(!(old_flags & LRU_GEN_MASK), folio);
@@ -3302,7 +3295,7 @@ static int folio_inc_gen(struct lruvec *lruvec, struct folio *folio, bool reclai
/* for folio_end_writeback() */
if (reclaiming)
new_flags |= BIT(PG_reclaim);
- } while (!try_cmpxchg(&folio->flags, &old_flags, new_flags));
+ } while (!try_cmpxchg(&folio->flags.f, &old_flags, new_flags));
lru_gen_update_size(lruvec, folio, old_gen, new_gen);
@@ -4553,7 +4546,7 @@ static bool isolate_folio(struct lruvec *lruvec, struct folio *folio, struct sca
/* see the comment on LRU_REFS_FLAGS */
if (!folio_test_referenced(folio))
- set_mask_bits(&folio->flags, LRU_REFS_MASK, 0);
+ set_mask_bits(&folio->flags.f, LRU_REFS_MASK, 0);
/* for shrink_folio_list() */
folio_clear_reclaim(folio);
@@ -4766,7 +4759,7 @@ retry:
/* don't add rejected folios to the oldest generation */
if (lru_gen_folio_seq(lruvec, folio, false) == min_seq[type])
- set_mask_bits(&folio->flags, LRU_REFS_FLAGS, BIT(PG_active));
+ set_mask_bits(&folio->flags.f, LRU_REFS_FLAGS, BIT(PG_active));
}
spin_lock_irq(&lruvec->lru_lock);
@@ -5561,6 +5554,7 @@ static int run_cmd(char cmd, int memcg_id, int nid, unsigned long seq,
if (memcg_id != mem_cgroup_id(memcg))
goto done;
+ sc->target_mem_cgroup = memcg;
lruvec = get_lruvec(memcg, nid);
if (swappiness < MIN_SWAPPINESS)
@@ -5597,6 +5591,7 @@ static ssize_t lru_gen_seq_write(struct file *file, const char __user *src,
.may_swap = true,
.reclaim_idx = MAX_NR_ZONES - 1,
.gfp_mask = GFP_KERNEL,
+ .proactive = true,
};
buf = kvmalloc(len + 1, GFP_KERNEL);
@@ -6493,7 +6488,7 @@ static bool allow_direct_reclaim(pg_data_t *pgdat)
return true;
for_each_managed_zone_pgdat(zone, pgdat, i, ZONE_NORMAL) {
- if (!zone_reclaimable_pages(zone))
+ if (!zone_reclaimable_pages(zone) && zone_page_state_snapshot(zone, NR_FREE_PAGES))
continue;
pfmemalloc_reserve += min_wmark_pages(zone);
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 71cd1ceba191..e522decf6a72 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -1280,6 +1280,7 @@ const char * const vmstat_text[] = {
#ifdef CONFIG_NUMA_BALANCING
[I(PGPROMOTE_SUCCESS)] = "pgpromote_success",
[I(PGPROMOTE_CANDIDATE)] = "pgpromote_candidate",
+ [I(PGPROMOTE_CANDIDATE_NRL)] = "pgpromote_candidate_nrl",
#endif
[I(PGDEMOTE_KSWAPD)] = "pgdemote_kswapd",
[I(PGDEMOTE_DIRECT)] = "pgdemote_direct",
@@ -1289,6 +1290,7 @@ const char * const vmstat_text[] = {
[I(NR_HUGETLB)] = "nr_hugetlb",
#endif
[I(NR_BALLOON_PAGES)] = "nr_balloon_pages",
+ [I(NR_KERNEL_FILE_PAGES)] = "nr_kernel_file_pages",
#undef I
/* system-wide enum vm_stat_item counters */
diff --git a/mm/workingset.c b/mm/workingset.c
index 6e7f4cb1b9a7..68a76a91111f 100644
--- a/mm/workingset.c
+++ b/mm/workingset.c
@@ -318,7 +318,7 @@ static void lru_gen_refault(struct folio *folio, void *shadow)
folio_set_workingset(folio);
mod_lruvec_state(lruvec, WORKINGSET_RESTORE_BASE + type, delta);
} else
- set_mask_bits(&folio->flags, LRU_REFS_MASK, (refs - 1UL) << LRU_REFS_PGOFF);
+ set_mask_bits(&folio->flags.f, LRU_REFS_MASK, (refs - 1UL) << LRU_REFS_PGOFF);
unlock:
rcu_read_unlock();
}
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index 805a10b41266..153783d49d34 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -1746,7 +1746,7 @@ static int zs_page_migrate(struct page *newpage, struct page *page,
* instead.
*/
if (!zpdesc->zspage)
- return MIGRATEPAGE_SUCCESS;
+ return 0;
/* The page is locked, so this pointer must remain valid */
zspage = get_zspage(zpdesc);
@@ -1813,7 +1813,7 @@ static int zs_page_migrate(struct page *newpage, struct page *page,
reset_zpdesc(zpdesc);
zpdesc_put(zpdesc);
- return MIGRATEPAGE_SUCCESS;
+ return 0;
}
static void zs_page_putback(struct page *page)
diff --git a/mm/zswap.c b/mm/zswap.c
index 3c0fd8a13718..e5e1f5687f5e 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -42,8 +42,10 @@
/*********************************
* statistics
**********************************/
-/* The number of compressed pages currently stored in zswap */
+/* The number of pages currently stored in zswap */
atomic_long_t zswap_stored_pages = ATOMIC_LONG_INIT(0);
+/* The number of incompressible pages currently stored in zswap */
+static atomic_long_t zswap_stored_incompressible_pages = ATOMIC_LONG_INIT(0);
/*
* The statistics below are not protected from concurrent access for
@@ -811,6 +813,8 @@ static void zswap_entry_free(struct zswap_entry *entry)
obj_cgroup_uncharge_zswap(entry->objcg, entry->length);
obj_cgroup_put(entry->objcg);
}
+ if (entry->length == PAGE_SIZE)
+ atomic_long_dec(&zswap_stored_incompressible_pages);
zswap_entry_cache_free(entry);
atomic_long_dec(&zswap_stored_pages);
}
@@ -827,7 +831,7 @@ static int zswap_cpu_comp_prepare(unsigned int cpu, struct hlist_node *node)
u8 *buffer = NULL;
int ret;
- buffer = kmalloc_node(PAGE_SIZE * 2, GFP_KERNEL, cpu_to_node(cpu));
+ buffer = kmalloc_node(PAGE_SIZE, GFP_KERNEL, cpu_to_node(cpu));
if (!buffer) {
ret = -ENOMEM;
goto fail;
@@ -948,18 +952,14 @@ static bool zswap_compress(struct page *page, struct zswap_entry *entry,
struct zpool *zpool;
gfp_t gfp;
u8 *dst;
+ bool mapped = false;
acomp_ctx = acomp_ctx_get_cpu_lock(pool);
dst = acomp_ctx->buffer;
sg_init_table(&input, 1);
sg_set_page(&input, page, PAGE_SIZE, 0);
- /*
- * We need PAGE_SIZE * 2 here since there maybe over-compression case,
- * and hardware-accelerators may won't check the dst buffer size, so
- * giving the dst buffer with enough length to avoid buffer overflow.
- */
- sg_init_one(&output, dst, PAGE_SIZE * 2);
+ sg_init_one(&output, dst, PAGE_SIZE);
acomp_request_set_params(acomp_ctx->req, &input, &output, PAGE_SIZE, dlen);
/*
@@ -976,8 +976,26 @@ static bool zswap_compress(struct page *page, struct zswap_entry *entry,
*/
comp_ret = crypto_wait_req(crypto_acomp_compress(acomp_ctx->req), &acomp_ctx->wait);
dlen = acomp_ctx->req->dlen;
- if (comp_ret)
- goto unlock;
+
+ /*
+ * If a page cannot be compressed into a size smaller than PAGE_SIZE,
+ * save the content as is without a compression, to keep the LRU order
+ * of writebacks. If writeback is disabled, reject the page since it
+ * only adds metadata overhead. swap_writeout() will put the page back
+ * to the active LRU list in the case.
+ */
+ if (comp_ret || !dlen || dlen >= PAGE_SIZE) {
+ dlen = PAGE_SIZE;
+ if (!mem_cgroup_zswap_writeback_enabled(
+ folio_memcg(page_folio(page)))) {
+ comp_ret = comp_ret ? comp_ret : -EINVAL;
+ goto unlock;
+ }
+ comp_ret = 0;
+ dlen = PAGE_SIZE;
+ dst = kmap_local_page(page);
+ mapped = true;
+ }
zpool = pool->zpool;
gfp = GFP_NOWAIT | __GFP_NORETRY | __GFP_HIGHMEM | __GFP_MOVABLE;
@@ -990,6 +1008,8 @@ static bool zswap_compress(struct page *page, struct zswap_entry *entry,
entry->length = dlen;
unlock:
+ if (mapped)
+ kunmap_local(dst);
if (comp_ret == -ENOSPC || alloc_ret == -ENOSPC)
zswap_reject_compress_poor++;
else if (comp_ret)
@@ -1006,12 +1026,18 @@ static bool zswap_decompress(struct zswap_entry *entry, struct folio *folio)
struct zpool *zpool = entry->pool->zpool;
struct scatterlist input, output;
struct crypto_acomp_ctx *acomp_ctx;
- int decomp_ret, dlen;
+ int decomp_ret = 0, dlen = PAGE_SIZE;
u8 *src, *obj;
acomp_ctx = acomp_ctx_get_cpu_lock(entry->pool);
obj = zpool_obj_read_begin(zpool, entry->handle, acomp_ctx->buffer);
+ /* zswap entries of length PAGE_SIZE are not compressed. */
+ if (entry->length == PAGE_SIZE) {
+ memcpy_to_folio(folio, 0, obj, entry->length);
+ goto read_done;
+ }
+
/*
* zpool_obj_read_begin() might return a kmap address of highmem when
* acomp_ctx->buffer is not used. However, sg_init_one() does not
@@ -1032,6 +1058,7 @@ static bool zswap_decompress(struct zswap_entry *entry, struct folio *folio)
decomp_ret = crypto_wait_req(crypto_acomp_decompress(acomp_ctx->req), &acomp_ctx->wait);
dlen = acomp_ctx->req->dlen;
+read_done:
zpool_obj_read_end(zpool, entry->handle, obj);
acomp_ctx_put_unlock(acomp_ctx);
@@ -1524,6 +1551,8 @@ static bool zswap_store_page(struct page *page,
obj_cgroup_charge_zswap(objcg, entry->length);
}
atomic_long_inc(&zswap_stored_pages);
+ if (entry->length == PAGE_SIZE)
+ atomic_long_inc(&zswap_stored_incompressible_pages);
/*
* We finish initializing the entry while it's already in xarray.
@@ -1792,6 +1821,14 @@ static int debugfs_get_stored_pages(void *data, u64 *val)
}
DEFINE_DEBUGFS_ATTRIBUTE(stored_pages_fops, debugfs_get_stored_pages, NULL, "%llu\n");
+static int debugfs_get_stored_incompressible_pages(void *data, u64 *val)
+{
+ *val = atomic_long_read(&zswap_stored_incompressible_pages);
+ return 0;
+}
+DEFINE_DEBUGFS_ATTRIBUTE(stored_incompressible_pages_fops,
+ debugfs_get_stored_incompressible_pages, NULL, "%llu\n");
+
static int zswap_debugfs_init(void)
{
if (!debugfs_initialized())
@@ -1819,6 +1856,9 @@ static int zswap_debugfs_init(void)
zswap_debugfs_root, NULL, &total_size_fops);
debugfs_create_file("stored_pages", 0444,
zswap_debugfs_root, NULL, &stored_pages_fops);
+ debugfs_create_file("stored_incompressible_pages", 0444,
+ zswap_debugfs_root, NULL,
+ &stored_incompressible_pages_fops);
return 0;
}
diff --git a/rust/helpers/slab.c b/rust/helpers/slab.c
index a842bfbddcba..7fac958907b0 100644
--- a/rust/helpers/slab.c
+++ b/rust/helpers/slab.c
@@ -3,13 +3,15 @@
#include <linux/slab.h>
void * __must_check __realloc_size(2)
-rust_helper_krealloc(const void *objp, size_t new_size, gfp_t flags)
+rust_helper_krealloc_node_align(const void *objp, size_t new_size, unsigned long align,
+ gfp_t flags, int node)
{
- return krealloc(objp, new_size, flags);
+ return krealloc_node_align(objp, new_size, align, flags, node);
}
void * __must_check __realloc_size(2)
-rust_helper_kvrealloc(const void *p, size_t size, gfp_t flags)
+rust_helper_kvrealloc_node_align(const void *p, size_t size, unsigned long align,
+ gfp_t flags, int node)
{
- return kvrealloc(p, size, flags);
+ return kvrealloc_node_align(p, size, align, flags, node);
}
diff --git a/rust/helpers/vmalloc.c b/rust/helpers/vmalloc.c
index 80d34501bbc0..7d7f7336b3d2 100644
--- a/rust/helpers/vmalloc.c
+++ b/rust/helpers/vmalloc.c
@@ -3,7 +3,8 @@
#include <linux/vmalloc.h>
void * __must_check __realloc_size(2)
-rust_helper_vrealloc(const void *p, size_t size, gfp_t flags)
+rust_helper_vrealloc_node_align(const void *p, size_t size, unsigned long align,
+ gfp_t flags, int node)
{
- return vrealloc(p, size, flags);
+ return vrealloc_node_align(p, size, align, flags, node);
}
diff --git a/rust/kernel/alloc.rs b/rust/kernel/alloc.rs
index a2c49e5494d3..b39c279236f5 100644
--- a/rust/kernel/alloc.rs
+++ b/rust/kernel/alloc.rs
@@ -28,6 +28,8 @@ pub use self::kvec::Vec;
/// Indicates an allocation error.
#[derive(Copy, Clone, PartialEq, Eq, Debug)]
pub struct AllocError;
+
+use crate::error::{code::EINVAL, Result};
use core::{alloc::Layout, ptr::NonNull};
/// Flags to be used when allocating memory.
@@ -115,6 +117,31 @@ pub mod flags {
pub const __GFP_NOWARN: Flags = Flags(bindings::__GFP_NOWARN);
}
+/// Non Uniform Memory Access (NUMA) node identifier.
+#[derive(Clone, Copy, PartialEq)]
+pub struct NumaNode(i32);
+
+impl NumaNode {
+ /// Create a new NUMA node identifier (non-negative integer).
+ ///
+ /// Returns [`EINVAL`] if a negative id or an id exceeding [`bindings::MAX_NUMNODES`] is
+ /// specified.
+ pub fn new(node: i32) -> Result<Self> {
+ // MAX_NUMNODES never exceeds 2**10 because NODES_SHIFT is 0..10.
+ if node < 0 || node >= bindings::MAX_NUMNODES as i32 {
+ return Err(EINVAL);
+ }
+ Ok(Self(node))
+ }
+}
+
+/// Specify necessary constant to pass the information to Allocator that the caller doesn't care
+/// about the NUMA node to allocate memory from.
+impl NumaNode {
+ /// No node preference.
+ pub const NO_NODE: NumaNode = NumaNode(bindings::NUMA_NO_NODE);
+}
+
/// The kernel's [`Allocator`] trait.
///
/// An implementation of [`Allocator`] can allocate, re-allocate and free memory buffers described
@@ -137,7 +164,7 @@ pub mod flags {
/// - Implementers must ensure that all trait functions abide by the guarantees documented in the
/// `# Guarantees` sections.
pub unsafe trait Allocator {
- /// Allocate memory based on `layout` and `flags`.
+ /// Allocate memory based on `layout`, `flags` and `nid`.
///
/// On success, returns a buffer represented as `NonNull<[u8]>` that satisfies the layout
/// constraints (i.e. minimum size and alignment as specified by `layout`).
@@ -153,13 +180,21 @@ pub unsafe trait Allocator {
///
/// Additionally, `Flags` are honored as documented in
/// <https://docs.kernel.org/core-api/mm-api.html#mm-api-gfp-flags>.
- fn alloc(layout: Layout, flags: Flags) -> Result<NonNull<[u8]>, AllocError> {
+ fn alloc(layout: Layout, flags: Flags, nid: NumaNode) -> Result<NonNull<[u8]>, AllocError> {
// SAFETY: Passing `None` to `realloc` is valid by its safety requirements and asks for a
// new memory allocation.
- unsafe { Self::realloc(None, layout, Layout::new::<()>(), flags) }
+ unsafe { Self::realloc(None, layout, Layout::new::<()>(), flags, nid) }
}
- /// Re-allocate an existing memory allocation to satisfy the requested `layout`.
+ /// Re-allocate an existing memory allocation to satisfy the requested `layout` and
+ /// a specific NUMA node request to allocate the memory for.
+ ///
+ /// Systems employing a Non Uniform Memory Access (NUMA) architecture contain collections of
+ /// hardware resources including processors, memory, and I/O buses, that comprise what is
+ /// commonly known as a NUMA node.
+ ///
+ /// `nid` stands for NUMA id, i. e. NUMA node identifier, which is a non-negative integer
+ /// if a node needs to be specified, or [`NumaNode::NO_NODE`] if the caller doesn't care.
///
/// If the requested size is zero, `realloc` behaves equivalent to `free`.
///
@@ -196,6 +231,7 @@ pub unsafe trait Allocator {
layout: Layout,
old_layout: Layout,
flags: Flags,
+ nid: NumaNode,
) -> Result<NonNull<[u8]>, AllocError>;
/// Free an existing memory allocation.
@@ -211,7 +247,15 @@ pub unsafe trait Allocator {
// SAFETY: The caller guarantees that `ptr` points at a valid allocation created by this
// allocator. We are passing a `Layout` with the smallest possible alignment, so it is
// smaller than or equal to the alignment previously used with this allocation.
- let _ = unsafe { Self::realloc(Some(ptr), Layout::new::<()>(), layout, Flags(0)) };
+ let _ = unsafe {
+ Self::realloc(
+ Some(ptr),
+ Layout::new::<()>(),
+ layout,
+ Flags(0),
+ NumaNode::NO_NODE,
+ )
+ };
}
}
diff --git a/rust/kernel/alloc/allocator.rs b/rust/kernel/alloc/allocator.rs
index 2692cf90c948..b561e7a57bb8 100644
--- a/rust/kernel/alloc/allocator.rs
+++ b/rust/kernel/alloc/allocator.rs
@@ -13,9 +13,8 @@ use core::alloc::Layout;
use core::ptr;
use core::ptr::NonNull;
-use crate::alloc::{AllocError, Allocator};
+use crate::alloc::{AllocError, Allocator, NumaNode};
use crate::bindings;
-use crate::pr_warn;
/// The contiguous kernel allocator.
///
@@ -45,20 +44,26 @@ pub struct KVmalloc;
/// # Invariants
///
-/// One of the following: `krealloc`, `vrealloc`, `kvrealloc`.
+/// One of the following: `krealloc_node_align`, `vrealloc_node_align`, `kvrealloc_node_align`.
struct ReallocFunc(
- unsafe extern "C" fn(*const crate::ffi::c_void, usize, u32) -> *mut crate::ffi::c_void,
+ unsafe extern "C" fn(
+ *const crate::ffi::c_void,
+ usize,
+ crate::ffi::c_ulong,
+ u32,
+ crate::ffi::c_int,
+ ) -> *mut crate::ffi::c_void,
);
impl ReallocFunc {
- // INVARIANT: `krealloc` satisfies the type invariants.
- const KREALLOC: Self = Self(bindings::krealloc);
+ // INVARIANT: `krealloc_node_align` satisfies the type invariants.
+ const KREALLOC: Self = Self(bindings::krealloc_node_align);
- // INVARIANT: `vrealloc` satisfies the type invariants.
- const VREALLOC: Self = Self(bindings::vrealloc);
+ // INVARIANT: `vrealloc_node_align` satisfies the type invariants.
+ const VREALLOC: Self = Self(bindings::vrealloc_node_align);
- // INVARIANT: `kvrealloc` satisfies the type invariants.
- const KVREALLOC: Self = Self(bindings::kvrealloc);
+ // INVARIANT: `kvrealloc_node_align` satisfies the type invariants.
+ const KVREALLOC: Self = Self(bindings::kvrealloc_node_align);
/// # Safety
///
@@ -76,6 +81,7 @@ impl ReallocFunc {
layout: Layout,
old_layout: Layout,
flags: Flags,
+ nid: NumaNode,
) -> Result<NonNull<[u8]>, AllocError> {
let size = layout.size();
let ptr = match ptr {
@@ -99,7 +105,7 @@ impl ReallocFunc {
// - Those functions provide the guarantees of this function.
let raw_ptr = unsafe {
// If `size == 0` and `ptr != NULL` the memory behind the pointer is freed.
- self.0(ptr.cast(), size, flags.0).cast()
+ self.0(ptr.cast(), size, layout.align(), flags.0, nid.0).cast()
};
let ptr = if size == 0 {
@@ -134,11 +140,12 @@ unsafe impl Allocator for Kmalloc {
layout: Layout,
old_layout: Layout,
flags: Flags,
+ nid: NumaNode,
) -> Result<NonNull<[u8]>, AllocError> {
let layout = Kmalloc::aligned_layout(layout);
// SAFETY: `ReallocFunc::call` has the same safety requirements as `Allocator::realloc`.
- unsafe { ReallocFunc::KREALLOC.call(ptr, layout, old_layout, flags) }
+ unsafe { ReallocFunc::KREALLOC.call(ptr, layout, old_layout, flags, nid) }
}
}
@@ -153,16 +160,11 @@ unsafe impl Allocator for Vmalloc {
layout: Layout,
old_layout: Layout,
flags: Flags,
+ nid: NumaNode,
) -> Result<NonNull<[u8]>, AllocError> {
- // TODO: Support alignments larger than PAGE_SIZE.
- if layout.align() > bindings::PAGE_SIZE {
- pr_warn!("Vmalloc does not support alignments larger than PAGE_SIZE yet.\n");
- return Err(AllocError);
- }
-
// SAFETY: If not `None`, `ptr` is guaranteed to point to valid memory, which was previously
// allocated with this `Allocator`.
- unsafe { ReallocFunc::VREALLOC.call(ptr, layout, old_layout, flags) }
+ unsafe { ReallocFunc::VREALLOC.call(ptr, layout, old_layout, flags, nid) }
}
}
@@ -177,19 +179,70 @@ unsafe impl Allocator for KVmalloc {
layout: Layout,
old_layout: Layout,
flags: Flags,
+ nid: NumaNode,
) -> Result<NonNull<[u8]>, AllocError> {
// `KVmalloc` may use the `Kmalloc` backend, hence we have to enforce a `Kmalloc`
// compatible layout.
let layout = Kmalloc::aligned_layout(layout);
- // TODO: Support alignments larger than PAGE_SIZE.
- if layout.align() > bindings::PAGE_SIZE {
- pr_warn!("KVmalloc does not support alignments larger than PAGE_SIZE yet.\n");
- return Err(AllocError);
- }
-
// SAFETY: If not `None`, `ptr` is guaranteed to point to valid memory, which was previously
// allocated with this `Allocator`.
- unsafe { ReallocFunc::KVREALLOC.call(ptr, layout, old_layout, flags) }
+ unsafe { ReallocFunc::KVREALLOC.call(ptr, layout, old_layout, flags, nid) }
+ }
+}
+
+#[macros::kunit_tests(rust_allocator)]
+mod tests {
+ use super::*;
+ use core::mem::MaybeUninit;
+ use kernel::prelude::*;
+
+ #[test]
+ fn test_alignment() -> Result {
+ const TEST_SIZE: usize = 1024;
+ const TEST_LARGE_ALIGN_SIZE: usize = kernel::page::PAGE_SIZE * 4;
+
+ // These two structs are used to test allocating aligned memory.
+ // they don't need to be accessed, so they're marked as dead_code.
+ #[expect(dead_code)]
+ #[repr(align(128))]
+ struct Blob([u8; TEST_SIZE]);
+ #[expect(dead_code)]
+ #[repr(align(8192))]
+ struct LargeAlignBlob([u8; TEST_LARGE_ALIGN_SIZE]);
+
+ struct TestAlign<T, A: Allocator>(Box<MaybeUninit<T>, A>);
+ impl<T, A: Allocator> TestAlign<T, A> {
+ fn new() -> Result<Self> {
+ Ok(Self(Box::<_, A>::new_uninit(GFP_KERNEL)?))
+ }
+
+ fn is_aligned_to(&self, align: usize) -> bool {
+ assert!(align.is_power_of_two());
+
+ let addr = self.0.as_ptr() as usize;
+ addr & (align - 1) == 0
+ }
+ }
+
+ let ta = TestAlign::<Blob, Kmalloc>::new()?;
+ assert!(ta.is_aligned_to(128));
+
+ let ta = TestAlign::<LargeAlignBlob, Kmalloc>::new()?;
+ assert!(ta.is_aligned_to(8192));
+
+ let ta = TestAlign::<Blob, Vmalloc>::new()?;
+ assert!(ta.is_aligned_to(128));
+
+ let ta = TestAlign::<LargeAlignBlob, Vmalloc>::new()?;
+ assert!(ta.is_aligned_to(8192));
+
+ let ta = TestAlign::<Blob, KVmalloc>::new()?;
+ assert!(ta.is_aligned_to(128));
+
+ let ta = TestAlign::<LargeAlignBlob, KVmalloc>::new()?;
+ assert!(ta.is_aligned_to(8192));
+
+ Ok(())
}
}
diff --git a/rust/kernel/alloc/allocator_test.rs b/rust/kernel/alloc/allocator_test.rs
index 90dd987d40e4..2e61cdbd2303 100644
--- a/rust/kernel/alloc/allocator_test.rs
+++ b/rust/kernel/alloc/allocator_test.rs
@@ -9,7 +9,7 @@
#![allow(missing_docs)]
-use super::{flags::*, AllocError, Allocator, Flags};
+use super::{flags::*, AllocError, Allocator, Flags, NumaNode};
use core::alloc::Layout;
use core::cmp;
use core::ptr;
@@ -51,6 +51,7 @@ unsafe impl Allocator for Cmalloc {
layout: Layout,
old_layout: Layout,
flags: Flags,
+ _nid: NumaNode,
) -> Result<NonNull<[u8]>, AllocError> {
let src = match ptr {
Some(src) => {
diff --git a/rust/kernel/alloc/kbox.rs b/rust/kernel/alloc/kbox.rs
index 856d05aa60f1..1fef9beb57c8 100644
--- a/rust/kernel/alloc/kbox.rs
+++ b/rust/kernel/alloc/kbox.rs
@@ -4,7 +4,7 @@
#[allow(unused_imports)] // Used in doc comments.
use super::allocator::{KVmalloc, Kmalloc, Vmalloc};
-use super::{AllocError, Allocator, Flags};
+use super::{AllocError, Allocator, Flags, NumaNode};
use core::alloc::Layout;
use core::borrow::{Borrow, BorrowMut};
use core::fmt;
@@ -273,7 +273,7 @@ where
/// ```
pub fn new_uninit(flags: Flags) -> Result<Box<MaybeUninit<T>, A>, AllocError> {
let layout = Layout::new::<MaybeUninit<T>>();
- let ptr = A::alloc(layout, flags)?;
+ let ptr = A::alloc(layout, flags, NumaNode::NO_NODE)?;
// INVARIANT: `ptr` is either a dangling pointer or points to memory allocated with `A`,
// which is sufficient in size and alignment for storing a `T`.
diff --git a/rust/kernel/alloc/kvec.rs b/rust/kernel/alloc/kvec.rs
index 3c72e0bdddb8..92d0ed3f302e 100644
--- a/rust/kernel/alloc/kvec.rs
+++ b/rust/kernel/alloc/kvec.rs
@@ -5,7 +5,7 @@
use super::{
allocator::{KVmalloc, Kmalloc, Vmalloc},
layout::ArrayLayout,
- AllocError, Allocator, Box, Flags,
+ AllocError, Allocator, Box, Flags, NumaNode,
};
use core::{
borrow::{Borrow, BorrowMut},
@@ -634,6 +634,7 @@ where
layout.into(),
self.layout.into(),
flags,
+ NumaNode::NO_NODE,
)?
};
@@ -1111,7 +1112,13 @@ where
// the type invariant to be smaller than `cap`. Depending on `realloc` this operation
// may shrink the buffer or leave it as it is.
ptr = match unsafe {
- A::realloc(Some(buf.cast()), layout.into(), old_layout.into(), flags)
+ A::realloc(
+ Some(buf.cast()),
+ layout.into(),
+ old_layout.into(),
+ flags,
+ NumaNode::NO_NODE,
+ )
} {
// If we fail to shrink, which likely can't even happen, continue with the existing
// buffer.
diff --git a/rust/kernel/mm.rs b/rust/kernel/mm.rs
index 43f525c0d16c..4764d7b68f2a 100644
--- a/rust/kernel/mm.rs
+++ b/rust/kernel/mm.rs
@@ -13,7 +13,8 @@
use crate::{
bindings,
- types::{ARef, AlwaysRefCounted, NotThreadSafe, Opaque},
+ sync::aref::{ARef, AlwaysRefCounted},
+ types::{NotThreadSafe, Opaque},
};
use core::{ops::Deref, ptr::NonNull};
diff --git a/rust/kernel/mm/mmput_async.rs b/rust/kernel/mm/mmput_async.rs
index 9289e05f7a67..b8d2f051225c 100644
--- a/rust/kernel/mm/mmput_async.rs
+++ b/rust/kernel/mm/mmput_async.rs
@@ -10,7 +10,7 @@
use crate::{
bindings,
mm::MmWithUser,
- types::{ARef, AlwaysRefCounted},
+ sync::aref::{ARef, AlwaysRefCounted},
};
use core::{ops::Deref, ptr::NonNull};
diff --git a/tools/include/linux/atomic.h b/tools/include/linux/atomic.h
index 01907b33537e..50c66ba9ada5 100644
--- a/tools/include/linux/atomic.h
+++ b/tools/include/linux/atomic.h
@@ -12,4 +12,26 @@ void atomic_long_set(atomic_long_t *v, long i);
#define atomic_cmpxchg_release atomic_cmpxchg
#endif /* atomic_cmpxchg_relaxed */
+static inline bool atomic_try_cmpxchg(atomic_t *ptr, int *oldp, int new)
+{
+ int ret, old = *oldp;
+
+ ret = atomic_cmpxchg(ptr, old, new);
+ if (ret != old)
+ *oldp = ret;
+ return ret == old;
+}
+
+static inline bool atomic_inc_unless_negative(atomic_t *v)
+{
+ int c = atomic_read(v);
+
+ do {
+ if (unlikely(c < 0))
+ return false;
+ } while (!atomic_try_cmpxchg(v, &c, c + 1));
+
+ return true;
+}
+
#endif /* __TOOLS_LINUX_ATOMIC_H */
diff --git a/tools/testing/radix-tree/maple.c b/tools/testing/radix-tree/maple.c
index 172700fb7784..05714c22994e 100644
--- a/tools/testing/radix-tree/maple.c
+++ b/tools/testing/radix-tree/maple.c
@@ -36327,13 +36327,18 @@ extern void test_kmem_cache_bulk(void);
static inline void check_spanning_store_height(struct maple_tree *mt)
{
int index = 0;
+ int last = 140;
MA_STATE(mas, mt, 0, 0);
mas_lock(&mas);
while (mt_height(mt) != 3) {
mas_store_gfp(&mas, xa_mk_value(index), GFP_KERNEL);
mas_set(&mas, ++index);
}
- mas_set_range(&mas, 90, 140);
+
+ if (MAPLE_32BIT)
+ last = 155; /* 32 bit higher branching factor. */
+
+ mas_set_range(&mas, 90, last);
mas_store_gfp(&mas, xa_mk_value(index), GFP_KERNEL);
MT_BUG_ON(mt, mas_mt_height(&mas) != 2);
mas_unlock(&mas);
@@ -36428,6 +36433,7 @@ static void check_nomem_writer_race(struct maple_tree *mt)
*/
static inline int check_vma_modification(struct maple_tree *mt)
{
+#if defined(CONFIG_64BIT)
MA_STATE(mas, mt, 0, 0);
mtree_lock(mt);
@@ -36451,6 +36457,8 @@ static inline int check_vma_modification(struct maple_tree *mt)
mas_destroy(&mas);
mtree_unlock(mt);
+#endif
+
return 0;
}
diff --git a/tools/testing/selftests/damon/Makefile b/tools/testing/selftests/damon/Makefile
index 9a3499827d4b..2180c328a825 100644
--- a/tools/testing/selftests/damon/Makefile
+++ b/tools/testing/selftests/damon/Makefile
@@ -5,6 +5,7 @@ TEST_GEN_FILES += access_memory access_memory_even
TEST_FILES = _damon_sysfs.py
TEST_FILES += drgn_dump_damon_status.py
+TEST_FILES += _common.sh
# functionality tests
TEST_PROGS += sysfs.sh
@@ -18,6 +19,7 @@ TEST_PROGS += reclaim.sh lru_sort.sh
TEST_PROGS += sysfs_update_removed_scheme_dir.sh
TEST_PROGS += sysfs_update_schemes_tried_regions_hang.py
TEST_PROGS += sysfs_memcg_path_leak.sh
+TEST_PROGS += sysfs_no_op_commit_break.py
EXTRA_CLEAN = __pycache__
diff --git a/tools/testing/selftests/damon/access_memory_even.c b/tools/testing/selftests/damon/access_memory_even.c
index a9f4e9aaf3a9..93f3a71bcfd4 100644
--- a/tools/testing/selftests/damon/access_memory_even.c
+++ b/tools/testing/selftests/damon/access_memory_even.c
@@ -9,7 +9,6 @@
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
-#include <time.h>
int main(int argc, char *argv[])
{
diff --git a/tools/testing/selftests/damon/sysfs_no_op_commit_break.py b/tools/testing/selftests/damon/sysfs_no_op_commit_break.py
new file mode 100755
index 000000000000..2c65cffe6b54
--- /dev/null
+++ b/tools/testing/selftests/damon/sysfs_no_op_commit_break.py
@@ -0,0 +1,72 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: GPL-2.0
+
+import json
+import os
+import subprocess
+import sys
+
+import _damon_sysfs
+
+def dump_damon_status_dict(pid):
+ try:
+ subprocess.check_output(['which', 'drgn'], stderr=subprocess.DEVNULL)
+ except:
+ return None, 'drgn not found'
+ file_dir = os.path.dirname(os.path.abspath(__file__))
+ dump_script = os.path.join(file_dir, 'drgn_dump_damon_status.py')
+ rc = subprocess.call(['drgn', dump_script, pid, 'damon_dump_output'],
+ stderr=subprocess.DEVNULL)
+
+ if rc != 0:
+ return None, f'drgn fail: return code({rc})'
+ try:
+ with open('damon_dump_output', 'r') as f:
+ return json.load(f), None
+ except Exception as e:
+ return None, 'json.load fail (%s)' % e
+
+def main():
+ kdamonds = _damon_sysfs.Kdamonds(
+ [_damon_sysfs.Kdamond(
+ contexts=[_damon_sysfs.DamonCtx(
+ schemes=[_damon_sysfs.Damos(
+ ops_filters=[
+ _damon_sysfs.DamosFilter(
+ type_='anon',
+ matching=True,
+ allow=True,
+ )
+ ]
+ )],
+ )])]
+ )
+
+ err = kdamonds.start()
+ if err is not None:
+ print('kdamond start failed: %s' % err)
+ exit(1)
+
+ before_commit_status, err = \
+ dump_damon_status_dict(kdamonds.kdamonds[0].pid)
+ if err is not None:
+ print('before-commit status dump failed: %s' % err)
+ exit(1)
+
+ kdamonds.kdamonds[0].commit()
+
+ after_commit_status, err = \
+ dump_damon_status_dict(kdamonds.kdamonds[0].pid)
+ if err is not None:
+ print('after-commit status dump failed: %s' % err)
+ exit(1)
+
+ if before_commit_status != after_commit_status:
+ print(f'before: {json.dumps(before_commit_status, indent=2)}')
+ print(f'after: {json.dumps(after_commit_status, indent=2)}')
+ exit(1)
+
+ kdamonds.stop()
+
+if __name__ == '__main__':
+ main()
diff --git a/tools/testing/selftests/kho/init.c b/tools/testing/selftests/kho/init.c
index 8034e24c6bf6..6d9e91d55d68 100644
--- a/tools/testing/selftests/kho/init.c
+++ b/tools/testing/selftests/kho/init.c
@@ -1,22 +1,17 @@
// SPDX-License-Identifier: GPL-2.0
-#ifndef NOLIBC
-#include <errno.h>
#include <stdio.h>
#include <unistd.h>
#include <fcntl.h>
-#include <syscall.h>
+#include <sys/syscall.h>
#include <sys/mount.h>
#include <sys/reboot.h>
-#endif
+#include <linux/kexec.h>
/* from arch/x86/include/asm/setup.h */
#define COMMAND_LINE_SIZE 2048
-/* from include/linux/kexex.h */
-#define KEXEC_FILE_NO_INITRAMFS 0x00000004
-
-#define KHO_FINILIZE "/debugfs/kho/out/finalize"
+#define KHO_FINALIZE "/debugfs/kho/out/finalize"
#define KERNEL_IMAGE "/kernel"
static int mount_filesystems(void)
@@ -32,7 +27,7 @@ static int kho_enable(void)
const char enable[] = "1";
int fd;
- fd = open(KHO_FINILIZE, O_RDWR);
+ fd = open(KHO_FINALIZE, O_RDWR);
if (fd < 0)
return -1;
diff --git a/tools/testing/selftests/kho/vmtest.sh b/tools/testing/selftests/kho/vmtest.sh
index ec70a17bd476..3f6c17166846 100755
--- a/tools/testing/selftests/kho/vmtest.sh
+++ b/tools/testing/selftests/kho/vmtest.sh
@@ -10,7 +10,6 @@ kernel_dir=$(realpath "$test_dir/../../../..")
tmp_dir=$(mktemp -d /tmp/kho-test.XXXXXXXX)
headers_dir="$tmp_dir/usr"
-initrd_dir="$tmp_dir/initrd"
initrd="$tmp_dir/initrd.cpio"
source "$test_dir/../kselftest/ktap_helpers.sh"
@@ -81,19 +80,22 @@ EOF
function mkinitrd() {
local kernel=$1
- mkdir -p "$initrd_dir"/{dev,debugfs,proc}
- sudo mknod "$initrd_dir/dev/console" c 5 1
-
- "$CROSS_COMPILE"gcc -s -static -Os -nostdinc -I"$headers_dir/include" \
- -fno-asynchronous-unwind-tables -fno-ident -nostdlib \
- -include "$test_dir/../../../include/nolibc/nolibc.h" \
- -o "$initrd_dir/init" "$test_dir/init.c" \
-
- cp "$kernel" "$initrd_dir/kernel"
+ "$CROSS_COMPILE"gcc -s -static -Os -nostdinc -nostdlib \
+ -fno-asynchronous-unwind-tables -fno-ident \
+ -I "$headers_dir/include" \
+ -I "$kernel_dir/tools/include/nolibc" \
+ -o "$tmp_dir/init" "$test_dir/init.c"
+
+ cat > "$tmp_dir/cpio_list" <<EOF
+dir /dev 0755 0 0
+dir /proc 0755 0 0
+dir /debugfs 0755 0 0
+nod /dev/console 0600 0 0 c 5 1
+file /init $tmp_dir/init 0755 0 0
+file /kernel $kernel 0644 0 0
+EOF
- pushd "$initrd_dir" &>/dev/null
- find . | cpio -H newc --create > "$initrd" 2>/dev/null
- popd &>/dev/null
+ "$build_dir/usr/gen_init_cpio" "$tmp_dir/cpio_list" > "$initrd"
}
function run_qemu() {
diff --git a/tools/testing/selftests/kselftest.h b/tools/testing/selftests/kselftest.h
index c3b6d2604b1e..661d31c4b558 100644
--- a/tools/testing/selftests/kselftest.h
+++ b/tools/testing/selftests/kselftest.h
@@ -92,6 +92,10 @@
#endif
#define __printf(a, b) __attribute__((format(printf, a, b)))
+#ifndef __maybe_unused
+#define __maybe_unused __attribute__((__unused__))
+#endif
+
/* counters */
struct ksft_count {
unsigned int ksft_pass;
diff --git a/tools/testing/selftests/landlock/audit.h b/tools/testing/selftests/landlock/audit.h
index b16986aa6442..02fd1393947a 100644
--- a/tools/testing/selftests/landlock/audit.h
+++ b/tools/testing/selftests/landlock/audit.h
@@ -20,14 +20,12 @@
#include <sys/time.h>
#include <unistd.h>
+#include "../kselftest.h"
+
#ifndef ARRAY_SIZE
#define ARRAY_SIZE(arr) (sizeof(arr) / sizeof((arr)[0]))
#endif
-#ifndef __maybe_unused
-#define __maybe_unused __attribute__((__unused__))
-#endif
-
#define REGEX_LANDLOCK_PREFIX "^audit([0-9.:]\\+): domain=\\([0-9a-f]\\+\\)"
struct audit_filter {
diff --git a/tools/testing/selftests/landlock/common.h b/tools/testing/selftests/landlock/common.h
index 88a3c78f5d98..9acecae36f51 100644
--- a/tools/testing/selftests/landlock/common.h
+++ b/tools/testing/selftests/landlock/common.h
@@ -22,10 +22,6 @@
#define TMP_DIR "tmp"
-#ifndef __maybe_unused
-#define __maybe_unused __attribute__((__unused__))
-#endif
-
/* TEST_F_FORK() should not be used for new tests. */
#define TEST_F_FORK(fixture_name, test_name) TEST_F(fixture_name, test_name)
diff --git a/tools/testing/selftests/mm/.gitignore b/tools/testing/selftests/mm/.gitignore
index e7b23a8a05fe..c2a8586e51a1 100644
--- a/tools/testing/selftests/mm/.gitignore
+++ b/tools/testing/selftests/mm/.gitignore
@@ -58,3 +58,5 @@ pkey_sighandler_tests_32
pkey_sighandler_tests_64
guard-regions
merge
+prctl_thp_disable
+rmap
diff --git a/tools/testing/selftests/mm/Makefile b/tools/testing/selftests/mm/Makefile
index d13b3cef2a2b..5a1dee50b898 100644
--- a/tools/testing/selftests/mm/Makefile
+++ b/tools/testing/selftests/mm/Makefile
@@ -86,6 +86,7 @@ TEST_GEN_FILES += on-fault-limit
TEST_GEN_FILES += pagemap_ioctl
TEST_GEN_FILES += pfnmap
TEST_GEN_FILES += process_madv
+TEST_GEN_FILES += prctl_thp_disable
TEST_GEN_FILES += thuge-gen
TEST_GEN_FILES += transhuge-stress
TEST_GEN_FILES += uffd-stress
@@ -101,6 +102,7 @@ TEST_GEN_FILES += hugetlb_dio
TEST_GEN_FILES += droppable
TEST_GEN_FILES += guard-regions
TEST_GEN_FILES += merge
+TEST_GEN_FILES += rmap
ifneq ($(ARCH),arm64)
TEST_GEN_FILES += soft-dirty
@@ -228,6 +230,8 @@ $(OUTPUT)/ksm_tests: LDLIBS += -lnuma
$(OUTPUT)/migration: LDLIBS += -lnuma
+$(OUTPUT)/rmap: LDLIBS += -lnuma
+
local_config.mk local_config.h: check_config.sh
/bin/sh ./check_config.sh $(CC)
diff --git a/tools/testing/selftests/mm/cow.c b/tools/testing/selftests/mm/cow.c
index c744c603d688..6560c26f47d1 100644
--- a/tools/testing/selftests/mm/cow.c
+++ b/tools/testing/selftests/mm/cow.c
@@ -41,11 +41,6 @@ static size_t hugetlbsizes[10];
static int gup_fd;
static bool has_huge_zeropage;
-static int sz2ord(size_t size)
-{
- return __builtin_ctzll(size / pagesize);
-}
-
static int detect_thp_sizes(size_t sizes[], int max)
{
int count = 0;
@@ -57,7 +52,7 @@ static int detect_thp_sizes(size_t sizes[], int max)
if (!pmdsize)
return 0;
- orders = 1UL << sz2ord(pmdsize);
+ orders = 1UL << sz2ord(pmdsize, pagesize);
orders |= thp_supported_orders();
for (i = 0; orders && count < max; i++) {
@@ -1216,8 +1211,8 @@ static void run_anon_test_case(struct test_case const *test_case)
size_t size = thpsizes[i];
struct thp_settings settings = *thp_current_settings();
- settings.hugepages[sz2ord(pmdsize)].enabled = THP_NEVER;
- settings.hugepages[sz2ord(size)].enabled = THP_ALWAYS;
+ settings.hugepages[sz2ord(pmdsize, pagesize)].enabled = THP_NEVER;
+ settings.hugepages[sz2ord(size, pagesize)].enabled = THP_ALWAYS;
thp_push_settings(&settings);
if (size == pmdsize) {
@@ -1868,7 +1863,7 @@ int main(int argc, char **argv)
if (pmdsize) {
/* Only if THP is supported. */
thp_read_settings(&default_settings);
- default_settings.hugepages[sz2ord(pmdsize)].enabled = THP_INHERIT;
+ default_settings.hugepages[sz2ord(pmdsize, pagesize)].enabled = THP_INHERIT;
thp_save_settings();
thp_push_settings(&default_settings);
diff --git a/tools/testing/selftests/mm/hugepage-mremap.c b/tools/testing/selftests/mm/hugepage-mremap.c
index c463d1c09c9b..2bd1dac75c3f 100644
--- a/tools/testing/selftests/mm/hugepage-mremap.c
+++ b/tools/testing/selftests/mm/hugepage-mremap.c
@@ -65,10 +65,20 @@ static void register_region_with_uffd(char *addr, size_t len)
struct uffdio_api uffdio_api;
/* Create and enable userfaultfd object. */
-
uffd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK);
- if (uffd == -1)
- ksft_exit_fail_msg("userfaultfd: %s\n", strerror(errno));
+ if (uffd == -1) {
+ switch (errno) {
+ case EPERM:
+ ksft_exit_skip("Insufficient permissions, try running as root.\n");
+ break;
+ case ENOSYS:
+ ksft_exit_skip("userfaultfd is not supported/not enabled.\n");
+ break;
+ default:
+ ksft_exit_fail_msg("userfaultfd failed with %s\n", strerror(errno));
+ break;
+ }
+ }
uffdio_api.api = UFFD_API;
uffdio_api.features = 0;
diff --git a/tools/testing/selftests/mm/ksm_functional_tests.c b/tools/testing/selftests/mm/ksm_functional_tests.c
index d8bd1911dfc0..712f43c87736 100644
--- a/tools/testing/selftests/mm/ksm_functional_tests.c
+++ b/tools/testing/selftests/mm/ksm_functional_tests.c
@@ -38,14 +38,11 @@ enum ksm_merge_mode {
};
static int mem_fd;
-static int ksm_fd;
-static int ksm_full_scans_fd;
-static int proc_self_ksm_stat_fd;
-static int proc_self_ksm_merging_pages_fd;
-static int ksm_use_zero_pages_fd;
static int pagemap_fd;
static size_t pagesize;
+static void init_global_file_handles(void);
+
static bool range_maps_duplicates(char *addr, unsigned long size)
{
unsigned long offs_a, offs_b, pfn_a, pfn_b;
@@ -73,88 +70,6 @@ static bool range_maps_duplicates(char *addr, unsigned long size)
return false;
}
-static long get_my_ksm_zero_pages(void)
-{
- char buf[200];
- char *substr_ksm_zero;
- size_t value_pos;
- ssize_t read_size;
- unsigned long my_ksm_zero_pages;
-
- if (!proc_self_ksm_stat_fd)
- return 0;
-
- read_size = pread(proc_self_ksm_stat_fd, buf, sizeof(buf) - 1, 0);
- if (read_size < 0)
- return -errno;
-
- buf[read_size] = 0;
-
- substr_ksm_zero = strstr(buf, "ksm_zero_pages");
- if (!substr_ksm_zero)
- return 0;
-
- value_pos = strcspn(substr_ksm_zero, "0123456789");
- my_ksm_zero_pages = strtol(substr_ksm_zero + value_pos, NULL, 10);
-
- return my_ksm_zero_pages;
-}
-
-static long get_my_merging_pages(void)
-{
- char buf[10];
- ssize_t ret;
-
- if (proc_self_ksm_merging_pages_fd < 0)
- return proc_self_ksm_merging_pages_fd;
-
- ret = pread(proc_self_ksm_merging_pages_fd, buf, sizeof(buf) - 1, 0);
- if (ret <= 0)
- return -errno;
- buf[ret] = 0;
-
- return strtol(buf, NULL, 10);
-}
-
-static long ksm_get_full_scans(void)
-{
- char buf[10];
- ssize_t ret;
-
- ret = pread(ksm_full_scans_fd, buf, sizeof(buf) - 1, 0);
- if (ret <= 0)
- return -errno;
- buf[ret] = 0;
-
- return strtol(buf, NULL, 10);
-}
-
-static int ksm_merge(void)
-{
- long start_scans, end_scans;
-
- /* Wait for two full scans such that any possible merging happened. */
- start_scans = ksm_get_full_scans();
- if (start_scans < 0)
- return start_scans;
- if (write(ksm_fd, "1", 1) != 1)
- return -errno;
- do {
- end_scans = ksm_get_full_scans();
- if (end_scans < 0)
- return end_scans;
- } while (end_scans < start_scans + 2);
-
- return 0;
-}
-
-static int ksm_unmerge(void)
-{
- if (write(ksm_fd, "2", 1) != 1)
- return -errno;
- return 0;
-}
-
static char *__mmap_and_merge_range(char val, unsigned long size, int prot,
enum ksm_merge_mode mode)
{
@@ -163,12 +78,12 @@ static char *__mmap_and_merge_range(char val, unsigned long size, int prot,
int ret;
/* Stabilize accounting by disabling KSM completely. */
- if (ksm_unmerge()) {
+ if (ksm_stop() < 0) {
ksft_print_msg("Disabling (unmerging) KSM failed\n");
return err_map;
}
- if (get_my_merging_pages() > 0) {
+ if (ksm_get_self_merging_pages() > 0) {
ksft_print_msg("Still pages merged\n");
return err_map;
}
@@ -218,7 +133,7 @@ static char *__mmap_and_merge_range(char val, unsigned long size, int prot,
}
/* Run KSM to trigger merging and wait. */
- if (ksm_merge()) {
+ if (ksm_start() < 0) {
ksft_print_msg("Running KSM failed\n");
goto unmap;
}
@@ -227,7 +142,7 @@ static char *__mmap_and_merge_range(char val, unsigned long size, int prot,
* Check if anything was merged at all. Ignore the zero page that is
* accounted differently (depending on kernel support).
*/
- if (val && !get_my_merging_pages()) {
+ if (val && !ksm_get_self_merging_pages()) {
ksft_print_msg("No pages got merged\n");
goto unmap;
}
@@ -274,6 +189,7 @@ static void test_unmerge(void)
ksft_test_result(!range_maps_duplicates(map, size),
"Pages were unmerged\n");
unmap:
+ ksm_stop();
munmap(map, size);
}
@@ -286,15 +202,12 @@ static void test_unmerge_zero_pages(void)
ksft_print_msg("[RUN] %s\n", __func__);
- if (proc_self_ksm_stat_fd < 0) {
- ksft_test_result_skip("open(\"/proc/self/ksm_stat\") failed\n");
- return;
- }
- if (ksm_use_zero_pages_fd < 0) {
- ksft_test_result_skip("open \"/sys/kernel/mm/ksm/use_zero_pages\" failed\n");
+ if (ksm_get_self_zero_pages() < 0) {
+ ksft_test_result_skip("accessing \"/proc/self/ksm_stat\" failed\n");
return;
}
- if (write(ksm_use_zero_pages_fd, "1", 1) != 1) {
+
+ if (ksm_use_zero_pages() < 0) {
ksft_test_result_skip("write \"/sys/kernel/mm/ksm/use_zero_pages\" failed\n");
return;
}
@@ -306,7 +219,7 @@ static void test_unmerge_zero_pages(void)
/* Check if ksm_zero_pages is updated correctly after KSM merging */
pages_expected = size / pagesize;
- if (pages_expected != get_my_ksm_zero_pages()) {
+ if (pages_expected != ksm_get_self_zero_pages()) {
ksft_test_result_fail("'ksm_zero_pages' updated after merging\n");
goto unmap;
}
@@ -319,7 +232,7 @@ static void test_unmerge_zero_pages(void)
/* Check if ksm_zero_pages is updated correctly after unmerging */
pages_expected /= 2;
- if (pages_expected != get_my_ksm_zero_pages()) {
+ if (pages_expected != ksm_get_self_zero_pages()) {
ksft_test_result_fail("'ksm_zero_pages' updated after unmerging\n");
goto unmap;
}
@@ -329,7 +242,7 @@ static void test_unmerge_zero_pages(void)
*((unsigned int *)&map[offs]) = offs;
/* Now we should have no zeropages remaining. */
- if (get_my_ksm_zero_pages()) {
+ if (ksm_get_self_zero_pages()) {
ksft_test_result_fail("'ksm_zero_pages' updated after write fault\n");
goto unmap;
}
@@ -338,6 +251,7 @@ static void test_unmerge_zero_pages(void)
ksft_test_result(!range_maps_duplicates(map, size),
"KSM zero pages were unmerged\n");
unmap:
+ ksm_stop();
munmap(map, size);
}
@@ -366,6 +280,7 @@ static void test_unmerge_discarded(void)
ksft_test_result(!range_maps_duplicates(map, size),
"Pages were unmerged\n");
unmap:
+ ksm_stop();
munmap(map, size);
}
@@ -452,6 +367,7 @@ static void test_unmerge_uffd_wp(void)
close_uffd:
close(uffd);
unmap:
+ ksm_stop();
munmap(map, size);
}
#endif
@@ -506,27 +422,30 @@ static int test_child_ksm(void)
/* Test if KSM is enabled for the process. */
if (prctl(PR_GET_MEMORY_MERGE, 0, 0, 0, 0) != 1)
- return -1;
+ return 1;
/* Test if merge could really happen. */
map = __mmap_and_merge_range(0xcf, size, PROT_READ | PROT_WRITE, KSM_MERGE_NONE);
if (map == MAP_MERGE_FAIL)
- return -2;
+ return 2;
else if (map == MAP_MERGE_SKIP)
- return -3;
+ return 3;
+ ksm_stop();
munmap(map, size);
return 0;
}
static void test_child_ksm_err(int status)
{
- if (status == -1)
+ if (status == 1)
ksft_test_result_fail("unexpected PR_GET_MEMORY_MERGE result in child\n");
- else if (status == -2)
+ else if (status == 2)
ksft_test_result_fail("Merge in child failed\n");
- else if (status == -3)
+ else if (status == 3)
ksft_test_result_skip("Merge in child skipped\n");
+ else if (status == 4)
+ ksft_test_result_fail("Binary not found\n");
}
/* Verify that prctl ksm flag is inherited. */
@@ -548,6 +467,7 @@ static void test_prctl_fork(void)
child_pid = fork();
if (!child_pid) {
+ init_global_file_handles();
exit(test_child_ksm());
} else if (child_pid < 0) {
ksft_test_result_fail("fork() failed\n");
@@ -595,10 +515,10 @@ static void test_prctl_fork_exec(void)
return;
} else if (child_pid == 0) {
char *prg_name = "./ksm_functional_tests";
- char *argv_for_program[] = { prg_name, FORK_EXEC_CHILD_PRG_NAME };
+ char *argv_for_program[] = { prg_name, FORK_EXEC_CHILD_PRG_NAME, NULL };
execv(prg_name, argv_for_program);
- return;
+ exit(4);
}
if (waitpid(child_pid, &status, 0) > 0) {
@@ -644,6 +564,7 @@ static void test_prctl_unmerge(void)
ksft_test_result(!range_maps_duplicates(map, size),
"Pages were unmerged\n");
unmap:
+ ksm_stop();
munmap(map, size);
}
@@ -677,6 +598,7 @@ static void test_prot_none(void)
ksft_test_result(!range_maps_duplicates(map, size),
"Pages were unmerged\n");
unmap:
+ ksm_stop();
munmap(map, size);
}
@@ -685,19 +607,15 @@ static void init_global_file_handles(void)
mem_fd = open("/proc/self/mem", O_RDWR);
if (mem_fd < 0)
ksft_exit_fail_msg("opening /proc/self/mem failed\n");
- ksm_fd = open("/sys/kernel/mm/ksm/run", O_RDWR);
- if (ksm_fd < 0)
- ksft_exit_skip("open(\"/sys/kernel/mm/ksm/run\") failed\n");
- ksm_full_scans_fd = open("/sys/kernel/mm/ksm/full_scans", O_RDONLY);
- if (ksm_full_scans_fd < 0)
- ksft_exit_skip("open(\"/sys/kernel/mm/ksm/full_scans\") failed\n");
+ if (ksm_stop() < 0)
+ ksft_exit_skip("accessing \"/sys/kernel/mm/ksm/run\") failed\n");
+ if (ksm_get_full_scans() < 0)
+ ksft_exit_skip("accessing \"/sys/kernel/mm/ksm/full_scans\") failed\n");
pagemap_fd = open("/proc/self/pagemap", O_RDONLY);
if (pagemap_fd < 0)
ksft_exit_skip("open(\"/proc/self/pagemap\") failed\n");
- proc_self_ksm_stat_fd = open("/proc/self/ksm_stat", O_RDONLY);
- proc_self_ksm_merging_pages_fd = open("/proc/self/ksm_merging_pages",
- O_RDONLY);
- ksm_use_zero_pages_fd = open("/sys/kernel/mm/ksm/use_zero_pages", O_RDWR);
+ if (ksm_get_self_merging_pages() < 0)
+ ksft_exit_skip("accessing \"/proc/self/ksm_merging_pages\") failed\n");
}
int main(int argc, char **argv)
diff --git a/tools/testing/selftests/mm/mremap_test.c b/tools/testing/selftests/mm/mremap_test.c
index 5bd52a951cbd..bf2863b102e3 100644
--- a/tools/testing/selftests/mm/mremap_test.c
+++ b/tools/testing/selftests/mm/mremap_test.c
@@ -846,7 +846,7 @@ static void mremap_move_multi_invalid_vmas(FILE *maps_fp,
}
if (err != EFAULT) {
errno = err;
- perror("mrmeap() unexpected error");
+ perror("mremap() unexpected error");
success = false;
goto out_unmap;
}
@@ -899,7 +899,7 @@ static void mremap_move_multi_invalid_vmas(FILE *maps_fp,
}
if (err != EFAULT) {
errno = err;
- perror("mrmeap() unexpected error");
+ perror("mremap() unexpected error");
success = false;
goto out_unmap;
}
@@ -948,7 +948,7 @@ static void mremap_move_multi_invalid_vmas(FILE *maps_fp,
}
if (err != EFAULT) {
errno = err;
- perror("mrmeap() unexpected error");
+ perror("mremap() unexpected error");
success = false;
goto out_unmap;
}
diff --git a/tools/testing/selftests/mm/pagemap_ioctl.c b/tools/testing/selftests/mm/pagemap_ioctl.c
index e6face7c0166..4fc8e578ec7c 100644
--- a/tools/testing/selftests/mm/pagemap_ioctl.c
+++ b/tools/testing/selftests/mm/pagemap_ioctl.c
@@ -209,7 +209,7 @@ int userfaultfd_tests(void)
wp_addr_range(mem, mem_size);
vec_size = mem_size/page_size;
- vec = malloc(sizeof(struct page_region) * vec_size);
+ vec = calloc(vec_size, sizeof(struct page_region));
written = pagemap_ioctl(mem, mem_size, vec, 1, PM_SCAN_WP_MATCHING | PM_SCAN_CHECK_WPASYNC,
vec_size - 2, PAGE_IS_WRITTEN, 0, 0, PAGE_IS_WRITTEN);
@@ -247,11 +247,11 @@ int sanity_tests_sd(void)
vec_size = num_pages/2;
mem_size = num_pages * page_size;
- vec = malloc(sizeof(struct page_region) * vec_size);
+ vec = calloc(vec_size, sizeof(struct page_region));
if (!vec)
ksft_exit_fail_msg("error nomem\n");
- vec2 = malloc(sizeof(struct page_region) * vec_size);
+ vec2 = calloc(vec_size, sizeof(struct page_region));
if (!vec2)
ksft_exit_fail_msg("error nomem\n");
@@ -436,7 +436,7 @@ int sanity_tests_sd(void)
mem_size = 1050 * page_size;
vec_size = mem_size/(page_size*2);
- vec = malloc(sizeof(struct page_region) * vec_size);
+ vec = calloc(vec_size, sizeof(struct page_region));
if (!vec)
ksft_exit_fail_msg("error nomem\n");
@@ -491,7 +491,7 @@ int sanity_tests_sd(void)
mem_size = 10000 * page_size;
vec_size = 50;
- vec = malloc(sizeof(struct page_region) * vec_size);
+ vec = calloc(vec_size, sizeof(struct page_region));
if (!vec)
ksft_exit_fail_msg("error nomem\n");
@@ -541,7 +541,7 @@ int sanity_tests_sd(void)
vec_size = 1000;
mem_size = vec_size * page_size;
- vec = malloc(sizeof(struct page_region) * vec_size);
+ vec = calloc(vec_size, sizeof(struct page_region));
if (!vec)
ksft_exit_fail_msg("error nomem\n");
@@ -695,8 +695,8 @@ int base_tests(char *prefix, char *mem, unsigned long long mem_size, int skip)
}
vec_size = mem_size/page_size;
- vec = malloc(sizeof(struct page_region) * vec_size);
- vec2 = malloc(sizeof(struct page_region) * vec_size);
+ vec = calloc(vec_size, sizeof(struct page_region));
+ vec2 = calloc(vec_size, sizeof(struct page_region));
/* 1. all new pages must be not be written (dirty) */
written = pagemap_ioctl(mem, mem_size, vec, 1, PM_SCAN_WP_MATCHING | PM_SCAN_CHECK_WPASYNC,
@@ -807,8 +807,8 @@ int hpage_unit_tests(void)
unsigned long long vec_size = map_size/page_size;
struct page_region *vec, *vec2;
- vec = malloc(sizeof(struct page_region) * vec_size);
- vec2 = malloc(sizeof(struct page_region) * vec_size);
+ vec = calloc(vec_size, sizeof(struct page_region));
+ vec2 = calloc(vec_size, sizeof(struct page_region));
if (!vec || !vec2)
ksft_exit_fail_msg("malloc failed\n");
@@ -997,7 +997,7 @@ int unmapped_region_tests(void)
void *start = (void *)0x10000000;
int written, len = 0x00040000;
long vec_size = len / page_size;
- struct page_region *vec = malloc(sizeof(struct page_region) * vec_size);
+ struct page_region *vec = calloc(vec_size, sizeof(struct page_region));
/* 1. Get written pages */
written = pagemap_ioctl(start, len, vec, vec_size, 0, 0,
@@ -1062,7 +1062,7 @@ int sanity_tests(void)
mem_size = 10 * page_size;
vec_size = mem_size / page_size;
- vec = malloc(sizeof(struct page_region) * vec_size);
+ vec = calloc(vec_size, sizeof(struct page_region));
mem = mmap(NULL, mem_size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON, -1, 0);
if (mem == MAP_FAILED || vec == MAP_FAILED)
ksft_exit_fail_msg("error nomem\n");
diff --git a/tools/testing/selftests/mm/pfnmap.c b/tools/testing/selftests/mm/pfnmap.c
index 866ac023baf5..88659f0a90ea 100644
--- a/tools/testing/selftests/mm/pfnmap.c
+++ b/tools/testing/selftests/mm/pfnmap.c
@@ -1,6 +1,7 @@
// SPDX-License-Identifier: GPL-2.0-only
/*
- * Basic VM_PFNMAP tests relying on mmap() of '/dev/mem'
+ * Basic VM_PFNMAP tests relying on mmap() of input file provided.
+ * Use '/dev/mem' as default.
*
* Copyright 2025, Red Hat, Inc.
*
@@ -25,6 +26,7 @@
#include "vm_util.h"
static sigjmp_buf sigjmp_buf_env;
+static char *file = "/dev/mem";
static void signal_handler(int sig)
{
@@ -51,7 +53,7 @@ static int test_read_access(char *addr, size_t size, size_t pagesize)
return ret;
}
-static int find_ram_target(off_t *phys_addr,
+static int find_ram_target(off_t *offset,
unsigned long long pagesize)
{
unsigned long long start, end;
@@ -91,7 +93,7 @@ static int find_ram_target(off_t *phys_addr,
/* We need two pages. */
if (end > start + 2 * pagesize) {
fclose(file);
- *phys_addr = start;
+ *offset = start;
return 0;
}
}
@@ -100,7 +102,7 @@ static int find_ram_target(off_t *phys_addr,
FIXTURE(pfnmap)
{
- off_t phys_addr;
+ off_t offset;
size_t pagesize;
int dev_mem_fd;
char *addr1;
@@ -113,23 +115,31 @@ FIXTURE_SETUP(pfnmap)
{
self->pagesize = getpagesize();
- /* We'll require two physical pages throughout our tests ... */
- if (find_ram_target(&self->phys_addr, self->pagesize))
- SKIP(return, "Cannot find ram target in '/proc/iomem'\n");
+ if (strncmp(file, "/dev/mem", strlen("/dev/mem")) == 0) {
+ /* We'll require two physical pages throughout our tests ... */
+ if (find_ram_target(&self->offset, self->pagesize))
+ SKIP(return,
+ "Cannot find ram target in '/proc/iomem'\n");
+ } else {
+ self->offset = 0;
+ }
- self->dev_mem_fd = open("/dev/mem", O_RDONLY);
+ self->dev_mem_fd = open(file, O_RDONLY);
if (self->dev_mem_fd < 0)
- SKIP(return, "Cannot open '/dev/mem'\n");
+ SKIP(return, "Cannot open '%s'\n", file);
self->size1 = self->pagesize * 2;
self->addr1 = mmap(NULL, self->size1, PROT_READ, MAP_SHARED,
- self->dev_mem_fd, self->phys_addr);
+ self->dev_mem_fd, self->offset);
if (self->addr1 == MAP_FAILED)
- SKIP(return, "Cannot mmap '/dev/mem'\n");
+ SKIP(return, "Cannot mmap '%s'\n", file);
+
+ if (!check_vmflag_pfnmap(self->addr1))
+ SKIP(return, "Invalid file: '%s'. Not pfnmap'ed\n", file);
/* ... and want to be able to read from them. */
if (test_read_access(self->addr1, self->size1, self->pagesize))
- SKIP(return, "Cannot read-access mmap'ed '/dev/mem'\n");
+ SKIP(return, "Cannot read-access mmap'ed '%s'\n", file);
self->size2 = 0;
self->addr2 = MAP_FAILED;
@@ -182,7 +192,7 @@ TEST_F(pfnmap, munmap_split)
*/
self->size2 = self->pagesize;
self->addr2 = mmap(NULL, self->pagesize, PROT_READ, MAP_SHARED,
- self->dev_mem_fd, self->phys_addr);
+ self->dev_mem_fd, self->offset);
ASSERT_NE(self->addr2, MAP_FAILED);
}
@@ -246,4 +256,14 @@ TEST_F(pfnmap, fork)
ASSERT_EQ(ret, 0);
}
-TEST_HARNESS_MAIN
+int main(int argc, char **argv)
+{
+ for (int i = 1; i < argc; i++) {
+ if (strcmp(argv[i], "--") == 0) {
+ if (i + 1 < argc && strlen(argv[i + 1]) > 0)
+ file = argv[i + 1];
+ return test_harness_run(i, argv);
+ }
+ }
+ return test_harness_run(argc, argv);
+}
diff --git a/tools/testing/selftests/mm/pkey-helpers.h b/tools/testing/selftests/mm/pkey-helpers.h
index ea404f80e6cb..fa15f006fa68 100644
--- a/tools/testing/selftests/mm/pkey-helpers.h
+++ b/tools/testing/selftests/mm/pkey-helpers.h
@@ -84,9 +84,6 @@ extern void abort_hooks(void);
#ifndef noinline
# define noinline __attribute__((noinline))
#endif
-#ifndef __maybe_unused
-# define __maybe_unused __attribute__((__unused__))
-#endif
int sys_pkey_alloc(unsigned long flags, unsigned long init_val);
int sys_pkey_free(unsigned long pkey);
diff --git a/tools/testing/selftests/mm/prctl_thp_disable.c b/tools/testing/selftests/mm/prctl_thp_disable.c
new file mode 100644
index 000000000000..84b4a4b345af
--- /dev/null
+++ b/tools/testing/selftests/mm/prctl_thp_disable.c
@@ -0,0 +1,291 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Basic tests for PR_GET/SET_THP_DISABLE prctl calls
+ *
+ * Author(s): Usama Arif <usamaarif642@gmail.com>
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <sys/mman.h>
+#include <linux/mman.h>
+#include <sys/prctl.h>
+#include <sys/wait.h>
+
+#include "../kselftest_harness.h"
+#include "thp_settings.h"
+#include "vm_util.h"
+
+#ifndef PR_THP_DISABLE_EXCEPT_ADVISED
+#define PR_THP_DISABLE_EXCEPT_ADVISED (1 << 1)
+#endif
+
+enum thp_collapse_type {
+ THP_COLLAPSE_NONE,
+ THP_COLLAPSE_MADV_NOHUGEPAGE,
+ THP_COLLAPSE_MADV_HUGEPAGE, /* MADV_HUGEPAGE before access */
+ THP_COLLAPSE_MADV_COLLAPSE, /* MADV_COLLAPSE after access */
+};
+
+/*
+ * Function to mmap a buffer, fault it in, madvise it appropriately (before
+ * page fault for MADV_HUGE, and after for MADV_COLLAPSE), and check if the
+ * mmap region is huge.
+ * Returns:
+ * 0 if test doesn't give hugepage
+ * 1 if test gives a hugepage
+ * -errno if mmap fails
+ */
+static int test_mmap_thp(enum thp_collapse_type madvise_buf, size_t pmdsize)
+{
+ char *mem, *mmap_mem;
+ size_t mmap_size;
+ int ret;
+
+ /* For alignment purposes, we need twice the THP size. */
+ mmap_size = 2 * pmdsize;
+ mmap_mem = (char *)mmap(NULL, mmap_size, PROT_READ | PROT_WRITE,
+ MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+ if (mmap_mem == MAP_FAILED)
+ return -errno;
+
+ /* We need a THP-aligned memory area. */
+ mem = (char *)(((uintptr_t)mmap_mem + pmdsize) & ~(pmdsize - 1));
+
+ if (madvise_buf == THP_COLLAPSE_MADV_HUGEPAGE)
+ madvise(mem, pmdsize, MADV_HUGEPAGE);
+ else if (madvise_buf == THP_COLLAPSE_MADV_NOHUGEPAGE)
+ madvise(mem, pmdsize, MADV_NOHUGEPAGE);
+
+ /* Ensure memory is allocated */
+ memset(mem, 1, pmdsize);
+
+ if (madvise_buf == THP_COLLAPSE_MADV_COLLAPSE)
+ madvise(mem, pmdsize, MADV_COLLAPSE);
+
+ /* HACK: make sure we have a separate VMA that we can check reliably. */
+ mprotect(mem, pmdsize, PROT_READ);
+
+ ret = check_huge_anon(mem, 1, pmdsize);
+ munmap(mmap_mem, mmap_size);
+ return ret;
+}
+
+static void prctl_thp_disable_completely_test(struct __test_metadata *const _metadata,
+ size_t pmdsize,
+ enum thp_enabled thp_policy)
+{
+ ASSERT_EQ(prctl(PR_GET_THP_DISABLE, NULL, NULL, NULL, NULL), 1);
+
+ /* tests after prctl overrides global policy */
+ ASSERT_EQ(test_mmap_thp(THP_COLLAPSE_NONE, pmdsize), 0);
+
+ ASSERT_EQ(test_mmap_thp(THP_COLLAPSE_MADV_NOHUGEPAGE, pmdsize), 0);
+
+ ASSERT_EQ(test_mmap_thp(THP_COLLAPSE_MADV_HUGEPAGE, pmdsize), 0);
+
+ ASSERT_EQ(test_mmap_thp(THP_COLLAPSE_MADV_COLLAPSE, pmdsize), 0);
+
+ /* Reset to global policy */
+ ASSERT_EQ(prctl(PR_SET_THP_DISABLE, 0, NULL, NULL, NULL), 0);
+
+ /* tests after prctl is cleared, and only global policy is effective */
+ ASSERT_EQ(test_mmap_thp(THP_COLLAPSE_NONE, pmdsize),
+ thp_policy == THP_ALWAYS ? 1 : 0);
+
+ ASSERT_EQ(test_mmap_thp(THP_COLLAPSE_MADV_NOHUGEPAGE, pmdsize), 0);
+
+ ASSERT_EQ(test_mmap_thp(THP_COLLAPSE_MADV_HUGEPAGE, pmdsize),
+ thp_policy == THP_NEVER ? 0 : 1);
+
+ ASSERT_EQ(test_mmap_thp(THP_COLLAPSE_MADV_COLLAPSE, pmdsize), 1);
+}
+
+FIXTURE(prctl_thp_disable_completely)
+{
+ struct thp_settings settings;
+ size_t pmdsize;
+};
+
+FIXTURE_VARIANT(prctl_thp_disable_completely)
+{
+ enum thp_enabled thp_policy;
+};
+
+FIXTURE_VARIANT_ADD(prctl_thp_disable_completely, never)
+{
+ .thp_policy = THP_NEVER,
+};
+
+FIXTURE_VARIANT_ADD(prctl_thp_disable_completely, madvise)
+{
+ .thp_policy = THP_MADVISE,
+};
+
+FIXTURE_VARIANT_ADD(prctl_thp_disable_completely, always)
+{
+ .thp_policy = THP_ALWAYS,
+};
+
+FIXTURE_SETUP(prctl_thp_disable_completely)
+{
+ if (!thp_available())
+ SKIP(return, "Transparent Hugepages not available\n");
+
+ self->pmdsize = read_pmd_pagesize();
+ if (!self->pmdsize)
+ SKIP(return, "Unable to read PMD size\n");
+
+ if (prctl(PR_SET_THP_DISABLE, 1, NULL, NULL, NULL))
+ SKIP(return, "Unable to disable THPs completely for the process\n");
+
+ thp_save_settings();
+ thp_read_settings(&self->settings);
+ self->settings.thp_enabled = variant->thp_policy;
+ self->settings.hugepages[sz2ord(self->pmdsize, getpagesize())].enabled = THP_INHERIT;
+ thp_write_settings(&self->settings);
+}
+
+FIXTURE_TEARDOWN(prctl_thp_disable_completely)
+{
+ thp_restore_settings();
+}
+
+TEST_F(prctl_thp_disable_completely, nofork)
+{
+ prctl_thp_disable_completely_test(_metadata, self->pmdsize, variant->thp_policy);
+}
+
+TEST_F(prctl_thp_disable_completely, fork)
+{
+ int ret = 0;
+ pid_t pid;
+
+ /* Make sure prctl changes are carried across fork */
+ pid = fork();
+ ASSERT_GE(pid, 0);
+
+ if (!pid) {
+ prctl_thp_disable_completely_test(_metadata, self->pmdsize, variant->thp_policy);
+ return;
+ }
+
+ wait(&ret);
+ if (WIFEXITED(ret))
+ ret = WEXITSTATUS(ret);
+ else
+ ret = -EINVAL;
+ ASSERT_EQ(ret, 0);
+}
+
+static void prctl_thp_disable_except_madvise_test(struct __test_metadata *const _metadata,
+ size_t pmdsize,
+ enum thp_enabled thp_policy)
+{
+ ASSERT_EQ(prctl(PR_GET_THP_DISABLE, NULL, NULL, NULL, NULL), 3);
+
+ /* tests after prctl overrides global policy */
+ ASSERT_EQ(test_mmap_thp(THP_COLLAPSE_NONE, pmdsize), 0);
+
+ ASSERT_EQ(test_mmap_thp(THP_COLLAPSE_MADV_NOHUGEPAGE, pmdsize), 0);
+
+ ASSERT_EQ(test_mmap_thp(THP_COLLAPSE_MADV_HUGEPAGE, pmdsize),
+ thp_policy == THP_NEVER ? 0 : 1);
+
+ ASSERT_EQ(test_mmap_thp(THP_COLLAPSE_MADV_COLLAPSE, pmdsize), 1);
+
+ /* Reset to global policy */
+ ASSERT_EQ(prctl(PR_SET_THP_DISABLE, 0, NULL, NULL, NULL), 0);
+
+ /* tests after prctl is cleared, and only global policy is effective */
+ ASSERT_EQ(test_mmap_thp(THP_COLLAPSE_NONE, pmdsize),
+ thp_policy == THP_ALWAYS ? 1 : 0);
+
+ ASSERT_EQ(test_mmap_thp(THP_COLLAPSE_MADV_NOHUGEPAGE, pmdsize), 0);
+
+ ASSERT_EQ(test_mmap_thp(THP_COLLAPSE_MADV_HUGEPAGE, pmdsize),
+ thp_policy == THP_NEVER ? 0 : 1);
+
+ ASSERT_EQ(test_mmap_thp(THP_COLLAPSE_MADV_COLLAPSE, pmdsize), 1);
+}
+
+FIXTURE(prctl_thp_disable_except_madvise)
+{
+ struct thp_settings settings;
+ size_t pmdsize;
+};
+
+FIXTURE_VARIANT(prctl_thp_disable_except_madvise)
+{
+ enum thp_enabled thp_policy;
+};
+
+FIXTURE_VARIANT_ADD(prctl_thp_disable_except_madvise, never)
+{
+ .thp_policy = THP_NEVER,
+};
+
+FIXTURE_VARIANT_ADD(prctl_thp_disable_except_madvise, madvise)
+{
+ .thp_policy = THP_MADVISE,
+};
+
+FIXTURE_VARIANT_ADD(prctl_thp_disable_except_madvise, always)
+{
+ .thp_policy = THP_ALWAYS,
+};
+
+FIXTURE_SETUP(prctl_thp_disable_except_madvise)
+{
+ if (!thp_available())
+ SKIP(return, "Transparent Hugepages not available\n");
+
+ self->pmdsize = read_pmd_pagesize();
+ if (!self->pmdsize)
+ SKIP(return, "Unable to read PMD size\n");
+
+ if (prctl(PR_SET_THP_DISABLE, 1, PR_THP_DISABLE_EXCEPT_ADVISED, NULL, NULL))
+ SKIP(return, "Unable to set PR_THP_DISABLE_EXCEPT_ADVISED\n");
+
+ thp_save_settings();
+ thp_read_settings(&self->settings);
+ self->settings.thp_enabled = variant->thp_policy;
+ self->settings.hugepages[sz2ord(self->pmdsize, getpagesize())].enabled = THP_INHERIT;
+ thp_write_settings(&self->settings);
+}
+
+FIXTURE_TEARDOWN(prctl_thp_disable_except_madvise)
+{
+ thp_restore_settings();
+}
+
+TEST_F(prctl_thp_disable_except_madvise, nofork)
+{
+ prctl_thp_disable_except_madvise_test(_metadata, self->pmdsize, variant->thp_policy);
+}
+
+TEST_F(prctl_thp_disable_except_madvise, fork)
+{
+ int ret = 0;
+ pid_t pid;
+
+ /* Make sure prctl changes are carried across fork */
+ pid = fork();
+ ASSERT_GE(pid, 0);
+
+ if (!pid) {
+ prctl_thp_disable_except_madvise_test(_metadata, self->pmdsize,
+ variant->thp_policy);
+ return;
+ }
+
+ wait(&ret);
+ if (WIFEXITED(ret))
+ ret = WEXITSTATUS(ret);
+ else
+ ret = -EINVAL;
+ ASSERT_EQ(ret, 0);
+}
+
+TEST_HARNESS_MAIN
diff --git a/tools/testing/selftests/mm/rmap.c b/tools/testing/selftests/mm/rmap.c
new file mode 100644
index 000000000000..13f7bccfd0a9
--- /dev/null
+++ b/tools/testing/selftests/mm/rmap.c
@@ -0,0 +1,433 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * RMAP functional tests
+ *
+ * Author(s): Wei Yang <richard.weiyang@gmail.com>
+ */
+
+#include "../kselftest_harness.h"
+#include <strings.h>
+#include <pthread.h>
+#include <numa.h>
+#include <numaif.h>
+#include <sys/mman.h>
+#include <sys/prctl.h>
+#include <sys/types.h>
+#include <signal.h>
+#include <time.h>
+#include <sys/sem.h>
+#include <unistd.h>
+#include <fcntl.h>
+
+#include "vm_util.h"
+
+#define TOTAL_LEVEL 5
+#define MAX_CHILDREN 3
+
+#define FAIL_ON_CHECK (1 << 0)
+#define FAIL_ON_WORK (1 << 1)
+
+struct sembuf sem_wait = {0, -1, 0};
+struct sembuf sem_signal = {0, 1, 0};
+
+enum backend_type {
+ ANON,
+ SHM,
+ NORM_FILE,
+};
+
+#define PREFIX "kst_rmap"
+#define MAX_FILENAME_LEN 256
+const char *suffixes[] = {
+ "",
+ "_shm",
+ "_file",
+};
+
+struct global_data;
+typedef int (*work_fn)(struct global_data *data);
+typedef int (*check_fn)(struct global_data *data);
+typedef void (*prepare_fn)(struct global_data *data);
+
+struct global_data {
+ int worker_level;
+
+ int semid;
+ int pipefd[2];
+
+ unsigned int mapsize;
+ unsigned int rand_seed;
+ char *region;
+
+ prepare_fn do_prepare;
+ work_fn do_work;
+ check_fn do_check;
+
+ enum backend_type backend;
+ char filename[MAX_FILENAME_LEN];
+
+ unsigned long *expected_pfn;
+};
+
+/*
+ * Create a process tree with TOTAL_LEVEL height and at most MAX_CHILDREN
+ * children for each.
+ *
+ * It will randomly select one process as 'worker' process which will
+ * 'do_work' until all processes are created. And all other processes will
+ * wait until 'worker' finish its work.
+ */
+void propagate_children(struct __test_metadata *_metadata, struct global_data *data)
+{
+ pid_t root_pid, pid;
+ unsigned int num_child;
+ int status;
+ int ret = 0;
+ int curr_child, worker_child;
+ int curr_level = 1;
+ bool is_worker = true;
+
+ root_pid = getpid();
+repeat:
+ num_child = rand_r(&data->rand_seed) % MAX_CHILDREN + 1;
+ worker_child = is_worker ? rand_r(&data->rand_seed) % num_child : -1;
+
+ for (curr_child = 0; curr_child < num_child; curr_child++) {
+ pid = fork();
+
+ if (pid < 0) {
+ perror("Error: fork\n");
+ } else if (pid == 0) {
+ curr_level++;
+
+ if (curr_child != worker_child)
+ is_worker = false;
+
+ if (curr_level == TOTAL_LEVEL)
+ break;
+
+ data->rand_seed += curr_child;
+ goto repeat;
+ }
+ }
+
+ if (data->do_prepare)
+ data->do_prepare(data);
+
+ close(data->pipefd[1]);
+
+ if (is_worker && curr_level == data->worker_level) {
+ /* This is the worker process, first wait last process created */
+ char buf;
+
+ while (read(data->pipefd[0], &buf, 1) > 0)
+ ;
+
+ if (data->do_work)
+ ret = data->do_work(data);
+
+ /* Kick others */
+ semctl(data->semid, 0, IPC_RMID);
+ } else {
+ /* Wait worker finish */
+ semop(data->semid, &sem_wait, 1);
+ if (data->do_check)
+ ret = data->do_check(data);
+ }
+
+ /* Wait all child to quit */
+ while (wait(&status) > 0) {
+ if (WIFEXITED(status))
+ ret |= WEXITSTATUS(status);
+ }
+
+ if (getpid() == root_pid) {
+ if (ret & FAIL_ON_WORK)
+ SKIP(return, "Failed in worker");
+
+ ASSERT_EQ(ret, 0);
+ } else {
+ exit(ret);
+ }
+}
+
+FIXTURE(migrate)
+{
+ struct global_data data;
+};
+
+FIXTURE_SETUP(migrate)
+{
+ struct global_data *data = &self->data;
+
+ if (numa_available() < 0)
+ SKIP(return, "NUMA not available");
+ if (numa_bitmask_weight(numa_all_nodes_ptr) <= 1)
+ SKIP(return, "Not enough NUMA nodes available");
+
+ data->mapsize = getpagesize();
+
+ data->expected_pfn = mmap(0, sizeof(unsigned long),
+ PROT_READ | PROT_WRITE,
+ MAP_SHARED | MAP_ANONYMOUS, -1, 0);
+ ASSERT_NE(data->expected_pfn, MAP_FAILED);
+
+ /* Prepare semaphore */
+ data->semid = semget(IPC_PRIVATE, 1, 0666 | IPC_CREAT);
+ ASSERT_NE(data->semid, -1);
+ ASSERT_NE(semctl(data->semid, 0, SETVAL, 0), -1);
+
+ /* Prepare pipe */
+ ASSERT_NE(pipe(data->pipefd), -1);
+
+ data->rand_seed = time(NULL);
+ srand(data->rand_seed);
+
+ data->worker_level = rand() % TOTAL_LEVEL + 1;
+
+ data->do_prepare = NULL;
+ data->do_work = NULL;
+ data->do_check = NULL;
+
+ data->backend = ANON;
+};
+
+FIXTURE_TEARDOWN(migrate)
+{
+ struct global_data *data = &self->data;
+
+ if (data->region != MAP_FAILED)
+ munmap(data->region, data->mapsize);
+ data->region = MAP_FAILED;
+ if (data->expected_pfn != MAP_FAILED)
+ munmap(data->expected_pfn, sizeof(unsigned long));
+ data->expected_pfn = MAP_FAILED;
+ semctl(data->semid, 0, IPC_RMID);
+ data->semid = -1;
+
+ close(data->pipefd[0]);
+
+ switch (data->backend) {
+ case ANON:
+ break;
+ case SHM:
+ shm_unlink(data->filename);
+ break;
+ case NORM_FILE:
+ unlink(data->filename);
+ break;
+ }
+}
+
+void access_region(struct global_data *data)
+{
+ /*
+ * Force read "region" to make sure page fault in.
+ */
+ FORCE_READ(*data->region);
+}
+
+int try_to_move_page(char *region)
+{
+ int ret;
+ int node;
+ int status = 0;
+ int failures = 0;
+
+ ret = move_pages(0, 1, (void **)&region, NULL, &status, MPOL_MF_MOVE_ALL);
+ if (ret != 0) {
+ perror("Failed to get original numa");
+ return FAIL_ON_WORK;
+ }
+
+ /* Pick up a different target node */
+ for (node = 0; node <= numa_max_node(); node++) {
+ if (numa_bitmask_isbitset(numa_all_nodes_ptr, node) && node != status)
+ break;
+ }
+
+ if (node > numa_max_node()) {
+ ksft_print_msg("Couldn't find available numa node for testing\n");
+ return FAIL_ON_WORK;
+ }
+
+ while (1) {
+ ret = move_pages(0, 1, (void **)&region, &node, &status, MPOL_MF_MOVE_ALL);
+
+ /* migrate successfully */
+ if (!ret)
+ break;
+
+ /* error happened */
+ if (ret < 0) {
+ ksft_perror("Failed to move pages");
+ return FAIL_ON_WORK;
+ }
+
+ /* migration is best effort; try again */
+ if (++failures >= 100)
+ return FAIL_ON_WORK;
+ }
+
+ return 0;
+}
+
+int move_region(struct global_data *data)
+{
+ int ret;
+ int pagemap_fd;
+
+ ret = try_to_move_page(data->region);
+ if (ret != 0)
+ return ret;
+
+ pagemap_fd = open("/proc/self/pagemap", O_RDONLY);
+ if (pagemap_fd == -1)
+ return FAIL_ON_WORK;
+ *data->expected_pfn = pagemap_get_pfn(pagemap_fd, data->region);
+
+ return 0;
+}
+
+int has_same_pfn(struct global_data *data)
+{
+ unsigned long pfn;
+ int pagemap_fd;
+
+ if (data->region == MAP_FAILED)
+ return 0;
+
+ pagemap_fd = open("/proc/self/pagemap", O_RDONLY);
+ if (pagemap_fd == -1)
+ return FAIL_ON_CHECK;
+
+ pfn = pagemap_get_pfn(pagemap_fd, data->region);
+ if (pfn != *data->expected_pfn)
+ return FAIL_ON_CHECK;
+
+ return 0;
+}
+
+TEST_F(migrate, anon)
+{
+ struct global_data *data = &self->data;
+
+ /* Map an area and fault in */
+ data->region = mmap(0, data->mapsize, PROT_READ | PROT_WRITE,
+ MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+ ASSERT_NE(data->region, MAP_FAILED);
+ memset(data->region, 0xcf, data->mapsize);
+
+ data->do_prepare = access_region;
+ data->do_work = move_region;
+ data->do_check = has_same_pfn;
+
+ propagate_children(_metadata, data);
+}
+
+TEST_F(migrate, shm)
+{
+ int shm_fd;
+ struct global_data *data = &self->data;
+
+ snprintf(data->filename, MAX_FILENAME_LEN, "%s%s", PREFIX, suffixes[SHM]);
+ shm_fd = shm_open(data->filename, O_CREAT | O_RDWR, 0666);
+ ASSERT_NE(shm_fd, -1);
+ ftruncate(shm_fd, data->mapsize);
+ data->backend = SHM;
+
+ /* Map a shared area and fault in */
+ data->region = mmap(0, data->mapsize, PROT_READ | PROT_WRITE,
+ MAP_SHARED, shm_fd, 0);
+ ASSERT_NE(data->region, MAP_FAILED);
+ memset(data->region, 0xcf, data->mapsize);
+ close(shm_fd);
+
+ data->do_prepare = access_region;
+ data->do_work = move_region;
+ data->do_check = has_same_pfn;
+
+ propagate_children(_metadata, data);
+}
+
+TEST_F(migrate, file)
+{
+ int fd;
+ struct global_data *data = &self->data;
+
+ snprintf(data->filename, MAX_FILENAME_LEN, "%s%s", PREFIX, suffixes[NORM_FILE]);
+ fd = open(data->filename, O_CREAT | O_RDWR | O_EXCL, 0666);
+ ASSERT_NE(fd, -1);
+ ftruncate(fd, data->mapsize);
+ data->backend = NORM_FILE;
+
+ /* Map a shared area and fault in */
+ data->region = mmap(0, data->mapsize, PROT_READ | PROT_WRITE,
+ MAP_SHARED, fd, 0);
+ ASSERT_NE(data->region, MAP_FAILED);
+ memset(data->region, 0xcf, data->mapsize);
+ close(fd);
+
+ data->do_prepare = access_region;
+ data->do_work = move_region;
+ data->do_check = has_same_pfn;
+
+ propagate_children(_metadata, data);
+}
+
+void prepare_local_region(struct global_data *data)
+{
+ /* Allocate range and set the same data */
+ data->region = mmap(NULL, data->mapsize, PROT_READ|PROT_WRITE,
+ MAP_PRIVATE|MAP_ANON, -1, 0);
+ if (data->region == MAP_FAILED)
+ return;
+
+ memset(data->region, 0xcf, data->mapsize);
+}
+
+int merge_and_migrate(struct global_data *data)
+{
+ int pagemap_fd;
+ int ret = 0;
+
+ if (data->region == MAP_FAILED)
+ return FAIL_ON_WORK;
+
+ if (ksm_start() < 0)
+ return FAIL_ON_WORK;
+
+ ret = try_to_move_page(data->region);
+
+ pagemap_fd = open("/proc/self/pagemap", O_RDONLY);
+ if (pagemap_fd == -1)
+ return FAIL_ON_WORK;
+ *data->expected_pfn = pagemap_get_pfn(pagemap_fd, data->region);
+
+ return ret;
+}
+
+TEST_F(migrate, ksm)
+{
+ int ret;
+ struct global_data *data = &self->data;
+
+ if (ksm_stop() < 0)
+ SKIP(return, "accessing \"/sys/kernel/mm/ksm/run\") failed");
+ if (ksm_get_full_scans() < 0)
+ SKIP(return, "accessing \"/sys/kernel/mm/ksm/full_scan\") failed");
+
+ ret = prctl(PR_SET_MEMORY_MERGE, 1, 0, 0, 0);
+ if (ret < 0 && errno == EINVAL)
+ SKIP(return, "PR_SET_MEMORY_MERGE not supported");
+ else if (ret)
+ ksft_exit_fail_perror("PR_SET_MEMORY_MERGE=1 failed");
+
+ data->do_prepare = prepare_local_region;
+ data->do_work = merge_and_migrate;
+ data->do_check = has_same_pfn;
+
+ propagate_children(_metadata, data);
+}
+
+TEST_HARNESS_MAIN
diff --git a/tools/testing/selftests/mm/run_vmtests.sh b/tools/testing/selftests/mm/run_vmtests.sh
index 471e539d82b8..9e88cc25b9df 100755
--- a/tools/testing/selftests/mm/run_vmtests.sh
+++ b/tools/testing/selftests/mm/run_vmtests.sh
@@ -85,6 +85,8 @@ separated by spaces:
test handling of page fragment allocation and freeing
- vma_merge
test VMA merge cases behave as expected
+- rmap
+ test rmap behaves as expected
example: ./run_vmtests.sh -t "hmm mmap ksm"
EOF
@@ -322,11 +324,15 @@ CATEGORY="gup_test" run_test ./gup_longterm
CATEGORY="userfaultfd" run_test ./uffd-unit-tests
uffd_stress_bin=./uffd-stress
CATEGORY="userfaultfd" run_test ${uffd_stress_bin} anon 20 16
-# Hugetlb tests require source and destination huge pages. Pass in half
-# the size of the free pages we have, which is used for *each*.
+# Hugetlb tests require source and destination huge pages. Pass in almost half
+# the size of the free pages we have, which is used for *each*. An adjustment
+# of (nr_parallel - 1) is done (see nr_parallel in uffd-stress.c) to have some
+# extra hugepages - this is done to prevent the test from failing by racily
+# reserving more hugepages than strictly required.
# uffd-stress expects a region expressed in MiB, so we adjust
# half_ufd_size_MB accordingly.
-half_ufd_size_MB=$(((freepgs * hpgsize_KB) / 1024 / 2))
+adjustment=$(( (31 < (nr_cpus - 1)) ? 31 : (nr_cpus - 1) ))
+half_ufd_size_MB=$((((freepgs - adjustment) * hpgsize_KB) / 1024 / 2))
CATEGORY="userfaultfd" run_test ${uffd_stress_bin} hugetlb "$half_ufd_size_MB" 32
CATEGORY="userfaultfd" run_test ${uffd_stress_bin} hugetlb-private "$half_ufd_size_MB" 32
CATEGORY="userfaultfd" run_test ${uffd_stress_bin} shmem 20 16
@@ -532,6 +538,8 @@ CATEGORY="page_frag" run_test ./test_page_frag.sh aligned
CATEGORY="page_frag" run_test ./test_page_frag.sh nonaligned
+CATEGORY="rmap" run_test ./rmap
+
echo "SUMMARY: PASS=${count_pass} SKIP=${count_skip} FAIL=${count_fail}" | tap_prefix
echo "1..${count_total}" | tap_output
diff --git a/tools/testing/selftests/mm/split_huge_page_test.c b/tools/testing/selftests/mm/split_huge_page_test.c
index 44a3f8a58806..10ae65ea032f 100644
--- a/tools/testing/selftests/mm/split_huge_page_test.c
+++ b/tools/testing/selftests/mm/split_huge_page_test.c
@@ -25,6 +25,8 @@
uint64_t pagesize;
unsigned int pageshift;
uint64_t pmd_pagesize;
+unsigned int pmd_order;
+int *expected_orders;
#define SPLIT_DEBUGFS "/sys/kernel/debug/split_huge_pages"
#define SMAP_PATH "/proc/self/smaps"
@@ -34,28 +36,226 @@ uint64_t pmd_pagesize;
#define PID_FMT_OFFSET "%d,0x%lx,0x%lx,%d,%d"
#define PATH_FMT "%s,0x%lx,0x%lx,%d"
-#define PFN_MASK ((1UL<<55)-1)
-#define KPF_THP (1UL<<22)
+const char *pagemap_proc = "/proc/self/pagemap";
+const char *kpageflags_proc = "/proc/kpageflags";
+int pagemap_fd;
+int kpageflags_fd;
-int is_backed_by_thp(char *vaddr, int pagemap_file, int kpageflags_file)
+static bool is_backed_by_folio(char *vaddr, int order, int pagemap_fd,
+ int kpageflags_fd)
{
- uint64_t paddr;
- uint64_t page_flags;
+ const unsigned long nr_pages = 1UL << order;
+ unsigned long pfn_head;
+ uint64_t pfn_flags;
+ unsigned long pfn;
+ unsigned long i;
- if (pagemap_file) {
- pread(pagemap_file, &paddr, sizeof(paddr),
- ((long)vaddr >> pageshift) * sizeof(paddr));
+ pfn = pagemap_get_pfn(pagemap_fd, vaddr);
- if (kpageflags_file) {
- pread(kpageflags_file, &page_flags, sizeof(page_flags),
- (paddr & PFN_MASK) * sizeof(page_flags));
+ /* non present page */
+ if (pfn == -1UL)
+ return false;
+
+ if (pageflags_get(pfn, kpageflags_fd, &pfn_flags))
+ goto fail;
+
+ /* check for order-0 pages */
+ if (!order) {
+ if (pfn_flags & (KPF_THP | KPF_COMPOUND_HEAD | KPF_COMPOUND_TAIL))
+ return false;
+ return true;
+ }
+
+ /* non THP folio */
+ if (!(pfn_flags & KPF_THP))
+ return false;
+
+ pfn_head = pfn & ~(nr_pages - 1);
+
+ if (pageflags_get(pfn_head, kpageflags_fd, &pfn_flags))
+ goto fail;
+
+ /* head PFN has no compound_head flag set */
+ if (!(pfn_flags & (KPF_THP | KPF_COMPOUND_HEAD)))
+ return false;
+
+ /* check all tail PFN flags */
+ for (i = 1; i < nr_pages; i++) {
+ if (pageflags_get(pfn_head + i, kpageflags_fd, &pfn_flags))
+ goto fail;
+ if (!(pfn_flags & (KPF_THP | KPF_COMPOUND_TAIL)))
+ return false;
+ }
+
+ /*
+ * check the PFN after this folio, but if its flags cannot be obtained,
+ * assume this folio has the expected order
+ */
+ if (pageflags_get(pfn_head + nr_pages, kpageflags_fd, &pfn_flags))
+ return true;
+
+ /* this folio is bigger than the given order */
+ if (pfn_flags & (KPF_THP | KPF_COMPOUND_TAIL))
+ return false;
+
+ return true;
+fail:
+ ksft_exit_fail_msg("Failed to get folio info\n");
+ return false;
+}
+
+static int vaddr_pageflags_get(char *vaddr, int pagemap_fd, int kpageflags_fd,
+ uint64_t *flags)
+{
+ unsigned long pfn;
+
+ pfn = pagemap_get_pfn(pagemap_fd, vaddr);
+
+ /* non-present PFN */
+ if (pfn == -1UL)
+ return 1;
+
+ if (pageflags_get(pfn, kpageflags_fd, flags))
+ return -1;
- return !!(page_flags & KPF_THP);
+ return 0;
+}
+
+/*
+ * gather_after_split_folio_orders - scan through [vaddr_start, len) and record
+ * folio orders
+ *
+ * @vaddr_start: start vaddr
+ * @len: range length
+ * @pagemap_fd: file descriptor to /proc/<pid>/pagemap
+ * @kpageflags_fd: file descriptor to /proc/kpageflags
+ * @orders: output folio order array
+ * @nr_orders: folio order array size
+ *
+ * gather_after_split_folio_orders() scan through [vaddr_start, len) and check
+ * all folios within the range and record their orders. All order-0 pages will
+ * be recorded. Non-present vaddr is skipped.
+ *
+ * NOTE: the function is used to check folio orders after a split is performed,
+ * so it assumes [vaddr_start, len) fully maps to after-split folios within that
+ * range.
+ *
+ * Return: 0 - no error, -1 - unhandled cases
+ */
+static int gather_after_split_folio_orders(char *vaddr_start, size_t len,
+ int pagemap_fd, int kpageflags_fd, int orders[], int nr_orders)
+{
+ uint64_t page_flags = 0;
+ int cur_order = -1;
+ char *vaddr;
+
+ if (pagemap_fd == -1 || kpageflags_fd == -1)
+ return -1;
+ if (!orders)
+ return -1;
+ if (nr_orders <= 0)
+ return -1;
+
+ for (vaddr = vaddr_start; vaddr < vaddr_start + len;) {
+ char *next_folio_vaddr;
+ int status;
+
+ status = vaddr_pageflags_get(vaddr, pagemap_fd, kpageflags_fd,
+ &page_flags);
+ if (status < 0)
+ return -1;
+
+ /* skip non present vaddr */
+ if (status == 1) {
+ vaddr += psize();
+ continue;
+ }
+
+ /* all order-0 pages with possible false postive (non folio) */
+ if (!(page_flags & (KPF_COMPOUND_HEAD | KPF_COMPOUND_TAIL))) {
+ orders[0]++;
+ vaddr += psize();
+ continue;
+ }
+
+ /* skip non thp compound pages */
+ if (!(page_flags & KPF_THP)) {
+ vaddr += psize();
+ continue;
+ }
+
+ /* vpn points to part of a THP at this point */
+ if (page_flags & KPF_COMPOUND_HEAD)
+ cur_order = 1;
+ else {
+ vaddr += psize();
+ continue;
+ }
+
+ next_folio_vaddr = vaddr + (1UL << (cur_order + pshift()));
+
+ if (next_folio_vaddr >= vaddr_start + len)
+ break;
+
+ while ((status = vaddr_pageflags_get(next_folio_vaddr,
+ pagemap_fd, kpageflags_fd,
+ &page_flags)) >= 0) {
+ /*
+ * non present vaddr, next compound head page, or
+ * order-0 page
+ */
+ if (status == 1 ||
+ (page_flags & KPF_COMPOUND_HEAD) ||
+ !(page_flags & (KPF_COMPOUND_HEAD | KPF_COMPOUND_TAIL))) {
+ if (cur_order < nr_orders) {
+ orders[cur_order]++;
+ cur_order = -1;
+ vaddr = next_folio_vaddr;
+ }
+ break;
+ }
+
+ cur_order++;
+ next_folio_vaddr = vaddr + (1UL << (cur_order + pshift()));
}
+
+ if (status < 0)
+ return status;
}
+ if (cur_order > 0 && cur_order < nr_orders)
+ orders[cur_order]++;
return 0;
}
+static int check_after_split_folio_orders(char *vaddr_start, size_t len,
+ int pagemap_fd, int kpageflags_fd, int orders[], int nr_orders)
+{
+ int *vaddr_orders;
+ int status;
+ int i;
+
+ vaddr_orders = (int *)malloc(sizeof(int) * nr_orders);
+
+ if (!vaddr_orders)
+ ksft_exit_fail_msg("Cannot allocate memory for vaddr_orders");
+
+ memset(vaddr_orders, 0, sizeof(int) * nr_orders);
+ status = gather_after_split_folio_orders(vaddr_start, len, pagemap_fd,
+ kpageflags_fd, vaddr_orders, nr_orders);
+ if (status)
+ ksft_exit_fail_msg("gather folio info failed\n");
+
+ for (i = 0; i < nr_orders; i++)
+ if (vaddr_orders[i] != orders[i]) {
+ ksft_print_msg("order %d: expected: %d got %d\n", i,
+ orders[i], vaddr_orders[i]);
+ status = -1;
+ }
+
+ free(vaddr_orders);
+ return status;
+}
+
static void write_file(const char *path, const char *buf, size_t buflen)
{
int fd;
@@ -111,7 +311,7 @@ static void verify_rss_anon_split_huge_page_all_zeroes(char *one_page, int nr_hp
unsigned long rss_anon_before, rss_anon_after;
size_t i;
- if (!check_huge_anon(one_page, 4, pmd_pagesize))
+ if (!check_huge_anon(one_page, nr_hpages, pmd_pagesize))
ksft_exit_fail_msg("No THP is allocated\n");
rss_anon_before = rss_anon();
@@ -135,7 +335,7 @@ static void verify_rss_anon_split_huge_page_all_zeroes(char *one_page, int nr_hp
rss_anon_before, rss_anon_after);
}
-void split_pmd_zero_pages(void)
+static void split_pmd_zero_pages(void)
{
char *one_page;
int nr_hpages = 4;
@@ -147,7 +347,7 @@ void split_pmd_zero_pages(void)
free(one_page);
}
-void split_pmd_thp_to_order(int order)
+static void split_pmd_thp_to_order(int order)
{
char *one_page;
size_t len = 4 * pmd_pagesize;
@@ -173,6 +373,13 @@ void split_pmd_thp_to_order(int order)
if (one_page[i] != (char)i)
ksft_exit_fail_msg("%ld byte corrupted\n", i);
+ memset(expected_orders, 0, sizeof(int) * (pmd_order + 1));
+ expected_orders[order] = 4 << (pmd_order - order);
+
+ if (check_after_split_folio_orders(one_page, len, pagemap_fd,
+ kpageflags_fd, expected_orders,
+ (pmd_order + 1)))
+ ksft_exit_fail_msg("Unexpected THP split\n");
if (!check_huge_anon(one_page, 0, pmd_pagesize))
ksft_exit_fail_msg("Still AnonHugePages not split\n");
@@ -181,28 +388,12 @@ void split_pmd_thp_to_order(int order)
free(one_page);
}
-void split_pte_mapped_thp(void)
+static void split_pte_mapped_thp(void)
{
char *one_page, *pte_mapped, *pte_mapped2;
size_t len = 4 * pmd_pagesize;
uint64_t thp_size;
size_t i;
- const char *pagemap_template = "/proc/%d/pagemap";
- const char *kpageflags_proc = "/proc/kpageflags";
- char pagemap_proc[255];
- int pagemap_fd;
- int kpageflags_fd;
-
- if (snprintf(pagemap_proc, 255, pagemap_template, getpid()) < 0)
- ksft_exit_fail_msg("get pagemap proc error: %s\n", strerror(errno));
-
- pagemap_fd = open(pagemap_proc, O_RDONLY);
- if (pagemap_fd == -1)
- ksft_exit_fail_msg("read pagemap: %s\n", strerror(errno));
-
- kpageflags_fd = open(kpageflags_proc, O_RDONLY);
- if (kpageflags_fd == -1)
- ksft_exit_fail_msg("read kpageflags: %s\n", strerror(errno));
one_page = mmap((void *)(1UL << 30), len, PROT_READ | PROT_WRITE,
MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
@@ -234,7 +425,7 @@ void split_pte_mapped_thp(void)
thp_size = 0;
for (i = 0; i < pagesize * 4; i++)
if (i % pagesize == 0 &&
- is_backed_by_thp(&pte_mapped[i], pagemap_fd, kpageflags_fd))
+ is_backed_by_folio(&pte_mapped[i], pmd_order, pagemap_fd, kpageflags_fd))
thp_size++;
if (thp_size != 4)
@@ -251,7 +442,7 @@ void split_pte_mapped_thp(void)
ksft_exit_fail_msg("%ld byte corrupted\n", i);
if (i % pagesize == 0 &&
- is_backed_by_thp(&pte_mapped[i], pagemap_fd, kpageflags_fd))
+ !is_backed_by_folio(&pte_mapped[i], 0, pagemap_fd, kpageflags_fd))
thp_size++;
}
@@ -260,11 +451,9 @@ void split_pte_mapped_thp(void)
ksft_test_result_pass("Split PTE-mapped huge pages successful\n");
munmap(one_page, len);
- close(pagemap_fd);
- close(kpageflags_fd);
}
-void split_file_backed_thp(int order)
+static void split_file_backed_thp(int order)
{
int status;
int fd;
@@ -366,7 +555,7 @@ out:
ksft_exit_fail_msg("Error occurred\n");
}
-bool prepare_thp_fs(const char *xfs_path, char *thp_fs_template,
+static bool prepare_thp_fs(const char *xfs_path, char *thp_fs_template,
const char **thp_fs_loc)
{
if (xfs_path) {
@@ -382,7 +571,7 @@ bool prepare_thp_fs(const char *xfs_path, char *thp_fs_template,
return true;
}
-void cleanup_thp_fs(const char *thp_fs_loc, bool created_tmp)
+static void cleanup_thp_fs(const char *thp_fs_loc, bool created_tmp)
{
int status;
@@ -395,8 +584,8 @@ void cleanup_thp_fs(const char *thp_fs_loc, bool created_tmp)
strerror(errno));
}
-int create_pagecache_thp_and_fd(const char *testfile, size_t fd_size, int *fd,
- char **addr)
+static int create_pagecache_thp_and_fd(const char *testfile, size_t fd_size,
+ int *fd, char **addr)
{
size_t i;
unsigned char buf[1024];
@@ -462,10 +651,11 @@ err_out_unlink:
return -1;
}
-void split_thp_in_pagecache_to_order_at(size_t fd_size, const char *fs_loc,
- int order, int offset)
+static void split_thp_in_pagecache_to_order_at(size_t fd_size,
+ const char *fs_loc, int order, int offset)
{
int fd;
+ char *split_addr;
char *addr;
size_t i;
char testfile[INPUT_MAX];
@@ -479,14 +669,33 @@ void split_thp_in_pagecache_to_order_at(size_t fd_size, const char *fs_loc,
err = create_pagecache_thp_and_fd(testfile, fd_size, &fd, &addr);
if (err)
return;
+
err = 0;
- if (offset == -1)
- write_debugfs(PID_FMT, getpid(), (uint64_t)addr,
- (uint64_t)addr + fd_size, order);
- else
- write_debugfs(PID_FMT_OFFSET, getpid(), (uint64_t)addr,
- (uint64_t)addr + fd_size, order, offset);
+ memset(expected_orders, 0, sizeof(int) * (pmd_order + 1));
+ /*
+ * use [split_addr, split_addr + pagesize) range to split THPs, since
+ * the debugfs function always split a range with pagesize step and
+ * providing a full [addr, addr + fd_size) range can trigger multiple
+ * splits, complicating after-split result checking.
+ */
+ if (offset == -1) {
+ for (split_addr = addr; split_addr < addr + fd_size; split_addr += pmd_pagesize)
+ write_debugfs(PID_FMT, getpid(), (uint64_t)split_addr,
+ (uint64_t)split_addr + pagesize, order);
+
+ expected_orders[order] = fd_size / (pagesize << order);
+ } else {
+ int times = fd_size / pmd_pagesize;
+
+ for (split_addr = addr; split_addr < addr + fd_size; split_addr += pmd_pagesize)
+ write_debugfs(PID_FMT_OFFSET, getpid(), (uint64_t)split_addr,
+ (uint64_t)split_addr + pagesize, order, offset);
+
+ for (i = order + 1; i < pmd_order; i++)
+ expected_orders[i] = times;
+ expected_orders[order] = 2 * times;
+ }
for (i = 0; i < fd_size; i++)
if (*(addr + i) != (char)i) {
@@ -495,6 +704,14 @@ void split_thp_in_pagecache_to_order_at(size_t fd_size, const char *fs_loc,
goto out;
}
+ if (check_after_split_folio_orders(addr, fd_size, pagemap_fd,
+ kpageflags_fd, expected_orders,
+ (pmd_order + 1))) {
+ ksft_print_msg("Unexpected THP split\n");
+ err = 1;
+ goto out;
+ }
+
if (!check_huge_file(addr, 0, pmd_pagesize)) {
ksft_print_msg("Still FilePmdMapped not split\n");
err = EXIT_FAILURE;
@@ -525,6 +742,8 @@ int main(int argc, char **argv)
const char *fs_loc;
bool created_tmp;
int offset;
+ unsigned int nr_pages;
+ unsigned int tests;
ksft_print_header();
@@ -536,38 +755,58 @@ int main(int argc, char **argv)
if (argc > 1)
optional_xfs_path = argv[1];
- ksft_set_plan(1+8+1+9+9+8*4+2);
-
pagesize = getpagesize();
pageshift = ffs(pagesize) - 1;
pmd_pagesize = read_pmd_pagesize();
if (!pmd_pagesize)
ksft_exit_fail_msg("Reading PMD pagesize failed\n");
+ nr_pages = pmd_pagesize / pagesize;
+ pmd_order = sz2ord(pmd_pagesize, pagesize);
+
+ expected_orders = (int *)malloc(sizeof(int) * (pmd_order + 1));
+ if (!expected_orders)
+ ksft_exit_fail_msg("Fail to allocate memory: %s\n", strerror(errno));
+
+ tests = 2 + (pmd_order - 1) + (2 * pmd_order) + (pmd_order - 1) * 4 + 2;
+ ksft_set_plan(tests);
+
+ pagemap_fd = open(pagemap_proc, O_RDONLY);
+ if (pagemap_fd == -1)
+ ksft_exit_fail_msg("read pagemap: %s\n", strerror(errno));
+
+ kpageflags_fd = open(kpageflags_proc, O_RDONLY);
+ if (kpageflags_fd == -1)
+ ksft_exit_fail_msg("read kpageflags: %s\n", strerror(errno));
+
fd_size = 2 * pmd_pagesize;
split_pmd_zero_pages();
- for (i = 0; i < 9; i++)
+ for (i = 0; i < pmd_order; i++)
if (i != 1)
split_pmd_thp_to_order(i);
split_pte_mapped_thp();
- for (i = 0; i < 9; i++)
+ for (i = 0; i < pmd_order; i++)
split_file_backed_thp(i);
created_tmp = prepare_thp_fs(optional_xfs_path, fs_loc_template,
&fs_loc);
- for (i = 8; i >= 0; i--)
+ for (i = pmd_order - 1; i >= 0; i--)
split_thp_in_pagecache_to_order_at(fd_size, fs_loc, i, -1);
- for (i = 0; i < 9; i++)
+ for (i = 0; i < pmd_order; i++)
for (offset = 0;
- offset < pmd_pagesize / pagesize;
- offset += MAX(pmd_pagesize / pagesize / 4, 1 << i))
+ offset < nr_pages;
+ offset += MAX(nr_pages / 4, 1 << i))
split_thp_in_pagecache_to_order_at(fd_size, fs_loc, i, offset);
cleanup_thp_fs(fs_loc, created_tmp);
+ close(pagemap_fd);
+ close(kpageflags_fd);
+ free(expected_orders);
+
ksft_finished();
return 0;
diff --git a/tools/testing/selftests/mm/test_vmalloc.sh b/tools/testing/selftests/mm/test_vmalloc.sh
index d73b846736f1..d39096723fca 100755
--- a/tools/testing/selftests/mm/test_vmalloc.sh
+++ b/tools/testing/selftests/mm/test_vmalloc.sh
@@ -47,14 +47,14 @@ check_test_requirements()
fi
}
-run_perfformance_check()
+run_performance_check()
{
echo "Run performance tests to evaluate how fast vmalloc allocation is."
echo "It runs all test cases on one single CPU with sequential order."
modprobe $DRIVER $PERF_PARAM > /dev/null 2>&1
echo "Done."
- echo "Ccheck the kernel message buffer to see the summary."
+ echo "Check the kernel message buffer to see the summary."
}
run_stability_check()
@@ -160,7 +160,7 @@ function run_test()
usage
else
if [[ "$1" = "performance" ]]; then
- run_perfformance_check
+ run_performance_check
elif [[ "$1" = "stress" ]]; then
run_stability_check
elif [[ "$1" = "smoke" ]]; then
diff --git a/tools/testing/selftests/mm/thp_settings.c b/tools/testing/selftests/mm/thp_settings.c
index bad60ac52874..574bd0f8ae48 100644
--- a/tools/testing/selftests/mm/thp_settings.c
+++ b/tools/testing/selftests/mm/thp_settings.c
@@ -382,10 +382,17 @@ unsigned long thp_shmem_supported_orders(void)
return __thp_supported_orders(true);
}
-bool thp_is_enabled(void)
+bool thp_available(void)
{
if (access(THP_SYSFS, F_OK) != 0)
return false;
+ return true;
+}
+
+bool thp_is_enabled(void)
+{
+ if (!thp_available())
+ return false;
int mode = thp_read_string("enabled", thp_enabled_strings);
diff --git a/tools/testing/selftests/mm/thp_settings.h b/tools/testing/selftests/mm/thp_settings.h
index 6c07f70beee9..76eeb712e5f1 100644
--- a/tools/testing/selftests/mm/thp_settings.h
+++ b/tools/testing/selftests/mm/thp_settings.h
@@ -84,6 +84,7 @@ void thp_set_read_ahead_path(char *path);
unsigned long thp_supported_orders(void);
unsigned long thp_shmem_supported_orders(void);
+bool thp_available(void);
bool thp_is_enabled(void);
#endif /* __THP_SETTINGS_H__ */
diff --git a/tools/testing/selftests/mm/thuge-gen.c b/tools/testing/selftests/mm/thuge-gen.c
index 8e2b08dc5762..4f5e290ff1a6 100644
--- a/tools/testing/selftests/mm/thuge-gen.c
+++ b/tools/testing/selftests/mm/thuge-gen.c
@@ -177,13 +177,16 @@ void find_pagesizes(void)
globfree(&g);
read_sysfs("/proc/sys/kernel/shmmax", &shmmax_val);
- if (shmmax_val < NUM_PAGES * largest)
- ksft_exit_fail_msg("Please do echo %lu > /proc/sys/kernel/shmmax",
- largest * NUM_PAGES);
+ if (shmmax_val < NUM_PAGES * largest) {
+ ksft_print_msg("WARNING: shmmax is too small to run this test.\n");
+ ksft_print_msg("Please run the following command to increase shmmax:\n");
+ ksft_print_msg("echo %lu > /proc/sys/kernel/shmmax\n", largest * NUM_PAGES);
+ ksft_exit_skip("Test skipped due to insufficient shmmax value.\n");
+ }
#if defined(__x86_64__)
if (largest != 1U<<30) {
- ksft_exit_fail_msg("No GB pages available on x86-64\n"
+ ksft_exit_skip("No GB pages available on x86-64\n"
"Please boot with hugepagesz=1G hugepages=%d\n", NUM_PAGES);
}
#endif
diff --git a/tools/testing/selftests/mm/uffd-stress.c b/tools/testing/selftests/mm/uffd-stress.c
index 40af7f67c407..ecd016329935 100644
--- a/tools/testing/selftests/mm/uffd-stress.c
+++ b/tools/testing/selftests/mm/uffd-stress.c
@@ -51,7 +51,7 @@ static char *zeropage;
pthread_attr_t attr;
#define swap(a, b) \
- do { typeof(a) __tmp = (a); (a) = (b); (b) = __tmp; } while (0)
+ do { __auto_type __tmp = (a); (a) = (b); (b) = __tmp; } while (0)
const char *examples =
"# Run anonymous memory test on 100MiB region with 99999 bounces:\n"
@@ -448,12 +448,6 @@ int main(int argc, char **argv)
parse_test_type_arg(argv[1]);
bytes = atol(argv[2]) * 1024 * 1024;
- if (test_type == TEST_HUGETLB &&
- get_free_hugepages() < bytes / page_size) {
- printf("skip: Skipping userfaultfd... not enough hugepages\n");
- return KSFT_SKIP;
- }
-
nr_cpus = sysconf(_SC_NPROCESSORS_ONLN);
if (nr_cpus > 32) {
/* Don't let calculation below go to zero. */
@@ -464,6 +458,17 @@ int main(int argc, char **argv)
nr_parallel = nr_cpus;
}
+ /*
+ * src and dst each require bytes / page_size number of hugepages.
+ * Ensure nr_parallel - 1 hugepages on top of that to account
+ * for racy extra reservation of hugepages.
+ */
+ if (test_type == TEST_HUGETLB &&
+ get_free_hugepages() < 2 * (bytes / page_size) + nr_parallel - 1) {
+ printf("skip: Skipping userfaultfd... not enough hugepages\n");
+ return KSFT_SKIP;
+ }
+
nr_pages_per_cpu = bytes / page_size / nr_parallel;
if (!nr_pages_per_cpu) {
_err("pages_per_cpu = 0, cannot test (%lu / %lu / %lu)",
diff --git a/tools/testing/selftests/mm/uffd-wp-mremap.c b/tools/testing/selftests/mm/uffd-wp-mremap.c
index c2ba7d46c7b4..78038c40aaaf 100644
--- a/tools/testing/selftests/mm/uffd-wp-mremap.c
+++ b/tools/testing/selftests/mm/uffd-wp-mremap.c
@@ -19,11 +19,6 @@ static size_t thpsizes[20];
static int nr_hugetlbsizes;
static size_t hugetlbsizes[10];
-static int sz2ord(size_t size)
-{
- return __builtin_ctzll(size / pagesize);
-}
-
static int detect_thp_sizes(size_t sizes[], int max)
{
int count = 0;
@@ -87,9 +82,9 @@ static void *alloc_one_folio(size_t size, bool private, bool hugetlb)
struct thp_settings settings = *thp_current_settings();
if (private)
- settings.hugepages[sz2ord(size)].enabled = THP_ALWAYS;
+ settings.hugepages[sz2ord(size, pagesize)].enabled = THP_ALWAYS;
else
- settings.shmem_hugepages[sz2ord(size)].enabled = SHMEM_ALWAYS;
+ settings.shmem_hugepages[sz2ord(size, pagesize)].enabled = SHMEM_ALWAYS;
thp_push_settings(&settings);
diff --git a/tools/testing/selftests/mm/virtual_address_range.c b/tools/testing/selftests/mm/virtual_address_range.c
index 169dbd692bf5..81b33d8f78f4 100644
--- a/tools/testing/selftests/mm/virtual_address_range.c
+++ b/tools/testing/selftests/mm/virtual_address_range.c
@@ -44,12 +44,18 @@
* On Arm64 the address space is 256TB and support for
* high mappings up to 4PB virtual address space has
* been added.
+ *
+ * On PowerPC64, the address space up to 128TB can be
+ * mapped without a hint. Addresses beyond 128TB, up to
+ * 4PB, can be mapped with a hint.
+ *
*/
#define NR_CHUNKS_128TB ((128 * SZ_1TB) / MAP_CHUNK_SIZE) /* Number of chunks for 128TB */
#define NR_CHUNKS_256TB (NR_CHUNKS_128TB * 2UL)
#define NR_CHUNKS_384TB (NR_CHUNKS_128TB * 3UL)
#define NR_CHUNKS_3840TB (NR_CHUNKS_128TB * 30UL)
+#define NR_CHUNKS_3968TB (NR_CHUNKS_128TB * 31UL)
#define ADDR_MARK_128TB (1UL << 47) /* First address beyond 128TB */
#define ADDR_MARK_256TB (1UL << 48) /* First address beyond 256TB */
@@ -59,6 +65,11 @@
#define HIGH_ADDR_SHIFT 49
#define NR_CHUNKS_LOW NR_CHUNKS_256TB
#define NR_CHUNKS_HIGH NR_CHUNKS_3840TB
+#elif defined(__PPC64__)
+#define HIGH_ADDR_MARK ADDR_MARK_128TB
+#define HIGH_ADDR_SHIFT 48
+#define NR_CHUNKS_LOW NR_CHUNKS_128TB
+#define NR_CHUNKS_HIGH NR_CHUNKS_3968TB
#else
#define HIGH_ADDR_MARK ADDR_MARK_128TB
#define HIGH_ADDR_SHIFT 48
@@ -227,7 +238,7 @@ int main(int argc, char *argv[])
if (hptr[i] == MAP_FAILED)
break;
- mark_range(ptr[i], MAP_CHUNK_SIZE);
+ mark_range(hptr[i], MAP_CHUNK_SIZE);
validate_addr(hptr[i], 1);
}
hchunks = i;
diff --git a/tools/testing/selftests/mm/vm_util.c b/tools/testing/selftests/mm/vm_util.c
index 9dafa7669ef9..56e9bd541edd 100644
--- a/tools/testing/selftests/mm/vm_util.c
+++ b/tools/testing/selftests/mm/vm_util.c
@@ -338,6 +338,19 @@ int detect_hugetlb_page_sizes(size_t sizes[], int max)
return count;
}
+int pageflags_get(unsigned long pfn, int kpageflags_fd, uint64_t *flags)
+{
+ size_t count;
+
+ count = pread(kpageflags_fd, flags, sizeof(*flags),
+ pfn * sizeof(*flags));
+
+ if (count != sizeof(*flags))
+ return -1;
+
+ return 0;
+}
+
/* If `ioctls' non-NULL, the allowed ioctls will be returned into the var */
int uffd_register_with_ioctls(int uffd, void *addr, uint64_t len,
bool miss, bool wp, bool minor, uint64_t *ioctls)
@@ -402,7 +415,7 @@ unsigned long get_free_hugepages(void)
return fhp;
}
-bool check_vmflag_io(void *addr)
+static bool check_vmflag(void *addr, const char *flag)
{
char buffer[MAX_LINE_LENGTH];
const char *flags;
@@ -419,13 +432,23 @@ bool check_vmflag_io(void *addr)
if (!flaglen)
return false;
- if (flaglen == strlen("io") && !memcmp(flags, "io", flaglen))
+ if (flaglen == strlen(flag) && !memcmp(flags, flag, flaglen))
return true;
flags += flaglen;
}
}
+bool check_vmflag_io(void *addr)
+{
+ return check_vmflag(addr, "io");
+}
+
+bool check_vmflag_pfnmap(void *addr)
+{
+ return check_vmflag(addr, "pf");
+}
+
/*
* Open an fd at /proc/$pid/maps and configure procmap_out ready for
* PROCMAP_QUERY query. Returns 0 on success, or an error code otherwise.
@@ -555,3 +578,126 @@ bool detect_huge_zeropage(void)
close(fd);
return enabled;
}
+
+long ksm_get_self_zero_pages(void)
+{
+ int proc_self_ksm_stat_fd;
+ char buf[200];
+ char *substr_ksm_zero;
+ size_t value_pos;
+ ssize_t read_size;
+
+ proc_self_ksm_stat_fd = open("/proc/self/ksm_stat", O_RDONLY);
+ if (proc_self_ksm_stat_fd < 0)
+ return -errno;
+
+ read_size = pread(proc_self_ksm_stat_fd, buf, sizeof(buf) - 1, 0);
+ close(proc_self_ksm_stat_fd);
+ if (read_size < 0)
+ return -errno;
+
+ buf[read_size] = 0;
+
+ substr_ksm_zero = strstr(buf, "ksm_zero_pages");
+ if (!substr_ksm_zero)
+ return 0;
+
+ value_pos = strcspn(substr_ksm_zero, "0123456789");
+ return strtol(substr_ksm_zero + value_pos, NULL, 10);
+}
+
+long ksm_get_self_merging_pages(void)
+{
+ int proc_self_ksm_merging_pages_fd;
+ char buf[10];
+ ssize_t ret;
+
+ proc_self_ksm_merging_pages_fd = open("/proc/self/ksm_merging_pages",
+ O_RDONLY);
+ if (proc_self_ksm_merging_pages_fd < 0)
+ return -errno;
+
+ ret = pread(proc_self_ksm_merging_pages_fd, buf, sizeof(buf) - 1, 0);
+ close(proc_self_ksm_merging_pages_fd);
+ if (ret <= 0)
+ return -errno;
+ buf[ret] = 0;
+
+ return strtol(buf, NULL, 10);
+}
+
+long ksm_get_full_scans(void)
+{
+ int ksm_full_scans_fd;
+ char buf[10];
+ ssize_t ret;
+
+ ksm_full_scans_fd = open("/sys/kernel/mm/ksm/full_scans", O_RDONLY);
+ if (ksm_full_scans_fd < 0)
+ return -errno;
+
+ ret = pread(ksm_full_scans_fd, buf, sizeof(buf) - 1, 0);
+ close(ksm_full_scans_fd);
+ if (ret <= 0)
+ return -errno;
+ buf[ret] = 0;
+
+ return strtol(buf, NULL, 10);
+}
+
+int ksm_use_zero_pages(void)
+{
+ int ksm_use_zero_pages_fd;
+ ssize_t ret;
+
+ ksm_use_zero_pages_fd = open("/sys/kernel/mm/ksm/use_zero_pages", O_RDWR);
+ if (ksm_use_zero_pages_fd < 0)
+ return -errno;
+
+ ret = write(ksm_use_zero_pages_fd, "1", 1);
+ close(ksm_use_zero_pages_fd);
+ return ret == 1 ? 0 : -errno;
+}
+
+int ksm_start(void)
+{
+ int ksm_fd;
+ ssize_t ret;
+ long start_scans, end_scans;
+
+ ksm_fd = open("/sys/kernel/mm/ksm/run", O_RDWR);
+ if (ksm_fd < 0)
+ return -errno;
+
+ /* Wait for two full scans such that any possible merging happened. */
+ start_scans = ksm_get_full_scans();
+ if (start_scans < 0) {
+ close(ksm_fd);
+ return start_scans;
+ }
+ ret = write(ksm_fd, "1", 1);
+ close(ksm_fd);
+ if (ret != 1)
+ return -errno;
+ do {
+ end_scans = ksm_get_full_scans();
+ if (end_scans < 0)
+ return end_scans;
+ } while (end_scans < start_scans + 2);
+
+ return 0;
+}
+
+int ksm_stop(void)
+{
+ int ksm_fd;
+ ssize_t ret;
+
+ ksm_fd = open("/sys/kernel/mm/ksm/run", O_RDWR);
+ if (ksm_fd < 0)
+ return -errno;
+
+ ret = write(ksm_fd, "2", 1);
+ close(ksm_fd);
+ return ret == 1 ? 0 : -errno;
+}
diff --git a/tools/testing/selftests/mm/vm_util.h b/tools/testing/selftests/mm/vm_util.h
index b55d1809debc..07c4acfd84b6 100644
--- a/tools/testing/selftests/mm/vm_util.h
+++ b/tools/testing/selftests/mm/vm_util.h
@@ -18,6 +18,9 @@
#define PM_SWAP BIT_ULL(62)
#define PM_PRESENT BIT_ULL(63)
+#define KPF_COMPOUND_HEAD BIT_ULL(15)
+#define KPF_COMPOUND_TAIL BIT_ULL(16)
+#define KPF_THP BIT_ULL(22)
/*
* Ignore the checkpatch warning, we must read from x but don't want to do
* anything with it in order to trigger a read page fault. We therefore must use
@@ -85,6 +88,7 @@ bool check_huge_shmem(void *addr, int nr_hpages, uint64_t hpage_size);
int64_t allocate_transhuge(void *ptr, int pagemap_fd);
unsigned long default_huge_page_size(void);
int detect_hugetlb_page_sizes(size_t sizes[], int max);
+int pageflags_get(unsigned long pfn, int kpageflags_fd, uint64_t *flags);
int uffd_register(int uffd, void *addr, uint64_t len,
bool miss, bool wp, bool minor);
@@ -93,6 +97,7 @@ int uffd_register_with_ioctls(int uffd, void *addr, uint64_t len,
bool miss, bool wp, bool minor, uint64_t *ioctls);
unsigned long get_free_hugepages(void);
bool check_vmflag_io(void *addr);
+bool check_vmflag_pfnmap(void *addr);
int open_procmap(pid_t pid, struct procmap_fd *procmap_out);
int query_procmap(struct procmap_fd *procmap);
bool find_vma_procmap(struct procmap_fd *procmap, void *address);
@@ -126,9 +131,21 @@ static inline void log_test_result(int result)
ksft_test_result_report(result, "%s\n", test_name);
}
+static inline int sz2ord(size_t size, size_t pagesize)
+{
+ return __builtin_ctzll(size / pagesize);
+}
+
void *sys_mremap(void *old_address, unsigned long old_size,
unsigned long new_size, int flags, void *new_address);
+long ksm_get_self_zero_pages(void);
+long ksm_get_self_merging_pages(void);
+long ksm_get_full_scans(void);
+int ksm_use_zero_pages(void);
+int ksm_start(void);
+int ksm_stop(void);
+
/*
* On ppc64 this will only work with radix 2M hugepage size
*/
diff --git a/tools/testing/selftests/net/psock_lib.h b/tools/testing/selftests/net/psock_lib.h
index 6e4fef560873..067265b0a554 100644
--- a/tools/testing/selftests/net/psock_lib.h
+++ b/tools/testing/selftests/net/psock_lib.h
@@ -22,10 +22,6 @@
#define PORT_BASE 8000
-#ifndef __maybe_unused
-# define __maybe_unused __attribute__ ((__unused__))
-#endif
-
static __maybe_unused void pair_udp_setfilter(int fd)
{
/* the filter below checks for all of the following conditions that
diff --git a/tools/testing/selftests/perf_events/watermark_signal.c b/tools/testing/selftests/perf_events/watermark_signal.c
index e03fe1b9bba2..b3a72f0ac522 100644
--- a/tools/testing/selftests/perf_events/watermark_signal.c
+++ b/tools/testing/selftests/perf_events/watermark_signal.c
@@ -17,8 +17,6 @@
#include "../kselftest_harness.h"
-#define __maybe_unused __attribute__((__unused__))
-
static int sigio_count;
static void handle_sigio(int signum __maybe_unused,
diff --git a/tools/testing/selftests/proc/proc-maps-race.c b/tools/testing/selftests/proc/proc-maps-race.c
index 94bba4553130..a546475db550 100644
--- a/tools/testing/selftests/proc/proc-maps-race.c
+++ b/tools/testing/selftests/proc/proc-maps-race.c
@@ -32,6 +32,8 @@
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
+#include <linux/fs.h>
+#include <sys/ioctl.h>
#include <sys/mman.h>
#include <sys/stat.h>
#include <sys/types.h>
@@ -317,6 +319,25 @@ static bool capture_mod_pattern(FIXTURE_DATA(proc_maps_race) *self,
strcmp(restored_first_line->text, self->first_line.text) == 0;
}
+static bool query_addr_at(int maps_fd, void *addr,
+ unsigned long *vma_start, unsigned long *vma_end)
+{
+ struct procmap_query q;
+
+ memset(&q, 0, sizeof(q));
+ q.size = sizeof(q);
+ /* Find the VMA at the split address */
+ q.query_addr = (unsigned long long)addr;
+ q.query_flags = 0;
+ if (ioctl(maps_fd, PROCMAP_QUERY, &q))
+ return false;
+
+ *vma_start = q.vma_start;
+ *vma_end = q.vma_end;
+
+ return true;
+}
+
static inline bool split_vma(FIXTURE_DATA(proc_maps_race) *self)
{
return mmap(self->mod_info->addr, self->page_size, self->mod_info->prot | PROT_EXEC,
@@ -559,6 +580,8 @@ TEST_F(proc_maps_race, test_maps_tearing_from_split)
do {
bool last_line_changed;
bool first_line_changed;
+ unsigned long vma_start;
+ unsigned long vma_end;
ASSERT_TRUE(read_boundary_lines(self, &new_last_line, &new_first_line));
@@ -595,6 +618,19 @@ TEST_F(proc_maps_race, test_maps_tearing_from_split)
first_line_changed = strcmp(new_first_line.text, self->first_line.text) != 0;
ASSERT_EQ(last_line_changed, first_line_changed);
+ /* Check if PROCMAP_QUERY ioclt() finds the right VMA */
+ ASSERT_TRUE(query_addr_at(self->maps_fd, mod_info->addr + self->page_size,
+ &vma_start, &vma_end));
+ /*
+ * The vma at the split address can be either the same as
+ * original one (if read before the split) or the same as the
+ * first line in the second page (if read after the split).
+ */
+ ASSERT_TRUE((vma_start == self->last_line.start_addr &&
+ vma_end == self->last_line.end_addr) ||
+ (vma_start == split_first_line.start_addr &&
+ vma_end == split_first_line.end_addr));
+
clock_gettime(CLOCK_MONOTONIC_COARSE, &end_ts);
end_test_iteration(&end_ts, self->verbose);
} while (end_ts.tv_sec - start_ts.tv_sec < self->duration_sec);
@@ -636,6 +672,9 @@ TEST_F(proc_maps_race, test_maps_tearing_from_resize)
clock_gettime(CLOCK_MONOTONIC_COARSE, &start_ts);
start_test_loop(&start_ts, self->verbose);
do {
+ unsigned long vma_start;
+ unsigned long vma_end;
+
ASSERT_TRUE(read_boundary_lines(self, &new_last_line, &new_first_line));
/* Check if we read vmas after shrinking it */
@@ -662,6 +701,16 @@ TEST_F(proc_maps_race, test_maps_tearing_from_resize)
"Expand result invalid", self));
}
+ /* Check if PROCMAP_QUERY ioclt() finds the right VMA */
+ ASSERT_TRUE(query_addr_at(self->maps_fd, mod_info->addr, &vma_start, &vma_end));
+ /*
+ * The vma should stay at the same address and have either the
+ * original size of 3 pages or 1 page if read after shrinking.
+ */
+ ASSERT_TRUE(vma_start == self->last_line.start_addr &&
+ (vma_end - vma_start == self->page_size * 3 ||
+ vma_end - vma_start == self->page_size));
+
clock_gettime(CLOCK_MONOTONIC_COARSE, &end_ts);
end_test_iteration(&end_ts, self->verbose);
} while (end_ts.tv_sec - start_ts.tv_sec < self->duration_sec);
@@ -703,6 +752,9 @@ TEST_F(proc_maps_race, test_maps_tearing_from_remap)
clock_gettime(CLOCK_MONOTONIC_COARSE, &start_ts);
start_test_loop(&start_ts, self->verbose);
do {
+ unsigned long vma_start;
+ unsigned long vma_end;
+
ASSERT_TRUE(read_boundary_lines(self, &new_last_line, &new_first_line));
/* Check if we read vmas after remapping it */
@@ -729,6 +781,19 @@ TEST_F(proc_maps_race, test_maps_tearing_from_remap)
"Remap restore result invalid", self));
}
+ /* Check if PROCMAP_QUERY ioclt() finds the right VMA */
+ ASSERT_TRUE(query_addr_at(self->maps_fd, mod_info->addr + self->page_size,
+ &vma_start, &vma_end));
+ /*
+ * The vma should either stay at the same address and have the
+ * original size of 3 pages or we should find the remapped vma
+ * at the remap destination address with size of 1 page.
+ */
+ ASSERT_TRUE((vma_start == self->last_line.start_addr &&
+ vma_end - vma_start == self->page_size * 3) ||
+ (vma_start == self->last_line.start_addr + self->page_size &&
+ vma_end - vma_start == self->page_size));
+
clock_gettime(CLOCK_MONOTONIC_COARSE, &end_ts);
end_test_iteration(&end_ts, self->verbose);
} while (end_ts.tv_sec - start_ts.tv_sec < self->duration_sec);
diff --git a/tools/testing/selftests/ublk/utils.h b/tools/testing/selftests/ublk/utils.h
index 36545d1567f1..a852e0b7153e 100644
--- a/tools/testing/selftests/ublk/utils.h
+++ b/tools/testing/selftests/ublk/utils.h
@@ -2,8 +2,6 @@
#ifndef KUBLK_UTILS_H
#define KUBLK_UTILS_H
-#define __maybe_unused __attribute__((unused))
-
#ifndef min
#define min(a, b) ((a) < (b) ? (a) : (b))
#endif
diff --git a/tools/testing/shared/linux/maple_tree.h b/tools/testing/shared/linux/maple_tree.h
index f67d47d32857..7d0fadef0f11 100644
--- a/tools/testing/shared/linux/maple_tree.h
+++ b/tools/testing/shared/linux/maple_tree.h
@@ -1,7 +1,5 @@
/* SPDX-License-Identifier: GPL-2.0+ */
-#define atomic_t int32_t
-#define atomic_inc(x) uatomic_inc(x)
-#define atomic_read(x) uatomic_read(x)
-#define atomic_set(x, y) uatomic_set(x, y)
+#include <linux/atomic.h>
+
#define U8_MAX UCHAR_MAX
#include "../../../../include/linux/maple_tree.h"
diff --git a/tools/testing/shared/shared.mk b/tools/testing/shared/shared.mk
index 923ee2492256..5bcdf26c8a9d 100644
--- a/tools/testing/shared/shared.mk
+++ b/tools/testing/shared/shared.mk
@@ -1,7 +1,10 @@
# SPDX-License-Identifier: GPL-2.0
+include ../../scripts/Makefile.arch
-CFLAGS += -I../shared -I. -I../../include -I../../../lib -g -Og -Wall \
+CFLAGS += -I../shared -I. -I../../include -I../../arch/$(SRCARCH)/include \
+ -I../../../lib -g -Og -Wall \
-D_LGPL_SOURCE -fsanitize=address -fsanitize=undefined
+CFLAGS += $(EXTRA_CFLAGS)
LDFLAGS += -fsanitize=address -fsanitize=undefined
LDLIBS += -lpthread -lurcu
LIBS := slab.o find_bit.o bitmap.o hweight.o vsprintf.o
@@ -11,6 +14,7 @@ SHARED_DEPS = Makefile ../shared/shared.mk ../shared/*.h generated/map-shift.h \
generated/bit-length.h generated/autoconf.h \
../../include/linux/*.h \
../../include/asm/*.h \
+ ../../arch/$(SRCARCH)/include/asm/*.h \
../../../include/linux/xarray.h \
../../../include/linux/maple_tree.h \
../../../include/linux/radix-tree.h \
diff --git a/tools/testing/vma/linux/atomic.h b/tools/testing/vma/linux/atomic.h
deleted file mode 100644
index 788c597c4fde..000000000000
--- a/tools/testing/vma/linux/atomic.h
+++ /dev/null
@@ -1,17 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-
-#ifndef _LINUX_ATOMIC_H
-#define _LINUX_ATOMIC_H
-
-#define atomic_t int32_t
-#define atomic_inc(x) uatomic_inc(x)
-#define atomic_read(x) uatomic_read(x)
-#define atomic_set(x, y) uatomic_set(x, y)
-#define U8_MAX UCHAR_MAX
-
-#ifndef atomic_cmpxchg_relaxed
-#define atomic_cmpxchg_relaxed uatomic_cmpxchg
-#define atomic_cmpxchg_release uatomic_cmpxchg
-#endif /* atomic_cmpxchg_relaxed */
-
-#endif /* _LINUX_ATOMIC_H */
diff --git a/tools/testing/vma/vma_internal.h b/tools/testing/vma/vma_internal.h
index 3639aa8dd2b0..437d2a1013be 100644
--- a/tools/testing/vma/vma_internal.h
+++ b/tools/testing/vma/vma_internal.h
@@ -21,6 +21,7 @@
#include <stdlib.h>
+#include <linux/atomic.h>
#include <linux/list.h>
#include <linux/maple_tree.h>
#include <linux/mm.h>
@@ -249,6 +250,14 @@ struct mutex {};
#define DEFINE_MUTEX(mutexname) \
struct mutex mutexname = {}
+#define DECLARE_BITMAP(name, bits) \
+ unsigned long name[BITS_TO_LONGS(bits)]
+
+#define NUM_MM_FLAG_BITS (64)
+typedef struct {
+ __private DECLARE_BITMAP(__mm_flags, NUM_MM_FLAG_BITS);
+} mm_flags_t;
+
struct mm_struct {
struct maple_tree mm_mt;
int map_count; /* number of VMAs */
@@ -260,7 +269,7 @@ struct mm_struct {
unsigned long def_flags;
- unsigned long flags; /* Must use atomic bitops to access */
+ mm_flags_t flags; /* Must use mm_flags_* helpers to access */
};
struct vm_area_struct;
@@ -467,13 +476,21 @@ struct vm_operations_struct {
struct mempolicy *(*get_policy)(struct vm_area_struct *vma,
unsigned long addr, pgoff_t *ilx);
#endif
+#ifdef CONFIG_FIND_NORMAL_PAGE
/*
- * Called by vm_normal_page() for special PTEs to find the
- * page for @addr. This is useful if the default behavior
- * (using pte_page()) would not find the correct page.
+ * Called by vm_normal_page() for special PTEs in @vma at @addr. This
+ * allows for returning a "normal" page from vm_normal_page() even
+ * though the PTE indicates that the "struct page" either does not exist
+ * or should not be touched: "special".
+ *
+ * Do not add new users: this really only works when a "normal" page
+ * was mapped, but then the PTE got changed to something weird (+
+ * marked special) that would not make pte_pfn() identify the originally
+ * inserted page.
*/
- struct page *(*find_special_page)(struct vm_area_struct *vma,
- unsigned long addr);
+ struct page *(*find_normal_page)(struct vm_area_struct *vma,
+ unsigned long addr);
+#endif /* CONFIG_FIND_NORMAL_PAGE */
};
struct vm_unmapped_area_info {
@@ -1325,6 +1342,13 @@ static inline void userfaultfd_unmap_complete(struct mm_struct *mm,
{
}
+# define ACCESS_PRIVATE(p, member) ((p)->member)
+
+static inline bool mm_flags_test(int flag, const struct mm_struct *mm)
+{
+ return test_bit(flag, ACCESS_PRIVATE(&mm->flags, __mm_flags));
+}
+
/*
* Denies creating a writable executable mapping or gaining executable permissions.
*
@@ -1355,7 +1379,7 @@ static inline void userfaultfd_unmap_complete(struct mm_struct *mm,
static inline bool map_deny_write_exec(unsigned long old, unsigned long new)
{
/* If MDWE is disabled, we have nothing to deny. */
- if (!test_bit(MMF_HAS_MDWE, &current->mm->flags))
+ if (mm_flags_test(MMF_HAS_MDWE, current->mm))
return false;
/* If the new VMA is not executable, we have nothing to deny. */
@@ -1375,15 +1399,8 @@ static inline bool map_deny_write_exec(unsigned long old, unsigned long new)
static inline int mapping_map_writable(struct address_space *mapping)
{
- int c = atomic_read(&mapping->i_mmap_writable);
-
- /* Derived from the raw_atomic_inc_unless_negative() implementation. */
- do {
- if (c < 0)
- return -EPERM;
- } while (!__sync_bool_compare_and_swap(&mapping->i_mmap_writable, c, c+1));
-
- return 0;
+ return atomic_inc_unless_negative(&mapping->i_mmap_writable) ?
+ 0 : -EPERM;
}
static inline unsigned long move_page_tables(struct pagetable_move_control *pmc)