From a65b0e7607ccb5e5184591f73e48512f25c76061 Mon Sep 17 00:00:00 2001
From: Domenico Cerasuolo <cerasuolodomenico@gmail.com>
Date: Thu, 30 Nov 2023 11:40:20 -0800
Subject: zswap: make shrinking memcg-aware

Currently, we only have a single global LRU for zswap.  This makes it
impossible to perform worload-specific shrinking - an memcg cannot
determine which pages in the pool it owns, and often ends up writing pages
from other memcgs.  This issue has been previously observed in practice
and mitigated by simply disabling memcg-initiated shrinking:

https://lore.kernel.org/all/20230530232435.3097106-1-nphamcs@gmail.com/T/#u

This patch fully resolves the issue by replacing the global zswap LRU
with memcg- and NUMA-specific LRUs, and modify the reclaim logic:

a) When a store attempt hits an memcg limit, it now triggers a
   synchronous reclaim attempt that, if successful, allows the new
   hotter page to be accepted by zswap.
b) If the store attempt instead hits the global zswap limit, it will
   trigger an asynchronous reclaim attempt, in which an memcg is
   selected for reclaim in a round-robin-like fashion.

[nphamcs@gmail.com: use correct function for the onlineness check, use mem_cgroup_iter_break()]
  Link: https://lkml.kernel.org/r/20231205195419.2563217-1-nphamcs@gmail.com
[nphamcs@gmail.com: drop the pool's reference at the end of the writeback step]
  Link: https://lkml.kernel.org/r/20231206030627.4155634-1-nphamcs@gmail.com
Link: https://lkml.kernel.org/r/20231130194023.4102148-4-nphamcs@gmail.com
Signed-off-by: Domenico Cerasuolo <cerasuolodomenico@gmail.com>
Co-developed-by: Nhat Pham <nphamcs@gmail.com>
Signed-off-by: Nhat Pham <nphamcs@gmail.com>
Tested-by: Bagas Sanjaya <bagasdotme@gmail.com>
Cc: Chris Li <chrisl@kernel.org>
Cc: Dan Streetman <ddstreet@ieee.org>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Roman Gushchin <roman.gushchin@linux.dev>
Cc: Seth Jennings <sjenning@redhat.com>
Cc: Shakeel Butt <shakeelb@google.com>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Vitaly Wool <vitaly.wool@konsulko.com>
Cc: Yosry Ahmed <yosryahmed@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/swap_state.c | 24 ++++++++++++++++++------
 1 file changed, 18 insertions(+), 6 deletions(-)

(limited to 'mm/swap_state.c')

diff --git a/mm/swap_state.c b/mm/swap_state.c
index 85d9e5806a6a..6c84236382f3 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -412,7 +412,8 @@ struct folio *filemap_get_incore_folio(struct address_space *mapping,
 
 struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
 				     struct mempolicy *mpol, pgoff_t ilx,
-				     bool *new_page_allocated)
+				     bool *new_page_allocated,
+				     bool skip_if_exists)
 {
 	struct swap_info_struct *si;
 	struct folio *folio;
@@ -470,6 +471,17 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
 		if (err != -EEXIST)
 			goto fail_put_swap;
 
+		/*
+		 * Protect against a recursive call to __read_swap_cache_async()
+		 * on the same entry waiting forever here because SWAP_HAS_CACHE
+		 * is set but the folio is not the swap cache yet. This can
+		 * happen today if mem_cgroup_swapin_charge_folio() below
+		 * triggers reclaim through zswap, which may call
+		 * __read_swap_cache_async() in the writeback path.
+		 */
+		if (skip_if_exists)
+			goto fail_put_swap;
+
 		/*
 		 * We might race against __delete_from_swap_cache(), and
 		 * stumble across a swap_map entry whose SWAP_HAS_CACHE
@@ -537,7 +549,7 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
 
 	mpol = get_vma_policy(vma, addr, 0, &ilx);
 	page = __read_swap_cache_async(entry, gfp_mask, mpol, ilx,
-					&page_allocated);
+					&page_allocated, false);
 	mpol_cond_put(mpol);
 
 	if (page_allocated)
@@ -654,7 +666,7 @@ struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask,
 		/* Ok, do the async read-ahead now */
 		page = __read_swap_cache_async(
 				swp_entry(swp_type(entry), offset),
-				gfp_mask, mpol, ilx, &page_allocated);
+				gfp_mask, mpol, ilx, &page_allocated, false);
 		if (!page)
 			continue;
 		if (page_allocated) {
@@ -672,7 +684,7 @@ struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask,
 skip:
 	/* The page was likely read above, so no need for plugging here */
 	page = __read_swap_cache_async(entry, gfp_mask, mpol, ilx,
-					&page_allocated);
+					&page_allocated, false);
 	if (unlikely(page_allocated))
 		swap_readpage(page, false, NULL);
 	return page;
@@ -827,7 +839,7 @@ static struct page *swap_vma_readahead(swp_entry_t targ_entry, gfp_t gfp_mask,
 		pte_unmap(pte);
 		pte = NULL;
 		page = __read_swap_cache_async(entry, gfp_mask, mpol, ilx,
-						&page_allocated);
+						&page_allocated, false);
 		if (!page)
 			continue;
 		if (page_allocated) {
@@ -847,7 +859,7 @@ static struct page *swap_vma_readahead(swp_entry_t targ_entry, gfp_t gfp_mask,
 skip:
 	/* The page was likely read above, so no need for plugging here */
 	page = __read_swap_cache_async(targ_entry, gfp_mask, mpol, targ_ilx,
-					&page_allocated);
+					&page_allocated, false);
 	if (unlikely(page_allocated))
 		swap_readpage(page, false, NULL);
 	return page;
-- 
cgit v1.2.3


From b5ba474f3f518701249598b35c581b92a3c95b48 Mon Sep 17 00:00:00 2001
From: Nhat Pham <nphamcs@gmail.com>
Date: Thu, 30 Nov 2023 11:40:23 -0800
Subject: zswap: shrink zswap pool based on memory pressure

Currently, we only shrink the zswap pool when the user-defined limit is
hit.  This means that if we set the limit too high, cold data that are
unlikely to be used again will reside in the pool, wasting precious
memory.  It is hard to predict how much zswap space will be needed ahead
of time, as this depends on the workload (specifically, on factors such as
memory access patterns and compressibility of the memory pages).

This patch implements a memcg- and NUMA-aware shrinker for zswap, that is
initiated when there is memory pressure.  The shrinker does not have any
parameter that must be tuned by the user, and can be opted in or out on a
per-memcg basis.

Furthermore, to make it more robust for many workloads and prevent
overshrinking (i.e evicting warm pages that might be refaulted into
memory), we build in the following heuristics:

* Estimate the number of warm pages residing in zswap, and attempt to
  protect this region of the zswap LRU.
* Scale the number of freeable objects by an estimate of the memory
  saving factor. The better zswap compresses the data, the fewer pages
  we will evict to swap (as we will otherwise incur IO for relatively
  small memory saving).
* During reclaim, if the shrinker encounters a page that is also being
  brought into memory, the shrinker will cautiously terminate its
  shrinking action, as this is a sign that it is touching the warmer
  region of the zswap LRU.

As a proof of concept, we ran the following synthetic benchmark: build the
linux kernel in a memory-limited cgroup, and allocate some cold data in
tmpfs to see if the shrinker could write them out and improved the overall
performance.  Depending on the amount of cold data generated, we observe
from 14% to 35% reduction in kernel CPU time used in the kernel builds.

[nphamcs@gmail.com: check shrinker enablement early, use less costly stat flushing]
  Link: https://lkml.kernel.org/r/20231206194456.3234203-1-nphamcs@gmail.com
Link: https://lkml.kernel.org/r/20231130194023.4102148-7-nphamcs@gmail.com
Signed-off-by: Nhat Pham <nphamcs@gmail.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Tested-by: Bagas Sanjaya <bagasdotme@gmail.com>
Cc: Chris Li <chrisl@kernel.org>
Cc: Dan Streetman <ddstreet@ieee.org>
Cc: Domenico Cerasuolo <cerasuolodomenico@gmail.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Roman Gushchin <roman.gushchin@linux.dev>
Cc: Seth Jennings <sjenning@redhat.com>
Cc: Shakeel Butt <shakeelb@google.com>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Vitaly Wool <vitaly.wool@konsulko.com>
Cc: Yosry Ahmed <yosryahmed@google.com>
Cc: Chengming Zhou <chengming.zhou@linux.dev>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 Documentation/admin-guide/mm/zswap.rst |  10 ++
 include/linux/mmzone.h                 |   2 +
 include/linux/zswap.h                  |  25 ++++-
 mm/Kconfig                             |  14 +++
 mm/mmzone.c                            |   1 +
 mm/swap_state.c                        |   2 +
 mm/zswap.c                             | 192 ++++++++++++++++++++++++++++++++-
 7 files changed, 240 insertions(+), 6 deletions(-)

(limited to 'mm/swap_state.c')

diff --git a/Documentation/admin-guide/mm/zswap.rst b/Documentation/admin-guide/mm/zswap.rst
index 45b98390e938..62fc244ec702 100644
--- a/Documentation/admin-guide/mm/zswap.rst
+++ b/Documentation/admin-guide/mm/zswap.rst
@@ -153,6 +153,16 @@ attribute, e. g.::
 
 Setting this parameter to 100 will disable the hysteresis.
 
+When there is a sizable amount of cold memory residing in the zswap pool, it
+can be advantageous to proactively write these cold pages to swap and reclaim
+the memory for other use cases. By default, the zswap shrinker is disabled.
+User can enable it as follows:
+
+  echo Y > /sys/module/zswap/parameters/shrinker_enabled
+
+This can be enabled at the boot time if ``CONFIG_ZSWAP_SHRINKER_DEFAULT_ON`` is
+selected.
+
 A debugfs interface is provided for various statistic about pool size, number
 of pages stored, same-value filled pages and various counters for the reasons
 pages are rejected.
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 14faffa4354f..9ef9d010bff0 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -22,6 +22,7 @@
 #include <linux/mm_types.h>
 #include <linux/page-flags.h>
 #include <linux/local_lock.h>
+#include <linux/zswap.h>
 #include <asm/page.h>
 
 /* Free memory management - zoned buddy allocator.  */
@@ -641,6 +642,7 @@ struct lruvec {
 #ifdef CONFIG_MEMCG
 	struct pglist_data *pgdat;
 #endif
+	struct zswap_lruvec_state zswap_lruvec_state;
 };
 
 /* Isolate for asynchronous migration */
diff --git a/include/linux/zswap.h b/include/linux/zswap.h
index e571e393669b..08c240e16a01 100644
--- a/include/linux/zswap.h
+++ b/include/linux/zswap.h
@@ -5,20 +5,40 @@
 #include <linux/types.h>
 #include <linux/mm_types.h>
 
+struct lruvec;
+
 extern u64 zswap_pool_total_size;
 extern atomic_t zswap_stored_pages;
 
 #ifdef CONFIG_ZSWAP
 
+struct zswap_lruvec_state {
+	/*
+	 * Number of pages in zswap that should be protected from the shrinker.
+	 * This number is an estimate of the following counts:
+	 *
+	 * a) Recent page faults.
+	 * b) Recent insertion to the zswap LRU. This includes new zswap stores,
+	 *    as well as recent zswap LRU rotations.
+	 *
+	 * These pages are likely to be warm, and might incur IO if the are written
+	 * to swap.
+	 */
+	atomic_long_t nr_zswap_protected;
+};
+
 bool zswap_store(struct folio *folio);
 bool zswap_load(struct folio *folio);
 void zswap_invalidate(int type, pgoff_t offset);
 void zswap_swapon(int type);
 void zswap_swapoff(int type);
 void zswap_memcg_offline_cleanup(struct mem_cgroup *memcg);
-
+void zswap_lruvec_state_init(struct lruvec *lruvec);
+void zswap_page_swapin(struct page *page);
 #else
 
+struct zswap_lruvec_state {};
+
 static inline bool zswap_store(struct folio *folio)
 {
 	return false;
@@ -33,7 +53,8 @@ static inline void zswap_invalidate(int type, pgoff_t offset) {}
 static inline void zswap_swapon(int type) {}
 static inline void zswap_swapoff(int type) {}
 static inline void zswap_memcg_offline_cleanup(struct mem_cgroup *memcg) {}
-
+static inline void zswap_lruvec_state_init(struct lruvec *lruvec) {}
+static inline void zswap_page_swapin(struct page *page) {}
 #endif
 
 #endif /* _LINUX_ZSWAP_H */
diff --git a/mm/Kconfig b/mm/Kconfig
index 57cd378c73d6..ca87cdb72f11 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -61,6 +61,20 @@ config ZSWAP_EXCLUSIVE_LOADS_DEFAULT_ON
 	  The cost is that if the page was never dirtied and needs to be
 	  swapped out again, it will be re-compressed.
 
+config ZSWAP_SHRINKER_DEFAULT_ON
+	bool "Shrink the zswap pool on memory pressure"
+	depends on ZSWAP
+	default n
+	help
+	  If selected, the zswap shrinker will be enabled, and the pages
+	  stored in the zswap pool will become available for reclaim (i.e
+	  written back to the backing swap device) on memory pressure.
+
+	  This means that zswap writeback could happen even if the pool is
+	  not yet full, or the cgroup zswap limit has not been reached,
+	  reducing the chance that cold pages will reside in the zswap pool
+	  and consume memory indefinitely.
+
 choice
 	prompt "Default compressor"
 	depends on ZSWAP
diff --git a/mm/mmzone.c b/mm/mmzone.c
index b594d3f268fe..c01896eca736 100644
--- a/mm/mmzone.c
+++ b/mm/mmzone.c
@@ -78,6 +78,7 @@ void lruvec_init(struct lruvec *lruvec)
 
 	memset(lruvec, 0, sizeof(struct lruvec));
 	spin_lock_init(&lruvec->lru_lock);
+	zswap_lruvec_state_init(lruvec);
 
 	for_each_lru(lru)
 		INIT_LIST_HEAD(&lruvec->lists[lru]);
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 6c84236382f3..c597cec606e4 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -687,6 +687,7 @@ skip:
 					&page_allocated, false);
 	if (unlikely(page_allocated))
 		swap_readpage(page, false, NULL);
+	zswap_page_swapin(page);
 	return page;
 }
 
@@ -862,6 +863,7 @@ skip:
 					&page_allocated, false);
 	if (unlikely(page_allocated))
 		swap_readpage(page, false, NULL);
+	zswap_page_swapin(page);
 	return page;
 }
 
diff --git a/mm/zswap.c b/mm/zswap.c
index c329fca810c8..015425ed9003 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -148,6 +148,11 @@ module_param_named(exclusive_loads, zswap_exclusive_loads_enabled, bool, 0644);
 /* Number of zpools in zswap_pool (empirically determined for scalability) */
 #define ZSWAP_NR_ZPOOLS 32
 
+/* Enable/disable memory pressure-based shrinker. */
+static bool zswap_shrinker_enabled = IS_ENABLED(
+		CONFIG_ZSWAP_SHRINKER_DEFAULT_ON);
+module_param_named(shrinker_enabled, zswap_shrinker_enabled, bool, 0644);
+
 /*********************************
 * data structures
 **********************************/
@@ -177,6 +182,8 @@ struct zswap_pool {
 	char tfm_name[CRYPTO_MAX_ALG_NAME];
 	struct list_lru list_lru;
 	struct mem_cgroup *next_shrink;
+	struct shrinker *shrinker;
+	atomic_t nr_stored;
 };
 
 /*
@@ -275,17 +282,26 @@ static bool zswap_can_accept(void)
 			DIV_ROUND_UP(zswap_pool_total_size, PAGE_SIZE);
 }
 
+static u64 get_zswap_pool_size(struct zswap_pool *pool)
+{
+	u64 pool_size = 0;
+	int i;
+
+	for (i = 0; i < ZSWAP_NR_ZPOOLS; i++)
+		pool_size += zpool_get_total_size(pool->zpools[i]);
+
+	return pool_size;
+}
+
 static void zswap_update_total_size(void)
 {
 	struct zswap_pool *pool;
 	u64 total = 0;
-	int i;
 
 	rcu_read_lock();
 
 	list_for_each_entry_rcu(pool, &zswap_pools, list)
-		for (i = 0; i < ZSWAP_NR_ZPOOLS; i++)
-			total += zpool_get_total_size(pool->zpools[i]);
+		total += get_zswap_pool_size(pool);
 
 	rcu_read_unlock();
 
@@ -344,13 +360,34 @@ static void zswap_entry_cache_free(struct zswap_entry *entry)
 	kmem_cache_free(zswap_entry_cache, entry);
 }
 
+/*********************************
+* zswap lruvec functions
+**********************************/
+void zswap_lruvec_state_init(struct lruvec *lruvec)
+{
+	atomic_long_set(&lruvec->zswap_lruvec_state.nr_zswap_protected, 0);
+}
+
+void zswap_page_swapin(struct page *page)
+{
+	struct lruvec *lruvec;
+
+	if (page) {
+		lruvec = folio_lruvec(page_folio(page));
+		atomic_long_inc(&lruvec->zswap_lruvec_state.nr_zswap_protected);
+	}
+}
+
 /*********************************
 * lru functions
 **********************************/
 static void zswap_lru_add(struct list_lru *list_lru, struct zswap_entry *entry)
 {
+	atomic_long_t *nr_zswap_protected;
+	unsigned long lru_size, old, new;
 	int nid = entry_to_nid(entry);
 	struct mem_cgroup *memcg;
+	struct lruvec *lruvec;
 
 	/*
 	 * Note that it is safe to use rcu_read_lock() here, even in the face of
@@ -368,6 +405,19 @@ static void zswap_lru_add(struct list_lru *list_lru, struct zswap_entry *entry)
 	memcg = mem_cgroup_from_entry(entry);
 	/* will always succeed */
 	list_lru_add(list_lru, &entry->lru, nid, memcg);
+
+	/* Update the protection area */
+	lru_size = list_lru_count_one(list_lru, nid, memcg);
+	lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(nid));
+	nr_zswap_protected = &lruvec->zswap_lruvec_state.nr_zswap_protected;
+	old = atomic_long_inc_return(nr_zswap_protected);
+	/*
+	 * Decay to avoid overflow and adapt to changing workloads.
+	 * This is based on LRU reclaim cost decaying heuristics.
+	 */
+	do {
+		new = old > lru_size / 4 ? old / 2 : old;
+	} while (!atomic_long_try_cmpxchg(nr_zswap_protected, &old, new));
 	rcu_read_unlock();
 }
 
@@ -389,6 +439,7 @@ static void zswap_lru_putback(struct list_lru *list_lru,
 	int nid = entry_to_nid(entry);
 	spinlock_t *lock = &list_lru->node[nid].lock;
 	struct mem_cgroup *memcg;
+	struct lruvec *lruvec;
 
 	rcu_read_lock();
 	memcg = mem_cgroup_from_entry(entry);
@@ -396,6 +447,10 @@ static void zswap_lru_putback(struct list_lru *list_lru,
 	/* we cannot use list_lru_add here, because it increments node's lru count */
 	list_lru_putback(list_lru, &entry->lru, nid, memcg);
 	spin_unlock(lock);
+
+	lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(entry_to_nid(entry)));
+	/* increment the protection area to account for the LRU rotation. */
+	atomic_long_inc(&lruvec->zswap_lruvec_state.nr_zswap_protected);
 	rcu_read_unlock();
 }
 
@@ -485,6 +540,7 @@ static void zswap_free_entry(struct zswap_entry *entry)
 	else {
 		zswap_lru_del(&entry->pool->list_lru, entry);
 		zpool_free(zswap_find_zpool(entry), entry->handle);
+		atomic_dec(&entry->pool->nr_stored);
 		zswap_pool_put(entry->pool);
 	}
 	zswap_entry_cache_free(entry);
@@ -526,6 +582,109 @@ static struct zswap_entry *zswap_entry_find_get(struct rb_root *root,
 	return entry;
 }
 
+/*********************************
+* shrinker functions
+**********************************/
+static enum lru_status shrink_memcg_cb(struct list_head *item, struct list_lru_one *l,
+				       spinlock_t *lock, void *arg);
+
+static unsigned long zswap_shrinker_scan(struct shrinker *shrinker,
+		struct shrink_control *sc)
+{
+	struct lruvec *lruvec = mem_cgroup_lruvec(sc->memcg, NODE_DATA(sc->nid));
+	unsigned long shrink_ret, nr_protected, lru_size;
+	struct zswap_pool *pool = shrinker->private_data;
+	bool encountered_page_in_swapcache = false;
+
+	if (!zswap_shrinker_enabled) {
+		sc->nr_scanned = 0;
+		return SHRINK_STOP;
+	}
+
+	nr_protected =
+		atomic_long_read(&lruvec->zswap_lruvec_state.nr_zswap_protected);
+	lru_size = list_lru_shrink_count(&pool->list_lru, sc);
+
+	/*
+	 * Abort if we are shrinking into the protected region.
+	 *
+	 * This short-circuiting is necessary because if we have too many multiple
+	 * concurrent reclaimers getting the freeable zswap object counts at the
+	 * same time (before any of them made reasonable progress), the total
+	 * number of reclaimed objects might be more than the number of unprotected
+	 * objects (i.e the reclaimers will reclaim into the protected area of the
+	 * zswap LRU).
+	 */
+	if (nr_protected >= lru_size - sc->nr_to_scan) {
+		sc->nr_scanned = 0;
+		return SHRINK_STOP;
+	}
+
+	shrink_ret = list_lru_shrink_walk(&pool->list_lru, sc, &shrink_memcg_cb,
+		&encountered_page_in_swapcache);
+
+	if (encountered_page_in_swapcache)
+		return SHRINK_STOP;
+
+	return shrink_ret ? shrink_ret : SHRINK_STOP;
+}
+
+static unsigned long zswap_shrinker_count(struct shrinker *shrinker,
+		struct shrink_control *sc)
+{
+	struct zswap_pool *pool = shrinker->private_data;
+	struct mem_cgroup *memcg = sc->memcg;
+	struct lruvec *lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(sc->nid));
+	unsigned long nr_backing, nr_stored, nr_freeable, nr_protected;
+
+	if (!zswap_shrinker_enabled)
+		return 0;
+
+#ifdef CONFIG_MEMCG_KMEM
+	mem_cgroup_flush_stats();
+	nr_backing = memcg_page_state(memcg, MEMCG_ZSWAP_B) >> PAGE_SHIFT;
+	nr_stored = memcg_page_state(memcg, MEMCG_ZSWAPPED);
+#else
+	/* use pool stats instead of memcg stats */
+	nr_backing = get_zswap_pool_size(pool) >> PAGE_SHIFT;
+	nr_stored = atomic_read(&pool->nr_stored);
+#endif
+
+	if (!nr_stored)
+		return 0;
+
+	nr_protected =
+		atomic_long_read(&lruvec->zswap_lruvec_state.nr_zswap_protected);
+	nr_freeable = list_lru_shrink_count(&pool->list_lru, sc);
+	/*
+	 * Subtract the lru size by an estimate of the number of pages
+	 * that should be protected.
+	 */
+	nr_freeable = nr_freeable > nr_protected ? nr_freeable - nr_protected : 0;
+
+	/*
+	 * Scale the number of freeable pages by the memory saving factor.
+	 * This ensures that the better zswap compresses memory, the fewer
+	 * pages we will evict to swap (as it will otherwise incur IO for
+	 * relatively small memory saving).
+	 */
+	return mult_frac(nr_freeable, nr_backing, nr_stored);
+}
+
+static void zswap_alloc_shrinker(struct zswap_pool *pool)
+{
+	pool->shrinker =
+		shrinker_alloc(SHRINKER_NUMA_AWARE | SHRINKER_MEMCG_AWARE, "mm-zswap");
+	if (!pool->shrinker)
+		return;
+
+	pool->shrinker->private_data = pool;
+	pool->shrinker->scan_objects = zswap_shrinker_scan;
+	pool->shrinker->count_objects = zswap_shrinker_count;
+	pool->shrinker->batch = 0;
+	pool->shrinker->seeks = DEFAULT_SEEKS;
+}
+
 /*********************************
 * per-cpu code
 **********************************/
@@ -721,6 +880,7 @@ static enum lru_status shrink_memcg_cb(struct list_head *item, struct list_lru_o
 				       spinlock_t *lock, void *arg)
 {
 	struct zswap_entry *entry = container_of(item, struct zswap_entry, lru);
+	bool *encountered_page_in_swapcache = (bool *)arg;
 	struct zswap_tree *tree;
 	pgoff_t swpoffset;
 	enum lru_status ret = LRU_REMOVED_RETRY;
@@ -756,6 +916,17 @@ static enum lru_status shrink_memcg_cb(struct list_head *item, struct list_lru_o
 		zswap_reject_reclaim_fail++;
 		zswap_lru_putback(&entry->pool->list_lru, entry);
 		ret = LRU_RETRY;
+
+		/*
+		 * Encountering a page already in swap cache is a sign that we are shrinking
+		 * into the warmer region. We should terminate shrinking (if we're in the dynamic
+		 * shrinker context).
+		 */
+		if (writeback_result == -EEXIST && encountered_page_in_swapcache) {
+			ret = LRU_SKIP;
+			*encountered_page_in_swapcache = true;
+		}
+
 		goto put_unlock;
 	}
 	zswap_written_back_pages++;
@@ -914,6 +1085,11 @@ static struct zswap_pool *zswap_pool_create(char *type, char *compressor)
 				       &pool->node);
 	if (ret)
 		goto error;
+
+	zswap_alloc_shrinker(pool);
+	if (!pool->shrinker)
+		goto error;
+
 	pr_debug("using %s compressor\n", pool->tfm_name);
 
 	/* being the current pool takes 1 ref; this func expects the
@@ -921,13 +1097,19 @@ static struct zswap_pool *zswap_pool_create(char *type, char *compressor)
 	 */
 	kref_init(&pool->kref);
 	INIT_LIST_HEAD(&pool->list);
-	list_lru_init_memcg(&pool->list_lru, NULL);
+	if (list_lru_init_memcg(&pool->list_lru, pool->shrinker))
+		goto lru_fail;
+	shrinker_register(pool->shrinker);
 	INIT_WORK(&pool->shrink_work, shrink_worker);
+	atomic_set(&pool->nr_stored, 0);
 
 	zswap_pool_debug("created", pool);
 
 	return pool;
 
+lru_fail:
+	list_lru_destroy(&pool->list_lru);
+	shrinker_free(pool->shrinker);
 error:
 	if (pool->acomp_ctx)
 		free_percpu(pool->acomp_ctx);
@@ -985,6 +1167,7 @@ static void zswap_pool_destroy(struct zswap_pool *pool)
 
 	zswap_pool_debug("destroying", pool);
 
+	shrinker_free(pool->shrinker);
 	cpuhp_state_remove_instance(CPUHP_MM_ZSWP_POOL_PREPARE, &pool->node);
 	free_percpu(pool->acomp_ctx);
 	list_lru_destroy(&pool->list_lru);
@@ -1536,6 +1719,7 @@ insert_entry:
 	if (entry->length) {
 		INIT_LIST_HEAD(&entry->lru);
 		zswap_lru_add(&entry->pool->list_lru, entry);
+		atomic_inc(&entry->pool->nr_stored);
 	}
 	spin_unlock(&tree->lock);
 
-- 
cgit v1.2.3


From 96c7b0b42239e7b8987b2664b458dc74e825f760 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Wed, 13 Dec 2023 21:58:30 +0000
Subject: mm: return the folio from __read_swap_cache_async()

Patch series "More swap folio conversions".

These all seem like fairly straightforward conversions to me.  A lot of
compound_head() calls get removed.  And page_swap_info(), which is nice.


This patch (of 13):

Move the folio->page conversion into the callers that actually want that.
Most of the callers are happier with the folio anyway.  If the
page_allocated boolean is set, the folio allocated is of order-0, so it is
safe to pass the page directly to swap_readpage().

Link: https://lkml.kernel.org/r/20231213215842.671461-1-willy@infradead.org
Link: https://lkml.kernel.org/r/20231213215842.671461-2-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/zswap.h |  4 +--
 mm/swap.h             |  7 +++--
 mm/swap_state.c       | 75 ++++++++++++++++++++++++---------------------------
 mm/zswap.c            | 58 +++++++++++++++++++--------------------
 4 files changed, 69 insertions(+), 75 deletions(-)

(limited to 'mm/swap_state.c')

diff --git a/include/linux/zswap.h b/include/linux/zswap.h
index 08c240e16a01..e88572d4c720 100644
--- a/include/linux/zswap.h
+++ b/include/linux/zswap.h
@@ -34,7 +34,7 @@ void zswap_swapon(int type);
 void zswap_swapoff(int type);
 void zswap_memcg_offline_cleanup(struct mem_cgroup *memcg);
 void zswap_lruvec_state_init(struct lruvec *lruvec);
-void zswap_page_swapin(struct page *page);
+void zswap_folio_swapin(struct folio *folio);
 #else
 
 struct zswap_lruvec_state {};
@@ -54,7 +54,7 @@ static inline void zswap_swapon(int type) {}
 static inline void zswap_swapoff(int type) {}
 static inline void zswap_memcg_offline_cleanup(struct mem_cgroup *memcg) {}
 static inline void zswap_lruvec_state_init(struct lruvec *lruvec) {}
-static inline void zswap_page_swapin(struct page *page) {}
+static inline void zswap_folio_swapin(struct folio *folio) {}
 #endif
 
 #endif /* _LINUX_ZSWAP_H */
diff --git a/mm/swap.h b/mm/swap.h
index c0dc73e10e91..a60ab1cfcaf2 100644
--- a/mm/swap.h
+++ b/mm/swap.h
@@ -49,10 +49,9 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
 				   struct vm_area_struct *vma,
 				   unsigned long addr,
 				   struct swap_iocb **plug);
-struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
-				     struct mempolicy *mpol, pgoff_t ilx,
-				     bool *new_page_allocated,
-				     bool skip_if_exists);
+struct folio *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_flags,
+		struct mempolicy *mpol, pgoff_t ilx, bool *new_page_allocated,
+		bool skip_if_exists);
 struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t flag,
 				    struct mempolicy *mpol, pgoff_t ilx);
 struct page *swapin_readahead(swp_entry_t entry, gfp_t flag,
diff --git a/mm/swap_state.c b/mm/swap_state.c
index c597cec606e4..874b40a1f502 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -410,14 +410,12 @@ struct folio *filemap_get_incore_folio(struct address_space *mapping,
 	return folio;
 }
 
-struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
-				     struct mempolicy *mpol, pgoff_t ilx,
-				     bool *new_page_allocated,
-				     bool skip_if_exists)
+struct folio *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
+		struct mempolicy *mpol, pgoff_t ilx, bool *new_page_allocated,
+		bool skip_if_exists)
 {
 	struct swap_info_struct *si;
 	struct folio *folio;
-	struct page *page;
 	void *shadow = NULL;
 
 	*new_page_allocated = false;
@@ -434,10 +432,8 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
 		 */
 		folio = filemap_get_folio(swap_address_space(entry),
 						swp_offset(entry));
-		if (!IS_ERR(folio)) {
-			page = folio_file_page(folio, swp_offset(entry));
-			goto got_page;
-		}
+		if (!IS_ERR(folio))
+			goto got_folio;
 
 		/*
 		 * Just skip read ahead for unused swap slot.
@@ -451,7 +447,7 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
 			goto fail_put_swap;
 
 		/*
-		 * Get a new page to read into from swap.  Allocate it now,
+		 * Get a new folio to read into from swap.  Allocate it now,
 		 * before marking swap_map SWAP_HAS_CACHE, when -EEXIST will
 		 * cause any racers to loop around until we add it to cache.
 		 */
@@ -487,13 +483,13 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
 		 * stumble across a swap_map entry whose SWAP_HAS_CACHE
 		 * has not yet been cleared.  Or race against another
 		 * __read_swap_cache_async(), which has set SWAP_HAS_CACHE
-		 * in swap_map, but not yet added its page to swap cache.
+		 * in swap_map, but not yet added its folio to swap cache.
 		 */
 		schedule_timeout_uninterruptible(1);
 	}
 
 	/*
-	 * The swap entry is ours to swap in. Prepare the new page.
+	 * The swap entry is ours to swap in. Prepare the new folio.
 	 */
 
 	__folio_set_locked(folio);
@@ -514,10 +510,9 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
 	/* Caller will initiate read into locked folio */
 	folio_add_lru(folio);
 	*new_page_allocated = true;
-	page = &folio->page;
-got_page:
+got_folio:
 	put_swap_device(si);
-	return page;
+	return folio;
 
 fail_unlock:
 	put_swap_folio(folio, entry);
@@ -545,16 +540,16 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
 	bool page_allocated;
 	struct mempolicy *mpol;
 	pgoff_t ilx;
-	struct page *page;
+	struct folio *folio;
 
 	mpol = get_vma_policy(vma, addr, 0, &ilx);
-	page = __read_swap_cache_async(entry, gfp_mask, mpol, ilx,
+	folio = __read_swap_cache_async(entry, gfp_mask, mpol, ilx,
 					&page_allocated, false);
 	mpol_cond_put(mpol);
 
 	if (page_allocated)
-		swap_readpage(page, false, plug);
-	return page;
+		swap_readpage(&folio->page, false, plug);
+	return folio_file_page(folio, swp_offset(entry));
 }
 
 static unsigned int __swapin_nr_pages(unsigned long prev_offset,
@@ -639,7 +634,7 @@ static unsigned long swapin_nr_pages(unsigned long offset)
 struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask,
 				    struct mempolicy *mpol, pgoff_t ilx)
 {
-	struct page *page;
+	struct folio *folio;
 	unsigned long entry_offset = swp_offset(entry);
 	unsigned long offset = entry_offset;
 	unsigned long start_offset, end_offset;
@@ -664,31 +659,31 @@ struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask,
 	blk_start_plug(&plug);
 	for (offset = start_offset; offset <= end_offset ; offset++) {
 		/* Ok, do the async read-ahead now */
-		page = __read_swap_cache_async(
+		folio = __read_swap_cache_async(
 				swp_entry(swp_type(entry), offset),
 				gfp_mask, mpol, ilx, &page_allocated, false);
-		if (!page)
+		if (!folio)
 			continue;
 		if (page_allocated) {
-			swap_readpage(page, false, &splug);
+			swap_readpage(&folio->page, false, &splug);
 			if (offset != entry_offset) {
-				SetPageReadahead(page);
+				folio_set_readahead(folio);
 				count_vm_event(SWAP_RA);
 			}
 		}
-		put_page(page);
+		folio_put(folio);
 	}
 	blk_finish_plug(&plug);
 	swap_read_unplug(splug);
 	lru_add_drain();	/* Push any new pages onto the LRU now */
 skip:
 	/* The page was likely read above, so no need for plugging here */
-	page = __read_swap_cache_async(entry, gfp_mask, mpol, ilx,
+	folio = __read_swap_cache_async(entry, gfp_mask, mpol, ilx,
 					&page_allocated, false);
 	if (unlikely(page_allocated))
-		swap_readpage(page, false, NULL);
-	zswap_page_swapin(page);
-	return page;
+		swap_readpage(&folio->page, false, NULL);
+	zswap_folio_swapin(folio);
+	return folio_file_page(folio, swp_offset(entry));
 }
 
 int init_swap_address_space(unsigned int type, unsigned long nr_pages)
@@ -806,7 +801,7 @@ static struct page *swap_vma_readahead(swp_entry_t targ_entry, gfp_t gfp_mask,
 {
 	struct blk_plug plug;
 	struct swap_iocb *splug = NULL;
-	struct page *page;
+	struct folio *folio;
 	pte_t *pte = NULL, pentry;
 	unsigned long addr;
 	swp_entry_t entry;
@@ -839,18 +834,18 @@ static struct page *swap_vma_readahead(swp_entry_t targ_entry, gfp_t gfp_mask,
 			continue;
 		pte_unmap(pte);
 		pte = NULL;
-		page = __read_swap_cache_async(entry, gfp_mask, mpol, ilx,
+		folio = __read_swap_cache_async(entry, gfp_mask, mpol, ilx,
 						&page_allocated, false);
-		if (!page)
+		if (!folio)
 			continue;
 		if (page_allocated) {
-			swap_readpage(page, false, &splug);
+			swap_readpage(&folio->page, false, &splug);
 			if (i != ra_info.offset) {
-				SetPageReadahead(page);
+				folio_set_readahead(folio);
 				count_vm_event(SWAP_RA);
 			}
 		}
-		put_page(page);
+		folio_put(folio);
 	}
 	if (pte)
 		pte_unmap(pte);
@@ -858,13 +853,13 @@ static struct page *swap_vma_readahead(swp_entry_t targ_entry, gfp_t gfp_mask,
 	swap_read_unplug(splug);
 	lru_add_drain();
 skip:
-	/* The page was likely read above, so no need for plugging here */
-	page = __read_swap_cache_async(targ_entry, gfp_mask, mpol, targ_ilx,
+	/* The folio was likely read above, so no need for plugging here */
+	folio = __read_swap_cache_async(targ_entry, gfp_mask, mpol, targ_ilx,
 					&page_allocated, false);
 	if (unlikely(page_allocated))
-		swap_readpage(page, false, NULL);
-	zswap_page_swapin(page);
-	return page;
+		swap_readpage(&folio->page, false, NULL);
+	zswap_folio_swapin(folio);
+	return folio_file_page(folio, swp_offset(entry));
 }
 
 /**
diff --git a/mm/zswap.c b/mm/zswap.c
index 12ce5a68da91..6d9ee2a5334a 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -368,12 +368,12 @@ void zswap_lruvec_state_init(struct lruvec *lruvec)
 	atomic_long_set(&lruvec->zswap_lruvec_state.nr_zswap_protected, 0);
 }
 
-void zswap_page_swapin(struct page *page)
+void zswap_folio_swapin(struct folio *folio)
 {
 	struct lruvec *lruvec;
 
-	if (page) {
-		lruvec = folio_lruvec(page_folio(page));
+	if (folio) {
+		lruvec = folio_lruvec(folio);
 		atomic_long_inc(&lruvec->zswap_lruvec_state.nr_zswap_protected);
 	}
 }
@@ -1383,14 +1383,14 @@ static void __zswap_load(struct zswap_entry *entry, struct page *page)
 * writeback code
 **********************************/
 /*
- * Attempts to free an entry by adding a page to the swap cache,
- * decompressing the entry data into the page, and issuing a
- * bio write to write the page back to the swap device.
+ * Attempts to free an entry by adding a folio to the swap cache,
+ * decompressing the entry data into the folio, and issuing a
+ * bio write to write the folio back to the swap device.
  *
- * This can be thought of as a "resumed writeback" of the page
+ * This can be thought of as a "resumed writeback" of the folio
  * to the swap device.  We are basically resuming the same swap
  * writeback path that was intercepted with the zswap_store()
- * in the first place.  After the page has been decompressed into
+ * in the first place.  After the folio has been decompressed into
  * the swap cache, the compressed version stored by zswap can be
  * freed.
  */
@@ -1398,56 +1398,56 @@ static int zswap_writeback_entry(struct zswap_entry *entry,
 				 struct zswap_tree *tree)
 {
 	swp_entry_t swpentry = entry->swpentry;
-	struct page *page;
+	struct folio *folio;
 	struct mempolicy *mpol;
-	bool page_was_allocated;
+	bool folio_was_allocated;
 	struct writeback_control wbc = {
 		.sync_mode = WB_SYNC_NONE,
 	};
 
-	/* try to allocate swap cache page */
+	/* try to allocate swap cache folio */
 	mpol = get_task_policy(current);
-	page = __read_swap_cache_async(swpentry, GFP_KERNEL, mpol,
-				NO_INTERLEAVE_INDEX, &page_was_allocated, true);
-	if (!page)
+	folio = __read_swap_cache_async(swpentry, GFP_KERNEL, mpol,
+				NO_INTERLEAVE_INDEX, &folio_was_allocated, true);
+	if (!folio)
 		return -ENOMEM;
 
 	/*
-	 * Found an existing page, we raced with load/swapin. We generally
-	 * writeback cold pages from zswap, and swapin means the page just
-	 * became hot. Skip this page and let the caller find another one.
+	 * Found an existing folio, we raced with load/swapin. We generally
+	 * writeback cold folios from zswap, and swapin means the folio just
+	 * became hot. Skip this folio and let the caller find another one.
 	 */
-	if (!page_was_allocated) {
-		put_page(page);
+	if (!folio_was_allocated) {
+		folio_put(folio);
 		return -EEXIST;
 	}
 
 	/*
-	 * Page is locked, and the swapcache is now secured against
+	 * folio is locked, and the swapcache is now secured against
 	 * concurrent swapping to and from the slot. Verify that the
 	 * swap entry hasn't been invalidated and recycled behind our
 	 * backs (our zswap_entry reference doesn't prevent that), to
-	 * avoid overwriting a new swap page with old compressed data.
+	 * avoid overwriting a new swap folio with old compressed data.
 	 */
 	spin_lock(&tree->lock);
 	if (zswap_rb_search(&tree->rbroot, swp_offset(entry->swpentry)) != entry) {
 		spin_unlock(&tree->lock);
-		delete_from_swap_cache(page_folio(page));
+		delete_from_swap_cache(folio);
 		return -ENOMEM;
 	}
 	spin_unlock(&tree->lock);
 
-	__zswap_load(entry, page);
+	__zswap_load(entry, &folio->page);
 
-	/* page is up to date */
-	SetPageUptodate(page);
+	/* folio is up to date */
+	folio_mark_uptodate(folio);
 
 	/* move it to the tail of the inactive list after end_writeback */
-	SetPageReclaim(page);
+	folio_set_reclaim(folio);
 
 	/* start writeback */
-	__swap_writepage(page, &wbc);
-	put_page(page);
+	__swap_writepage(&folio->page, &wbc);
+	folio_put(folio);
 
 	return 0;
 }
@@ -1593,7 +1593,7 @@ bool zswap_store(struct folio *folio)
 
 	dst = acomp_ctx->buffer;
 	sg_init_table(&input, 1);
-	sg_set_page(&input, page, PAGE_SIZE, 0);
+	sg_set_page(&input, &folio->page, PAGE_SIZE, 0);
 
 	/*
 	 * We need PAGE_SIZE * 2 here since there maybe over-compression case,
-- 
cgit v1.2.3


From c9bdf768dd9319d2d80a334646e2c8116af9e430 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Wed, 13 Dec 2023 21:58:39 +0000
Subject: mm: convert swap_readpage() to swap_read_folio()

All callers have a folio, so pass it in, saving two calls to
compound_head().

Link: https://lkml.kernel.org/r/20231213215842.671461-11-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/memory.c     |  4 ++--
 mm/page_io.c    | 18 +++++++++---------
 mm/swap.h       |  5 +++--
 mm/swap_state.c | 12 ++++++------
 mm/swapfile.c   |  2 +-
 5 files changed, 21 insertions(+), 20 deletions(-)

(limited to 'mm/swap_state.c')

diff --git a/mm/memory.c b/mm/memory.c
index 5c023bef2adb..cfcaf4c0198c 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3882,9 +3882,9 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
 
 				folio_add_lru(folio);
 
-				/* To provide entry to swap_readpage() */
+				/* To provide entry to swap_read_folio() */
 				folio->swap = entry;
-				swap_readpage(page, true, NULL);
+				swap_read_folio(folio, true, NULL);
 				folio->private = NULL;
 			}
 		} else {
diff --git a/mm/page_io.c b/mm/page_io.c
index 6736c56526bf..09c6a4f316f3 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -420,7 +420,7 @@ static void sio_read_complete(struct kiocb *iocb, long ret)
 	mempool_free(sio, sio_pool);
 }
 
-static void swap_readpage_fs(struct folio *folio, struct swap_iocb **plug)
+static void swap_read_folio_fs(struct folio *folio, struct swap_iocb **plug)
 {
 	struct swap_info_struct *sis = swp_swap_info(folio->swap);
 	struct swap_iocb *sio = NULL;
@@ -454,7 +454,7 @@ static void swap_readpage_fs(struct folio *folio, struct swap_iocb **plug)
 		*plug = sio;
 }
 
-static void swap_readpage_bdev_sync(struct folio *folio,
+static void swap_read_folio_bdev_sync(struct folio *folio,
 		struct swap_info_struct *sis)
 {
 	struct bio_vec bv;
@@ -474,7 +474,7 @@ static void swap_readpage_bdev_sync(struct folio *folio,
 	put_task_struct(current);
 }
 
-static void swap_readpage_bdev_async(struct folio *folio,
+static void swap_read_folio_bdev_async(struct folio *folio,
 		struct swap_info_struct *sis)
 {
 	struct bio *bio;
@@ -487,10 +487,10 @@ static void swap_readpage_bdev_async(struct folio *folio,
 	submit_bio(bio);
 }
 
-void swap_readpage(struct page *page, bool synchronous, struct swap_iocb **plug)
+void swap_read_folio(struct folio *folio, bool synchronous,
+		struct swap_iocb **plug)
 {
-	struct folio *folio = page_folio(page);
-	struct swap_info_struct *sis = page_swap_info(page);
+	struct swap_info_struct *sis = swp_swap_info(folio->swap);
 	bool workingset = folio_test_workingset(folio);
 	unsigned long pflags;
 	bool in_thrashing;
@@ -514,11 +514,11 @@ void swap_readpage(struct page *page, bool synchronous, struct swap_iocb **plug)
 		folio_mark_uptodate(folio);
 		folio_unlock(folio);
 	} else if (data_race(sis->flags & SWP_FS_OPS)) {
-		swap_readpage_fs(folio, plug);
+		swap_read_folio_fs(folio, plug);
 	} else if (synchronous || (sis->flags & SWP_SYNCHRONOUS_IO)) {
-		swap_readpage_bdev_sync(folio, sis);
+		swap_read_folio_bdev_sync(folio, sis);
 	} else {
-		swap_readpage_bdev_async(folio, sis);
+		swap_read_folio_bdev_async(folio, sis);
 	}
 
 	if (workingset) {
diff --git a/mm/swap.h b/mm/swap.h
index b81587740cf1..859ae8f0fd2d 100644
--- a/mm/swap.h
+++ b/mm/swap.h
@@ -10,7 +10,8 @@ struct mempolicy;
 /* linux/mm/page_io.c */
 int sio_pool_init(void);
 struct swap_iocb;
-void swap_readpage(struct page *page, bool do_poll, struct swap_iocb **plug);
+void swap_read_folio(struct folio *folio, bool do_poll,
+		struct swap_iocb **plug);
 void __swap_read_unplug(struct swap_iocb *plug);
 static inline void swap_read_unplug(struct swap_iocb *plug)
 {
@@ -63,7 +64,7 @@ static inline unsigned int folio_swap_flags(struct folio *folio)
 }
 #else /* CONFIG_SWAP */
 struct swap_iocb;
-static inline void swap_readpage(struct page *page, bool do_poll,
+static inline void swap_read_folio(struct folio *folio, bool do_poll,
 		struct swap_iocb **plug)
 {
 }
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 874b40a1f502..d2fe70e307d9 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -530,7 +530,7 @@ fail_put_swap:
  * the swap entry is no longer in use.
  *
  * get/put_swap_device() aren't needed to call this function, because
- * __read_swap_cache_async() call them and swap_readpage() holds the
+ * __read_swap_cache_async() call them and swap_read_folio() holds the
  * swap cache folio lock.
  */
 struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
@@ -548,7 +548,7 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
 	mpol_cond_put(mpol);
 
 	if (page_allocated)
-		swap_readpage(&folio->page, false, plug);
+		swap_read_folio(folio, false, plug);
 	return folio_file_page(folio, swp_offset(entry));
 }
 
@@ -665,7 +665,7 @@ struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask,
 		if (!folio)
 			continue;
 		if (page_allocated) {
-			swap_readpage(&folio->page, false, &splug);
+			swap_read_folio(folio, false, &splug);
 			if (offset != entry_offset) {
 				folio_set_readahead(folio);
 				count_vm_event(SWAP_RA);
@@ -681,7 +681,7 @@ skip:
 	folio = __read_swap_cache_async(entry, gfp_mask, mpol, ilx,
 					&page_allocated, false);
 	if (unlikely(page_allocated))
-		swap_readpage(&folio->page, false, NULL);
+		swap_read_folio(folio, false, NULL);
 	zswap_folio_swapin(folio);
 	return folio_file_page(folio, swp_offset(entry));
 }
@@ -839,7 +839,7 @@ static struct page *swap_vma_readahead(swp_entry_t targ_entry, gfp_t gfp_mask,
 		if (!folio)
 			continue;
 		if (page_allocated) {
-			swap_readpage(&folio->page, false, &splug);
+			swap_read_folio(folio, false, &splug);
 			if (i != ra_info.offset) {
 				folio_set_readahead(folio);
 				count_vm_event(SWAP_RA);
@@ -857,7 +857,7 @@ skip:
 	folio = __read_swap_cache_async(targ_entry, gfp_mask, mpol, targ_ilx,
 					&page_allocated, false);
 	if (unlikely(page_allocated))
-		swap_readpage(&folio->page, false, NULL);
+		swap_read_folio(folio, false, NULL);
 	zswap_folio_swapin(folio);
 	return folio_file_page(folio, swp_offset(entry));
 }
diff --git a/mm/swapfile.c b/mm/swapfile.c
index b22c47b11d65..f3e23a3d26ae 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -2225,7 +2225,7 @@ EXPORT_SYMBOL_GPL(add_swap_extent);
 /*
  * A `swap extent' is a simple thing which maps a contiguous range of pages
  * onto a contiguous range of disk blocks.  A rbtree of swap extents is
- * built at swapon time and is then used at swap_writepage/swap_readpage
+ * built at swapon time and is then used at swap_writepage/swap_read_folio
  * time for locating where on disk a page belongs.
  *
  * If the swapfile is an S_ISBLK block device, a single extent is installed.
-- 
cgit v1.2.3


From 6e03492e9d288d9ce886064289e2768da5d7d967 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Wed, 13 Dec 2023 21:58:41 +0000
Subject: mm: return a folio from read_swap_cache_async()

The only two callers simply call put_page() on the page returned, so
they're happier calling folio_put().  Saves two calls to compound_head().

Link: https://lkml.kernel.org/r/20231213215842.671461-13-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/madvise.c    | 22 +++++++++++-----------
 mm/swap.h       |  7 +++----
 mm/swap_state.c |  8 ++++----
 3 files changed, 18 insertions(+), 19 deletions(-)

(limited to 'mm/swap_state.c')

diff --git a/mm/madvise.c b/mm/madvise.c
index 6214a1ab5654..912155a94ed5 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -180,7 +180,7 @@ static int swapin_walk_pmd_entry(pmd_t *pmd, unsigned long start,
 	for (addr = start; addr < end; addr += PAGE_SIZE) {
 		pte_t pte;
 		swp_entry_t entry;
-		struct page *page;
+		struct folio *folio;
 
 		if (!ptep++) {
 			ptep = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
@@ -198,10 +198,10 @@ static int swapin_walk_pmd_entry(pmd_t *pmd, unsigned long start,
 		pte_unmap_unlock(ptep, ptl);
 		ptep = NULL;
 
-		page = read_swap_cache_async(entry, GFP_HIGHUSER_MOVABLE,
+		folio = read_swap_cache_async(entry, GFP_HIGHUSER_MOVABLE,
 					     vma, addr, &splug);
-		if (page)
-			put_page(page);
+		if (folio)
+			folio_put(folio);
 	}
 
 	if (ptep)
@@ -223,17 +223,17 @@ static void shmem_swapin_range(struct vm_area_struct *vma,
 {
 	XA_STATE(xas, &mapping->i_pages, linear_page_index(vma, start));
 	pgoff_t end_index = linear_page_index(vma, end) - 1;
-	struct page *page;
+	struct folio *folio;
 	struct swap_iocb *splug = NULL;
 
 	rcu_read_lock();
-	xas_for_each(&xas, page, end_index) {
+	xas_for_each(&xas, folio, end_index) {
 		unsigned long addr;
 		swp_entry_t entry;
 
-		if (!xa_is_value(page))
+		if (!xa_is_value(folio))
 			continue;
-		entry = radix_to_swp_entry(page);
+		entry = radix_to_swp_entry(folio);
 		/* There might be swapin error entries in shmem mapping. */
 		if (non_swap_entry(entry))
 			continue;
@@ -243,10 +243,10 @@ static void shmem_swapin_range(struct vm_area_struct *vma,
 		xas_pause(&xas);
 		rcu_read_unlock();
 
-		page = read_swap_cache_async(entry, mapping_gfp_mask(mapping),
+		folio = read_swap_cache_async(entry, mapping_gfp_mask(mapping),
 					     vma, addr, &splug);
-		if (page)
-			put_page(page);
+		if (folio)
+			folio_put(folio);
 
 		rcu_read_lock();
 	}
diff --git a/mm/swap.h b/mm/swap.h
index 6bf25342589f..82c68ccb5ab1 100644
--- a/mm/swap.h
+++ b/mm/swap.h
@@ -46,10 +46,9 @@ struct folio *swap_cache_get_folio(swp_entry_t entry,
 struct folio *filemap_get_incore_folio(struct address_space *mapping,
 		pgoff_t index);
 
-struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
-				   struct vm_area_struct *vma,
-				   unsigned long addr,
-				   struct swap_iocb **plug);
+struct folio *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
+		struct vm_area_struct *vma, unsigned long addr,
+		struct swap_iocb **plug);
 struct folio *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_flags,
 		struct mempolicy *mpol, pgoff_t ilx, bool *new_page_allocated,
 		bool skip_if_exists);
diff --git a/mm/swap_state.c b/mm/swap_state.c
index d2fe70e307d9..97c8a950dd18 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -533,9 +533,9 @@ fail_put_swap:
  * __read_swap_cache_async() call them and swap_read_folio() holds the
  * swap cache folio lock.
  */
-struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
-				   struct vm_area_struct *vma,
-				   unsigned long addr, struct swap_iocb **plug)
+struct folio *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
+		struct vm_area_struct *vma, unsigned long addr,
+		struct swap_iocb **plug)
 {
 	bool page_allocated;
 	struct mempolicy *mpol;
@@ -549,7 +549,7 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
 
 	if (page_allocated)
 		swap_read_folio(folio, false, plug);
-	return folio_file_page(folio, swp_offset(entry));
+	return folio;
 }
 
 static unsigned int __swapin_nr_pages(unsigned long prev_offset,
-- 
cgit v1.2.3


From a4575c4138db887bd27dc7f87cf7cfb0224c6f5e Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Wed, 13 Dec 2023 21:58:42 +0000
Subject: mm: convert swap_cluster_readahead and swap_vma_readahead to return a
 folio

shmem_swapin_cluster() immediately converts the page back to a folio, and
swapin_readahead() may as well call folio_file_page() once instead of
having each function call it.

[willy@infradead.org: avoid NULL pointer deref]
  Link: https://lkml.kernel.org/r/ZYI7OcVlM1voKfBl@casper.infradead.org
Link: https://lkml.kernel.org/r/20231213215842.671461-14-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/shmem.c      |  8 +++-----
 mm/swap.h       |  6 +++---
 mm/swap_state.c | 24 +++++++++++++-----------
 3 files changed, 19 insertions(+), 19 deletions(-)

(limited to 'mm/swap_state.c')

diff --git a/mm/shmem.c b/mm/shmem.c
index c62f904ba1ca..a4d388973021 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1570,15 +1570,13 @@ static struct folio *shmem_swapin_cluster(swp_entry_t swap, gfp_t gfp,
 {
 	struct mempolicy *mpol;
 	pgoff_t ilx;
-	struct page *page;
+	struct folio *folio;
 
 	mpol = shmem_get_pgoff_policy(info, index, 0, &ilx);
-	page = swap_cluster_readahead(swap, gfp, mpol, ilx);
+	folio = swap_cluster_readahead(swap, gfp, mpol, ilx);
 	mpol_cond_put(mpol);
 
-	if (!page)
-		return NULL;
-	return page_folio(page);
+	return folio;
 }
 
 /*
diff --git a/mm/swap.h b/mm/swap.h
index 82c68ccb5ab1..758c46ca671e 100644
--- a/mm/swap.h
+++ b/mm/swap.h
@@ -52,8 +52,8 @@ struct folio *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
 struct folio *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_flags,
 		struct mempolicy *mpol, pgoff_t ilx, bool *new_page_allocated,
 		bool skip_if_exists);
-struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t flag,
-				    struct mempolicy *mpol, pgoff_t ilx);
+struct folio *swap_cluster_readahead(swp_entry_t entry, gfp_t flag,
+		struct mempolicy *mpol, pgoff_t ilx);
 struct page *swapin_readahead(swp_entry_t entry, gfp_t flag,
 			      struct vm_fault *vmf);
 
@@ -80,7 +80,7 @@ static inline void show_swap_cache_info(void)
 {
 }
 
-static inline struct page *swap_cluster_readahead(swp_entry_t entry,
+static inline struct folio *swap_cluster_readahead(swp_entry_t entry,
 			gfp_t gfp_mask, struct mempolicy *mpol, pgoff_t ilx)
 {
 	return NULL;
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 97c8a950dd18..e671266ad772 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -620,7 +620,7 @@ static unsigned long swapin_nr_pages(unsigned long offset)
  * @mpol: NUMA memory allocation policy to be applied
  * @ilx: NUMA interleave index, for use only when MPOL_INTERLEAVE
  *
- * Returns the struct page for entry and addr, after queueing swapin.
+ * Returns the struct folio for entry and addr, after queueing swapin.
  *
  * Primitive swap readahead code. We simply read an aligned block of
  * (1 << page_cluster) entries in the swap area. This method is chosen
@@ -631,7 +631,7 @@ static unsigned long swapin_nr_pages(unsigned long offset)
  * are used for every page of the readahead: neighbouring pages on swap
  * are fairly likely to have been swapped out from the same node.
  */
-struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask,
+struct folio *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask,
 				    struct mempolicy *mpol, pgoff_t ilx)
 {
 	struct folio *folio;
@@ -683,7 +683,7 @@ skip:
 	if (unlikely(page_allocated))
 		swap_read_folio(folio, false, NULL);
 	zswap_folio_swapin(folio);
-	return folio_file_page(folio, swp_offset(entry));
+	return folio;
 }
 
 int init_swap_address_space(unsigned int type, unsigned long nr_pages)
@@ -787,7 +787,7 @@ static void swap_ra_info(struct vm_fault *vmf,
  * @targ_ilx: NUMA interleave index, for use only when MPOL_INTERLEAVE
  * @vmf: fault information
  *
- * Returns the struct page for entry and addr, after queueing swapin.
+ * Returns the struct folio for entry and addr, after queueing swapin.
  *
  * Primitive swap readahead code. We simply read in a few pages whose
  * virtual addresses are around the fault address in the same vma.
@@ -795,9 +795,8 @@ static void swap_ra_info(struct vm_fault *vmf,
  * Caller must hold read mmap_lock if vmf->vma is not NULL.
  *
  */
-static struct page *swap_vma_readahead(swp_entry_t targ_entry, gfp_t gfp_mask,
-				       struct mempolicy *mpol, pgoff_t targ_ilx,
-				       struct vm_fault *vmf)
+static struct folio *swap_vma_readahead(swp_entry_t targ_entry, gfp_t gfp_mask,
+		struct mempolicy *mpol, pgoff_t targ_ilx, struct vm_fault *vmf)
 {
 	struct blk_plug plug;
 	struct swap_iocb *splug = NULL;
@@ -859,7 +858,7 @@ skip:
 	if (unlikely(page_allocated))
 		swap_read_folio(folio, false, NULL);
 	zswap_folio_swapin(folio);
-	return folio_file_page(folio, swp_offset(entry));
+	return folio;
 }
 
 /**
@@ -879,14 +878,17 @@ struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask,
 {
 	struct mempolicy *mpol;
 	pgoff_t ilx;
-	struct page *page;
+	struct folio *folio;
 
 	mpol = get_vma_policy(vmf->vma, vmf->address, 0, &ilx);
-	page = swap_use_vma_readahead() ?
+	folio = swap_use_vma_readahead() ?
 		swap_vma_readahead(entry, gfp_mask, mpol, ilx, vmf) :
 		swap_cluster_readahead(entry, gfp_mask, mpol, ilx);
 	mpol_cond_put(mpol);
-	return page;
+
+	if (!folio)
+		return NULL;
+	return folio_file_page(folio, swp_offset(entry));
 }
 
 #ifdef CONFIG_SYSFS
-- 
cgit v1.2.3