zswap: implement a second chance algorithm for dynamic zswap shrinker

author Nhat Pham <nphamcs@gmail.com>

Mon, 5 Aug 2024 23:22:42 +0000 (16:22 -0700)

committer Andrew Morton <akpm@linux-foundation.org>

Mon, 2 Sep 2024 03:26:02 +0000 (20:26 -0700)
author Nhat Pham <nphamcs@gmail.com>
Mon, 5 Aug 2024 23:22:42 +0000 (16:22 -0700)
committer Andrew Morton <akpm@linux-foundation.org>
Mon, 2 Sep 2024 03:26:02 +0000 (20:26 -0700)
diff --git a/include/linux/zswap.h b/include/linux/zswap.h

index 6cecb4a4f68b4baffcf41244cd8f277ad2189a7c..9cd1beef0654873e47cf84114505630ac9c3720d 100644 (file)
--- a/include/linux/zswap.h
+++ b/include/linux/zswap.h
@@ -13,17 +13,15 @@ extern atomic_t zswap_stored_pages;
  
  struct zswap_lruvec_state {
         /*
-        * Number of pages in zswap that should be protected from the shrinker.
-        * This number is an estimate of the following counts:
+        * Number of swapped in pages from disk, i.e not found in the zswap pool.
          *
-        * a) Recent page faults.
-        * b) Recent insertion to the zswap LRU. This includes new zswap stores,
-        *    as well as recent zswap LRU rotations.
-        *
-        * These pages are likely to be warm, and might incur IO if the are written
-        * to swap.
+        * This is consumed and subtracted from the lru size in
+        * zswap_shrinker_count() to penalize past overshrinking that led to disk
+        * swapins. The idea is that had we considered this many more pages in the
+        * LRU active/protected and not written them back, we would not have had to
+        * swapped them in.
          */
-       atomic_long_t nr_zswap_protected;
+       atomic_long_t nr_disk_swapins;
  };
  
  unsigned long zswap_total_pages(void);
diff --git a/mm/zswap.c b/mm/zswap.c

index 71b75ff1f3fb7c770573a76831c9c7e9b21e1ffb..df66ab102d2765da8d93ba75006a3d580fefc621 100644 (file)
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -187,6 +187,10 @@ static struct shrinker *zswap_shrinker;
   * length - the length in bytes of the compressed page data.  Needed during
   *          decompression. For a same value filled page length is 0, and both
   *          pool and lru are invalid and must be ignored.
+ * referenced - true if the entry recently entered the zswap pool. Unset by the
+ *              writeback logic. The entry is only reclaimed by the writeback
+ *              logic if referenced is unset. See comments in the shrinker
+ *              section for context.
   * pool - the zswap_pool the entry's data is in
   * handle - zpool allocation handle that stores the compressed page data
   * value - value of the same-value filled pages which have same content
@@ -196,6 +200,7 @@ static struct shrinker *zswap_shrinker;
  struct zswap_entry {
         swp_entry_t swpentry;
         unsigned int length;
+       bool referenced;
         struct zswap_pool *pool;
         union {
                 unsigned long handle;
@@ -700,11 +705,8 @@ static inline int entry_to_nid(struct zswap_entry *entry)
  
  static void zswap_lru_add(struct list_lru *list_lru, struct zswap_entry *entry)
  {
-       atomic_long_t *nr_zswap_protected;
-       unsigned long lru_size, old, new;
         int nid = entry_to_nid(entry);
         struct mem_cgroup *memcg;
-       struct lruvec *lruvec;
  
         /*
          * Note that it is safe to use rcu_read_lock() here, even in the face of
@@ -722,19 +724,6 @@ static void zswap_lru_add(struct list_lru *list_lru, struct zswap_entry *entry)
         memcg = mem_cgroup_from_entry(entry);
         /* will always succeed */
         list_lru_add(list_lru, &entry->lru, nid, memcg);
-
-       /* Update the protection area */
-       lru_size = list_lru_count_one(list_lru, nid, memcg);
-       lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(nid));
-       nr_zswap_protected = &lruvec->zswap_lruvec_state.nr_zswap_protected;
-       old = atomic_long_inc_return(nr_zswap_protected);
-       /*
-        * Decay to avoid overflow and adapt to changing workloads.
-        * This is based on LRU reclaim cost decaying heuristics.
-        */
-       do {
-               new = old > lru_size / 4 ? old / 2 : old;
-       } while (!atomic_long_try_cmpxchg(nr_zswap_protected, &old, new));
         rcu_read_unlock();
  }
  
@@ -752,7 +741,7 @@ static void zswap_lru_del(struct list_lru *list_lru, struct zswap_entry *entry)
  
  void zswap_lruvec_state_init(struct lruvec *lruvec)
  {
-       atomic_long_set(&lruvec->zswap_lruvec_state.nr_zswap_protected, 0);
+       atomic_long_set(&lruvec->zswap_lruvec_state.nr_disk_swapins, 0);
  }
  
  void zswap_folio_swapin(struct folio *folio)
@@ -761,7 +750,7 @@ void zswap_folio_swapin(struct folio *folio)
  
         if (folio) {
                 lruvec = folio_lruvec(folio);
-               atomic_long_inc(&lruvec->zswap_lruvec_state.nr_zswap_protected);
+               atomic_long_inc(&lruvec->zswap_lruvec_state.nr_disk_swapins);
         }
  }
  
@@ -1095,6 +1084,28 @@ static int zswap_writeback_entry(struct zswap_entry *entry,
  /*********************************
  * shrinker functions
  **********************************/
+/*
+ * The dynamic shrinker is modulated by the following factors:
+ *
+ * 1. Each zswap entry has a referenced bit, which the shrinker unsets (giving
+ *    the entry a second chance) before rotating it in the LRU list. If the
+ *    entry is considered again by the shrinker, with its referenced bit unset,
+ *    it is written back. The writeback rate as a result is dynamically
+ *    adjusted by the pool activities - if the pool is dominated by new entries
+ *    (i.e lots of recent zswapouts), these entries will be protected and
+ *    the writeback rate will slow down. On the other hand, if the pool has a
+ *    lot of stagnant entries, these entries will be reclaimed immediately,
+ *    effectively increasing the writeback rate.
+ *
+ * 2. Swapins counter: If we observe swapins, it is a sign that we are
+ *    overshrinking and should slow down. We maintain a swapins counter, which
+ *    is consumed and subtract from the number of eligible objects on the LRU
+ *    in zswap_shrinker_count().
+ *
+ * 3. Compression ratio. The better the workload compresses, the less gains we
+ *    can expect from writeback. We scale down the number of objects available
+ *    for reclaim by this ratio.
+ */
  static enum lru_status shrink_memcg_cb(struct list_head *item, struct list_lru_one *l,
                                        spinlock_t *lock, void *arg)
  {
@@ -1104,6 +1115,16 @@ static enum lru_status shrink_memcg_cb(struct list_head *item, struct list_lru_o
         enum lru_status ret = LRU_REMOVED_RETRY;
         int writeback_result;
  
+       /*
+        * Second chance algorithm: if the entry has its referenced bit set, give it
+        * a second chance. Only clear the referenced bit and rotate it in the
+        * zswap's LRU list.
+        */
+       if (entry->referenced) {
+               entry->referenced = false;
+               return LRU_ROTATE;
+       }
+
         /*
          * As soon as we drop the LRU lock, the entry can be freed by
          * a concurrent invalidation. This means the following:
@@ -1170,8 +1191,7 @@ static enum lru_status shrink_memcg_cb(struct list_head *item, struct list_lru_o
  static unsigned long zswap_shrinker_scan(struct shrinker *shrinker,
                 struct shrink_control *sc)
  {
-       struct lruvec *lruvec = mem_cgroup_lruvec(sc->memcg, NODE_DATA(sc->nid));
-       unsigned long shrink_ret, nr_protected, lru_size;
+       unsigned long shrink_ret;
         bool encountered_page_in_swapcache = false;
  
         if (!zswap_shrinker_enabled ||
@@ -1180,25 +1200,6 @@ static unsigned long zswap_shrinker_scan(struct shrinker *shrinker,
                 return SHRINK_STOP;
         }
  
-       nr_protected =
-               atomic_long_read(&lruvec->zswap_lruvec_state.nr_zswap_protected);
-       lru_size = list_lru_shrink_count(&zswap_list_lru, sc);
-
-       /*
-        * Abort if we are shrinking into the protected region.
-        *
-        * This short-circuiting is necessary because if we have too many multiple
-        * concurrent reclaimers getting the freeable zswap object counts at the
-        * same time (before any of them made reasonable progress), the total
-        * number of reclaimed objects might be more than the number of unprotected
-        * objects (i.e the reclaimers will reclaim into the protected area of the
-        * zswap LRU).
-        */
-       if (nr_protected >= lru_size - sc->nr_to_scan) {
-               sc->nr_scanned = 0;
-               return SHRINK_STOP;
-       }
-
         shrink_ret = list_lru_shrink_walk(&zswap_list_lru, sc, &shrink_memcg_cb,
                 &encountered_page_in_swapcache);
  
@@ -1213,7 +1214,10 @@ static unsigned long zswap_shrinker_count(struct shrinker *shrinker,
  {
         struct mem_cgroup *memcg = sc->memcg;
         struct lruvec *lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(sc->nid));
-       unsigned long nr_backing, nr_stored, nr_freeable, nr_protected;
+       atomic_long_t *nr_disk_swapins =
+               &lruvec->zswap_lruvec_state.nr_disk_swapins;
+       unsigned long nr_backing, nr_stored, nr_freeable, nr_disk_swapins_cur,
+               nr_remain;
  
         if (!zswap_shrinker_enabled || !mem_cgroup_zswap_writeback_enabled(memcg))
                 return 0;
@@ -1246,14 +1250,27 @@ static unsigned long zswap_shrinker_count(struct shrinker *shrinker,
         if (!nr_stored)
                 return 0;
  
-       nr_protected =
-               atomic_long_read(&lruvec->zswap_lruvec_state.nr_zswap_protected);
         nr_freeable = list_lru_shrink_count(&zswap_list_lru, sc);
+       if (!nr_freeable)
+               return 0;
+
         /*
-        * Subtract the lru size by an estimate of the number of pages
-        * that should be protected.
+        * Subtract from the lru size the number of pages that are recently swapped
+        * in from disk. The idea is that had we protect the zswap's LRU by this
+        * amount of pages, these disk swapins would not have happened.
          */
-       nr_freeable = nr_freeable > nr_protected ? nr_freeable - nr_protected : 0;
+       nr_disk_swapins_cur = atomic_long_read(nr_disk_swapins);
+       do {
+               if (nr_freeable >= nr_disk_swapins_cur)
+                       nr_remain = 0;
+               else
+                       nr_remain = nr_disk_swapins_cur - nr_freeable;
+       } while (!atomic_long_try_cmpxchg(
+               nr_disk_swapins, &nr_disk_swapins_cur, nr_remain));
+
+       nr_freeable -= nr_disk_swapins_cur - nr_remain;
+       if (!nr_freeable)
+               return 0;
  
         /*
          * Scale the number of freeable pages by the memory saving factor.
@@ -1506,6 +1523,7 @@ bool zswap_store(struct folio *folio)
  store_entry:
         entry->swpentry = swp;
         entry->objcg = objcg;
+       entry->referenced = true;
  
         old = xa_store(tree, offset, entry, GFP_KERNEL);
         if (xa_is_err(old)) {
author	Nhat Pham <nphamcs@gmail.com>
	Mon, 5 Aug 2024 23:22:42 +0000 (16:22 -0700)
committer	Andrew Morton <akpm@linux-foundation.org>
	Mon, 2 Sep 2024 03:26:02 +0000 (20:26 -0700)
include/linux/zswap.h		patch \| blob \| blame \| history
mm/zswap.c		patch \| blob \| blame \| history