From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Fri, 21 Nov 2025 10:08:43 +0000 (+0100)
Subject: 6.6-stable patches
X-Git-Tag: v6.6.117~18
X-Git-Url: http://git.ipfire.org/gitweb.cgi?a=commitdiff_plain;h=ab2d042776444788a0d4cfd3529d7025efaa7cfb;p=thirdparty%2Fkernel%2Fstable-queue.git

6.6-stable patches

added patches:
	mm-memcg-add-per-memcg-zswap-writeback-stat.patch
	mm-memcg-add-thp-swap-out-info-for-anonymous-reclaim.patch
	mm-memcg-change-flush_next_time-to-flush_last_time.patch
	mm-memcg-make-stats-flushing-threshold-per-memcg.patch
	mm-memcg-move-vmstats-structs-definition-above-flushing-code.patch
	mm-memcg-restore-subtree-stats-flushing.patch
	mm-workingset-move-the-stats-flush-into-workingset_test_recent.patch
---

diff --git a/queue-6.6/mm-memcg-add-per-memcg-zswap-writeback-stat.patch b/queue-6.6/mm-memcg-add-per-memcg-zswap-writeback-stat.patch
new file mode 100644
index 0000000000..3fc8279fd4
--- /dev/null
+++ b/queue-6.6/mm-memcg-add-per-memcg-zswap-writeback-stat.patch
@@ -0,0 +1,86 @@
+From leon.huangfu@shopee.com Mon Nov  3 08:53:08 2025
+From: Leon Huang Fu <leon.huangfu@shopee.com>
+Date: Mon,  3 Nov 2025 15:51:30 +0800
+Subject: mm: memcg: add per-memcg zswap writeback stat
+To: stable@vger.kernel.org, greg@kroah.com
+Cc: tj@kernel.org, lizefan.x@bytedance.com, hannes@cmpxchg.org, corbet@lwn.net, mhocko@kernel.org, roman.gushchin@linux.dev, shakeelb@google.com, muchun.song@linux.dev, akpm@linux-foundation.org, sjenning@redhat.com, ddstreet@ieee.org, vitaly.wool@konsulko.com, lance.yang@linux.dev, leon.huangfu@shopee.com, shy828301@gmail.com, yosryahmed@google.com, sashal@kernel.org, vishal.moola@gmail.com, cerasuolodomenico@gmail.com, nphamcs@gmail.com, cgroups@vger.kernel.org, linux-doc@vger.kernel.org, linux-kernel@vger.kernel.org, linux-mm@kvack.org, Bagas Sanjaya <bagasdotme@gmail.com>, Chris Li <chrisl@kernel.org>, Shuah Khan <shuah@kernel.org>
+Message-ID: <20251103075135.20254-3-leon.huangfu@shopee.com>
+
+From: Domenico Cerasuolo <cerasuolodomenico@gmail.com>
+
+[ Upstream commit 7108cc3f765cafd48a6a35f8add140beaecfa75b ]
+
+Since zswap now writes back pages from memcg-specific LRUs, we now need a
+new stat to show writebacks count for each memcg.
+
+[nphamcs@gmail.com: rename ZSWP_WB to ZSWPWB]
+  Link: https://lkml.kernel.org/r/20231205193307.2432803-1-nphamcs@gmail.com
+Link: https://lkml.kernel.org/r/20231130194023.4102148-5-nphamcs@gmail.com
+Suggested-by: Nhat Pham <nphamcs@gmail.com>
+Signed-off-by: Domenico Cerasuolo <cerasuolodomenico@gmail.com>
+Signed-off-by: Nhat Pham <nphamcs@gmail.com>
+Tested-by: Bagas Sanjaya <bagasdotme@gmail.com>
+Reviewed-by: Yosry Ahmed <yosryahmed@google.com>
+Cc: Chris Li <chrisl@kernel.org>
+Cc: Dan Streetman <ddstreet@ieee.org>
+Cc: Johannes Weiner <hannes@cmpxchg.org>
+Cc: Michal Hocko <mhocko@kernel.org>
+Cc: Muchun Song <muchun.song@linux.dev>
+Cc: Roman Gushchin <roman.gushchin@linux.dev>
+Cc: Seth Jennings <sjenning@redhat.com>
+Cc: Shakeel Butt <shakeelb@google.com>
+Cc: Shuah Khan <shuah@kernel.org>
+Cc: Vitaly Wool <vitaly.wool@konsulko.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Leon Huang Fu <leon.huangfu@shopee.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ include/linux/vm_event_item.h |    1 +
+ mm/memcontrol.c               |    1 +
+ mm/vmstat.c                   |    1 +
+ mm/zswap.c                    |    4 ++++
+ 4 files changed, 7 insertions(+)
+
+--- a/include/linux/vm_event_item.h
++++ b/include/linux/vm_event_item.h
+@@ -145,6 +145,7 @@ enum vm_event_item { PGPGIN, PGPGOUT, PS
+ #ifdef CONFIG_ZSWAP
+ 		ZSWPIN,
+ 		ZSWPOUT,
++		ZSWPWB,
+ #endif
+ #ifdef CONFIG_X86
+ 		DIRECT_MAP_LEVEL2_SPLIT,
+--- a/mm/memcontrol.c
++++ b/mm/memcontrol.c
+@@ -700,6 +700,7 @@ static const unsigned int memcg_vm_event
+ #if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_ZSWAP)
+ 	ZSWPIN,
+ 	ZSWPOUT,
++	ZSWPWB,
+ #endif
+ #ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ 	THP_FAULT_ALLOC,
+--- a/mm/vmstat.c
++++ b/mm/vmstat.c
+@@ -1397,6 +1397,7 @@ const char * const vmstat_text[] = {
+ #ifdef CONFIG_ZSWAP
+ 	"zswpin",
+ 	"zswpout",
++	"zswpwb",
+ #endif
+ #ifdef CONFIG_X86
+ 	"direct_map_level2_splits",
+--- a/mm/zswap.c
++++ b/mm/zswap.c
+@@ -674,6 +674,10 @@ static int zswap_reclaim_entry(struct zs
+ 		goto put_unlock;
+ 	}
+ 
++	if (entry->objcg)
++		count_objcg_event(entry->objcg, ZSWPWB);
++
++	count_vm_event(ZSWPWB);
+ 	/*
+ 	 * Writeback started successfully, the page now belongs to the
+ 	 * swapcache. Drop the entry from zswap - unless invalidate already
diff --git a/queue-6.6/mm-memcg-add-thp-swap-out-info-for-anonymous-reclaim.patch b/queue-6.6/mm-memcg-add-thp-swap-out-info-for-anonymous-reclaim.patch
new file mode 100644
index 0000000000..4cece193ea
--- /dev/null
+++ b/queue-6.6/mm-memcg-add-thp-swap-out-info-for-anonymous-reclaim.patch
@@ -0,0 +1,113 @@
+From leon.huangfu@shopee.com Mon Nov  3 08:53:02 2025
+From: Leon Huang Fu <leon.huangfu@shopee.com>
+Date: Mon,  3 Nov 2025 15:51:29 +0800
+Subject: mm: memcg: add THP swap out info for anonymous reclaim
+To: stable@vger.kernel.org, greg@kroah.com
+Cc: tj@kernel.org, lizefan.x@bytedance.com, hannes@cmpxchg.org, corbet@lwn.net, mhocko@kernel.org, roman.gushchin@linux.dev, shakeelb@google.com, muchun.song@linux.dev, akpm@linux-foundation.org, sjenning@redhat.com, ddstreet@ieee.org, vitaly.wool@konsulko.com, lance.yang@linux.dev, leon.huangfu@shopee.com, shy828301@gmail.com, yosryahmed@google.com, sashal@kernel.org, vishal.moola@gmail.com, cerasuolodomenico@gmail.com, nphamcs@gmail.com, cgroups@vger.kernel.org, linux-doc@vger.kernel.org, linux-kernel@vger.kernel.org, linux-mm@kvack.org, Xin Hao <vernhao@tencent.com>, Michal Hocko <mhocko@suse.com>, Muchun Song <songmuchun@bytedance.com>
+Message-ID: <20251103075135.20254-2-leon.huangfu@shopee.com>
+
+From: Xin Hao <vernhao@tencent.com>
+
+[ Upstream commit 811244a501b967b00fecb1ae906d5dc6329c91e0 ]
+
+At present, we support per-memcg reclaim strategy, however we do not know
+the number of transparent huge pages being reclaimed, as we know the
+transparent huge pages need to be splited before reclaim them, and they
+will bring some performance bottleneck effect.  for example, when two
+memcg (A & B) are doing reclaim for anonymous pages at same time, and 'A'
+memcg is reclaiming a large number of transparent huge pages, we can
+better analyze that the performance bottleneck will be caused by 'A'
+memcg.  therefore, in order to better analyze such problems, there add THP
+swap out info for per-memcg.
+
+[akpm@linux-foundation.orgL fix swap_writepage_fs(), per Johannes]
+  Link: https://lkml.kernel.org/r/20230913213343.GB48476@cmpxchg.org
+Link: https://lkml.kernel.org/r/20230913164938.16918-1-vernhao@tencent.com
+Signed-off-by: Xin Hao <vernhao@tencent.com>
+Suggested-by: Johannes Weiner <hannes@cmpxchg.org>
+Acked-by: Johannes Weiner <hannes@cmpxchg.org>
+Cc: Michal Hocko <mhocko@suse.com>
+Cc: Roman Gushchin <roman.gushchin@linux.dev>
+Cc: Shakeel Butt <shakeelb@google.com>
+Cc: Muchun Song <songmuchun@bytedance.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Leon Huang Fu <leon.huangfu@shopee.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ Documentation/admin-guide/cgroup-v2.rst |    9 +++++++++
+ mm/memcontrol.c                         |    2 ++
+ mm/page_io.c                            |    8 ++++----
+ mm/vmscan.c                             |    1 +
+ 4 files changed, 16 insertions(+), 4 deletions(-)
+
+--- a/Documentation/admin-guide/cgroup-v2.rst
++++ b/Documentation/admin-guide/cgroup-v2.rst
+@@ -1532,6 +1532,15 @@ PAGE_SIZE multiple when read back.
+ 		collapsing an existing range of pages. This counter is not
+ 		present when CONFIG_TRANSPARENT_HUGEPAGE is not set.
+ 
++	  thp_swpout (npn)
++		Number of transparent hugepages which are swapout in one piece
++		without splitting.
++
++	  thp_swpout_fallback (npn)
++		Number of transparent hugepages which were split before swapout.
++		Usually because failed to allocate some continuous swap space
++		for the huge page.
++
+   memory.numa_stat
+ 	A read-only nested-keyed file which exists on non-root cgroups.
+ 
+--- a/mm/memcontrol.c
++++ b/mm/memcontrol.c
+@@ -704,6 +704,8 @@ static const unsigned int memcg_vm_event
+ #ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ 	THP_FAULT_ALLOC,
+ 	THP_COLLAPSE_ALLOC,
++	THP_SWPOUT,
++	THP_SWPOUT_FALLBACK,
+ #endif
+ };
+ 
+--- a/mm/page_io.c
++++ b/mm/page_io.c
+@@ -208,8 +208,10 @@ int swap_writepage(struct page *page, st
+ static inline void count_swpout_vm_event(struct folio *folio)
+ {
+ #ifdef CONFIG_TRANSPARENT_HUGEPAGE
+-	if (unlikely(folio_test_pmd_mappable(folio)))
++	if (unlikely(folio_test_pmd_mappable(folio))) {
++		count_memcg_folio_events(folio, THP_SWPOUT, 1);
+ 		count_vm_event(THP_SWPOUT);
++	}
+ #endif
+ 	count_vm_events(PSWPOUT, folio_nr_pages(folio));
+ }
+@@ -278,9 +280,6 @@ static void sio_write_complete(struct ki
+ 			set_page_dirty(page);
+ 			ClearPageReclaim(page);
+ 		}
+-	} else {
+-		for (p = 0; p < sio->pages; p++)
+-			count_swpout_vm_event(page_folio(sio->bvec[p].bv_page));
+ 	}
+ 
+ 	for (p = 0; p < sio->pages; p++)
+@@ -296,6 +295,7 @@ static void swap_writepage_fs(struct pag
+ 	struct file *swap_file = sis->swap_file;
+ 	loff_t pos = page_file_offset(page);
+ 
++	count_swpout_vm_event(page_folio(page));
+ 	set_page_writeback(page);
+ 	unlock_page(page);
+ 	if (wbc->swap_plug)
+--- a/mm/vmscan.c
++++ b/mm/vmscan.c
+@@ -1922,6 +1922,7 @@ retry:
+ 								folio_list))
+ 						goto activate_locked;
+ #ifdef CONFIG_TRANSPARENT_HUGEPAGE
++					count_memcg_folio_events(folio, THP_SWPOUT_FALLBACK, 1);
+ 					count_vm_event(THP_SWPOUT_FALLBACK);
+ #endif
+ 					if (!add_to_swap(folio))
diff --git a/queue-6.6/mm-memcg-change-flush_next_time-to-flush_last_time.patch b/queue-6.6/mm-memcg-change-flush_next_time-to-flush_last_time.patch
new file mode 100644
index 0000000000..9e35431a3f
--- /dev/null
+++ b/queue-6.6/mm-memcg-change-flush_next_time-to-flush_last_time.patch
@@ -0,0 +1,88 @@
+From leon.huangfu@shopee.com Mon Nov  3 08:53:16 2025
+From: Leon Huang Fu <leon.huangfu@shopee.com>
+Date: Mon,  3 Nov 2025 15:51:31 +0800
+Subject: mm: memcg: change flush_next_time to flush_last_time
+To: stable@vger.kernel.org, greg@kroah.com
+Cc: tj@kernel.org, lizefan.x@bytedance.com, hannes@cmpxchg.org, corbet@lwn.net, mhocko@kernel.org, roman.gushchin@linux.dev, shakeelb@google.com, muchun.song@linux.dev, akpm@linux-foundation.org, sjenning@redhat.com, ddstreet@ieee.org, vitaly.wool@konsulko.com, lance.yang@linux.dev, leon.huangfu@shopee.com, shy828301@gmail.com, yosryahmed@google.com, sashal@kernel.org, vishal.moola@gmail.com, cerasuolodomenico@gmail.com, nphamcs@gmail.com, cgroups@vger.kernel.org, linux-doc@vger.kernel.org, linux-kernel@vger.kernel.org, linux-mm@kvack.org, Chris Li <chrisl@kernel.org>, Bagas Sanjaya <bagasdotme@gmail.com>, Greg Thelen <gthelen@google.com>, Ivan Babrou <ivan@cloudflare.com>, Michal Koutny <mkoutny@suse.com>, Waiman Long <longman@redhat.com>, Wei Xu <weixugc@google.com>
+Message-ID: <20251103075135.20254-4-leon.huangfu@shopee.com>
+
+From: Yosry Ahmed <yosryahmed@google.com>
+
+[ Upstream commit 508bed884767a8eb394640bae9edcdf082816c43 ]
+
+Patch series "mm: memcg: subtree stats flushing and thresholds", v4.
+
+This series attempts to address shortages in today's approach for memcg
+stats flushing, namely occasionally stale or expensive stat reads.  The
+series does so by changing the threshold that we use to decide whether to
+trigger a flush to be per memcg instead of global (patch 3), and then
+changing flushing to be per memcg (i.e.  subtree flushes) instead of
+global (patch 5).
+
+This patch (of 5):
+
+flush_next_time is an inaccurate name.  It's not the next time that
+periodic flushing will happen, it's rather the next time that ratelimited
+flushing can happen if the periodic flusher is late.
+
+Simplify its semantics by just storing the timestamp of the last flush
+instead, flush_last_time.  Move the 2*FLUSH_TIME addition to
+mem_cgroup_flush_stats_ratelimited(), and add a comment explaining it.
+This way, all the ratelimiting semantics live in one place.
+
+No functional change intended.
+
+Link: https://lkml.kernel.org/r/20231129032154.3710765-1-yosryahmed@google.com
+Link: https://lkml.kernel.org/r/20231129032154.3710765-2-yosryahmed@google.com
+Signed-off-by: Yosry Ahmed <yosryahmed@google.com>
+Tested-by: Domenico Cerasuolo <cerasuolodomenico@gmail.com>
+Acked-by: Shakeel Butt <shakeelb@google.com>
+Acked-by: Chris Li <chrisl@kernel.org> (Google)
+Tested-by: Bagas Sanjaya <bagasdotme@gmail.com>
+Cc: Greg Thelen <gthelen@google.com>
+Cc: Ivan Babrou <ivan@cloudflare.com>
+Cc: Johannes Weiner <hannes@cmpxchg.org>
+Cc: Michal Hocko <mhocko@kernel.org>
+Cc: Michal Koutny <mkoutny@suse.com>
+Cc: Muchun Song <muchun.song@linux.dev>
+Cc: Roman Gushchin <roman.gushchin@linux.dev>
+Cc: Tejun Heo <tj@kernel.org>
+Cc: Waiman Long <longman@redhat.com>
+Cc: Wei Xu <weixugc@google.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Leon Huang Fu <leon.huangfu@shopee.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ mm/memcontrol.c |    7 ++++---
+ 1 file changed, 4 insertions(+), 3 deletions(-)
+
+--- a/mm/memcontrol.c
++++ b/mm/memcontrol.c
+@@ -590,7 +590,7 @@ static DECLARE_DEFERRABLE_WORK(stats_flu
+ static DEFINE_PER_CPU(unsigned int, stats_updates);
+ static atomic_t stats_flush_ongoing = ATOMIC_INIT(0);
+ static atomic_t stats_flush_threshold = ATOMIC_INIT(0);
+-static u64 flush_next_time;
++static u64 flush_last_time;
+ 
+ #define FLUSH_TIME (2UL*HZ)
+ 
+@@ -650,7 +650,7 @@ static void do_flush_stats(void)
+ 	    atomic_xchg(&stats_flush_ongoing, 1))
+ 		return;
+ 
+-	WRITE_ONCE(flush_next_time, jiffies_64 + 2*FLUSH_TIME);
++	WRITE_ONCE(flush_last_time, jiffies_64);
+ 
+ 	cgroup_rstat_flush(root_mem_cgroup->css.cgroup);
+ 
+@@ -666,7 +666,8 @@ void mem_cgroup_flush_stats(void)
+ 
+ void mem_cgroup_flush_stats_ratelimited(void)
+ {
+-	if (time_after64(jiffies_64, READ_ONCE(flush_next_time)))
++	/* Only flush if the periodic flusher is one full cycle late */
++	if (time_after64(jiffies_64, READ_ONCE(flush_last_time) + 2*FLUSH_TIME))
+ 		mem_cgroup_flush_stats();
+ }
+ 
diff --git a/queue-6.6/mm-memcg-make-stats-flushing-threshold-per-memcg.patch b/queue-6.6/mm-memcg-make-stats-flushing-threshold-per-memcg.patch
new file mode 100644
index 0000000000..b34d1502b3
--- /dev/null
+++ b/queue-6.6/mm-memcg-make-stats-flushing-threshold-per-memcg.patch
@@ -0,0 +1,258 @@
+From leon.huangfu@shopee.com Mon Nov  3 08:53:30 2025
+From: Leon Huang Fu <leon.huangfu@shopee.com>
+Date: Mon,  3 Nov 2025 15:51:33 +0800
+Subject: mm: memcg: make stats flushing threshold per-memcg
+To: stable@vger.kernel.org, greg@kroah.com
+Cc: tj@kernel.org, lizefan.x@bytedance.com, hannes@cmpxchg.org, corbet@lwn.net, mhocko@kernel.org, roman.gushchin@linux.dev, shakeelb@google.com, muchun.song@linux.dev, akpm@linux-foundation.org, sjenning@redhat.com, ddstreet@ieee.org, vitaly.wool@konsulko.com, lance.yang@linux.dev, leon.huangfu@shopee.com, shy828301@gmail.com, yosryahmed@google.com, sashal@kernel.org, vishal.moola@gmail.com, cerasuolodomenico@gmail.com, nphamcs@gmail.com, cgroups@vger.kernel.org, linux-doc@vger.kernel.org, linux-kernel@vger.kernel.org, linux-mm@kvack.org, Chris Li <chrisl@kernel.org>, Greg Thelen <gthelen@google.com>, Ivan Babrou <ivan@cloudflare.com>, Michal Koutny <mkoutny@suse.com>, Waiman Long <longman@redhat.com>, Wei Xu <weixugc@google.com>
+Message-ID: <20251103075135.20254-6-leon.huangfu@shopee.com>
+
+From: Yosry Ahmed <yosryahmed@google.com>
+
+[ Upstream commit 8d59d2214c2362e7a9d185d80b613e632581af7b ]
+
+A global counter for the magnitude of memcg stats update is maintained on
+the memcg side to avoid invoking rstat flushes when the pending updates
+are not significant.  This avoids unnecessary flushes, which are not very
+cheap even if there isn't a lot of stats to flush.  It also avoids
+unnecessary lock contention on the underlying global rstat lock.
+
+Make this threshold per-memcg.  The scheme is followed where percpu (now
+also per-memcg) counters are incremented in the update path, and only
+propagated to per-memcg atomics when they exceed a certain threshold.
+
+This provides two benefits: (a) On large machines with a lot of memcgs,
+the global threshold can be reached relatively fast, so guarding the
+underlying lock becomes less effective.  Making the threshold per-memcg
+avoids this.
+
+(b) Having a global threshold makes it hard to do subtree flushes, as we
+cannot reset the global counter except for a full flush.  Per-memcg
+counters removes this as a blocker from doing subtree flushes, which helps
+avoid unnecessary work when the stats of a small subtree are needed.
+
+Nothing is free, of course.  This comes at a cost: (a) A new per-cpu
+counter per memcg, consuming NR_CPUS * NR_MEMCGS * 4 bytes.  The extra
+memory usage is insigificant.
+
+(b) More work on the update side, although in the common case it will only
+be percpu counter updates.  The amount of work scales with the number of
+ancestors (i.e.  tree depth).  This is not a new concept, adding a cgroup
+to the rstat tree involves a parent loop, so is charging.  Testing results
+below show no significant regressions.
+
+(c) The error margin in the stats for the system as a whole increases from
+NR_CPUS * MEMCG_CHARGE_BATCH to NR_CPUS * MEMCG_CHARGE_BATCH * NR_MEMCGS.
+This is probably fine because we have a similar per-memcg error in charges
+coming from percpu stocks, and we have a periodic flusher that makes sure
+we always flush all the stats every 2s anyway.
+
+This patch was tested to make sure no significant regressions are
+introduced on the update path as follows.  The following benchmarks were
+ran in a cgroup that is 2 levels deep (/sys/fs/cgroup/a/b/):
+
+(1) Running 22 instances of netperf on a 44 cpu machine with
+hyperthreading disabled. All instances are run in a level 2 cgroup, as
+well as netserver:
+  # netserver -6
+  # netperf -6 -H ::1 -l 60 -t TCP_SENDFILE -- -m 10K
+
+Averaging 20 runs, the numbers are as follows:
+Base: 40198.0 mbps
+Patched: 38629.7 mbps (-3.9%)
+
+The regression is minimal, especially for 22 instances in the same
+cgroup sharing all ancestors (so updating the same atomics).
+
+(2) will-it-scale page_fault tests. These tests (specifically
+per_process_ops in page_fault3 test) detected a 25.9% regression before
+for a change in the stats update path [1]. These are the
+numbers from 10 runs (+ is good) on a machine with 256 cpus:
+
+             LABEL            |     MEAN    |   MEDIAN    |   STDDEV   |
+------------------------------+-------------+-------------+-------------
+  page_fault1_per_process_ops |             |             |            |
+  (A) base                    | 270249.164  | 265437.000  | 13451.836  |
+  (B) patched                 | 261368.709  | 255725.000  | 13394.767  |
+                              | -3.29%      | -3.66%      |            |
+  page_fault1_per_thread_ops  |             |             |            |
+  (A) base                    | 242111.345  | 239737.000  | 10026.031  |
+  (B) patched                 | 237057.109  | 235305.000  | 9769.687   |
+                              | -2.09%      | -1.85%      |            |
+  page_fault1_scalability     |             |             |
+  (A) base                    | 0.034387    | 0.035168    | 0.0018283  |
+  (B) patched                 | 0.033988    | 0.034573    | 0.0018056  |
+                              | -1.16%      | -1.69%      |            |
+  page_fault2_per_process_ops |             |             |
+  (A) base                    | 203561.836  | 203301.000  | 2550.764   |
+  (B) patched                 | 197195.945  | 197746.000  | 2264.263   |
+                              | -3.13%      | -2.73%      |            |
+  page_fault2_per_thread_ops  |             |             |
+  (A) base                    | 171046.473  | 170776.000  | 1509.679   |
+  (B) patched                 | 166626.327  | 166406.000  | 768.753    |
+                              | -2.58%      | -2.56%      |            |
+  page_fault2_scalability     |             |             |
+  (A) base                    | 0.054026    | 0.053821    | 0.00062121 |
+  (B) patched                 | 0.053329    | 0.05306     | 0.00048394 |
+                              | -1.29%      | -1.41%      |            |
+  page_fault3_per_process_ops |             |             |
+  (A) base                    | 1295807.782 | 1297550.000 | 5907.585   |
+  (B) patched                 | 1275579.873 | 1273359.000 | 8759.160   |
+                              | -1.56%      | -1.86%      |            |
+  page_fault3_per_thread_ops  |             |             |
+  (A) base                    | 391234.164  | 390860.000  | 1760.720   |
+  (B) patched                 | 377231.273  | 376369.000  | 1874.971   |
+                              | -3.58%      | -3.71%      |            |
+  page_fault3_scalability     |             |             |
+  (A) base                    | 0.60369     | 0.60072     | 0.0083029  |
+  (B) patched                 | 0.61733     | 0.61544     | 0.009855   |
+                              | +2.26%      | +2.45%      |            |
+
+All regressions seem to be minimal, and within the normal variance for the
+benchmark.  The fix for [1] assumes that 3% is noise -- and there were no
+further practical complaints), so hopefully this means that such
+variations in these microbenchmarks do not reflect on practical workloads.
+
+(3) I also ran stress-ng in a nested cgroup and did not observe any
+obvious regressions.
+
+[1]https://lore.kernel.org/all/20190520063534.GB19312@shao2-debian/
+
+Link: https://lkml.kernel.org/r/20231129032154.3710765-4-yosryahmed@google.com
+Signed-off-by: Yosry Ahmed <yosryahmed@google.com>
+Suggested-by: Johannes Weiner <hannes@cmpxchg.org>
+Tested-by: Domenico Cerasuolo <cerasuolodomenico@gmail.com>
+Acked-by: Shakeel Butt <shakeelb@google.com>
+Cc: Chris Li <chrisl@kernel.org>
+Cc: Greg Thelen <gthelen@google.com>
+Cc: Ivan Babrou <ivan@cloudflare.com>
+Cc: Michal Hocko <mhocko@kernel.org>
+Cc: Michal Koutny <mkoutny@suse.com>
+Cc: Muchun Song <muchun.song@linux.dev>
+Cc: Roman Gushchin <roman.gushchin@linux.dev>
+Cc: Tejun Heo <tj@kernel.org>
+Cc: Waiman Long <longman@redhat.com>
+Cc: Wei Xu <weixugc@google.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Leon Huang Fu <leon.huangfu@shopee.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ mm/memcontrol.c |   50 ++++++++++++++++++++++++++++++++++----------------
+ 1 file changed, 34 insertions(+), 16 deletions(-)
+
+--- a/mm/memcontrol.c
++++ b/mm/memcontrol.c
+@@ -628,6 +628,9 @@ struct memcg_vmstats_percpu {
+ 	/* Cgroup1: threshold notifications & softlimit tree updates */
+ 	unsigned long		nr_page_events;
+ 	unsigned long		targets[MEM_CGROUP_NTARGETS];
++
++	/* Stats updates since the last flush */
++	unsigned int		stats_updates;
+ };
+ 
+ struct memcg_vmstats {
+@@ -642,6 +645,9 @@ struct memcg_vmstats {
+ 	/* Pending child counts during tree propagation */
+ 	long			state_pending[MEMCG_NR_STAT];
+ 	unsigned long		events_pending[NR_MEMCG_EVENTS];
++
++	/* Stats updates since the last flush */
++	atomic64_t		stats_updates;
+ };
+ 
+ /*
+@@ -661,9 +667,7 @@ struct memcg_vmstats {
+  */
+ static void flush_memcg_stats_dwork(struct work_struct *w);
+ static DECLARE_DEFERRABLE_WORK(stats_flush_dwork, flush_memcg_stats_dwork);
+-static DEFINE_PER_CPU(unsigned int, stats_updates);
+ static atomic_t stats_flush_ongoing = ATOMIC_INIT(0);
+-static atomic_t stats_flush_threshold = ATOMIC_INIT(0);
+ static u64 flush_last_time;
+ 
+ #define FLUSH_TIME (2UL*HZ)
+@@ -690,26 +694,37 @@ static void memcg_stats_unlock(void)
+ 	preempt_enable_nested();
+ }
+ 
++
++static bool memcg_should_flush_stats(struct mem_cgroup *memcg)
++{
++	return atomic64_read(&memcg->vmstats->stats_updates) >
++		MEMCG_CHARGE_BATCH * num_online_cpus();
++}
++
+ static inline void memcg_rstat_updated(struct mem_cgroup *memcg, int val)
+ {
++	int cpu = smp_processor_id();
+ 	unsigned int x;
+ 
+ 	if (!val)
+ 		return;
+ 
+-	cgroup_rstat_updated(memcg->css.cgroup, smp_processor_id());
++	cgroup_rstat_updated(memcg->css.cgroup, cpu);
++
++	for (; memcg; memcg = parent_mem_cgroup(memcg)) {
++		x = __this_cpu_add_return(memcg->vmstats_percpu->stats_updates,
++					  abs(val));
++
++		if (x < MEMCG_CHARGE_BATCH)
++			continue;
+ 
+-	x = __this_cpu_add_return(stats_updates, abs(val));
+-	if (x > MEMCG_CHARGE_BATCH) {
+ 		/*
+-		 * If stats_flush_threshold exceeds the threshold
+-		 * (>num_online_cpus()), cgroup stats update will be triggered
+-		 * in __mem_cgroup_flush_stats(). Increasing this var further
+-		 * is redundant and simply adds overhead in atomic update.
++		 * If @memcg is already flush-able, increasing stats_updates is
++		 * redundant. Avoid the overhead of the atomic update.
+ 		 */
+-		if (atomic_read(&stats_flush_threshold) <= num_online_cpus())
+-			atomic_add(x / MEMCG_CHARGE_BATCH, &stats_flush_threshold);
+-		__this_cpu_write(stats_updates, 0);
++		if (!memcg_should_flush_stats(memcg))
++			atomic64_add(x, &memcg->vmstats->stats_updates);
++		__this_cpu_write(memcg->vmstats_percpu->stats_updates, 0);
+ 	}
+ }
+ 
+@@ -728,13 +743,12 @@ static void do_flush_stats(void)
+ 
+ 	cgroup_rstat_flush(root_mem_cgroup->css.cgroup);
+ 
+-	atomic_set(&stats_flush_threshold, 0);
+ 	atomic_set(&stats_flush_ongoing, 0);
+ }
+ 
+ void mem_cgroup_flush_stats(void)
+ {
+-	if (atomic_read(&stats_flush_threshold) > num_online_cpus())
++	if (memcg_should_flush_stats(root_mem_cgroup))
+ 		do_flush_stats();
+ }
+ 
+@@ -748,8 +762,8 @@ void mem_cgroup_flush_stats_ratelimited(
+ static void flush_memcg_stats_dwork(struct work_struct *w)
+ {
+ 	/*
+-	 * Always flush here so that flushing in latency-sensitive paths is
+-	 * as cheap as possible.
++	 * Deliberately ignore memcg_should_flush_stats() here so that flushing
++	 * in latency-sensitive paths is as cheap as possible.
+ 	 */
+ 	do_flush_stats();
+ 	queue_delayed_work(system_unbound_wq, &stats_flush_dwork, FLUSH_TIME);
+@@ -5658,6 +5672,10 @@ static void mem_cgroup_css_rstat_flush(s
+ 			}
+ 		}
+ 	}
++	statc->stats_updates = 0;
++	/* We are in a per-cpu loop here, only do the atomic write once */
++	if (atomic64_read(&memcg->vmstats->stats_updates))
++		atomic64_set(&memcg->vmstats->stats_updates, 0);
+ }
+ 
+ #ifdef CONFIG_MMU
diff --git a/queue-6.6/mm-memcg-move-vmstats-structs-definition-above-flushing-code.patch b/queue-6.6/mm-memcg-move-vmstats-structs-definition-above-flushing-code.patch
new file mode 100644
index 0000000000..eb5783f46f
--- /dev/null
+++ b/queue-6.6/mm-memcg-move-vmstats-structs-definition-above-flushing-code.patch
@@ -0,0 +1,204 @@
+From leon.huangfu@shopee.com Mon Nov  3 08:53:23 2025
+From: Leon Huang Fu <leon.huangfu@shopee.com>
+Date: Mon,  3 Nov 2025 15:51:32 +0800
+Subject: mm: memcg: move vmstats structs definition above flushing code
+To: stable@vger.kernel.org, greg@kroah.com
+Cc: tj@kernel.org, lizefan.x@bytedance.com, hannes@cmpxchg.org, corbet@lwn.net, mhocko@kernel.org, roman.gushchin@linux.dev, shakeelb@google.com, muchun.song@linux.dev, akpm@linux-foundation.org, sjenning@redhat.com, ddstreet@ieee.org, vitaly.wool@konsulko.com, lance.yang@linux.dev, leon.huangfu@shopee.com, shy828301@gmail.com, yosryahmed@google.com, sashal@kernel.org, vishal.moola@gmail.com, cerasuolodomenico@gmail.com, nphamcs@gmail.com, cgroups@vger.kernel.org, linux-doc@vger.kernel.org, linux-kernel@vger.kernel.org, linux-mm@kvack.org, Chris Li <chrisl@kernel.org>, Greg Thelen <gthelen@google.com>, Ivan Babrou <ivan@cloudflare.com>, Michal Koutny <mkoutny@suse.com>, Waiman Long <longman@redhat.com>, Wei Xu <weixugc@google.com>
+Message-ID: <20251103075135.20254-5-leon.huangfu@shopee.com>
+
+From: Yosry Ahmed <yosryahmed@google.com>
+
+[ Upstream commit e0bf1dc859fdd08ef738824710770a30a8069433 ]
+
+The following patch will make use of those structs in the flushing code,
+so move their definitions (and a few other dependencies) a little bit up
+to reduce the diff noise in the following patch.
+
+No functional change intended.
+
+Link: https://lkml.kernel.org/r/20231129032154.3710765-3-yosryahmed@google.com
+Signed-off-by: Yosry Ahmed <yosryahmed@google.com>
+Tested-by: Domenico Cerasuolo <cerasuolodomenico@gmail.com>
+Acked-by: Shakeel Butt <shakeelb@google.com>
+Cc: Chris Li <chrisl@kernel.org>
+Cc: Greg Thelen <gthelen@google.com>
+Cc: Ivan Babrou <ivan@cloudflare.com>
+Cc: Johannes Weiner <hannes@cmpxchg.org>
+Cc: Michal Hocko <mhocko@kernel.org>
+Cc: Michal Koutny <mkoutny@suse.com>
+Cc: Muchun Song <muchun.song@linux.dev>
+Cc: Roman Gushchin <roman.gushchin@linux.dev>
+Cc: Tejun Heo <tj@kernel.org>
+Cc: Waiman Long <longman@redhat.com>
+Cc: Wei Xu <weixugc@google.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Leon Huang Fu <leon.huangfu@shopee.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ mm/memcontrol.c |  148 ++++++++++++++++++++++++++++----------------------------
+ 1 file changed, 74 insertions(+), 74 deletions(-)
+
+--- a/mm/memcontrol.c
++++ b/mm/memcontrol.c
+@@ -570,6 +570,80 @@ mem_cgroup_largest_soft_limit_node(struc
+ 	return mz;
+ }
+ 
++/* Subset of vm_event_item to report for memcg event stats */
++static const unsigned int memcg_vm_event_stat[] = {
++	PGPGIN,
++	PGPGOUT,
++	PGSCAN_KSWAPD,
++	PGSCAN_DIRECT,
++	PGSCAN_KHUGEPAGED,
++	PGSTEAL_KSWAPD,
++	PGSTEAL_DIRECT,
++	PGSTEAL_KHUGEPAGED,
++	PGFAULT,
++	PGMAJFAULT,
++	PGREFILL,
++	PGACTIVATE,
++	PGDEACTIVATE,
++	PGLAZYFREE,
++	PGLAZYFREED,
++#if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_ZSWAP)
++	ZSWPIN,
++	ZSWPOUT,
++	ZSWPWB,
++#endif
++#ifdef CONFIG_TRANSPARENT_HUGEPAGE
++	THP_FAULT_ALLOC,
++	THP_COLLAPSE_ALLOC,
++	THP_SWPOUT,
++	THP_SWPOUT_FALLBACK,
++#endif
++};
++
++#define NR_MEMCG_EVENTS ARRAY_SIZE(memcg_vm_event_stat)
++static int mem_cgroup_events_index[NR_VM_EVENT_ITEMS] __read_mostly;
++
++static void init_memcg_events(void)
++{
++	int i;
++
++	for (i = 0; i < NR_MEMCG_EVENTS; ++i)
++		mem_cgroup_events_index[memcg_vm_event_stat[i]] = i + 1;
++}
++
++static inline int memcg_events_index(enum vm_event_item idx)
++{
++	return mem_cgroup_events_index[idx] - 1;
++}
++
++struct memcg_vmstats_percpu {
++	/* Local (CPU and cgroup) page state & events */
++	long			state[MEMCG_NR_STAT];
++	unsigned long		events[NR_MEMCG_EVENTS];
++
++	/* Delta calculation for lockless upward propagation */
++	long			state_prev[MEMCG_NR_STAT];
++	unsigned long		events_prev[NR_MEMCG_EVENTS];
++
++	/* Cgroup1: threshold notifications & softlimit tree updates */
++	unsigned long		nr_page_events;
++	unsigned long		targets[MEM_CGROUP_NTARGETS];
++};
++
++struct memcg_vmstats {
++	/* Aggregated (CPU and subtree) page state & events */
++	long			state[MEMCG_NR_STAT];
++	unsigned long		events[NR_MEMCG_EVENTS];
++
++	/* Non-hierarchical (CPU aggregated) page state & events */
++	long			state_local[MEMCG_NR_STAT];
++	unsigned long		events_local[NR_MEMCG_EVENTS];
++
++	/* Pending child counts during tree propagation */
++	long			state_pending[MEMCG_NR_STAT];
++	unsigned long		events_pending[NR_MEMCG_EVENTS];
++};
++
+ /*
+  * memcg and lruvec stats flushing
+  *
+@@ -681,80 +755,6 @@ static void flush_memcg_stats_dwork(stru
+ 	queue_delayed_work(system_unbound_wq, &stats_flush_dwork, FLUSH_TIME);
+ }
+ 
+-/* Subset of vm_event_item to report for memcg event stats */
+-static const unsigned int memcg_vm_event_stat[] = {
+-	PGPGIN,
+-	PGPGOUT,
+-	PGSCAN_KSWAPD,
+-	PGSCAN_DIRECT,
+-	PGSCAN_KHUGEPAGED,
+-	PGSTEAL_KSWAPD,
+-	PGSTEAL_DIRECT,
+-	PGSTEAL_KHUGEPAGED,
+-	PGFAULT,
+-	PGMAJFAULT,
+-	PGREFILL,
+-	PGACTIVATE,
+-	PGDEACTIVATE,
+-	PGLAZYFREE,
+-	PGLAZYFREED,
+-#if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_ZSWAP)
+-	ZSWPIN,
+-	ZSWPOUT,
+-	ZSWPWB,
+-#endif
+-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+-	THP_FAULT_ALLOC,
+-	THP_COLLAPSE_ALLOC,
+-	THP_SWPOUT,
+-	THP_SWPOUT_FALLBACK,
+-#endif
+-};
+-
+-#define NR_MEMCG_EVENTS ARRAY_SIZE(memcg_vm_event_stat)
+-static int mem_cgroup_events_index[NR_VM_EVENT_ITEMS] __read_mostly;
+-
+-static void init_memcg_events(void)
+-{
+-	int i;
+-
+-	for (i = 0; i < NR_MEMCG_EVENTS; ++i)
+-		mem_cgroup_events_index[memcg_vm_event_stat[i]] = i + 1;
+-}
+-
+-static inline int memcg_events_index(enum vm_event_item idx)
+-{
+-	return mem_cgroup_events_index[idx] - 1;
+-}
+-
+-struct memcg_vmstats_percpu {
+-	/* Local (CPU and cgroup) page state & events */
+-	long			state[MEMCG_NR_STAT];
+-	unsigned long		events[NR_MEMCG_EVENTS];
+-
+-	/* Delta calculation for lockless upward propagation */
+-	long			state_prev[MEMCG_NR_STAT];
+-	unsigned long		events_prev[NR_MEMCG_EVENTS];
+-
+-	/* Cgroup1: threshold notifications & softlimit tree updates */
+-	unsigned long		nr_page_events;
+-	unsigned long		targets[MEM_CGROUP_NTARGETS];
+-};
+-
+-struct memcg_vmstats {
+-	/* Aggregated (CPU and subtree) page state & events */
+-	long			state[MEMCG_NR_STAT];
+-	unsigned long		events[NR_MEMCG_EVENTS];
+-
+-	/* Non-hierarchical (CPU aggregated) page state & events */
+-	long			state_local[MEMCG_NR_STAT];
+-	unsigned long		events_local[NR_MEMCG_EVENTS];
+-
+-	/* Pending child counts during tree propagation */
+-	long			state_pending[MEMCG_NR_STAT];
+-	unsigned long		events_pending[NR_MEMCG_EVENTS];
+-};
+-
+ unsigned long memcg_page_state(struct mem_cgroup *memcg, int idx)
+ {
+ 	long x = READ_ONCE(memcg->vmstats->state[idx]);
diff --git a/queue-6.6/mm-memcg-restore-subtree-stats-flushing.patch b/queue-6.6/mm-memcg-restore-subtree-stats-flushing.patch
new file mode 100644
index 0000000000..067e475211
--- /dev/null
+++ b/queue-6.6/mm-memcg-restore-subtree-stats-flushing.patch
@@ -0,0 +1,321 @@
+From leon.huangfu@shopee.com Mon Nov  3 08:53:44 2025
+From: Leon Huang Fu <leon.huangfu@shopee.com>
+Date: Mon,  3 Nov 2025 15:51:35 +0800
+Subject: mm: memcg: restore subtree stats flushing
+To: stable@vger.kernel.org, greg@kroah.com
+Cc: tj@kernel.org, lizefan.x@bytedance.com, hannes@cmpxchg.org, corbet@lwn.net, mhocko@kernel.org, roman.gushchin@linux.dev, shakeelb@google.com, muchun.song@linux.dev, akpm@linux-foundation.org, sjenning@redhat.com, ddstreet@ieee.org, vitaly.wool@konsulko.com, lance.yang@linux.dev, leon.huangfu@shopee.com, shy828301@gmail.com, yosryahmed@google.com, sashal@kernel.org, vishal.moola@gmail.com, cerasuolodomenico@gmail.com, nphamcs@gmail.com, cgroups@vger.kernel.org, linux-doc@vger.kernel.org, linux-kernel@vger.kernel.org, linux-mm@kvack.org, Chris Li <chrisl@kernel.org>, Greg Thelen <gthelen@google.com>, Ivan Babrou <ivan@cloudflare.com>, Michal Koutny <mkoutny@suse.com>, Waiman Long <longman@redhat.com>, Wei Xu <weixugc@google.com>
+Message-ID: <20251103075135.20254-8-leon.huangfu@shopee.com>
+
+From: Yosry Ahmed <yosryahmed@google.com>
+
+[ Upstream commit 7d7ef0a4686abe43cd76a141b340a348f45ecdf2 ]
+
+Stats flushing for memcg currently follows the following rules:
+- Always flush the entire memcg hierarchy (i.e. flush the root).
+- Only one flusher is allowed at a time. If someone else tries to flush
+  concurrently, they skip and return immediately.
+- A periodic flusher flushes all the stats every 2 seconds.
+
+The reason this approach is followed is because all flushes are serialized
+by a global rstat spinlock.  On the memcg side, flushing is invoked from
+userspace reads as well as in-kernel flushers (e.g.  reclaim, refault,
+etc).  This approach aims to avoid serializing all flushers on the global
+lock, which can cause a significant performance hit under high
+concurrency.
+
+This approach has the following problems:
+- Occasionally a userspace read of the stats of a non-root cgroup will
+  be too expensive as it has to flush the entire hierarchy [1].
+- Sometimes the stats accuracy are compromised if there is an ongoing
+  flush, and we skip and return before the subtree of interest is
+  actually flushed, yielding stale stats (by up to 2s due to periodic
+  flushing). This is more visible when reading stats from userspace,
+  but can also affect in-kernel flushers.
+
+The latter problem is particulary a concern when userspace reads stats
+after an event occurs, but gets stats from before the event. Examples:
+- When memory usage / pressure spikes, a userspace OOM handler may look
+  at the stats of different memcgs to select a victim based on various
+  heuristics (e.g. how much private memory will be freed by killing
+  this). Reading stale stats from before the usage spike in this case
+  may cause a wrongful OOM kill.
+- A proactive reclaimer may read the stats after writing to
+  memory.reclaim to measure the success of the reclaim operation. Stale
+  stats from before reclaim may give a false negative.
+- Reading the stats of a parent and a child memcg may be inconsistent
+  (child larger than parent), if the flush doesn't happen when the
+  parent is read, but happens when the child is read.
+
+As for in-kernel flushers, they will occasionally get stale stats.  No
+regressions are currently known from this, but if there are regressions,
+they would be very difficult to debug and link to the source of the
+problem.
+
+This patch aims to fix these problems by restoring subtree flushing, and
+removing the unified/coalesced flushing logic that skips flushing if there
+is an ongoing flush.  This change would introduce a significant regression
+with global stats flushing thresholds.  With per-memcg stats flushing
+thresholds, this seems to perform really well.  The thresholds protect the
+underlying lock from unnecessary contention.
+
+This patch was tested in two ways to ensure the latency of flushing is
+up to par, on a machine with 384 cpus:
+
+- A synthetic test with 5000 concurrent workers in 500 cgroups doing
+  allocations and reclaim, as well as 1000 readers for memory.stat
+  (variation of [2]). No regressions were noticed in the total runtime.
+  Note that significant regressions in this test are observed with
+  global stats thresholds, but not with per-memcg thresholds.
+
+- A synthetic stress test for concurrently reading memcg stats while
+  memory allocation/freeing workers are running in the background,
+  provided by Wei Xu [3]. With 250k threads reading the stats every
+  100ms in 50k cgroups, 99.9% of reads take <= 50us. Less than 0.01%
+  of reads take more than 1ms, and no reads take more than 100ms.
+
+[1] https://lore.kernel.org/lkml/CABWYdi0c6__rh-K7dcM_pkf9BJdTRtAU08M43KO9ME4-dsgfoQ@mail.gmail.com/
+[2] https://lore.kernel.org/lkml/CAJD7tka13M-zVZTyQJYL1iUAYvuQ1fcHbCjcOBZcz6POYTV-4g@mail.gmail.com/
+[3] https://lore.kernel.org/lkml/CAAPL-u9D2b=iF5Lf_cRnKxUfkiEe0AMDTu6yhrUAzX0b6a6rDg@mail.gmail.com/
+
+[akpm@linux-foundation.org: fix mm/zswap.c]
+[yosryahmed@google.com: remove stats flushing mutex]
+  Link: https://lkml.kernel.org/r/CAJD7tkZgP3m-VVPn+fF_YuvXeQYK=tZZjJHj=dzD=CcSSpp2qg@mail.gmail.com
+Link: https://lkml.kernel.org/r/20231129032154.3710765-6-yosryahmed@google.com
+Signed-off-by: Yosry Ahmed <yosryahmed@google.com>
+Tested-by: Domenico Cerasuolo <cerasuolodomenico@gmail.com>
+Acked-by: Shakeel Butt <shakeelb@google.com>
+Cc: Chris Li <chrisl@kernel.org>
+Cc: Greg Thelen <gthelen@google.com>
+Cc: Ivan Babrou <ivan@cloudflare.com>
+Cc: Johannes Weiner <hannes@cmpxchg.org>
+Cc: Michal Hocko <mhocko@kernel.org>
+Cc: Michal Koutny <mkoutny@suse.com>
+Cc: Muchun Song <muchun.song@linux.dev>
+Cc: Roman Gushchin <roman.gushchin@linux.dev>
+Cc: Tejun Heo <tj@kernel.org>
+Cc: Waiman Long <longman@redhat.com>
+Cc: Wei Xu <weixugc@google.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Leon Huang Fu <leon.huangfu@shopee.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ include/linux/memcontrol.h |    8 ++---
+ mm/memcontrol.c            |   68 +++++++++++++++++++++++++--------------------
+ mm/vmscan.c                |    2 -
+ mm/workingset.c            |   10 ++++--
+ 4 files changed, 51 insertions(+), 37 deletions(-)
+
+--- a/include/linux/memcontrol.h
++++ b/include/linux/memcontrol.h
+@@ -1039,8 +1039,8 @@ static inline unsigned long lruvec_page_
+ 	return x;
+ }
+ 
+-void mem_cgroup_flush_stats(void);
+-void mem_cgroup_flush_stats_ratelimited(void);
++void mem_cgroup_flush_stats(struct mem_cgroup *memcg);
++void mem_cgroup_flush_stats_ratelimited(struct mem_cgroup *memcg);
+ 
+ void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
+ 			      int val);
+@@ -1515,11 +1515,11 @@ static inline unsigned long lruvec_page_
+ 	return node_page_state(lruvec_pgdat(lruvec), idx);
+ }
+ 
+-static inline void mem_cgroup_flush_stats(void)
++static inline void mem_cgroup_flush_stats(struct mem_cgroup *memcg)
+ {
+ }
+ 
+-static inline void mem_cgroup_flush_stats_ratelimited(void)
++static inline void mem_cgroup_flush_stats_ratelimited(struct mem_cgroup *memcg)
+ {
+ }
+ 
+--- a/mm/memcontrol.c
++++ b/mm/memcontrol.c
+@@ -667,7 +667,6 @@ struct memcg_vmstats {
+  */
+ static void flush_memcg_stats_dwork(struct work_struct *w);
+ static DECLARE_DEFERRABLE_WORK(stats_flush_dwork, flush_memcg_stats_dwork);
+-static atomic_t stats_flush_ongoing = ATOMIC_INIT(0);
+ static u64 flush_last_time;
+ 
+ #define FLUSH_TIME (2UL*HZ)
+@@ -728,35 +727,40 @@ static inline void memcg_rstat_updated(s
+ 	}
+ }
+ 
+-static void do_flush_stats(void)
++static void do_flush_stats(struct mem_cgroup *memcg)
+ {
+-	/*
+-	 * We always flush the entire tree, so concurrent flushers can just
+-	 * skip. This avoids a thundering herd problem on the rstat global lock
+-	 * from memcg flushers (e.g. reclaim, refault, etc).
+-	 */
+-	if (atomic_read(&stats_flush_ongoing) ||
+-	    atomic_xchg(&stats_flush_ongoing, 1))
+-		return;
+-
+-	WRITE_ONCE(flush_last_time, jiffies_64);
+-
+-	cgroup_rstat_flush(root_mem_cgroup->css.cgroup);
++	if (mem_cgroup_is_root(memcg))
++		WRITE_ONCE(flush_last_time, jiffies_64);
+ 
+-	atomic_set(&stats_flush_ongoing, 0);
++	cgroup_rstat_flush(memcg->css.cgroup);
+ }
+ 
+-void mem_cgroup_flush_stats(void)
++/*
++ * mem_cgroup_flush_stats - flush the stats of a memory cgroup subtree
++ * @memcg: root of the subtree to flush
++ *
++ * Flushing is serialized by the underlying global rstat lock. There is also a
++ * minimum amount of work to be done even if there are no stat updates to flush.
++ * Hence, we only flush the stats if the updates delta exceeds a threshold. This
++ * avoids unnecessary work and contention on the underlying lock.
++ */
++void mem_cgroup_flush_stats(struct mem_cgroup *memcg)
+ {
+-	if (memcg_should_flush_stats(root_mem_cgroup))
+-		do_flush_stats();
++	if (mem_cgroup_disabled())
++		return;
++
++	if (!memcg)
++		memcg = root_mem_cgroup;
++
++	if (memcg_should_flush_stats(memcg))
++		do_flush_stats(memcg);
+ }
+ 
+-void mem_cgroup_flush_stats_ratelimited(void)
++void mem_cgroup_flush_stats_ratelimited(struct mem_cgroup *memcg)
+ {
+ 	/* Only flush if the periodic flusher is one full cycle late */
+ 	if (time_after64(jiffies_64, READ_ONCE(flush_last_time) + 2*FLUSH_TIME))
+-		mem_cgroup_flush_stats();
++		mem_cgroup_flush_stats(memcg);
+ }
+ 
+ static void flush_memcg_stats_dwork(struct work_struct *w)
+@@ -765,7 +769,7 @@ static void flush_memcg_stats_dwork(stru
+ 	 * Deliberately ignore memcg_should_flush_stats() here so that flushing
+ 	 * in latency-sensitive paths is as cheap as possible.
+ 	 */
+-	do_flush_stats();
++	do_flush_stats(root_mem_cgroup);
+ 	queue_delayed_work(system_unbound_wq, &stats_flush_dwork, FLUSH_TIME);
+ }
+ 
+@@ -1597,7 +1601,7 @@ static void memcg_stat_format(struct mem
+ 	 *
+ 	 * Current memory state:
+ 	 */
+-	mem_cgroup_flush_stats();
++	mem_cgroup_flush_stats(memcg);
+ 
+ 	for (i = 0; i < ARRAY_SIZE(memory_stats); i++) {
+ 		u64 size;
+@@ -4047,7 +4051,7 @@ static int memcg_numa_stat_show(struct s
+ 	int nid;
+ 	struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
+ 
+-	mem_cgroup_flush_stats();
++	mem_cgroup_flush_stats(memcg);
+ 
+ 	for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) {
+ 		seq_printf(m, "%s=%lu", stat->name,
+@@ -4122,7 +4126,7 @@ static void memcg1_stat_format(struct me
+ 
+ 	BUILD_BUG_ON(ARRAY_SIZE(memcg1_stat_names) != ARRAY_SIZE(memcg1_stats));
+ 
+-	mem_cgroup_flush_stats();
++	mem_cgroup_flush_stats(memcg);
+ 
+ 	for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) {
+ 		unsigned long nr;
+@@ -4624,7 +4628,7 @@ void mem_cgroup_wb_stats(struct bdi_writ
+ 	struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css);
+ 	struct mem_cgroup *parent;
+ 
+-	mem_cgroup_flush_stats();
++	mem_cgroup_flush_stats(memcg);
+ 
+ 	*pdirty = memcg_page_state(memcg, NR_FILE_DIRTY);
+ 	*pwriteback = memcg_page_state(memcg, NR_WRITEBACK);
+@@ -6704,7 +6708,7 @@ static int memory_numa_stat_show(struct
+ 	int i;
+ 	struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
+ 
+-	mem_cgroup_flush_stats();
++	mem_cgroup_flush_stats(memcg);
+ 
+ 	for (i = 0; i < ARRAY_SIZE(memory_stats); i++) {
+ 		int nid;
+@@ -7868,7 +7872,11 @@ bool obj_cgroup_may_zswap(struct obj_cgr
+ 			break;
+ 		}
+ 
+-		cgroup_rstat_flush(memcg->css.cgroup);
++		/*
++		 * mem_cgroup_flush_stats() ignores small changes. Use
++		 * do_flush_stats() directly to get accurate stats for charging.
++		 */
++		do_flush_stats(memcg);
+ 		pages = memcg_page_state(memcg, MEMCG_ZSWAP_B) / PAGE_SIZE;
+ 		if (pages < max)
+ 			continue;
+@@ -7933,8 +7941,10 @@ void obj_cgroup_uncharge_zswap(struct ob
+ static u64 zswap_current_read(struct cgroup_subsys_state *css,
+ 			      struct cftype *cft)
+ {
+-	cgroup_rstat_flush(css->cgroup);
+-	return memcg_page_state(mem_cgroup_from_css(css), MEMCG_ZSWAP_B);
++	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
++
++	mem_cgroup_flush_stats(memcg);
++	return memcg_page_state(memcg, MEMCG_ZSWAP_B);
+ }
+ 
+ static int zswap_max_show(struct seq_file *m, void *v)
+--- a/mm/vmscan.c
++++ b/mm/vmscan.c
+@@ -2911,7 +2911,7 @@ static void prepare_scan_count(pg_data_t
+ 	 * Flush the memory cgroup stats, so that we read accurate per-memcg
+ 	 * lruvec stats for heuristics.
+ 	 */
+-	mem_cgroup_flush_stats();
++	mem_cgroup_flush_stats(sc->target_mem_cgroup);
+ 
+ 	/*
+ 	 * Determine the scan balance between anon and file LRUs.
+--- a/mm/workingset.c
++++ b/mm/workingset.c
+@@ -464,8 +464,12 @@ bool workingset_test_recent(void *shadow
+ 
+ 	rcu_read_unlock();
+ 
+-	/* Flush stats (and potentially sleep) outside the RCU read section */
+-	mem_cgroup_flush_stats_ratelimited();
++	/*
++	 * Flush stats (and potentially sleep) outside the RCU read section.
++	 * XXX: With per-memcg flushing and thresholding, is ratelimiting
++	 * still needed here?
++	 */
++	mem_cgroup_flush_stats_ratelimited(eviction_memcg);
+ 
+ 	eviction_lruvec = mem_cgroup_lruvec(eviction_memcg, pgdat);
+ 	refault = atomic_long_read(&eviction_lruvec->nonresident_age);
+@@ -676,7 +680,7 @@ static unsigned long count_shadow_nodes(
+ 		struct lruvec *lruvec;
+ 		int i;
+ 
+-		mem_cgroup_flush_stats_ratelimited();
++		mem_cgroup_flush_stats_ratelimited(sc->memcg);
+ 		lruvec = mem_cgroup_lruvec(sc->memcg, NODE_DATA(sc->nid));
+ 		for (pages = 0, i = 0; i < NR_LRU_LISTS; i++)
+ 			pages += lruvec_page_state_local(lruvec,
diff --git a/queue-6.6/mm-workingset-move-the-stats-flush-into-workingset_test_recent.patch b/queue-6.6/mm-workingset-move-the-stats-flush-into-workingset_test_recent.patch
new file mode 100644
index 0000000000..f64f82936a
--- /dev/null
+++ b/queue-6.6/mm-workingset-move-the-stats-flush-into-workingset_test_recent.patch
@@ -0,0 +1,142 @@
+From leon.huangfu@shopee.com Mon Nov  3 08:53:37 2025
+From: Leon Huang Fu <leon.huangfu@shopee.com>
+Date: Mon,  3 Nov 2025 15:51:34 +0800
+Subject: mm: workingset: move the stats flush into workingset_test_recent()
+To: stable@vger.kernel.org, greg@kroah.com
+Cc: tj@kernel.org, lizefan.x@bytedance.com, hannes@cmpxchg.org, corbet@lwn.net, mhocko@kernel.org, roman.gushchin@linux.dev, shakeelb@google.com, muchun.song@linux.dev, akpm@linux-foundation.org, sjenning@redhat.com, ddstreet@ieee.org, vitaly.wool@konsulko.com, lance.yang@linux.dev, leon.huangfu@shopee.com, shy828301@gmail.com, yosryahmed@google.com, sashal@kernel.org, vishal.moola@gmail.com, cerasuolodomenico@gmail.com, nphamcs@gmail.com, cgroups@vger.kernel.org, linux-doc@vger.kernel.org, linux-kernel@vger.kernel.org, linux-mm@kvack.org, Chris Li <chrisl@kernel.org>, Greg Thelen <gthelen@google.com>, Ivan Babrou <ivan@cloudflare.com>, Michal Koutny <mkoutny@suse.com>, Waiman Long <longman@redhat.com>, Wei Xu <weixugc@google.com>
+Message-ID: <20251103075135.20254-7-leon.huangfu@shopee.com>
+
+From: Yosry Ahmed <yosryahmed@google.com>
+
+[ Upstream commit b006847222623ac3cda8589d15379eac86a2bcb7 ]
+
+The workingset code flushes the stats in workingset_refault() to get
+accurate stats of the eviction memcg.  In preparation for more scoped
+flushed and passing the eviction memcg to the flush call, move the call to
+workingset_test_recent() where we have a pointer to the eviction memcg.
+
+The flush call is sleepable, and cannot be made in an rcu read section.
+Hence, minimize the rcu read section by also moving it into
+workingset_test_recent().  Furthermore, instead of holding the rcu read
+lock throughout workingset_test_recent(), only hold it briefly to get a
+ref on the eviction memcg.  This allows us to make the flush call after we
+get the eviction memcg.
+
+As for workingset_refault(), nothing else there appears to be protected by
+rcu.  The memcg of the faulted folio (which is not necessarily the same as
+the eviction memcg) is protected by the folio lock, which is held from all
+callsites.  Add a VM_BUG_ON() to make sure this doesn't change from under
+us.
+
+No functional change intended.
+
+Link: https://lkml.kernel.org/r/20231129032154.3710765-5-yosryahmed@google.com
+Signed-off-by: Yosry Ahmed <yosryahmed@google.com>
+Tested-by: Domenico Cerasuolo <cerasuolodomenico@gmail.com>
+Acked-by: Shakeel Butt <shakeelb@google.com>
+Cc: Chris Li <chrisl@kernel.org>
+Cc: Greg Thelen <gthelen@google.com>
+Cc: Ivan Babrou <ivan@cloudflare.com>
+Cc: Johannes Weiner <hannes@cmpxchg.org>
+Cc: Michal Hocko <mhocko@kernel.org>
+Cc: Michal Koutny <mkoutny@suse.com>
+Cc: Muchun Song <muchun.song@linux.dev>
+Cc: Roman Gushchin <roman.gushchin@linux.dev>
+Cc: Tejun Heo <tj@kernel.org>
+Cc: Waiman Long <longman@redhat.com>
+Cc: Wei Xu <weixugc@google.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Leon Huang Fu <leon.huangfu@shopee.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ mm/workingset.c |   36 ++++++++++++++++++++++++------------
+ 1 file changed, 24 insertions(+), 12 deletions(-)
+
+--- a/mm/workingset.c
++++ b/mm/workingset.c
+@@ -425,8 +425,16 @@ bool workingset_test_recent(void *shadow
+ 	struct pglist_data *pgdat;
+ 	unsigned long eviction;
+ 
+-	if (lru_gen_enabled())
+-		return lru_gen_test_recent(shadow, file, &eviction_lruvec, &eviction, workingset);
++	rcu_read_lock();
++
++	if (lru_gen_enabled()) {
++		bool recent = lru_gen_test_recent(shadow, file,
++				&eviction_lruvec, &eviction, workingset);
++
++		rcu_read_unlock();
++		return recent;
++	}
++
+ 
+ 	unpack_shadow(shadow, &memcgid, &pgdat, &eviction, workingset);
+ 	eviction <<= bucket_order;
+@@ -448,8 +456,16 @@ bool workingset_test_recent(void *shadow
+ 	 * configurations instead.
+ 	 */
+ 	eviction_memcg = mem_cgroup_from_id(memcgid);
+-	if (!mem_cgroup_disabled() && !eviction_memcg)
++	if (!mem_cgroup_disabled() &&
++	    (!eviction_memcg || !mem_cgroup_tryget(eviction_memcg))) {
++		rcu_read_unlock();
+ 		return false;
++	}
++
++	rcu_read_unlock();
++
++	/* Flush stats (and potentially sleep) outside the RCU read section */
++	mem_cgroup_flush_stats_ratelimited();
+ 
+ 	eviction_lruvec = mem_cgroup_lruvec(eviction_memcg, pgdat);
+ 	refault = atomic_long_read(&eviction_lruvec->nonresident_age);
+@@ -493,6 +509,7 @@ bool workingset_test_recent(void *shadow
+ 		}
+ 	}
+ 
++	mem_cgroup_put(eviction_memcg);
+ 	return refault_distance <= workingset_size;
+ }
+ 
+@@ -519,19 +536,16 @@ void workingset_refault(struct folio *fo
+ 		return;
+ 	}
+ 
+-	/* Flush stats (and potentially sleep) before holding RCU read lock */
+-	mem_cgroup_flush_stats_ratelimited();
+-
+-	rcu_read_lock();
+-
+ 	/*
+ 	 * The activation decision for this folio is made at the level
+ 	 * where the eviction occurred, as that is where the LRU order
+ 	 * during folio reclaim is being determined.
+ 	 *
+ 	 * However, the cgroup that will own the folio is the one that
+-	 * is actually experiencing the refault event.
++	 * is actually experiencing the refault event. Make sure the folio is
++	 * locked to guarantee folio_memcg() stability throughout.
+ 	 */
++	VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
+ 	nr = folio_nr_pages(folio);
+ 	memcg = folio_memcg(folio);
+ 	pgdat = folio_pgdat(folio);
+@@ -540,7 +554,7 @@ void workingset_refault(struct folio *fo
+ 	mod_lruvec_state(lruvec, WORKINGSET_REFAULT_BASE + file, nr);
+ 
+ 	if (!workingset_test_recent(shadow, file, &workingset))
+-		goto out;
++		return;
+ 
+ 	folio_set_active(folio);
+ 	workingset_age_nonresident(lruvec, nr);
+@@ -556,8 +570,6 @@ void workingset_refault(struct folio *fo
+ 		lru_note_cost_refault(folio);
+ 		mod_lruvec_state(lruvec, WORKINGSET_RESTORE_BASE + file, nr);
+ 	}
+-out:
+-	rcu_read_unlock();
+ }
+ 
+ /**
diff --git a/queue-6.6/series b/queue-6.6/series
index cec3e8c7d6..0bc28ad41b 100644
--- a/queue-6.6/series
+++ b/queue-6.6/series
@@ -508,3 +508,10 @@ scsi-ufs-core-add-ufshcd_quirk_keys_in_prdt.patch
 scsi-ufs-core-add-a-quirk-for-handling-broken-lsdbs-field-in-controller-capabilities-register.patch
 scsi-ufs-core-add-a-quirk-to-suppress-link_startup_again.patch
 scsi-ufs-ufs-pci-set-ufshcd_quirk_perform_link_startup_once-for-intel-adl.patch
+mm-memcg-add-thp-swap-out-info-for-anonymous-reclaim.patch
+mm-memcg-add-per-memcg-zswap-writeback-stat.patch
+mm-memcg-change-flush_next_time-to-flush_last_time.patch
+mm-memcg-move-vmstats-structs-definition-above-flushing-code.patch
+mm-memcg-make-stats-flushing-threshold-per-memcg.patch
+mm-workingset-move-the-stats-flush-into-workingset_test_recent.patch
+mm-memcg-restore-subtree-stats-flushing.patch