--- /dev/null
+From leon.huangfu@shopee.com Mon Nov 3 08:53:08 2025
+From: Leon Huang Fu <leon.huangfu@shopee.com>
+Date: Mon, 3 Nov 2025 15:51:30 +0800
+Subject: mm: memcg: add per-memcg zswap writeback stat
+To: stable@vger.kernel.org, greg@kroah.com
+Cc: tj@kernel.org, lizefan.x@bytedance.com, hannes@cmpxchg.org, corbet@lwn.net, mhocko@kernel.org, roman.gushchin@linux.dev, shakeelb@google.com, muchun.song@linux.dev, akpm@linux-foundation.org, sjenning@redhat.com, ddstreet@ieee.org, vitaly.wool@konsulko.com, lance.yang@linux.dev, leon.huangfu@shopee.com, shy828301@gmail.com, yosryahmed@google.com, sashal@kernel.org, vishal.moola@gmail.com, cerasuolodomenico@gmail.com, nphamcs@gmail.com, cgroups@vger.kernel.org, linux-doc@vger.kernel.org, linux-kernel@vger.kernel.org, linux-mm@kvack.org, Bagas Sanjaya <bagasdotme@gmail.com>, Chris Li <chrisl@kernel.org>, Shuah Khan <shuah@kernel.org>
+Message-ID: <20251103075135.20254-3-leon.huangfu@shopee.com>
+
+From: Domenico Cerasuolo <cerasuolodomenico@gmail.com>
+
+[ Upstream commit 7108cc3f765cafd48a6a35f8add140beaecfa75b ]
+
+Since zswap now writes back pages from memcg-specific LRUs, we now need a
+new stat to show writebacks count for each memcg.
+
+[nphamcs@gmail.com: rename ZSWP_WB to ZSWPWB]
+ Link: https://lkml.kernel.org/r/20231205193307.2432803-1-nphamcs@gmail.com
+Link: https://lkml.kernel.org/r/20231130194023.4102148-5-nphamcs@gmail.com
+Suggested-by: Nhat Pham <nphamcs@gmail.com>
+Signed-off-by: Domenico Cerasuolo <cerasuolodomenico@gmail.com>
+Signed-off-by: Nhat Pham <nphamcs@gmail.com>
+Tested-by: Bagas Sanjaya <bagasdotme@gmail.com>
+Reviewed-by: Yosry Ahmed <yosryahmed@google.com>
+Cc: Chris Li <chrisl@kernel.org>
+Cc: Dan Streetman <ddstreet@ieee.org>
+Cc: Johannes Weiner <hannes@cmpxchg.org>
+Cc: Michal Hocko <mhocko@kernel.org>
+Cc: Muchun Song <muchun.song@linux.dev>
+Cc: Roman Gushchin <roman.gushchin@linux.dev>
+Cc: Seth Jennings <sjenning@redhat.com>
+Cc: Shakeel Butt <shakeelb@google.com>
+Cc: Shuah Khan <shuah@kernel.org>
+Cc: Vitaly Wool <vitaly.wool@konsulko.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Leon Huang Fu <leon.huangfu@shopee.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ include/linux/vm_event_item.h | 1 +
+ mm/memcontrol.c | 1 +
+ mm/vmstat.c | 1 +
+ mm/zswap.c | 4 ++++
+ 4 files changed, 7 insertions(+)
+
+--- a/include/linux/vm_event_item.h
++++ b/include/linux/vm_event_item.h
+@@ -145,6 +145,7 @@ enum vm_event_item { PGPGIN, PGPGOUT, PS
+ #ifdef CONFIG_ZSWAP
+ ZSWPIN,
+ ZSWPOUT,
++ ZSWPWB,
+ #endif
+ #ifdef CONFIG_X86
+ DIRECT_MAP_LEVEL2_SPLIT,
+--- a/mm/memcontrol.c
++++ b/mm/memcontrol.c
+@@ -700,6 +700,7 @@ static const unsigned int memcg_vm_event
+ #if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_ZSWAP)
+ ZSWPIN,
+ ZSWPOUT,
++ ZSWPWB,
+ #endif
+ #ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ THP_FAULT_ALLOC,
+--- a/mm/vmstat.c
++++ b/mm/vmstat.c
+@@ -1397,6 +1397,7 @@ const char * const vmstat_text[] = {
+ #ifdef CONFIG_ZSWAP
+ "zswpin",
+ "zswpout",
++ "zswpwb",
+ #endif
+ #ifdef CONFIG_X86
+ "direct_map_level2_splits",
+--- a/mm/zswap.c
++++ b/mm/zswap.c
+@@ -674,6 +674,10 @@ static int zswap_reclaim_entry(struct zs
+ goto put_unlock;
+ }
+
++ if (entry->objcg)
++ count_objcg_event(entry->objcg, ZSWPWB);
++
++ count_vm_event(ZSWPWB);
+ /*
+ * Writeback started successfully, the page now belongs to the
+ * swapcache. Drop the entry from zswap - unless invalidate already
--- /dev/null
+From leon.huangfu@shopee.com Mon Nov 3 08:53:02 2025
+From: Leon Huang Fu <leon.huangfu@shopee.com>
+Date: Mon, 3 Nov 2025 15:51:29 +0800
+Subject: mm: memcg: add THP swap out info for anonymous reclaim
+To: stable@vger.kernel.org, greg@kroah.com
+Cc: tj@kernel.org, lizefan.x@bytedance.com, hannes@cmpxchg.org, corbet@lwn.net, mhocko@kernel.org, roman.gushchin@linux.dev, shakeelb@google.com, muchun.song@linux.dev, akpm@linux-foundation.org, sjenning@redhat.com, ddstreet@ieee.org, vitaly.wool@konsulko.com, lance.yang@linux.dev, leon.huangfu@shopee.com, shy828301@gmail.com, yosryahmed@google.com, sashal@kernel.org, vishal.moola@gmail.com, cerasuolodomenico@gmail.com, nphamcs@gmail.com, cgroups@vger.kernel.org, linux-doc@vger.kernel.org, linux-kernel@vger.kernel.org, linux-mm@kvack.org, Xin Hao <vernhao@tencent.com>, Michal Hocko <mhocko@suse.com>, Muchun Song <songmuchun@bytedance.com>
+Message-ID: <20251103075135.20254-2-leon.huangfu@shopee.com>
+
+From: Xin Hao <vernhao@tencent.com>
+
+[ Upstream commit 811244a501b967b00fecb1ae906d5dc6329c91e0 ]
+
+At present, we support per-memcg reclaim strategy, however we do not know
+the number of transparent huge pages being reclaimed, as we know the
+transparent huge pages need to be splited before reclaim them, and they
+will bring some performance bottleneck effect. for example, when two
+memcg (A & B) are doing reclaim for anonymous pages at same time, and 'A'
+memcg is reclaiming a large number of transparent huge pages, we can
+better analyze that the performance bottleneck will be caused by 'A'
+memcg. therefore, in order to better analyze such problems, there add THP
+swap out info for per-memcg.
+
+[akpm@linux-foundation.orgL fix swap_writepage_fs(), per Johannes]
+ Link: https://lkml.kernel.org/r/20230913213343.GB48476@cmpxchg.org
+Link: https://lkml.kernel.org/r/20230913164938.16918-1-vernhao@tencent.com
+Signed-off-by: Xin Hao <vernhao@tencent.com>
+Suggested-by: Johannes Weiner <hannes@cmpxchg.org>
+Acked-by: Johannes Weiner <hannes@cmpxchg.org>
+Cc: Michal Hocko <mhocko@suse.com>
+Cc: Roman Gushchin <roman.gushchin@linux.dev>
+Cc: Shakeel Butt <shakeelb@google.com>
+Cc: Muchun Song <songmuchun@bytedance.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Leon Huang Fu <leon.huangfu@shopee.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ Documentation/admin-guide/cgroup-v2.rst | 9 +++++++++
+ mm/memcontrol.c | 2 ++
+ mm/page_io.c | 8 ++++----
+ mm/vmscan.c | 1 +
+ 4 files changed, 16 insertions(+), 4 deletions(-)
+
+--- a/Documentation/admin-guide/cgroup-v2.rst
++++ b/Documentation/admin-guide/cgroup-v2.rst
+@@ -1532,6 +1532,15 @@ PAGE_SIZE multiple when read back.
+ collapsing an existing range of pages. This counter is not
+ present when CONFIG_TRANSPARENT_HUGEPAGE is not set.
+
++ thp_swpout (npn)
++ Number of transparent hugepages which are swapout in one piece
++ without splitting.
++
++ thp_swpout_fallback (npn)
++ Number of transparent hugepages which were split before swapout.
++ Usually because failed to allocate some continuous swap space
++ for the huge page.
++
+ memory.numa_stat
+ A read-only nested-keyed file which exists on non-root cgroups.
+
+--- a/mm/memcontrol.c
++++ b/mm/memcontrol.c
+@@ -704,6 +704,8 @@ static const unsigned int memcg_vm_event
+ #ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ THP_FAULT_ALLOC,
+ THP_COLLAPSE_ALLOC,
++ THP_SWPOUT,
++ THP_SWPOUT_FALLBACK,
+ #endif
+ };
+
+--- a/mm/page_io.c
++++ b/mm/page_io.c
+@@ -208,8 +208,10 @@ int swap_writepage(struct page *page, st
+ static inline void count_swpout_vm_event(struct folio *folio)
+ {
+ #ifdef CONFIG_TRANSPARENT_HUGEPAGE
+- if (unlikely(folio_test_pmd_mappable(folio)))
++ if (unlikely(folio_test_pmd_mappable(folio))) {
++ count_memcg_folio_events(folio, THP_SWPOUT, 1);
+ count_vm_event(THP_SWPOUT);
++ }
+ #endif
+ count_vm_events(PSWPOUT, folio_nr_pages(folio));
+ }
+@@ -278,9 +280,6 @@ static void sio_write_complete(struct ki
+ set_page_dirty(page);
+ ClearPageReclaim(page);
+ }
+- } else {
+- for (p = 0; p < sio->pages; p++)
+- count_swpout_vm_event(page_folio(sio->bvec[p].bv_page));
+ }
+
+ for (p = 0; p < sio->pages; p++)
+@@ -296,6 +295,7 @@ static void swap_writepage_fs(struct pag
+ struct file *swap_file = sis->swap_file;
+ loff_t pos = page_file_offset(page);
+
++ count_swpout_vm_event(page_folio(page));
+ set_page_writeback(page);
+ unlock_page(page);
+ if (wbc->swap_plug)
+--- a/mm/vmscan.c
++++ b/mm/vmscan.c
+@@ -1922,6 +1922,7 @@ retry:
+ folio_list))
+ goto activate_locked;
+ #ifdef CONFIG_TRANSPARENT_HUGEPAGE
++ count_memcg_folio_events(folio, THP_SWPOUT_FALLBACK, 1);
+ count_vm_event(THP_SWPOUT_FALLBACK);
+ #endif
+ if (!add_to_swap(folio))
--- /dev/null
+From leon.huangfu@shopee.com Mon Nov 3 08:53:16 2025
+From: Leon Huang Fu <leon.huangfu@shopee.com>
+Date: Mon, 3 Nov 2025 15:51:31 +0800
+Subject: mm: memcg: change flush_next_time to flush_last_time
+To: stable@vger.kernel.org, greg@kroah.com
+Cc: tj@kernel.org, lizefan.x@bytedance.com, hannes@cmpxchg.org, corbet@lwn.net, mhocko@kernel.org, roman.gushchin@linux.dev, shakeelb@google.com, muchun.song@linux.dev, akpm@linux-foundation.org, sjenning@redhat.com, ddstreet@ieee.org, vitaly.wool@konsulko.com, lance.yang@linux.dev, leon.huangfu@shopee.com, shy828301@gmail.com, yosryahmed@google.com, sashal@kernel.org, vishal.moola@gmail.com, cerasuolodomenico@gmail.com, nphamcs@gmail.com, cgroups@vger.kernel.org, linux-doc@vger.kernel.org, linux-kernel@vger.kernel.org, linux-mm@kvack.org, Chris Li <chrisl@kernel.org>, Bagas Sanjaya <bagasdotme@gmail.com>, Greg Thelen <gthelen@google.com>, Ivan Babrou <ivan@cloudflare.com>, Michal Koutny <mkoutny@suse.com>, Waiman Long <longman@redhat.com>, Wei Xu <weixugc@google.com>
+Message-ID: <20251103075135.20254-4-leon.huangfu@shopee.com>
+
+From: Yosry Ahmed <yosryahmed@google.com>
+
+[ Upstream commit 508bed884767a8eb394640bae9edcdf082816c43 ]
+
+Patch series "mm: memcg: subtree stats flushing and thresholds", v4.
+
+This series attempts to address shortages in today's approach for memcg
+stats flushing, namely occasionally stale or expensive stat reads. The
+series does so by changing the threshold that we use to decide whether to
+trigger a flush to be per memcg instead of global (patch 3), and then
+changing flushing to be per memcg (i.e. subtree flushes) instead of
+global (patch 5).
+
+This patch (of 5):
+
+flush_next_time is an inaccurate name. It's not the next time that
+periodic flushing will happen, it's rather the next time that ratelimited
+flushing can happen if the periodic flusher is late.
+
+Simplify its semantics by just storing the timestamp of the last flush
+instead, flush_last_time. Move the 2*FLUSH_TIME addition to
+mem_cgroup_flush_stats_ratelimited(), and add a comment explaining it.
+This way, all the ratelimiting semantics live in one place.
+
+No functional change intended.
+
+Link: https://lkml.kernel.org/r/20231129032154.3710765-1-yosryahmed@google.com
+Link: https://lkml.kernel.org/r/20231129032154.3710765-2-yosryahmed@google.com
+Signed-off-by: Yosry Ahmed <yosryahmed@google.com>
+Tested-by: Domenico Cerasuolo <cerasuolodomenico@gmail.com>
+Acked-by: Shakeel Butt <shakeelb@google.com>
+Acked-by: Chris Li <chrisl@kernel.org> (Google)
+Tested-by: Bagas Sanjaya <bagasdotme@gmail.com>
+Cc: Greg Thelen <gthelen@google.com>
+Cc: Ivan Babrou <ivan@cloudflare.com>
+Cc: Johannes Weiner <hannes@cmpxchg.org>
+Cc: Michal Hocko <mhocko@kernel.org>
+Cc: Michal Koutny <mkoutny@suse.com>
+Cc: Muchun Song <muchun.song@linux.dev>
+Cc: Roman Gushchin <roman.gushchin@linux.dev>
+Cc: Tejun Heo <tj@kernel.org>
+Cc: Waiman Long <longman@redhat.com>
+Cc: Wei Xu <weixugc@google.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Leon Huang Fu <leon.huangfu@shopee.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ mm/memcontrol.c | 7 ++++---
+ 1 file changed, 4 insertions(+), 3 deletions(-)
+
+--- a/mm/memcontrol.c
++++ b/mm/memcontrol.c
+@@ -590,7 +590,7 @@ static DECLARE_DEFERRABLE_WORK(stats_flu
+ static DEFINE_PER_CPU(unsigned int, stats_updates);
+ static atomic_t stats_flush_ongoing = ATOMIC_INIT(0);
+ static atomic_t stats_flush_threshold = ATOMIC_INIT(0);
+-static u64 flush_next_time;
++static u64 flush_last_time;
+
+ #define FLUSH_TIME (2UL*HZ)
+
+@@ -650,7 +650,7 @@ static void do_flush_stats(void)
+ atomic_xchg(&stats_flush_ongoing, 1))
+ return;
+
+- WRITE_ONCE(flush_next_time, jiffies_64 + 2*FLUSH_TIME);
++ WRITE_ONCE(flush_last_time, jiffies_64);
+
+ cgroup_rstat_flush(root_mem_cgroup->css.cgroup);
+
+@@ -666,7 +666,8 @@ void mem_cgroup_flush_stats(void)
+
+ void mem_cgroup_flush_stats_ratelimited(void)
+ {
+- if (time_after64(jiffies_64, READ_ONCE(flush_next_time)))
++ /* Only flush if the periodic flusher is one full cycle late */
++ if (time_after64(jiffies_64, READ_ONCE(flush_last_time) + 2*FLUSH_TIME))
+ mem_cgroup_flush_stats();
+ }
+
--- /dev/null
+From leon.huangfu@shopee.com Mon Nov 3 08:53:30 2025
+From: Leon Huang Fu <leon.huangfu@shopee.com>
+Date: Mon, 3 Nov 2025 15:51:33 +0800
+Subject: mm: memcg: make stats flushing threshold per-memcg
+To: stable@vger.kernel.org, greg@kroah.com
+Cc: tj@kernel.org, lizefan.x@bytedance.com, hannes@cmpxchg.org, corbet@lwn.net, mhocko@kernel.org, roman.gushchin@linux.dev, shakeelb@google.com, muchun.song@linux.dev, akpm@linux-foundation.org, sjenning@redhat.com, ddstreet@ieee.org, vitaly.wool@konsulko.com, lance.yang@linux.dev, leon.huangfu@shopee.com, shy828301@gmail.com, yosryahmed@google.com, sashal@kernel.org, vishal.moola@gmail.com, cerasuolodomenico@gmail.com, nphamcs@gmail.com, cgroups@vger.kernel.org, linux-doc@vger.kernel.org, linux-kernel@vger.kernel.org, linux-mm@kvack.org, Chris Li <chrisl@kernel.org>, Greg Thelen <gthelen@google.com>, Ivan Babrou <ivan@cloudflare.com>, Michal Koutny <mkoutny@suse.com>, Waiman Long <longman@redhat.com>, Wei Xu <weixugc@google.com>
+Message-ID: <20251103075135.20254-6-leon.huangfu@shopee.com>
+
+From: Yosry Ahmed <yosryahmed@google.com>
+
+[ Upstream commit 8d59d2214c2362e7a9d185d80b613e632581af7b ]
+
+A global counter for the magnitude of memcg stats update is maintained on
+the memcg side to avoid invoking rstat flushes when the pending updates
+are not significant. This avoids unnecessary flushes, which are not very
+cheap even if there isn't a lot of stats to flush. It also avoids
+unnecessary lock contention on the underlying global rstat lock.
+
+Make this threshold per-memcg. The scheme is followed where percpu (now
+also per-memcg) counters are incremented in the update path, and only
+propagated to per-memcg atomics when they exceed a certain threshold.
+
+This provides two benefits: (a) On large machines with a lot of memcgs,
+the global threshold can be reached relatively fast, so guarding the
+underlying lock becomes less effective. Making the threshold per-memcg
+avoids this.
+
+(b) Having a global threshold makes it hard to do subtree flushes, as we
+cannot reset the global counter except for a full flush. Per-memcg
+counters removes this as a blocker from doing subtree flushes, which helps
+avoid unnecessary work when the stats of a small subtree are needed.
+
+Nothing is free, of course. This comes at a cost: (a) A new per-cpu
+counter per memcg, consuming NR_CPUS * NR_MEMCGS * 4 bytes. The extra
+memory usage is insigificant.
+
+(b) More work on the update side, although in the common case it will only
+be percpu counter updates. The amount of work scales with the number of
+ancestors (i.e. tree depth). This is not a new concept, adding a cgroup
+to the rstat tree involves a parent loop, so is charging. Testing results
+below show no significant regressions.
+
+(c) The error margin in the stats for the system as a whole increases from
+NR_CPUS * MEMCG_CHARGE_BATCH to NR_CPUS * MEMCG_CHARGE_BATCH * NR_MEMCGS.
+This is probably fine because we have a similar per-memcg error in charges
+coming from percpu stocks, and we have a periodic flusher that makes sure
+we always flush all the stats every 2s anyway.
+
+This patch was tested to make sure no significant regressions are
+introduced on the update path as follows. The following benchmarks were
+ran in a cgroup that is 2 levels deep (/sys/fs/cgroup/a/b/):
+
+(1) Running 22 instances of netperf on a 44 cpu machine with
+hyperthreading disabled. All instances are run in a level 2 cgroup, as
+well as netserver:
+ # netserver -6
+ # netperf -6 -H ::1 -l 60 -t TCP_SENDFILE -- -m 10K
+
+Averaging 20 runs, the numbers are as follows:
+Base: 40198.0 mbps
+Patched: 38629.7 mbps (-3.9%)
+
+The regression is minimal, especially for 22 instances in the same
+cgroup sharing all ancestors (so updating the same atomics).
+
+(2) will-it-scale page_fault tests. These tests (specifically
+per_process_ops in page_fault3 test) detected a 25.9% regression before
+for a change in the stats update path [1]. These are the
+numbers from 10 runs (+ is good) on a machine with 256 cpus:
+
+ LABEL | MEAN | MEDIAN | STDDEV |
+------------------------------+-------------+-------------+-------------
+ page_fault1_per_process_ops | | | |
+ (A) base | 270249.164 | 265437.000 | 13451.836 |
+ (B) patched | 261368.709 | 255725.000 | 13394.767 |
+ | -3.29% | -3.66% | |
+ page_fault1_per_thread_ops | | | |
+ (A) base | 242111.345 | 239737.000 | 10026.031 |
+ (B) patched | 237057.109 | 235305.000 | 9769.687 |
+ | -2.09% | -1.85% | |
+ page_fault1_scalability | | |
+ (A) base | 0.034387 | 0.035168 | 0.0018283 |
+ (B) patched | 0.033988 | 0.034573 | 0.0018056 |
+ | -1.16% | -1.69% | |
+ page_fault2_per_process_ops | | |
+ (A) base | 203561.836 | 203301.000 | 2550.764 |
+ (B) patched | 197195.945 | 197746.000 | 2264.263 |
+ | -3.13% | -2.73% | |
+ page_fault2_per_thread_ops | | |
+ (A) base | 171046.473 | 170776.000 | 1509.679 |
+ (B) patched | 166626.327 | 166406.000 | 768.753 |
+ | -2.58% | -2.56% | |
+ page_fault2_scalability | | |
+ (A) base | 0.054026 | 0.053821 | 0.00062121 |
+ (B) patched | 0.053329 | 0.05306 | 0.00048394 |
+ | -1.29% | -1.41% | |
+ page_fault3_per_process_ops | | |
+ (A) base | 1295807.782 | 1297550.000 | 5907.585 |
+ (B) patched | 1275579.873 | 1273359.000 | 8759.160 |
+ | -1.56% | -1.86% | |
+ page_fault3_per_thread_ops | | |
+ (A) base | 391234.164 | 390860.000 | 1760.720 |
+ (B) patched | 377231.273 | 376369.000 | 1874.971 |
+ | -3.58% | -3.71% | |
+ page_fault3_scalability | | |
+ (A) base | 0.60369 | 0.60072 | 0.0083029 |
+ (B) patched | 0.61733 | 0.61544 | 0.009855 |
+ | +2.26% | +2.45% | |
+
+All regressions seem to be minimal, and within the normal variance for the
+benchmark. The fix for [1] assumes that 3% is noise -- and there were no
+further practical complaints), so hopefully this means that such
+variations in these microbenchmarks do not reflect on practical workloads.
+
+(3) I also ran stress-ng in a nested cgroup and did not observe any
+obvious regressions.
+
+[1]https://lore.kernel.org/all/20190520063534.GB19312@shao2-debian/
+
+Link: https://lkml.kernel.org/r/20231129032154.3710765-4-yosryahmed@google.com
+Signed-off-by: Yosry Ahmed <yosryahmed@google.com>
+Suggested-by: Johannes Weiner <hannes@cmpxchg.org>
+Tested-by: Domenico Cerasuolo <cerasuolodomenico@gmail.com>
+Acked-by: Shakeel Butt <shakeelb@google.com>
+Cc: Chris Li <chrisl@kernel.org>
+Cc: Greg Thelen <gthelen@google.com>
+Cc: Ivan Babrou <ivan@cloudflare.com>
+Cc: Michal Hocko <mhocko@kernel.org>
+Cc: Michal Koutny <mkoutny@suse.com>
+Cc: Muchun Song <muchun.song@linux.dev>
+Cc: Roman Gushchin <roman.gushchin@linux.dev>
+Cc: Tejun Heo <tj@kernel.org>
+Cc: Waiman Long <longman@redhat.com>
+Cc: Wei Xu <weixugc@google.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Leon Huang Fu <leon.huangfu@shopee.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ mm/memcontrol.c | 50 ++++++++++++++++++++++++++++++++++----------------
+ 1 file changed, 34 insertions(+), 16 deletions(-)
+
+--- a/mm/memcontrol.c
++++ b/mm/memcontrol.c
+@@ -628,6 +628,9 @@ struct memcg_vmstats_percpu {
+ /* Cgroup1: threshold notifications & softlimit tree updates */
+ unsigned long nr_page_events;
+ unsigned long targets[MEM_CGROUP_NTARGETS];
++
++ /* Stats updates since the last flush */
++ unsigned int stats_updates;
+ };
+
+ struct memcg_vmstats {
+@@ -642,6 +645,9 @@ struct memcg_vmstats {
+ /* Pending child counts during tree propagation */
+ long state_pending[MEMCG_NR_STAT];
+ unsigned long events_pending[NR_MEMCG_EVENTS];
++
++ /* Stats updates since the last flush */
++ atomic64_t stats_updates;
+ };
+
+ /*
+@@ -661,9 +667,7 @@ struct memcg_vmstats {
+ */
+ static void flush_memcg_stats_dwork(struct work_struct *w);
+ static DECLARE_DEFERRABLE_WORK(stats_flush_dwork, flush_memcg_stats_dwork);
+-static DEFINE_PER_CPU(unsigned int, stats_updates);
+ static atomic_t stats_flush_ongoing = ATOMIC_INIT(0);
+-static atomic_t stats_flush_threshold = ATOMIC_INIT(0);
+ static u64 flush_last_time;
+
+ #define FLUSH_TIME (2UL*HZ)
+@@ -690,26 +694,37 @@ static void memcg_stats_unlock(void)
+ preempt_enable_nested();
+ }
+
++
++static bool memcg_should_flush_stats(struct mem_cgroup *memcg)
++{
++ return atomic64_read(&memcg->vmstats->stats_updates) >
++ MEMCG_CHARGE_BATCH * num_online_cpus();
++}
++
+ static inline void memcg_rstat_updated(struct mem_cgroup *memcg, int val)
+ {
++ int cpu = smp_processor_id();
+ unsigned int x;
+
+ if (!val)
+ return;
+
+- cgroup_rstat_updated(memcg->css.cgroup, smp_processor_id());
++ cgroup_rstat_updated(memcg->css.cgroup, cpu);
++
++ for (; memcg; memcg = parent_mem_cgroup(memcg)) {
++ x = __this_cpu_add_return(memcg->vmstats_percpu->stats_updates,
++ abs(val));
++
++ if (x < MEMCG_CHARGE_BATCH)
++ continue;
+
+- x = __this_cpu_add_return(stats_updates, abs(val));
+- if (x > MEMCG_CHARGE_BATCH) {
+ /*
+- * If stats_flush_threshold exceeds the threshold
+- * (>num_online_cpus()), cgroup stats update will be triggered
+- * in __mem_cgroup_flush_stats(). Increasing this var further
+- * is redundant and simply adds overhead in atomic update.
++ * If @memcg is already flush-able, increasing stats_updates is
++ * redundant. Avoid the overhead of the atomic update.
+ */
+- if (atomic_read(&stats_flush_threshold) <= num_online_cpus())
+- atomic_add(x / MEMCG_CHARGE_BATCH, &stats_flush_threshold);
+- __this_cpu_write(stats_updates, 0);
++ if (!memcg_should_flush_stats(memcg))
++ atomic64_add(x, &memcg->vmstats->stats_updates);
++ __this_cpu_write(memcg->vmstats_percpu->stats_updates, 0);
+ }
+ }
+
+@@ -728,13 +743,12 @@ static void do_flush_stats(void)
+
+ cgroup_rstat_flush(root_mem_cgroup->css.cgroup);
+
+- atomic_set(&stats_flush_threshold, 0);
+ atomic_set(&stats_flush_ongoing, 0);
+ }
+
+ void mem_cgroup_flush_stats(void)
+ {
+- if (atomic_read(&stats_flush_threshold) > num_online_cpus())
++ if (memcg_should_flush_stats(root_mem_cgroup))
+ do_flush_stats();
+ }
+
+@@ -748,8 +762,8 @@ void mem_cgroup_flush_stats_ratelimited(
+ static void flush_memcg_stats_dwork(struct work_struct *w)
+ {
+ /*
+- * Always flush here so that flushing in latency-sensitive paths is
+- * as cheap as possible.
++ * Deliberately ignore memcg_should_flush_stats() here so that flushing
++ * in latency-sensitive paths is as cheap as possible.
+ */
+ do_flush_stats();
+ queue_delayed_work(system_unbound_wq, &stats_flush_dwork, FLUSH_TIME);
+@@ -5658,6 +5672,10 @@ static void mem_cgroup_css_rstat_flush(s
+ }
+ }
+ }
++ statc->stats_updates = 0;
++ /* We are in a per-cpu loop here, only do the atomic write once */
++ if (atomic64_read(&memcg->vmstats->stats_updates))
++ atomic64_set(&memcg->vmstats->stats_updates, 0);
+ }
+
+ #ifdef CONFIG_MMU
--- /dev/null
+From leon.huangfu@shopee.com Mon Nov 3 08:53:23 2025
+From: Leon Huang Fu <leon.huangfu@shopee.com>
+Date: Mon, 3 Nov 2025 15:51:32 +0800
+Subject: mm: memcg: move vmstats structs definition above flushing code
+To: stable@vger.kernel.org, greg@kroah.com
+Cc: tj@kernel.org, lizefan.x@bytedance.com, hannes@cmpxchg.org, corbet@lwn.net, mhocko@kernel.org, roman.gushchin@linux.dev, shakeelb@google.com, muchun.song@linux.dev, akpm@linux-foundation.org, sjenning@redhat.com, ddstreet@ieee.org, vitaly.wool@konsulko.com, lance.yang@linux.dev, leon.huangfu@shopee.com, shy828301@gmail.com, yosryahmed@google.com, sashal@kernel.org, vishal.moola@gmail.com, cerasuolodomenico@gmail.com, nphamcs@gmail.com, cgroups@vger.kernel.org, linux-doc@vger.kernel.org, linux-kernel@vger.kernel.org, linux-mm@kvack.org, Chris Li <chrisl@kernel.org>, Greg Thelen <gthelen@google.com>, Ivan Babrou <ivan@cloudflare.com>, Michal Koutny <mkoutny@suse.com>, Waiman Long <longman@redhat.com>, Wei Xu <weixugc@google.com>
+Message-ID: <20251103075135.20254-5-leon.huangfu@shopee.com>
+
+From: Yosry Ahmed <yosryahmed@google.com>
+
+[ Upstream commit e0bf1dc859fdd08ef738824710770a30a8069433 ]
+
+The following patch will make use of those structs in the flushing code,
+so move their definitions (and a few other dependencies) a little bit up
+to reduce the diff noise in the following patch.
+
+No functional change intended.
+
+Link: https://lkml.kernel.org/r/20231129032154.3710765-3-yosryahmed@google.com
+Signed-off-by: Yosry Ahmed <yosryahmed@google.com>
+Tested-by: Domenico Cerasuolo <cerasuolodomenico@gmail.com>
+Acked-by: Shakeel Butt <shakeelb@google.com>
+Cc: Chris Li <chrisl@kernel.org>
+Cc: Greg Thelen <gthelen@google.com>
+Cc: Ivan Babrou <ivan@cloudflare.com>
+Cc: Johannes Weiner <hannes@cmpxchg.org>
+Cc: Michal Hocko <mhocko@kernel.org>
+Cc: Michal Koutny <mkoutny@suse.com>
+Cc: Muchun Song <muchun.song@linux.dev>
+Cc: Roman Gushchin <roman.gushchin@linux.dev>
+Cc: Tejun Heo <tj@kernel.org>
+Cc: Waiman Long <longman@redhat.com>
+Cc: Wei Xu <weixugc@google.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Leon Huang Fu <leon.huangfu@shopee.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ mm/memcontrol.c | 148 ++++++++++++++++++++++++++++----------------------------
+ 1 file changed, 74 insertions(+), 74 deletions(-)
+
+--- a/mm/memcontrol.c
++++ b/mm/memcontrol.c
+@@ -570,6 +570,80 @@ mem_cgroup_largest_soft_limit_node(struc
+ return mz;
+ }
+
++/* Subset of vm_event_item to report for memcg event stats */
++static const unsigned int memcg_vm_event_stat[] = {
++ PGPGIN,
++ PGPGOUT,
++ PGSCAN_KSWAPD,
++ PGSCAN_DIRECT,
++ PGSCAN_KHUGEPAGED,
++ PGSTEAL_KSWAPD,
++ PGSTEAL_DIRECT,
++ PGSTEAL_KHUGEPAGED,
++ PGFAULT,
++ PGMAJFAULT,
++ PGREFILL,
++ PGACTIVATE,
++ PGDEACTIVATE,
++ PGLAZYFREE,
++ PGLAZYFREED,
++#if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_ZSWAP)
++ ZSWPIN,
++ ZSWPOUT,
++ ZSWPWB,
++#endif
++#ifdef CONFIG_TRANSPARENT_HUGEPAGE
++ THP_FAULT_ALLOC,
++ THP_COLLAPSE_ALLOC,
++ THP_SWPOUT,
++ THP_SWPOUT_FALLBACK,
++#endif
++};
++
++#define NR_MEMCG_EVENTS ARRAY_SIZE(memcg_vm_event_stat)
++static int mem_cgroup_events_index[NR_VM_EVENT_ITEMS] __read_mostly;
++
++static void init_memcg_events(void)
++{
++ int i;
++
++ for (i = 0; i < NR_MEMCG_EVENTS; ++i)
++ mem_cgroup_events_index[memcg_vm_event_stat[i]] = i + 1;
++}
++
++static inline int memcg_events_index(enum vm_event_item idx)
++{
++ return mem_cgroup_events_index[idx] - 1;
++}
++
++struct memcg_vmstats_percpu {
++ /* Local (CPU and cgroup) page state & events */
++ long state[MEMCG_NR_STAT];
++ unsigned long events[NR_MEMCG_EVENTS];
++
++ /* Delta calculation for lockless upward propagation */
++ long state_prev[MEMCG_NR_STAT];
++ unsigned long events_prev[NR_MEMCG_EVENTS];
++
++ /* Cgroup1: threshold notifications & softlimit tree updates */
++ unsigned long nr_page_events;
++ unsigned long targets[MEM_CGROUP_NTARGETS];
++};
++
++struct memcg_vmstats {
++ /* Aggregated (CPU and subtree) page state & events */
++ long state[MEMCG_NR_STAT];
++ unsigned long events[NR_MEMCG_EVENTS];
++
++ /* Non-hierarchical (CPU aggregated) page state & events */
++ long state_local[MEMCG_NR_STAT];
++ unsigned long events_local[NR_MEMCG_EVENTS];
++
++ /* Pending child counts during tree propagation */
++ long state_pending[MEMCG_NR_STAT];
++ unsigned long events_pending[NR_MEMCG_EVENTS];
++};
++
+ /*
+ * memcg and lruvec stats flushing
+ *
+@@ -681,80 +755,6 @@ static void flush_memcg_stats_dwork(stru
+ queue_delayed_work(system_unbound_wq, &stats_flush_dwork, FLUSH_TIME);
+ }
+
+-/* Subset of vm_event_item to report for memcg event stats */
+-static const unsigned int memcg_vm_event_stat[] = {
+- PGPGIN,
+- PGPGOUT,
+- PGSCAN_KSWAPD,
+- PGSCAN_DIRECT,
+- PGSCAN_KHUGEPAGED,
+- PGSTEAL_KSWAPD,
+- PGSTEAL_DIRECT,
+- PGSTEAL_KHUGEPAGED,
+- PGFAULT,
+- PGMAJFAULT,
+- PGREFILL,
+- PGACTIVATE,
+- PGDEACTIVATE,
+- PGLAZYFREE,
+- PGLAZYFREED,
+-#if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_ZSWAP)
+- ZSWPIN,
+- ZSWPOUT,
+- ZSWPWB,
+-#endif
+-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+- THP_FAULT_ALLOC,
+- THP_COLLAPSE_ALLOC,
+- THP_SWPOUT,
+- THP_SWPOUT_FALLBACK,
+-#endif
+-};
+-
+-#define NR_MEMCG_EVENTS ARRAY_SIZE(memcg_vm_event_stat)
+-static int mem_cgroup_events_index[NR_VM_EVENT_ITEMS] __read_mostly;
+-
+-static void init_memcg_events(void)
+-{
+- int i;
+-
+- for (i = 0; i < NR_MEMCG_EVENTS; ++i)
+- mem_cgroup_events_index[memcg_vm_event_stat[i]] = i + 1;
+-}
+-
+-static inline int memcg_events_index(enum vm_event_item idx)
+-{
+- return mem_cgroup_events_index[idx] - 1;
+-}
+-
+-struct memcg_vmstats_percpu {
+- /* Local (CPU and cgroup) page state & events */
+- long state[MEMCG_NR_STAT];
+- unsigned long events[NR_MEMCG_EVENTS];
+-
+- /* Delta calculation for lockless upward propagation */
+- long state_prev[MEMCG_NR_STAT];
+- unsigned long events_prev[NR_MEMCG_EVENTS];
+-
+- /* Cgroup1: threshold notifications & softlimit tree updates */
+- unsigned long nr_page_events;
+- unsigned long targets[MEM_CGROUP_NTARGETS];
+-};
+-
+-struct memcg_vmstats {
+- /* Aggregated (CPU and subtree) page state & events */
+- long state[MEMCG_NR_STAT];
+- unsigned long events[NR_MEMCG_EVENTS];
+-
+- /* Non-hierarchical (CPU aggregated) page state & events */
+- long state_local[MEMCG_NR_STAT];
+- unsigned long events_local[NR_MEMCG_EVENTS];
+-
+- /* Pending child counts during tree propagation */
+- long state_pending[MEMCG_NR_STAT];
+- unsigned long events_pending[NR_MEMCG_EVENTS];
+-};
+-
+ unsigned long memcg_page_state(struct mem_cgroup *memcg, int idx)
+ {
+ long x = READ_ONCE(memcg->vmstats->state[idx]);
--- /dev/null
+From leon.huangfu@shopee.com Mon Nov 3 08:53:44 2025
+From: Leon Huang Fu <leon.huangfu@shopee.com>
+Date: Mon, 3 Nov 2025 15:51:35 +0800
+Subject: mm: memcg: restore subtree stats flushing
+To: stable@vger.kernel.org, greg@kroah.com
+Cc: tj@kernel.org, lizefan.x@bytedance.com, hannes@cmpxchg.org, corbet@lwn.net, mhocko@kernel.org, roman.gushchin@linux.dev, shakeelb@google.com, muchun.song@linux.dev, akpm@linux-foundation.org, sjenning@redhat.com, ddstreet@ieee.org, vitaly.wool@konsulko.com, lance.yang@linux.dev, leon.huangfu@shopee.com, shy828301@gmail.com, yosryahmed@google.com, sashal@kernel.org, vishal.moola@gmail.com, cerasuolodomenico@gmail.com, nphamcs@gmail.com, cgroups@vger.kernel.org, linux-doc@vger.kernel.org, linux-kernel@vger.kernel.org, linux-mm@kvack.org, Chris Li <chrisl@kernel.org>, Greg Thelen <gthelen@google.com>, Ivan Babrou <ivan@cloudflare.com>, Michal Koutny <mkoutny@suse.com>, Waiman Long <longman@redhat.com>, Wei Xu <weixugc@google.com>
+Message-ID: <20251103075135.20254-8-leon.huangfu@shopee.com>
+
+From: Yosry Ahmed <yosryahmed@google.com>
+
+[ Upstream commit 7d7ef0a4686abe43cd76a141b340a348f45ecdf2 ]
+
+Stats flushing for memcg currently follows the following rules:
+- Always flush the entire memcg hierarchy (i.e. flush the root).
+- Only one flusher is allowed at a time. If someone else tries to flush
+ concurrently, they skip and return immediately.
+- A periodic flusher flushes all the stats every 2 seconds.
+
+The reason this approach is followed is because all flushes are serialized
+by a global rstat spinlock. On the memcg side, flushing is invoked from
+userspace reads as well as in-kernel flushers (e.g. reclaim, refault,
+etc). This approach aims to avoid serializing all flushers on the global
+lock, which can cause a significant performance hit under high
+concurrency.
+
+This approach has the following problems:
+- Occasionally a userspace read of the stats of a non-root cgroup will
+ be too expensive as it has to flush the entire hierarchy [1].
+- Sometimes the stats accuracy are compromised if there is an ongoing
+ flush, and we skip and return before the subtree of interest is
+ actually flushed, yielding stale stats (by up to 2s due to periodic
+ flushing). This is more visible when reading stats from userspace,
+ but can also affect in-kernel flushers.
+
+The latter problem is particulary a concern when userspace reads stats
+after an event occurs, but gets stats from before the event. Examples:
+- When memory usage / pressure spikes, a userspace OOM handler may look
+ at the stats of different memcgs to select a victim based on various
+ heuristics (e.g. how much private memory will be freed by killing
+ this). Reading stale stats from before the usage spike in this case
+ may cause a wrongful OOM kill.
+- A proactive reclaimer may read the stats after writing to
+ memory.reclaim to measure the success of the reclaim operation. Stale
+ stats from before reclaim may give a false negative.
+- Reading the stats of a parent and a child memcg may be inconsistent
+ (child larger than parent), if the flush doesn't happen when the
+ parent is read, but happens when the child is read.
+
+As for in-kernel flushers, they will occasionally get stale stats. No
+regressions are currently known from this, but if there are regressions,
+they would be very difficult to debug and link to the source of the
+problem.
+
+This patch aims to fix these problems by restoring subtree flushing, and
+removing the unified/coalesced flushing logic that skips flushing if there
+is an ongoing flush. This change would introduce a significant regression
+with global stats flushing thresholds. With per-memcg stats flushing
+thresholds, this seems to perform really well. The thresholds protect the
+underlying lock from unnecessary contention.
+
+This patch was tested in two ways to ensure the latency of flushing is
+up to par, on a machine with 384 cpus:
+
+- A synthetic test with 5000 concurrent workers in 500 cgroups doing
+ allocations and reclaim, as well as 1000 readers for memory.stat
+ (variation of [2]). No regressions were noticed in the total runtime.
+ Note that significant regressions in this test are observed with
+ global stats thresholds, but not with per-memcg thresholds.
+
+- A synthetic stress test for concurrently reading memcg stats while
+ memory allocation/freeing workers are running in the background,
+ provided by Wei Xu [3]. With 250k threads reading the stats every
+ 100ms in 50k cgroups, 99.9% of reads take <= 50us. Less than 0.01%
+ of reads take more than 1ms, and no reads take more than 100ms.
+
+[1] https://lore.kernel.org/lkml/CABWYdi0c6__rh-K7dcM_pkf9BJdTRtAU08M43KO9ME4-dsgfoQ@mail.gmail.com/
+[2] https://lore.kernel.org/lkml/CAJD7tka13M-zVZTyQJYL1iUAYvuQ1fcHbCjcOBZcz6POYTV-4g@mail.gmail.com/
+[3] https://lore.kernel.org/lkml/CAAPL-u9D2b=iF5Lf_cRnKxUfkiEe0AMDTu6yhrUAzX0b6a6rDg@mail.gmail.com/
+
+[akpm@linux-foundation.org: fix mm/zswap.c]
+[yosryahmed@google.com: remove stats flushing mutex]
+ Link: https://lkml.kernel.org/r/CAJD7tkZgP3m-VVPn+fF_YuvXeQYK=tZZjJHj=dzD=CcSSpp2qg@mail.gmail.com
+Link: https://lkml.kernel.org/r/20231129032154.3710765-6-yosryahmed@google.com
+Signed-off-by: Yosry Ahmed <yosryahmed@google.com>
+Tested-by: Domenico Cerasuolo <cerasuolodomenico@gmail.com>
+Acked-by: Shakeel Butt <shakeelb@google.com>
+Cc: Chris Li <chrisl@kernel.org>
+Cc: Greg Thelen <gthelen@google.com>
+Cc: Ivan Babrou <ivan@cloudflare.com>
+Cc: Johannes Weiner <hannes@cmpxchg.org>
+Cc: Michal Hocko <mhocko@kernel.org>
+Cc: Michal Koutny <mkoutny@suse.com>
+Cc: Muchun Song <muchun.song@linux.dev>
+Cc: Roman Gushchin <roman.gushchin@linux.dev>
+Cc: Tejun Heo <tj@kernel.org>
+Cc: Waiman Long <longman@redhat.com>
+Cc: Wei Xu <weixugc@google.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Leon Huang Fu <leon.huangfu@shopee.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ include/linux/memcontrol.h | 8 ++---
+ mm/memcontrol.c | 68 +++++++++++++++++++++++++--------------------
+ mm/vmscan.c | 2 -
+ mm/workingset.c | 10 ++++--
+ 4 files changed, 51 insertions(+), 37 deletions(-)
+
+--- a/include/linux/memcontrol.h
++++ b/include/linux/memcontrol.h
+@@ -1039,8 +1039,8 @@ static inline unsigned long lruvec_page_
+ return x;
+ }
+
+-void mem_cgroup_flush_stats(void);
+-void mem_cgroup_flush_stats_ratelimited(void);
++void mem_cgroup_flush_stats(struct mem_cgroup *memcg);
++void mem_cgroup_flush_stats_ratelimited(struct mem_cgroup *memcg);
+
+ void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
+ int val);
+@@ -1515,11 +1515,11 @@ static inline unsigned long lruvec_page_
+ return node_page_state(lruvec_pgdat(lruvec), idx);
+ }
+
+-static inline void mem_cgroup_flush_stats(void)
++static inline void mem_cgroup_flush_stats(struct mem_cgroup *memcg)
+ {
+ }
+
+-static inline void mem_cgroup_flush_stats_ratelimited(void)
++static inline void mem_cgroup_flush_stats_ratelimited(struct mem_cgroup *memcg)
+ {
+ }
+
+--- a/mm/memcontrol.c
++++ b/mm/memcontrol.c
+@@ -667,7 +667,6 @@ struct memcg_vmstats {
+ */
+ static void flush_memcg_stats_dwork(struct work_struct *w);
+ static DECLARE_DEFERRABLE_WORK(stats_flush_dwork, flush_memcg_stats_dwork);
+-static atomic_t stats_flush_ongoing = ATOMIC_INIT(0);
+ static u64 flush_last_time;
+
+ #define FLUSH_TIME (2UL*HZ)
+@@ -728,35 +727,40 @@ static inline void memcg_rstat_updated(s
+ }
+ }
+
+-static void do_flush_stats(void)
++static void do_flush_stats(struct mem_cgroup *memcg)
+ {
+- /*
+- * We always flush the entire tree, so concurrent flushers can just
+- * skip. This avoids a thundering herd problem on the rstat global lock
+- * from memcg flushers (e.g. reclaim, refault, etc).
+- */
+- if (atomic_read(&stats_flush_ongoing) ||
+- atomic_xchg(&stats_flush_ongoing, 1))
+- return;
+-
+- WRITE_ONCE(flush_last_time, jiffies_64);
+-
+- cgroup_rstat_flush(root_mem_cgroup->css.cgroup);
++ if (mem_cgroup_is_root(memcg))
++ WRITE_ONCE(flush_last_time, jiffies_64);
+
+- atomic_set(&stats_flush_ongoing, 0);
++ cgroup_rstat_flush(memcg->css.cgroup);
+ }
+
+-void mem_cgroup_flush_stats(void)
++/*
++ * mem_cgroup_flush_stats - flush the stats of a memory cgroup subtree
++ * @memcg: root of the subtree to flush
++ *
++ * Flushing is serialized by the underlying global rstat lock. There is also a
++ * minimum amount of work to be done even if there are no stat updates to flush.
++ * Hence, we only flush the stats if the updates delta exceeds a threshold. This
++ * avoids unnecessary work and contention on the underlying lock.
++ */
++void mem_cgroup_flush_stats(struct mem_cgroup *memcg)
+ {
+- if (memcg_should_flush_stats(root_mem_cgroup))
+- do_flush_stats();
++ if (mem_cgroup_disabled())
++ return;
++
++ if (!memcg)
++ memcg = root_mem_cgroup;
++
++ if (memcg_should_flush_stats(memcg))
++ do_flush_stats(memcg);
+ }
+
+-void mem_cgroup_flush_stats_ratelimited(void)
++void mem_cgroup_flush_stats_ratelimited(struct mem_cgroup *memcg)
+ {
+ /* Only flush if the periodic flusher is one full cycle late */
+ if (time_after64(jiffies_64, READ_ONCE(flush_last_time) + 2*FLUSH_TIME))
+- mem_cgroup_flush_stats();
++ mem_cgroup_flush_stats(memcg);
+ }
+
+ static void flush_memcg_stats_dwork(struct work_struct *w)
+@@ -765,7 +769,7 @@ static void flush_memcg_stats_dwork(stru
+ * Deliberately ignore memcg_should_flush_stats() here so that flushing
+ * in latency-sensitive paths is as cheap as possible.
+ */
+- do_flush_stats();
++ do_flush_stats(root_mem_cgroup);
+ queue_delayed_work(system_unbound_wq, &stats_flush_dwork, FLUSH_TIME);
+ }
+
+@@ -1597,7 +1601,7 @@ static void memcg_stat_format(struct mem
+ *
+ * Current memory state:
+ */
+- mem_cgroup_flush_stats();
++ mem_cgroup_flush_stats(memcg);
+
+ for (i = 0; i < ARRAY_SIZE(memory_stats); i++) {
+ u64 size;
+@@ -4047,7 +4051,7 @@ static int memcg_numa_stat_show(struct s
+ int nid;
+ struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
+
+- mem_cgroup_flush_stats();
++ mem_cgroup_flush_stats(memcg);
+
+ for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) {
+ seq_printf(m, "%s=%lu", stat->name,
+@@ -4122,7 +4126,7 @@ static void memcg1_stat_format(struct me
+
+ BUILD_BUG_ON(ARRAY_SIZE(memcg1_stat_names) != ARRAY_SIZE(memcg1_stats));
+
+- mem_cgroup_flush_stats();
++ mem_cgroup_flush_stats(memcg);
+
+ for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) {
+ unsigned long nr;
+@@ -4624,7 +4628,7 @@ void mem_cgroup_wb_stats(struct bdi_writ
+ struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css);
+ struct mem_cgroup *parent;
+
+- mem_cgroup_flush_stats();
++ mem_cgroup_flush_stats(memcg);
+
+ *pdirty = memcg_page_state(memcg, NR_FILE_DIRTY);
+ *pwriteback = memcg_page_state(memcg, NR_WRITEBACK);
+@@ -6704,7 +6708,7 @@ static int memory_numa_stat_show(struct
+ int i;
+ struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
+
+- mem_cgroup_flush_stats();
++ mem_cgroup_flush_stats(memcg);
+
+ for (i = 0; i < ARRAY_SIZE(memory_stats); i++) {
+ int nid;
+@@ -7868,7 +7872,11 @@ bool obj_cgroup_may_zswap(struct obj_cgr
+ break;
+ }
+
+- cgroup_rstat_flush(memcg->css.cgroup);
++ /*
++ * mem_cgroup_flush_stats() ignores small changes. Use
++ * do_flush_stats() directly to get accurate stats for charging.
++ */
++ do_flush_stats(memcg);
+ pages = memcg_page_state(memcg, MEMCG_ZSWAP_B) / PAGE_SIZE;
+ if (pages < max)
+ continue;
+@@ -7933,8 +7941,10 @@ void obj_cgroup_uncharge_zswap(struct ob
+ static u64 zswap_current_read(struct cgroup_subsys_state *css,
+ struct cftype *cft)
+ {
+- cgroup_rstat_flush(css->cgroup);
+- return memcg_page_state(mem_cgroup_from_css(css), MEMCG_ZSWAP_B);
++ struct mem_cgroup *memcg = mem_cgroup_from_css(css);
++
++ mem_cgroup_flush_stats(memcg);
++ return memcg_page_state(memcg, MEMCG_ZSWAP_B);
+ }
+
+ static int zswap_max_show(struct seq_file *m, void *v)
+--- a/mm/vmscan.c
++++ b/mm/vmscan.c
+@@ -2911,7 +2911,7 @@ static void prepare_scan_count(pg_data_t
+ * Flush the memory cgroup stats, so that we read accurate per-memcg
+ * lruvec stats for heuristics.
+ */
+- mem_cgroup_flush_stats();
++ mem_cgroup_flush_stats(sc->target_mem_cgroup);
+
+ /*
+ * Determine the scan balance between anon and file LRUs.
+--- a/mm/workingset.c
++++ b/mm/workingset.c
+@@ -464,8 +464,12 @@ bool workingset_test_recent(void *shadow
+
+ rcu_read_unlock();
+
+- /* Flush stats (and potentially sleep) outside the RCU read section */
+- mem_cgroup_flush_stats_ratelimited();
++ /*
++ * Flush stats (and potentially sleep) outside the RCU read section.
++ * XXX: With per-memcg flushing and thresholding, is ratelimiting
++ * still needed here?
++ */
++ mem_cgroup_flush_stats_ratelimited(eviction_memcg);
+
+ eviction_lruvec = mem_cgroup_lruvec(eviction_memcg, pgdat);
+ refault = atomic_long_read(&eviction_lruvec->nonresident_age);
+@@ -676,7 +680,7 @@ static unsigned long count_shadow_nodes(
+ struct lruvec *lruvec;
+ int i;
+
+- mem_cgroup_flush_stats_ratelimited();
++ mem_cgroup_flush_stats_ratelimited(sc->memcg);
+ lruvec = mem_cgroup_lruvec(sc->memcg, NODE_DATA(sc->nid));
+ for (pages = 0, i = 0; i < NR_LRU_LISTS; i++)
+ pages += lruvec_page_state_local(lruvec,
--- /dev/null
+From leon.huangfu@shopee.com Mon Nov 3 08:53:37 2025
+From: Leon Huang Fu <leon.huangfu@shopee.com>
+Date: Mon, 3 Nov 2025 15:51:34 +0800
+Subject: mm: workingset: move the stats flush into workingset_test_recent()
+To: stable@vger.kernel.org, greg@kroah.com
+Cc: tj@kernel.org, lizefan.x@bytedance.com, hannes@cmpxchg.org, corbet@lwn.net, mhocko@kernel.org, roman.gushchin@linux.dev, shakeelb@google.com, muchun.song@linux.dev, akpm@linux-foundation.org, sjenning@redhat.com, ddstreet@ieee.org, vitaly.wool@konsulko.com, lance.yang@linux.dev, leon.huangfu@shopee.com, shy828301@gmail.com, yosryahmed@google.com, sashal@kernel.org, vishal.moola@gmail.com, cerasuolodomenico@gmail.com, nphamcs@gmail.com, cgroups@vger.kernel.org, linux-doc@vger.kernel.org, linux-kernel@vger.kernel.org, linux-mm@kvack.org, Chris Li <chrisl@kernel.org>, Greg Thelen <gthelen@google.com>, Ivan Babrou <ivan@cloudflare.com>, Michal Koutny <mkoutny@suse.com>, Waiman Long <longman@redhat.com>, Wei Xu <weixugc@google.com>
+Message-ID: <20251103075135.20254-7-leon.huangfu@shopee.com>
+
+From: Yosry Ahmed <yosryahmed@google.com>
+
+[ Upstream commit b006847222623ac3cda8589d15379eac86a2bcb7 ]
+
+The workingset code flushes the stats in workingset_refault() to get
+accurate stats of the eviction memcg. In preparation for more scoped
+flushed and passing the eviction memcg to the flush call, move the call to
+workingset_test_recent() where we have a pointer to the eviction memcg.
+
+The flush call is sleepable, and cannot be made in an rcu read section.
+Hence, minimize the rcu read section by also moving it into
+workingset_test_recent(). Furthermore, instead of holding the rcu read
+lock throughout workingset_test_recent(), only hold it briefly to get a
+ref on the eviction memcg. This allows us to make the flush call after we
+get the eviction memcg.
+
+As for workingset_refault(), nothing else there appears to be protected by
+rcu. The memcg of the faulted folio (which is not necessarily the same as
+the eviction memcg) is protected by the folio lock, which is held from all
+callsites. Add a VM_BUG_ON() to make sure this doesn't change from under
+us.
+
+No functional change intended.
+
+Link: https://lkml.kernel.org/r/20231129032154.3710765-5-yosryahmed@google.com
+Signed-off-by: Yosry Ahmed <yosryahmed@google.com>
+Tested-by: Domenico Cerasuolo <cerasuolodomenico@gmail.com>
+Acked-by: Shakeel Butt <shakeelb@google.com>
+Cc: Chris Li <chrisl@kernel.org>
+Cc: Greg Thelen <gthelen@google.com>
+Cc: Ivan Babrou <ivan@cloudflare.com>
+Cc: Johannes Weiner <hannes@cmpxchg.org>
+Cc: Michal Hocko <mhocko@kernel.org>
+Cc: Michal Koutny <mkoutny@suse.com>
+Cc: Muchun Song <muchun.song@linux.dev>
+Cc: Roman Gushchin <roman.gushchin@linux.dev>
+Cc: Tejun Heo <tj@kernel.org>
+Cc: Waiman Long <longman@redhat.com>
+Cc: Wei Xu <weixugc@google.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Leon Huang Fu <leon.huangfu@shopee.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ mm/workingset.c | 36 ++++++++++++++++++++++++------------
+ 1 file changed, 24 insertions(+), 12 deletions(-)
+
+--- a/mm/workingset.c
++++ b/mm/workingset.c
+@@ -425,8 +425,16 @@ bool workingset_test_recent(void *shadow
+ struct pglist_data *pgdat;
+ unsigned long eviction;
+
+- if (lru_gen_enabled())
+- return lru_gen_test_recent(shadow, file, &eviction_lruvec, &eviction, workingset);
++ rcu_read_lock();
++
++ if (lru_gen_enabled()) {
++ bool recent = lru_gen_test_recent(shadow, file,
++ &eviction_lruvec, &eviction, workingset);
++
++ rcu_read_unlock();
++ return recent;
++ }
++
+
+ unpack_shadow(shadow, &memcgid, &pgdat, &eviction, workingset);
+ eviction <<= bucket_order;
+@@ -448,8 +456,16 @@ bool workingset_test_recent(void *shadow
+ * configurations instead.
+ */
+ eviction_memcg = mem_cgroup_from_id(memcgid);
+- if (!mem_cgroup_disabled() && !eviction_memcg)
++ if (!mem_cgroup_disabled() &&
++ (!eviction_memcg || !mem_cgroup_tryget(eviction_memcg))) {
++ rcu_read_unlock();
+ return false;
++ }
++
++ rcu_read_unlock();
++
++ /* Flush stats (and potentially sleep) outside the RCU read section */
++ mem_cgroup_flush_stats_ratelimited();
+
+ eviction_lruvec = mem_cgroup_lruvec(eviction_memcg, pgdat);
+ refault = atomic_long_read(&eviction_lruvec->nonresident_age);
+@@ -493,6 +509,7 @@ bool workingset_test_recent(void *shadow
+ }
+ }
+
++ mem_cgroup_put(eviction_memcg);
+ return refault_distance <= workingset_size;
+ }
+
+@@ -519,19 +536,16 @@ void workingset_refault(struct folio *fo
+ return;
+ }
+
+- /* Flush stats (and potentially sleep) before holding RCU read lock */
+- mem_cgroup_flush_stats_ratelimited();
+-
+- rcu_read_lock();
+-
+ /*
+ * The activation decision for this folio is made at the level
+ * where the eviction occurred, as that is where the LRU order
+ * during folio reclaim is being determined.
+ *
+ * However, the cgroup that will own the folio is the one that
+- * is actually experiencing the refault event.
++ * is actually experiencing the refault event. Make sure the folio is
++ * locked to guarantee folio_memcg() stability throughout.
+ */
++ VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
+ nr = folio_nr_pages(folio);
+ memcg = folio_memcg(folio);
+ pgdat = folio_pgdat(folio);
+@@ -540,7 +554,7 @@ void workingset_refault(struct folio *fo
+ mod_lruvec_state(lruvec, WORKINGSET_REFAULT_BASE + file, nr);
+
+ if (!workingset_test_recent(shadow, file, &workingset))
+- goto out;
++ return;
+
+ folio_set_active(folio);
+ workingset_age_nonresident(lruvec, nr);
+@@ -556,8 +570,6 @@ void workingset_refault(struct folio *fo
+ lru_note_cost_refault(folio);
+ mod_lruvec_state(lruvec, WORKINGSET_RESTORE_BASE + file, nr);
+ }
+-out:
+- rcu_read_unlock();
+ }
+
+ /**
scsi-ufs-core-add-a-quirk-for-handling-broken-lsdbs-field-in-controller-capabilities-register.patch
scsi-ufs-core-add-a-quirk-to-suppress-link_startup_again.patch
scsi-ufs-ufs-pci-set-ufshcd_quirk_perform_link_startup_once-for-intel-adl.patch
+mm-memcg-add-thp-swap-out-info-for-anonymous-reclaim.patch
+mm-memcg-add-per-memcg-zswap-writeback-stat.patch
+mm-memcg-change-flush_next_time-to-flush_last_time.patch
+mm-memcg-move-vmstats-structs-definition-above-flushing-code.patch
+mm-memcg-make-stats-flushing-threshold-per-memcg.patch
+mm-workingset-move-the-stats-flush-into-workingset_test_recent.patch
+mm-memcg-restore-subtree-stats-flushing.patch