]> git.ipfire.org Git - thirdparty/kernel/stable-queue.git/commitdiff
6.6-stable patches
authorGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Fri, 21 Nov 2025 11:14:32 +0000 (12:14 +0100)
committerGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Fri, 21 Nov 2025 11:14:32 +0000 (12:14 +0100)
added patches:
acpi-hmat-remove-register-of-memory-node-for-generic-target.patch
cachestat-do-not-flush-stats-in-recency-check.patch
memory-tiers-use-default_dram_perf_ref_source-in-log-message.patch
mm-memcg-optimize-parent-iteration-in-memcg_rstat_updated.patch
mm-memory-tier-fix-abstract-distance-calculation-overflow.patch

queue-6.6/acpi-hmat-remove-register-of-memory-node-for-generic-target.patch [new file with mode: 0644]
queue-6.6/cachestat-do-not-flush-stats-in-recency-check.patch [new file with mode: 0644]
queue-6.6/memory-tiers-use-default_dram_perf_ref_source-in-log-message.patch [new file with mode: 0644]
queue-6.6/mm-memcg-optimize-parent-iteration-in-memcg_rstat_updated.patch [new file with mode: 0644]
queue-6.6/mm-memory-tier-fix-abstract-distance-calculation-overflow.patch [new file with mode: 0644]
queue-6.6/series

diff --git a/queue-6.6/acpi-hmat-remove-register-of-memory-node-for-generic-target.patch b/queue-6.6/acpi-hmat-remove-register-of-memory-node-for-generic-target.patch
new file mode 100644 (file)
index 0000000..bffb4ff
--- /dev/null
@@ -0,0 +1,57 @@
+From 54b9460b0a28c4c76a7b455ec1b3b61a13e97291 Mon Sep 17 00:00:00 2001
+From: Dave Jiang <dave.jiang@intel.com>
+Date: Fri, 8 Mar 2024 14:59:20 -0700
+Subject: ACPI: HMAT: Remove register of memory node for generic target
+
+From: Dave Jiang <dave.jiang@intel.com>
+
+commit 54b9460b0a28c4c76a7b455ec1b3b61a13e97291 upstream.
+
+For generic targets, there's no reason to call
+register_memory_node_under_compute_node() with the access levels that are
+only visible to HMAT handling code. Only update the attributes and rename
+hmat_register_generic_target_initiators() to hmat_update_generic_target().
+
+The original call path ends up triggering register_memory_node_under_compute_node().
+Although the access level would be "3" and not impact any current node arrays, it
+introduces unwanted data into the numa node access_coordinate array.
+
+Fixes: a3a3e341f169 ("acpi: numa: Add setting of generic port system locality attributes")
+Cc: Rafael J. Wysocki <rafael@kernel.org>
+Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
+Tested-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
+Signed-off-by: Dave Jiang <dave.jiang@intel.com>
+Link: https://lore.kernel.org/r/20240308220055.2172956-2-dave.jiang@intel.com
+Signed-off-by: Dan Williams <dan.j.williams@intel.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/acpi/numa/hmat.c |    8 ++++----
+ 1 file changed, 4 insertions(+), 4 deletions(-)
+
+--- a/drivers/acpi/numa/hmat.c
++++ b/drivers/acpi/numa/hmat.c
+@@ -729,12 +729,12 @@ static void __hmat_register_target_initi
+       }
+ }
+-static void hmat_register_generic_target_initiators(struct memory_target *target)
++static void hmat_update_generic_target(struct memory_target *target)
+ {
+       static DECLARE_BITMAP(p_nodes, MAX_NUMNODES);
+-      __hmat_register_target_initiators(target, p_nodes,
+-                                        NODE_ACCESS_CLASS_GENPORT_SINK);
++      hmat_update_target_attrs(target, p_nodes,
++                               NODE_ACCESS_CLASS_GENPORT_SINK);
+ }
+ static void hmat_register_target_initiators(struct memory_target *target)
+@@ -818,7 +818,7 @@ static void hmat_register_target(struct
+        */
+       mutex_lock(&target_lock);
+       if (*(u16 *)target->gen_port_device_handle) {
+-              hmat_register_generic_target_initiators(target);
++              hmat_update_generic_target(target);
+               target->registered = true;
+       }
+       mutex_unlock(&target_lock);
diff --git a/queue-6.6/cachestat-do-not-flush-stats-in-recency-check.patch b/queue-6.6/cachestat-do-not-flush-stats-in-recency-check.patch
new file mode 100644 (file)
index 0000000..c8739f1
--- /dev/null
@@ -0,0 +1,119 @@
+From 5a4d8944d6b1e1aaaa83ea42c116b520b4ed0394 Mon Sep 17 00:00:00 2001
+From: Nhat Pham <nphamcs@gmail.com>
+Date: Thu, 27 Jun 2024 13:17:37 -0700
+Subject: cachestat: do not flush stats in recency check
+
+From: Nhat Pham <nphamcs@gmail.com>
+
+commit 5a4d8944d6b1e1aaaa83ea42c116b520b4ed0394 upstream.
+
+syzbot detects that cachestat() is flushing stats, which can sleep, in its
+RCU read section (see [1]).  This is done in the workingset_test_recent()
+step (which checks if the folio's eviction is recent).
+
+Move the stat flushing step to before the RCU read section of cachestat,
+and skip stat flushing during the recency check.
+
+[1]: https://lore.kernel.org/cgroups/000000000000f71227061bdf97e0@google.com/
+
+Link: https://lkml.kernel.org/r/20240627201737.3506959-1-nphamcs@gmail.com
+Fixes: b00684722262 ("mm: workingset: move the stats flush into workingset_test_recent()")
+Signed-off-by: Nhat Pham <nphamcs@gmail.com>
+Reported-by: syzbot+b7f13b2d0cc156edf61a@syzkaller.appspotmail.com
+Closes: https://lore.kernel.org/cgroups/000000000000f71227061bdf97e0@google.com/
+Debugged-by: Johannes Weiner <hannes@cmpxchg.org>
+Suggested-by: Johannes Weiner <hannes@cmpxchg.org>
+Acked-by: Johannes Weiner <hannes@cmpxchg.org>
+Acked-by: Shakeel Butt <shakeel.butt@linux.dev>
+Cc: Al Viro <viro@zeniv.linux.org.uk>
+Cc: David Hildenbrand <david@redhat.com>
+Cc: "Huang, Ying" <ying.huang@intel.com>
+Cc: Kairui Song <kasong@tencent.com>
+Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
+Cc: Ryan Roberts <ryan.roberts@arm.com>
+Cc: Yosry Ahmed <yosryahmed@google.com>
+Cc: <stable@vger.kernel.org>   [6.8+]
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ include/linux/swap.h |    3 ++-
+ mm/filemap.c         |    5 ++++-
+ mm/workingset.c      |   14 +++++++++++---
+ 3 files changed, 17 insertions(+), 5 deletions(-)
+
+--- a/include/linux/swap.h
++++ b/include/linux/swap.h
+@@ -343,7 +343,8 @@ static inline swp_entry_t page_swap_entr
+ }
+ /* linux/mm/workingset.c */
+-bool workingset_test_recent(void *shadow, bool file, bool *workingset);
++bool workingset_test_recent(void *shadow, bool file, bool *workingset,
++                              bool flush);
+ void workingset_age_nonresident(struct lruvec *lruvec, unsigned long nr_pages);
+ void *workingset_eviction(struct folio *folio, struct mem_cgroup *target_memcg);
+ void workingset_refault(struct folio *folio, void *shadow);
+--- a/mm/filemap.c
++++ b/mm/filemap.c
+@@ -4210,6 +4210,9 @@ static void filemap_cachestat(struct add
+       XA_STATE(xas, &mapping->i_pages, first_index);
+       struct folio *folio;
++      /* Flush stats (and potentially sleep) outside the RCU read section. */
++      mem_cgroup_flush_stats_ratelimited(NULL);
++
+       rcu_read_lock();
+       xas_for_each(&xas, folio, last_index) {
+               int order;
+@@ -4273,7 +4276,7 @@ static void filemap_cachestat(struct add
+                                       goto resched;
+                       }
+ #endif
+-                      if (workingset_test_recent(shadow, true, &workingset))
++                      if (workingset_test_recent(shadow, true, &workingset, false))
+                               cs->nr_recently_evicted += nr_pages;
+                       goto resched;
+--- a/mm/workingset.c
++++ b/mm/workingset.c
+@@ -411,10 +411,12 @@ void *workingset_eviction(struct folio *
+  * @file: whether the corresponding folio is from the file lru.
+  * @workingset: where the workingset value unpacked from shadow should
+  * be stored.
++ * @flush: whether to flush cgroup rstat.
+  *
+  * Return: true if the shadow is for a recently evicted folio; false otherwise.
+  */
+-bool workingset_test_recent(void *shadow, bool file, bool *workingset)
++bool workingset_test_recent(void *shadow, bool file, bool *workingset,
++                              bool flush)
+ {
+       struct mem_cgroup *eviction_memcg;
+       struct lruvec *eviction_lruvec;
+@@ -466,10 +468,16 @@ bool workingset_test_recent(void *shadow
+       /*
+        * Flush stats (and potentially sleep) outside the RCU read section.
++       *
++       * Note that workingset_test_recent() itself might be called in RCU read
++       * section (for e.g, in cachestat) - these callers need to skip flushing
++       * stats (via the flush argument).
++       *
+        * XXX: With per-memcg flushing and thresholding, is ratelimiting
+        * still needed here?
+        */
+-      mem_cgroup_flush_stats_ratelimited(eviction_memcg);
++      if (flush)
++              mem_cgroup_flush_stats_ratelimited(eviction_memcg);
+       eviction_lruvec = mem_cgroup_lruvec(eviction_memcg, pgdat);
+       refault = atomic_long_read(&eviction_lruvec->nonresident_age);
+@@ -557,7 +565,7 @@ void workingset_refault(struct folio *fo
+       mod_lruvec_state(lruvec, WORKINGSET_REFAULT_BASE + file, nr);
+-      if (!workingset_test_recent(shadow, file, &workingset))
++      if (!workingset_test_recent(shadow, file, &workingset, true))
+               return;
+       folio_set_active(folio);
diff --git a/queue-6.6/memory-tiers-use-default_dram_perf_ref_source-in-log-message.patch b/queue-6.6/memory-tiers-use-default_dram_perf_ref_source-in-log-message.patch
new file mode 100644 (file)
index 0000000..ecc3410
--- /dev/null
@@ -0,0 +1,62 @@
+From a530bbc53826c607f64e8ee466c3351efaf6aea5 Mon Sep 17 00:00:00 2001
+From: Huang Ying <ying.huang@intel.com>
+Date: Fri, 20 Sep 2024 09:47:40 +0800
+Subject: memory tiers: use default_dram_perf_ref_source in log message
+
+From: Ying Huang <ying.huang@intel.com>
+
+commit a530bbc53826c607f64e8ee466c3351efaf6aea5 upstream.
+
+Commit 3718c02dbd4c ("acpi, hmat: calculate abstract distance with HMAT")
+added a default_dram_perf_ref_source variable that was initialized but
+never used.  This causes kmemleak to report the following memory leak:
+
+unreferenced object 0xff11000225a47b60 (size 16):
+  comm "swapper/0", pid 1, jiffies 4294761654
+  hex dump (first 16 bytes):
+    41 43 50 49 20 48 4d 41 54 00 c1 4b 7d b7 75 7c  ACPI HMAT..K}.u|
+  backtrace (crc e6d0e7b2):
+    [<ffffffff95d5afdb>] __kmalloc_node_track_caller_noprof+0x36b/0x440
+    [<ffffffff95c276d6>] kstrdup+0x36/0x60
+    [<ffffffff95dfabfa>] mt_set_default_dram_perf+0x23a/0x2c0
+    [<ffffffff9ad64733>] hmat_init+0x2b3/0x660
+    [<ffffffff95203cec>] do_one_initcall+0x11c/0x5c0
+    [<ffffffff9ac9cfc4>] do_initcalls+0x1b4/0x1f0
+    [<ffffffff9ac9d52e>] kernel_init_freeable+0x4ae/0x520
+    [<ffffffff97c789cc>] kernel_init+0x1c/0x150
+    [<ffffffff952aecd1>] ret_from_fork+0x31/0x70
+    [<ffffffff9520b18a>] ret_from_fork_asm+0x1a/0x30
+
+This reminds us that we forget to use the performance data source
+information.  So, use the variable in the error log message to help
+identify the root cause of inconsistent performance number.
+
+Link: https://lkml.kernel.org/r/87y13mvo0n.fsf@yhuang6-desk2.ccr.corp.intel.com
+Fixes: 3718c02dbd4c ("acpi, hmat: calculate abstract distance with HMAT")
+Signed-off-by: "Huang, Ying" <ying.huang@intel.com>
+Reported-by: Waiman Long <longman@redhat.com>
+Acked-by: Waiman Long <longman@redhat.com>
+Cc: Alistair Popple <apopple@nvidia.com>
+Cc: Dave Jiang <dave.jiang@intel.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ mm/memory-tiers.c |    6 +++---
+ 1 file changed, 3 insertions(+), 3 deletions(-)
+
+--- a/mm/memory-tiers.c
++++ b/mm/memory-tiers.c
+@@ -649,10 +649,10 @@ int mt_set_default_dram_perf(int nid, st
+               pr_info(
+ "memory-tiers: the performance of DRAM node %d mismatches that of the reference\n"
+ "DRAM node %d.\n", nid, default_dram_perf_ref_nid);
+-              pr_info("  performance of reference DRAM node %d:\n",
+-                      default_dram_perf_ref_nid);
++              pr_info("  performance of reference DRAM node %d from %s:\n",
++                      default_dram_perf_ref_nid, default_dram_perf_ref_source);
+               dump_hmem_attrs(&default_dram_perf, "    ");
+-              pr_info("  performance of DRAM node %d:\n", nid);
++              pr_info("  performance of DRAM node %d from %s:\n", nid, source);
+               dump_hmem_attrs(perf, "    ");
+               pr_info(
+ "  disable default DRAM node performance based abstract distance algorithm.\n");
diff --git a/queue-6.6/mm-memcg-optimize-parent-iteration-in-memcg_rstat_updated.patch b/queue-6.6/mm-memcg-optimize-parent-iteration-in-memcg_rstat_updated.patch
new file mode 100644 (file)
index 0000000..1777283
--- /dev/null
@@ -0,0 +1,202 @@
+From 9cee7e8ef3e31ca25b40ca52b8585dc6935deff2 Mon Sep 17 00:00:00 2001
+From: Yosry Ahmed <yosryahmed@google.com>
+Date: Wed, 24 Jan 2024 10:00:22 +0000
+Subject: mm: memcg: optimize parent iteration in memcg_rstat_updated()
+
+From: Yosry Ahmed <yosryahmed@google.com>
+
+commit 9cee7e8ef3e31ca25b40ca52b8585dc6935deff2 upstream.
+
+In memcg_rstat_updated(), we iterate the memcg being updated and its
+parents to update memcg->vmstats_percpu->stats_updates in the fast path
+(i.e. no atomic updates). According to my math, this is 3 memory loads
+(and potentially 3 cache misses) per memcg:
+- Load the address of memcg->vmstats_percpu.
+- Load vmstats_percpu->stats_updates (based on some percpu calculation).
+- Load the address of the parent memcg.
+
+Avoid most of the cache misses by caching a pointer from each struct
+memcg_vmstats_percpu to its parent on the corresponding CPU. In this
+case, for the first memcg we have 2 memory loads (same as above):
+- Load the address of memcg->vmstats_percpu.
+- Load vmstats_percpu->stats_updates (based on some percpu calculation).
+
+Then for each additional memcg, we need a single load to get the
+parent's stats_updates directly. This reduces the number of loads from
+O(3N) to O(2+N) -- where N is the number of memcgs we need to iterate.
+
+Additionally, stash a pointer to memcg->vmstats in each struct
+memcg_vmstats_percpu such that we can access the atomic counter that all
+CPUs fold into, memcg->vmstats->stats_updates.
+memcg_should_flush_stats() is changed to memcg_vmstats_needs_flush() to
+accept a struct memcg_vmstats pointer accordingly.
+
+In struct memcg_vmstats_percpu, make sure both pointers together with
+stats_updates live on the same cacheline. Finally, update
+mem_cgroup_alloc() to take in a parent pointer and initialize the new
+cache pointers on each CPU. The percpu loop in mem_cgroup_alloc() may
+look concerning, but there are multiple similar loops in the cgroup
+creation path (e.g. cgroup_rstat_init()), most of which are hidden
+within alloc_percpu().
+
+According to Oliver's testing [1], this fixes multiple 30-38%
+regressions in vm-scalability, will-it-scale-tlb_flush2, and
+will-it-scale-fallocate1. This comes at a cost of 2 more pointers per
+CPU (<2KB on a machine with 128 CPUs).
+
+[1] https://lore.kernel.org/lkml/ZbDJsfsZt2ITyo61@xsang-OptiPlex-9020/
+
+[yosryahmed@google.com: fix struct memcg_vmstats_percpu size and alignment]
+  Link: https://lkml.kernel.org/r/20240203044612.1234216-1-yosryahmed@google.com
+Link: https://lkml.kernel.org/r/20240124100023.660032-1-yosryahmed@google.com
+Signed-off-by: Yosry Ahmed <yosryahmed@google.com>
+Fixes: 8d59d2214c23 ("mm: memcg: make stats flushing threshold per-memcg")
+Tested-by: kernel test robot <oliver.sang@intel.com>
+Reported-by: kernel test robot <oliver.sang@intel.com>
+Closes: https://lore.kernel.org/oe-lkp/202401221624.cb53a8ca-oliver.sang@intel.com
+Acked-by: Shakeel Butt <shakeelb@google.com>
+Acked-by: Johannes Weiner <hannes@cmpxchg.org>
+Cc: Michal Hocko <mhocko@kernel.org>
+Cc: Muchun Song <muchun.song@linux.dev>
+Cc: Roman Gushchin <roman.gushchin@linux.dev>
+Cc: Greg Thelen <gthelen@google.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ mm/memcontrol.c |   56 +++++++++++++++++++++++++++++++++++---------------------
+ 1 file changed, 35 insertions(+), 21 deletions(-)
+
+--- a/mm/memcontrol.c
++++ b/mm/memcontrol.c
+@@ -617,6 +617,15 @@ static inline int memcg_events_index(enu
+ }
+ struct memcg_vmstats_percpu {
++      /* Stats updates since the last flush */
++      unsigned int                    stats_updates;
++
++      /* Cached pointers for fast iteration in memcg_rstat_updated() */
++      struct memcg_vmstats_percpu     *parent;
++      struct memcg_vmstats            *vmstats;
++
++      /* The above should fit a single cacheline for memcg_rstat_updated() */
++
+       /* Local (CPU and cgroup) page state & events */
+       long                    state[MEMCG_NR_STAT];
+       unsigned long           events[NR_MEMCG_EVENTS];
+@@ -628,10 +637,7 @@ struct memcg_vmstats_percpu {
+       /* Cgroup1: threshold notifications & softlimit tree updates */
+       unsigned long           nr_page_events;
+       unsigned long           targets[MEM_CGROUP_NTARGETS];
+-
+-      /* Stats updates since the last flush */
+-      unsigned int            stats_updates;
+-};
++} ____cacheline_aligned;
+ struct memcg_vmstats {
+       /* Aggregated (CPU and subtree) page state & events */
+@@ -694,36 +700,35 @@ static void memcg_stats_unlock(void)
+ }
+-static bool memcg_should_flush_stats(struct mem_cgroup *memcg)
++static bool memcg_vmstats_needs_flush(struct memcg_vmstats *vmstats)
+ {
+-      return atomic64_read(&memcg->vmstats->stats_updates) >
++      return atomic64_read(&vmstats->stats_updates) >
+               MEMCG_CHARGE_BATCH * num_online_cpus();
+ }
+ static inline void memcg_rstat_updated(struct mem_cgroup *memcg, int val)
+ {
++      struct memcg_vmstats_percpu *statc;
+       int cpu = smp_processor_id();
+-      unsigned int x;
+       if (!val)
+               return;
+       cgroup_rstat_updated(memcg->css.cgroup, cpu);
+-
+-      for (; memcg; memcg = parent_mem_cgroup(memcg)) {
+-              x = __this_cpu_add_return(memcg->vmstats_percpu->stats_updates,
+-                                        abs(val));
+-
+-              if (x < MEMCG_CHARGE_BATCH)
++      statc = this_cpu_ptr(memcg->vmstats_percpu);
++      for (; statc; statc = statc->parent) {
++              statc->stats_updates += abs(val);
++              if (statc->stats_updates < MEMCG_CHARGE_BATCH)
+                       continue;
+               /*
+                * If @memcg is already flush-able, increasing stats_updates is
+                * redundant. Avoid the overhead of the atomic update.
+                */
+-              if (!memcg_should_flush_stats(memcg))
+-                      atomic64_add(x, &memcg->vmstats->stats_updates);
+-              __this_cpu_write(memcg->vmstats_percpu->stats_updates, 0);
++              if (!memcg_vmstats_needs_flush(statc->vmstats))
++                      atomic64_add(statc->stats_updates,
++                                   &statc->vmstats->stats_updates);
++              statc->stats_updates = 0;
+       }
+ }
+@@ -752,7 +757,7 @@ void mem_cgroup_flush_stats(struct mem_c
+       if (!memcg)
+               memcg = root_mem_cgroup;
+-      if (memcg_should_flush_stats(memcg))
++      if (memcg_vmstats_needs_flush(memcg->vmstats))
+               do_flush_stats(memcg);
+ }
+@@ -766,7 +771,7 @@ void mem_cgroup_flush_stats_ratelimited(
+ static void flush_memcg_stats_dwork(struct work_struct *w)
+ {
+       /*
+-       * Deliberately ignore memcg_should_flush_stats() here so that flushing
++       * Deliberately ignore memcg_vmstats_needs_flush() here so that flushing
+        * in latency-sensitive paths is as cheap as possible.
+        */
+       do_flush_stats(root_mem_cgroup);
+@@ -5328,10 +5333,11 @@ static void mem_cgroup_free(struct mem_c
+       __mem_cgroup_free(memcg);
+ }
+-static struct mem_cgroup *mem_cgroup_alloc(void)
++static struct mem_cgroup *mem_cgroup_alloc(struct mem_cgroup *parent)
+ {
++      struct memcg_vmstats_percpu *statc, *pstatc;
+       struct mem_cgroup *memcg;
+-      int node;
++      int node, cpu;
+       int __maybe_unused i;
+       long error = -ENOMEM;
+@@ -5354,6 +5360,14 @@ static struct mem_cgroup *mem_cgroup_all
+       if (!memcg->vmstats_percpu)
+               goto fail;
++      for_each_possible_cpu(cpu) {
++              if (parent)
++                      pstatc = per_cpu_ptr(parent->vmstats_percpu, cpu);
++              statc = per_cpu_ptr(memcg->vmstats_percpu, cpu);
++              statc->parent = parent ? pstatc : NULL;
++              statc->vmstats = memcg->vmstats;
++      }
++
+       for_each_node(node)
+               if (alloc_mem_cgroup_per_node_info(memcg, node))
+                       goto fail;
+@@ -5399,7 +5413,7 @@ mem_cgroup_css_alloc(struct cgroup_subsy
+       struct mem_cgroup *memcg, *old_memcg;
+       old_memcg = set_active_memcg(parent);
+-      memcg = mem_cgroup_alloc();
++      memcg = mem_cgroup_alloc(parent);
+       set_active_memcg(old_memcg);
+       if (IS_ERR(memcg))
+               return ERR_CAST(memcg);
diff --git a/queue-6.6/mm-memory-tier-fix-abstract-distance-calculation-overflow.patch b/queue-6.6/mm-memory-tier-fix-abstract-distance-calculation-overflow.patch
new file mode 100644 (file)
index 0000000..3ab13a3
--- /dev/null
@@ -0,0 +1,65 @@
+From cce35103135c7ffc7bebc32ebfc74fe1f2c3cb5d Mon Sep 17 00:00:00 2001
+From: Li Zhijian <lizhijian@fujitsu.com>
+Date: Tue, 10 Jun 2025 14:27:51 +0800
+Subject: mm/memory-tier: fix abstract distance calculation overflow
+
+From: Li Zhijian <lizhijian@fujitsu.com>
+
+commit cce35103135c7ffc7bebc32ebfc74fe1f2c3cb5d upstream.
+
+In mt_perf_to_adistance(), the calculation of abstract distance (adist)
+involves multiplying several int values including
+MEMTIER_ADISTANCE_DRAM.
+
+*adist = MEMTIER_ADISTANCE_DRAM *
+               (perf->read_latency + perf->write_latency) /
+               (default_dram_perf.read_latency + default_dram_perf.write_latency) *
+               (default_dram_perf.read_bandwidth + default_dram_perf.write_bandwidth) /
+               (perf->read_bandwidth + perf->write_bandwidth);
+
+Since these values can be large, the multiplication may exceed the
+maximum value of an int (INT_MAX) and overflow (Our platform did),
+leading to an incorrect adist.
+
+User-visible impact:
+The memory tiering subsystem will misinterpret slow memory (like CXL)
+as faster than DRAM, causing inappropriate demotion of pages from
+CXL (slow memory) to DRAM (fast memory).
+
+For example, we will see the following demotion chains from the dmesg, where
+Node0,1 are DRAM, and Node2,3 are CXL node:
+ Demotion targets for Node 0: null
+ Demotion targets for Node 1: null
+ Demotion targets for Node 2: preferred: 0-1, fallback: 0-1
+ Demotion targets for Node 3: preferred: 0-1, fallback: 0-1
+
+Change MEMTIER_ADISTANCE_DRAM to be a long constant by writing it with
+the 'L' suffix.  This prevents the overflow because the multiplication
+will then be done in the long type which has a larger range.
+
+Link: https://lkml.kernel.org/r/20250611023439.2845785-1-lizhijian@fujitsu.com
+Link: https://lkml.kernel.org/r/20250610062751.2365436-1-lizhijian@fujitsu.com
+Fixes: 3718c02dbd4c ("acpi, hmat: calculate abstract distance with HMAT")
+Signed-off-by: Li Zhijian <lizhijian@fujitsu.com>
+Reviewed-by: Huang Ying <ying.huang@linux.alibaba.com>
+Acked-by: Balbir Singh <balbirs@nvidia.com>
+Reviewed-by: Donet Tom <donettom@linux.ibm.com>
+Reviewed-by: Oscar Salvador <osalvador@suse.de>
+Cc: <stable@vger.kernel.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ include/linux/memory-tiers.h |    2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/include/linux/memory-tiers.h
++++ b/include/linux/memory-tiers.h
+@@ -18,7 +18,7 @@
+  * adistance value (slightly faster) than default DRAM adistance to be part of
+  * the same memory tier.
+  */
+-#define MEMTIER_ADISTANCE_DRAM        ((4 * MEMTIER_CHUNK_SIZE) + (MEMTIER_CHUNK_SIZE >> 1))
++#define MEMTIER_ADISTANCE_DRAM        ((4L * MEMTIER_CHUNK_SIZE) + (MEMTIER_CHUNK_SIZE >> 1))
+ struct memory_tier;
+ struct memory_dev_type {
index 9f8c27234627d048b1725e1bc4ad755db6b784da..c95a5fac44e1a448b1d46537395d78b4ea5238bf 100644 (file)
@@ -521,3 +521,8 @@ mm-truncate-unmap-large-folio-on-split-failure.patch
 mm-secretmem-fix-use-after-free-race-in-fault-handler.patch
 isdn-misdn-hfcsusb-fix-memory-leak-in-hfcsusb_probe.patch
 net-netpoll-ensure-skb_pool-list-is-always-initialized.patch
+cachestat-do-not-flush-stats-in-recency-check.patch
+memory-tiers-use-default_dram_perf_ref_source-in-log-message.patch
+mm-memory-tier-fix-abstract-distance-calculation-overflow.patch
+mm-memcg-optimize-parent-iteration-in-memcg_rstat_updated.patch
+acpi-hmat-remove-register-of-memory-node-for-generic-target.patch