]> git.ipfire.org Git - thirdparty/kernel/stable-queue.git/commitdiff
Fixes for 6.6
authorSasha Levin <sashal@kernel.org>
Mon, 29 Apr 2024 01:53:52 +0000 (21:53 -0400)
committerSasha Levin <sashal@kernel.org>
Mon, 29 Apr 2024 01:53:52 +0000 (21:53 -0400)
Signed-off-by: Sasha Levin <sashal@kernel.org>
12 files changed:
queue-6.6/drm-add-drm_gem_object_is_shared_for_memory_stats-he.patch [new file with mode: 0644]
queue-6.6/drm-amdgpu-add-shared-fdinfo-stats.patch [new file with mode: 0644]
queue-6.6/drm-amdgpu-fix-visible-vram-handling-during-faults.patch [new file with mode: 0644]
queue-6.6/drm-ttm-stop-pooling-cached-numa-pages-v2.patch [new file with mode: 0644]
queue-6.6/kvm-x86-pmu-set-enable-bits-for-gp-counters-in-perf_.patch [new file with mode: 0644]
queue-6.6/kvm-x86-pmu-zero-out-pmu-metadata-on-amd-if-pmu-is-d.patch [new file with mode: 0644]
queue-6.6/mm-gup-explicitly-define-and-check-internal-gup-flag.patch [new file with mode: 0644]
queue-6.6/mm-madvise-make-madv_populate_-read-write-handle-vm_.patch [new file with mode: 0644]
queue-6.6/mm-treewide-introduce-nr_page_orders.patch [new file with mode: 0644]
queue-6.6/series
queue-6.6/squashfs-check-the-inode-number-is-not-the-invalid-v.patch [new file with mode: 0644]
queue-6.6/squashfs-convert-to-new-timestamp-accessors.patch [new file with mode: 0644]

diff --git a/queue-6.6/drm-add-drm_gem_object_is_shared_for_memory_stats-he.patch b/queue-6.6/drm-add-drm_gem_object_is_shared_for_memory_stats-he.patch
new file mode 100644 (file)
index 0000000..692c36c
--- /dev/null
@@ -0,0 +1,59 @@
+From 20d98716f2ddccaf0c08ef3edeb921836b32d8d9 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Mon, 12 Feb 2024 16:04:24 -0500
+Subject: drm: add drm_gem_object_is_shared_for_memory_stats() helper
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+From: Alex Deucher <alexander.deucher@amd.com>
+
+[ Upstream commit b31f5eba32ae8cc28e7cfa5a55ec8670d8c718e2 ]
+
+Add a helper so that drm drivers can consistently report
+shared status via the fdinfo shared memory stats interface.
+
+In addition to handle count, show buffers as shared if they
+are shared via dma-buf as well (e.g., shared with v4l or some
+other subsystem).
+
+v2: switch to inline function
+
+Link: https://lore.kernel.org/all/20231207180225.439482-1-alexander.deucher@amd.com/
+Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com> (v1)
+Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
+Reviewed-by: Christian König <christian.keonig@amd.com>
+Signed-off-by: Christian König <christian.koenig@amd.com>
+Stable-dep-of: a6ff969fe9cb ("drm/amdgpu: fix visible VRAM handling during faults")
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ include/drm/drm_gem.h | 13 +++++++++++++
+ 1 file changed, 13 insertions(+)
+
+diff --git a/include/drm/drm_gem.h b/include/drm/drm_gem.h
+index bc9f6aa2f3fec..7c2ec139c464a 100644
+--- a/include/drm/drm_gem.h
++++ b/include/drm/drm_gem.h
+@@ -544,6 +544,19 @@ unsigned long drm_gem_lru_scan(struct drm_gem_lru *lru,
+ int drm_gem_evict(struct drm_gem_object *obj);
++/**
++ * drm_gem_object_is_shared_for_memory_stats - helper for shared memory stats
++ *
++ * This helper should only be used for fdinfo shared memory stats to determine
++ * if a GEM object is shared.
++ *
++ * @obj: obj in question
++ */
++static inline bool drm_gem_object_is_shared_for_memory_stats(struct drm_gem_object *obj)
++{
++      return (obj->handle_count > 1) || obj->dma_buf;
++}
++
+ #ifdef CONFIG_LOCKDEP
+ /**
+  * drm_gem_gpuva_set_lock() - Set the lock protecting accesses to the gpuva list.
+-- 
+2.43.0
+
diff --git a/queue-6.6/drm-amdgpu-add-shared-fdinfo-stats.patch b/queue-6.6/drm-amdgpu-add-shared-fdinfo-stats.patch
new file mode 100644 (file)
index 0000000..9eb645b
--- /dev/null
@@ -0,0 +1,112 @@
+From 414f410b10b6323ad6499db3bafb65b241dfa7c6 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Mon, 12 Feb 2024 16:04:26 -0500
+Subject: drm/amdgpu: add shared fdinfo stats
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+From: Alex Deucher <alexander.deucher@amd.com>
+
+[ Upstream commit ba1a58d5b907bdf1814f8f57434aebc86233430f ]
+
+Add shared stats.  Useful for seeing shared memory.
+
+v2: take dma-buf into account as well
+v3: use the new gem helper
+
+Link: https://lore.kernel.org/all/20231207180225.439482-1-alexander.deucher@amd.com/
+Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
+Cc: Rob Clark <robdclark@gmail.com>
+Reviewed-by: Christian König <christian.keonig@amd.com>
+Signed-off-by: Christian König <christian.koenig@amd.com>
+Stable-dep-of: a6ff969fe9cb ("drm/amdgpu: fix visible VRAM handling during faults")
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/gpu/drm/amd/amdgpu/amdgpu_fdinfo.c |  4 ++++
+ drivers/gpu/drm/amd/amdgpu/amdgpu_object.c | 11 +++++++++++
+ drivers/gpu/drm/amd/amdgpu/amdgpu_object.h |  6 ++++++
+ 3 files changed, 21 insertions(+)
+
+diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fdinfo.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_fdinfo.c
+index 6038b5021b27b..792c059ff7b35 100644
+--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fdinfo.c
++++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fdinfo.c
+@@ -105,6 +105,10 @@ void amdgpu_show_fdinfo(struct drm_printer *p, struct drm_file *file)
+                  stats.requested_visible_vram/1024UL);
+       drm_printf(p, "amd-requested-gtt:\t%llu KiB\n",
+                  stats.requested_gtt/1024UL);
++      drm_printf(p, "drm-shared-vram:\t%llu KiB\n", stats.vram_shared/1024UL);
++      drm_printf(p, "drm-shared-gtt:\t%llu KiB\n", stats.gtt_shared/1024UL);
++      drm_printf(p, "drm-shared-cpu:\t%llu KiB\n", stats.cpu_shared/1024UL);
++
+       for (hw_ip = 0; hw_ip < AMDGPU_HW_IP_NUM; ++hw_ip) {
+               if (!usage[hw_ip])
+                       continue;
+diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
+index 173b43a5aa13b..394f475877e3b 100644
+--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
++++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
+@@ -1281,25 +1281,36 @@ void amdgpu_bo_get_memory(struct amdgpu_bo *bo,
+                         struct amdgpu_mem_stats *stats)
+ {
+       uint64_t size = amdgpu_bo_size(bo);
++      struct drm_gem_object *obj;
+       unsigned int domain;
++      bool shared;
+       /* Abort if the BO doesn't currently have a backing store */
+       if (!bo->tbo.resource)
+               return;
++      obj = &bo->tbo.base;
++      shared = drm_gem_object_is_shared_for_memory_stats(obj);
++
+       domain = amdgpu_mem_type_to_domain(bo->tbo.resource->mem_type);
+       switch (domain) {
+       case AMDGPU_GEM_DOMAIN_VRAM:
+               stats->vram += size;
+               if (amdgpu_bo_in_cpu_visible_vram(bo))
+                       stats->visible_vram += size;
++              if (shared)
++                      stats->vram_shared += size;
+               break;
+       case AMDGPU_GEM_DOMAIN_GTT:
+               stats->gtt += size;
++              if (shared)
++                      stats->gtt_shared += size;
+               break;
+       case AMDGPU_GEM_DOMAIN_CPU:
+       default:
+               stats->cpu += size;
++              if (shared)
++                      stats->cpu_shared += size;
+               break;
+       }
+diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h
+index a3ea8a82db23a..be679c42b0b8c 100644
+--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h
++++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h
+@@ -138,12 +138,18 @@ struct amdgpu_bo_vm {
+ struct amdgpu_mem_stats {
+       /* current VRAM usage, includes visible VRAM */
+       uint64_t vram;
++      /* current shared VRAM usage, includes visible VRAM */
++      uint64_t vram_shared;
+       /* current visible VRAM usage */
+       uint64_t visible_vram;
+       /* current GTT usage */
+       uint64_t gtt;
++      /* current shared GTT usage */
++      uint64_t gtt_shared;
+       /* current system memory usage */
+       uint64_t cpu;
++      /* current shared system memory usage */
++      uint64_t cpu_shared;
+       /* sum of evicted buffers, includes visible VRAM */
+       uint64_t evicted_vram;
+       /* sum of evicted buffers due to CPU access */
+-- 
+2.43.0
+
diff --git a/queue-6.6/drm-amdgpu-fix-visible-vram-handling-during-faults.patch b/queue-6.6/drm-amdgpu-fix-visible-vram-handling-during-faults.patch
new file mode 100644 (file)
index 0000000..4f64baf
--- /dev/null
@@ -0,0 +1,283 @@
+From 18ea34ca16ae5fcde5f879e412836c1a81d28914 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 4 Apr 2024 16:25:40 +0200
+Subject: drm/amdgpu: fix visible VRAM handling during faults
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+From: Christian König <christian.koenig@amd.com>
+
+[ Upstream commit a6ff969fe9cbf369e3cd0ac54261fec1122682ec ]
+
+When we removed the hacky start code check we actually didn't took into
+account that *all* VRAM pages needs to be CPU accessible.
+
+Clean up the code and unify the handling into a single helper which
+checks if the whole resource is CPU accessible.
+
+The only place where a partial check would make sense is during
+eviction, but that is neglitible.
+
+Signed-off-by: Christian König <christian.koenig@amd.com>
+Fixes: aed01a68047b ("drm/amdgpu: Remove TTM resource->start visible VRAM condition v2")
+Reviewed-by: Alex Deucher <alexander.deucher@amd.com>
+Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
+CC: stable@vger.kernel.org
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c     |  2 +-
+ drivers/gpu/drm/amd/amdgpu/amdgpu_object.c | 22 ++++----
+ drivers/gpu/drm/amd/amdgpu/amdgpu_object.h | 22 --------
+ drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c    | 61 ++++++++++++++--------
+ drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h    |  3 ++
+ 5 files changed, 53 insertions(+), 57 deletions(-)
+
+diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
+index c0a3afe81bb1a..4294f5e7bff9a 100644
+--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
++++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
+@@ -819,7 +819,7 @@ static int amdgpu_cs_bo_validate(void *param, struct amdgpu_bo *bo)
+       p->bytes_moved += ctx.bytes_moved;
+       if (!amdgpu_gmc_vram_full_visible(&adev->gmc) &&
+-          amdgpu_bo_in_cpu_visible_vram(bo))
++          amdgpu_res_cpu_visible(adev, bo->tbo.resource))
+               p->bytes_moved_vis += ctx.bytes_moved;
+       if (unlikely(r == -ENOMEM) && domain != bo->allowed_domains) {
+diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
+index 394f475877e3b..361f2cc94e8e5 100644
+--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
++++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
+@@ -625,8 +625,7 @@ int amdgpu_bo_create(struct amdgpu_device *adev,
+               return r;
+       if (!amdgpu_gmc_vram_full_visible(&adev->gmc) &&
+-          bo->tbo.resource->mem_type == TTM_PL_VRAM &&
+-          amdgpu_bo_in_cpu_visible_vram(bo))
++          amdgpu_res_cpu_visible(adev, bo->tbo.resource))
+               amdgpu_cs_report_moved_bytes(adev, ctx.bytes_moved,
+                                            ctx.bytes_moved);
+       else
+@@ -1280,23 +1279,25 @@ void amdgpu_bo_move_notify(struct ttm_buffer_object *bo, bool evict)
+ void amdgpu_bo_get_memory(struct amdgpu_bo *bo,
+                         struct amdgpu_mem_stats *stats)
+ {
++      struct amdgpu_device *adev = amdgpu_ttm_adev(bo->tbo.bdev);
++      struct ttm_resource *res = bo->tbo.resource;
+       uint64_t size = amdgpu_bo_size(bo);
+       struct drm_gem_object *obj;
+       unsigned int domain;
+       bool shared;
+       /* Abort if the BO doesn't currently have a backing store */
+-      if (!bo->tbo.resource)
++      if (!res)
+               return;
+       obj = &bo->tbo.base;
+       shared = drm_gem_object_is_shared_for_memory_stats(obj);
+-      domain = amdgpu_mem_type_to_domain(bo->tbo.resource->mem_type);
++      domain = amdgpu_mem_type_to_domain(res->mem_type);
+       switch (domain) {
+       case AMDGPU_GEM_DOMAIN_VRAM:
+               stats->vram += size;
+-              if (amdgpu_bo_in_cpu_visible_vram(bo))
++              if (amdgpu_res_cpu_visible(adev, bo->tbo.resource))
+                       stats->visible_vram += size;
+               if (shared)
+                       stats->vram_shared += size;
+@@ -1395,10 +1396,7 @@ vm_fault_t amdgpu_bo_fault_reserve_notify(struct ttm_buffer_object *bo)
+       /* Remember that this BO was accessed by the CPU */
+       abo->flags |= AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED;
+-      if (bo->resource->mem_type != TTM_PL_VRAM)
+-              return 0;
+-
+-      if (amdgpu_bo_in_cpu_visible_vram(abo))
++      if (amdgpu_res_cpu_visible(adev, bo->resource))
+               return 0;
+       /* Can't move a pinned BO to visible VRAM */
+@@ -1422,7 +1420,7 @@ vm_fault_t amdgpu_bo_fault_reserve_notify(struct ttm_buffer_object *bo)
+       /* this should never happen */
+       if (bo->resource->mem_type == TTM_PL_VRAM &&
+-          !amdgpu_bo_in_cpu_visible_vram(abo))
++          !amdgpu_res_cpu_visible(adev, bo->resource))
+               return VM_FAULT_SIGBUS;
+       ttm_bo_move_to_lru_tail_unlocked(bo);
+@@ -1582,6 +1580,7 @@ uint32_t amdgpu_bo_get_preferred_domain(struct amdgpu_device *adev,
+  */
+ u64 amdgpu_bo_print_info(int id, struct amdgpu_bo *bo, struct seq_file *m)
+ {
++      struct amdgpu_device *adev = amdgpu_ttm_adev(bo->tbo.bdev);
+       struct dma_buf_attachment *attachment;
+       struct dma_buf *dma_buf;
+       const char *placement;
+@@ -1590,10 +1589,11 @@ u64 amdgpu_bo_print_info(int id, struct amdgpu_bo *bo, struct seq_file *m)
+       if (dma_resv_trylock(bo->tbo.base.resv)) {
+               unsigned int domain;
++
+               domain = amdgpu_mem_type_to_domain(bo->tbo.resource->mem_type);
+               switch (domain) {
+               case AMDGPU_GEM_DOMAIN_VRAM:
+-                      if (amdgpu_bo_in_cpu_visible_vram(bo))
++                      if (amdgpu_res_cpu_visible(adev, bo->tbo.resource))
+                               placement = "VRAM VISIBLE";
+                       else
+                               placement = "VRAM";
+diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h
+index be679c42b0b8c..fa03d9e4874cc 100644
+--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h
++++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h
+@@ -250,28 +250,6 @@ static inline u64 amdgpu_bo_mmap_offset(struct amdgpu_bo *bo)
+       return drm_vma_node_offset_addr(&bo->tbo.base.vma_node);
+ }
+-/**
+- * amdgpu_bo_in_cpu_visible_vram - check if BO is (partly) in visible VRAM
+- */
+-static inline bool amdgpu_bo_in_cpu_visible_vram(struct amdgpu_bo *bo)
+-{
+-      struct amdgpu_device *adev = amdgpu_ttm_adev(bo->tbo.bdev);
+-      struct amdgpu_res_cursor cursor;
+-
+-      if (!bo->tbo.resource || bo->tbo.resource->mem_type != TTM_PL_VRAM)
+-              return false;
+-
+-      amdgpu_res_first(bo->tbo.resource, 0, amdgpu_bo_size(bo), &cursor);
+-      while (cursor.remaining) {
+-              if (cursor.start < adev->gmc.visible_vram_size)
+-                      return true;
+-
+-              amdgpu_res_next(&cursor, cursor.size);
+-      }
+-
+-      return false;
+-}
+-
+ /**
+  * amdgpu_bo_explicit_sync - return whether the bo is explicitly synced
+  */
+diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
+index 1124e2d4f8530..d1687b5725693 100644
+--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
++++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
+@@ -137,7 +137,7 @@ static void amdgpu_evict_flags(struct ttm_buffer_object *bo,
+                       amdgpu_bo_placement_from_domain(abo, AMDGPU_GEM_DOMAIN_CPU);
+               } else if (!amdgpu_gmc_vram_full_visible(&adev->gmc) &&
+                          !(abo->flags & AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED) &&
+-                         amdgpu_bo_in_cpu_visible_vram(abo)) {
++                         amdgpu_res_cpu_visible(adev, bo->resource)) {
+                       /* Try evicting to the CPU inaccessible part of VRAM
+                        * first, but only set GTT as busy placement, so this
+@@ -408,40 +408,55 @@ static int amdgpu_move_blit(struct ttm_buffer_object *bo,
+       return r;
+ }
+-/*
+- * amdgpu_mem_visible - Check that memory can be accessed by ttm_bo_move_memcpy
++/**
++ * amdgpu_res_cpu_visible - Check that resource can be accessed by CPU
++ * @adev: amdgpu device
++ * @res: the resource to check
+  *
+- * Called by amdgpu_bo_move()
++ * Returns: true if the full resource is CPU visible, false otherwise.
+  */
+-static bool amdgpu_mem_visible(struct amdgpu_device *adev,
+-                             struct ttm_resource *mem)
++bool amdgpu_res_cpu_visible(struct amdgpu_device *adev,
++                          struct ttm_resource *res)
+ {
+-      u64 mem_size = (u64)mem->size;
+       struct amdgpu_res_cursor cursor;
+-      u64 end;
+-      if (mem->mem_type == TTM_PL_SYSTEM ||
+-          mem->mem_type == TTM_PL_TT)
++      if (!res)
++              return false;
++
++      if (res->mem_type == TTM_PL_SYSTEM || res->mem_type == TTM_PL_TT ||
++          res->mem_type == AMDGPU_PL_PREEMPT)
+               return true;
+-      if (mem->mem_type != TTM_PL_VRAM)
++
++      if (res->mem_type != TTM_PL_VRAM)
+               return false;
+-      amdgpu_res_first(mem, 0, mem_size, &cursor);
+-      end = cursor.start + cursor.size;
++      amdgpu_res_first(res, 0, res->size, &cursor);
+       while (cursor.remaining) {
++              if ((cursor.start + cursor.size) >= adev->gmc.visible_vram_size)
++                      return false;
+               amdgpu_res_next(&cursor, cursor.size);
++      }
+-              if (!cursor.remaining)
+-                      break;
++      return true;
++}
+-              /* ttm_resource_ioremap only supports contiguous memory */
+-              if (end != cursor.start)
+-                      return false;
++/*
++ * amdgpu_res_copyable - Check that memory can be accessed by ttm_bo_move_memcpy
++ *
++ * Called by amdgpu_bo_move()
++ */
++static bool amdgpu_res_copyable(struct amdgpu_device *adev,
++                              struct ttm_resource *mem)
++{
++      if (!amdgpu_res_cpu_visible(adev, mem))
++              return false;
+-              end = cursor.start + cursor.size;
+-      }
++      /* ttm_resource_ioremap only supports contiguous memory */
++      if (mem->mem_type == TTM_PL_VRAM &&
++          !(mem->placement & TTM_PL_FLAG_CONTIGUOUS))
++              return false;
+-      return end <= adev->gmc.visible_vram_size;
++      return true;
+ }
+ /*
+@@ -534,8 +549,8 @@ static int amdgpu_bo_move(struct ttm_buffer_object *bo, bool evict,
+       if (r) {
+               /* Check that all memory is CPU accessible */
+-              if (!amdgpu_mem_visible(adev, old_mem) ||
+-                  !amdgpu_mem_visible(adev, new_mem)) {
++              if (!amdgpu_res_copyable(adev, old_mem) ||
++                  !amdgpu_res_copyable(adev, new_mem)) {
+                       pr_err("Move buffer fallback to memcpy unavailable\n");
+                       return r;
+               }
+diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h
+index 65ec82141a8e0..32cf6b6f6efd9 100644
+--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h
++++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h
+@@ -139,6 +139,9 @@ int amdgpu_vram_mgr_reserve_range(struct amdgpu_vram_mgr *mgr,
+ int amdgpu_vram_mgr_query_page_status(struct amdgpu_vram_mgr *mgr,
+                                     uint64_t start);
++bool amdgpu_res_cpu_visible(struct amdgpu_device *adev,
++                          struct ttm_resource *res);
++
+ int amdgpu_ttm_init(struct amdgpu_device *adev);
+ void amdgpu_ttm_fini(struct amdgpu_device *adev);
+ void amdgpu_ttm_set_buffer_funcs_status(struct amdgpu_device *adev,
+-- 
+2.43.0
+
diff --git a/queue-6.6/drm-ttm-stop-pooling-cached-numa-pages-v2.patch b/queue-6.6/drm-ttm-stop-pooling-cached-numa-pages-v2.patch
new file mode 100644 (file)
index 0000000..4097b8e
--- /dev/null
@@ -0,0 +1,112 @@
+From 1c9adb22902b02073346490edd1c27feb4ff0eb3 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Mon, 15 Apr 2024 15:48:21 +0200
+Subject: drm/ttm: stop pooling cached NUMA pages v2
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+From: Christian König <ckoenig.leichtzumerken@gmail.com>
+
+[ Upstream commit b6976f323a8687cc0d55bc92c2086fd934324ed5 ]
+
+We only pool write combined and uncached allocations because they
+require extra overhead on allocation and release.
+
+If we also pool cached NUMA it not only means some extra unnecessary
+overhead, but also that under memory pressure it can happen that
+pages from the wrong NUMA node enters the pool and are re-used
+over and over again.
+
+This can lead to performance reduction after running into memory
+pressure.
+
+v2: restructure and cleanup the code a bit from the internal hack to
+    test this.
+
+Signed-off-by: Christian König <christian.koenig@amd.com>
+Fixes: 4482d3c94d7f ("drm/ttm: add NUMA node id to the pool")
+CC: stable@vger.kernel.org
+Reviewed-by: Felix Kuehling <felix.kuehling@amd.com>
+Link: https://patchwork.freedesktop.org/patch/msgid/20240415134821.1919-1-christian.koenig@amd.com
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/gpu/drm/ttm/ttm_pool.c | 38 +++++++++++++++++++++++++---------
+ 1 file changed, 28 insertions(+), 10 deletions(-)
+
+diff --git a/drivers/gpu/drm/ttm/ttm_pool.c b/drivers/gpu/drm/ttm/ttm_pool.c
+index c8ec6a2cac5d4..37c08fac7e7d0 100644
+--- a/drivers/gpu/drm/ttm/ttm_pool.c
++++ b/drivers/gpu/drm/ttm/ttm_pool.c
+@@ -287,17 +287,23 @@ static struct ttm_pool_type *ttm_pool_select_type(struct ttm_pool *pool,
+                                                 enum ttm_caching caching,
+                                                 unsigned int order)
+ {
+-      if (pool->use_dma_alloc || pool->nid != NUMA_NO_NODE)
++      if (pool->use_dma_alloc)
+               return &pool->caching[caching].orders[order];
+ #ifdef CONFIG_X86
+       switch (caching) {
+       case ttm_write_combined:
++              if (pool->nid != NUMA_NO_NODE)
++                      return &pool->caching[caching].orders[order];
++
+               if (pool->use_dma32)
+                       return &global_dma32_write_combined[order];
+               return &global_write_combined[order];
+       case ttm_uncached:
++              if (pool->nid != NUMA_NO_NODE)
++                      return &pool->caching[caching].orders[order];
++
+               if (pool->use_dma32)
+                       return &global_dma32_uncached[order];
+@@ -563,11 +569,17 @@ void ttm_pool_init(struct ttm_pool *pool, struct device *dev,
+       pool->use_dma_alloc = use_dma_alloc;
+       pool->use_dma32 = use_dma32;
+-      if (use_dma_alloc || nid != NUMA_NO_NODE) {
+-              for (i = 0; i < TTM_NUM_CACHING_TYPES; ++i)
+-                      for (j = 0; j < NR_PAGE_ORDERS; ++j)
+-                              ttm_pool_type_init(&pool->caching[i].orders[j],
+-                                                 pool, i, j);
++      for (i = 0; i < TTM_NUM_CACHING_TYPES; ++i) {
++              for (j = 0; j < NR_PAGE_ORDERS; ++j) {
++                      struct ttm_pool_type *pt;
++
++                      /* Initialize only pool types which are actually used */
++                      pt = ttm_pool_select_type(pool, i, j);
++                      if (pt != &pool->caching[i].orders[j])
++                              continue;
++
++                      ttm_pool_type_init(pt, pool, i, j);
++              }
+       }
+ }
+ EXPORT_SYMBOL(ttm_pool_init);
+@@ -584,10 +596,16 @@ void ttm_pool_fini(struct ttm_pool *pool)
+ {
+       unsigned int i, j;
+-      if (pool->use_dma_alloc || pool->nid != NUMA_NO_NODE) {
+-              for (i = 0; i < TTM_NUM_CACHING_TYPES; ++i)
+-                      for (j = 0; j < NR_PAGE_ORDERS; ++j)
+-                              ttm_pool_type_fini(&pool->caching[i].orders[j]);
++      for (i = 0; i < TTM_NUM_CACHING_TYPES; ++i) {
++              for (j = 0; j < NR_PAGE_ORDERS; ++j) {
++                      struct ttm_pool_type *pt;
++
++                      pt = ttm_pool_select_type(pool, i, j);
++                      if (pt != &pool->caching[i].orders[j])
++                              continue;
++
++                      ttm_pool_type_fini(pt);
++              }
+       }
+       /* We removed the pool types from the LRU, but we need to also make sure
+-- 
+2.43.0
+
diff --git a/queue-6.6/kvm-x86-pmu-set-enable-bits-for-gp-counters-in-perf_.patch b/queue-6.6/kvm-x86-pmu-set-enable-bits-for-gp-counters-in-perf_.patch
new file mode 100644 (file)
index 0000000..1f7d5a2
--- /dev/null
@@ -0,0 +1,135 @@
+From e53f791282b6567256f6c19d43cf78cd053ff7bb Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Fri, 8 Mar 2024 17:36:40 -0800
+Subject: KVM: x86/pmu: Set enable bits for GP counters in PERF_GLOBAL_CTRL at
+ "RESET"
+
+From: Sean Christopherson <seanjc@google.com>
+
+[ Upstream commit de120e1d692d73c7eefa3278837b1eb68f90728a ]
+
+Set the enable bits for general purpose counters in IA32_PERF_GLOBAL_CTRL
+when refreshing the PMU to emulate the MSR's architecturally defined
+post-RESET behavior.  Per Intel's SDM:
+
+  IA32_PERF_GLOBAL_CTRL:  Sets bits n-1:0 and clears the upper bits.
+
+and
+
+  Where "n" is the number of general-purpose counters available in the processor.
+
+AMD also documents this behavior for PerfMonV2 CPUs in one of AMD's many
+PPRs.
+
+Do not set any PERF_GLOBAL_CTRL bits if there are no general purpose
+counters, although a literal reading of the SDM would require the CPU to
+set either bits 63:0 or 31:0.  The intent of the behavior is to globally
+enable all GP counters; honor the intent, if not the letter of the law.
+
+Leaving PERF_GLOBAL_CTRL '0' effectively breaks PMU usage in guests that
+haven't been updated to work with PMUs that support PERF_GLOBAL_CTRL.
+This bug was recently exposed when KVM added supported for AMD's
+PerfMonV2, i.e. when KVM started exposing a vPMU with PERF_GLOBAL_CTRL to
+guest software that only knew how to program v1 PMUs (that don't support
+PERF_GLOBAL_CTRL).
+
+Failure to emulate the post-RESET behavior results in such guests
+unknowingly leaving all general purpose counters globally disabled (the
+entire reason the post-RESET value sets the GP counter enable bits is to
+maintain backwards compatibility).
+
+The bug has likely gone unnoticed because PERF_GLOBAL_CTRL has been
+supported on Intel CPUs for as long as KVM has existed, i.e. hardly anyone
+is running guest software that isn't aware of PERF_GLOBAL_CTRL on Intel
+PMUs.  And because up until v6.0, KVM _did_ emulate the behavior for Intel
+CPUs, although the old behavior was likely dumb luck.
+
+Because (a) that old code was also broken in its own way (the history of
+this code is a comedy of errors), and (b) PERF_GLOBAL_CTRL was documented
+as having a value of '0' post-RESET in all SDMs before March 2023.
+
+Initial vPMU support in commit f5132b01386b ("KVM: Expose a version 2
+architectural PMU to a guests") *almost* got it right (again likely by
+dumb luck), but for some reason only set the bits if the guest PMU was
+advertised as v1:
+
+        if (pmu->version == 1) {
+                pmu->global_ctrl = (1 << pmu->nr_arch_gp_counters) - 1;
+                return;
+        }
+
+Commit f19a0c2c2e6a ("KVM: PMU emulation: GLOBAL_CTRL MSR should be
+enabled on reset") then tried to remedy that goof, presumably because
+guest PMUs were leaving PERF_GLOBAL_CTRL '0', i.e. weren't enabling
+counters.
+
+        pmu->global_ctrl = ((1 << pmu->nr_arch_gp_counters) - 1) |
+                (((1ull << pmu->nr_arch_fixed_counters) - 1) << X86_PMC_IDX_FIXED);
+        pmu->global_ctrl_mask = ~pmu->global_ctrl;
+
+That was KVM's behavior up until commit c49467a45fe0 ("KVM: x86/pmu:
+Don't overwrite the pmu->global_ctrl when refreshing") removed
+*everything*.  However, it did so based on the behavior defined by the
+SDM , which at the time stated that "Global Perf Counter Controls" is
+'0' at Power-Up and RESET.
+
+But then the March 2023 SDM (325462-079US), stealthily changed its
+"IA-32 and Intel 64 Processor States Following Power-up, Reset, or INIT"
+table to say:
+
+  IA32_PERF_GLOBAL_CTRL: Sets bits n-1:0 and clears the upper bits.
+
+Note, kvm_pmu_refresh() can be invoked multiple times, i.e. it's not a
+"pure" RESET flow.  But it can only be called prior to the first KVM_RUN,
+i.e. the guest will only ever observe the final value.
+
+Note #2, KVM has always cleared global_ctrl during refresh (see commit
+f5132b01386b ("KVM: Expose a version 2 architectural PMU to a guests")),
+i.e. there is no danger of breaking existing setups by clobbering a value
+set by userspace.
+
+Reported-by: Babu Moger <babu.moger@amd.com>
+Cc: Sandipan Das <sandipan.das@amd.com>
+Cc: Like Xu <like.xu.linux@gmail.com>
+Cc: Mingwei Zhang <mizhang@google.com>
+Cc: Dapeng Mi <dapeng1.mi@linux.intel.com>
+Cc: stable@vger.kernel.org
+Reviewed-by: Dapeng Mi <dapeng1.mi@linux.intel.com>
+Tested-by: Dapeng Mi <dapeng1.mi@linux.intel.com>
+Link: https://lore.kernel.org/r/20240309013641.1413400-2-seanjc@google.com
+Signed-off-by: Sean Christopherson <seanjc@google.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ arch/x86/kvm/pmu.c | 16 ++++++++++++++--
+ 1 file changed, 14 insertions(+), 2 deletions(-)
+
+diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c
+index fa6f5cd70d4c8..da2d82e3a8735 100644
+--- a/arch/x86/kvm/pmu.c
++++ b/arch/x86/kvm/pmu.c
+@@ -716,8 +716,20 @@ void kvm_pmu_refresh(struct kvm_vcpu *vcpu)
+       pmu->pebs_data_cfg_mask = ~0ull;
+       bitmap_zero(pmu->all_valid_pmc_idx, X86_PMC_IDX_MAX);
+-      if (vcpu->kvm->arch.enable_pmu)
+-              static_call(kvm_x86_pmu_refresh)(vcpu);
++      if (!vcpu->kvm->arch.enable_pmu)
++              return;
++
++      static_call(kvm_x86_pmu_refresh)(vcpu);
++
++      /*
++       * At RESET, both Intel and AMD CPUs set all enable bits for general
++       * purpose counters in IA32_PERF_GLOBAL_CTRL (so that software that
++       * was written for v1 PMUs don't unknowingly leave GP counters disabled
++       * in the global controls).  Emulate that behavior when refreshing the
++       * PMU so that userspace doesn't need to manually set PERF_GLOBAL_CTRL.
++       */
++      if (kvm_pmu_has_perf_global_ctrl(pmu) && pmu->nr_arch_gp_counters)
++              pmu->global_ctrl = GENMASK_ULL(pmu->nr_arch_gp_counters - 1, 0);
+ }
+ void kvm_pmu_init(struct kvm_vcpu *vcpu)
+-- 
+2.43.0
+
diff --git a/queue-6.6/kvm-x86-pmu-zero-out-pmu-metadata-on-amd-if-pmu-is-d.patch b/queue-6.6/kvm-x86-pmu-zero-out-pmu-metadata-on-amd-if-pmu-is-d.patch
new file mode 100644 (file)
index 0000000..4c4e17c
--- /dev/null
@@ -0,0 +1,109 @@
+From ca1df8e3df9e6041412863834209ce5cdef70af9 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 9 Nov 2023 18:28:48 -0800
+Subject: KVM: x86/pmu: Zero out PMU metadata on AMD if PMU is disabled
+
+From: Sean Christopherson <seanjc@google.com>
+
+[ Upstream commit f933b88e20150f15787390e2a1754a7e412754ed ]
+
+Move the purging of common PMU metadata from intel_pmu_refresh() to
+kvm_pmu_refresh(), and invoke the vendor refresh() hook if and only if
+the VM is supposed to have a vPMU.
+
+KVM already denies access to the PMU based on kvm->arch.enable_pmu, as
+get_gp_pmc_amd() returns NULL for all PMCs in that case, i.e. KVM already
+violates AMD's architecture by not virtualizing a PMU (kernels have long
+since learned to not panic when the PMU is unavailable).  But configuring
+the PMU as if it were enabled causes unwanted side effects, e.g. calls to
+kvm_pmu_trigger_event() waste an absurd number of cycles due to the
+all_valid_pmc_idx bitmap being non-zero.
+
+Fixes: b1d66dad65dc ("KVM: x86/svm: Add module param to control PMU virtualization")
+Reported-by: Konstantin Khorenko <khorenko@virtuozzo.com>
+Closes: https://lore.kernel.org/all/20231109180646.2963718-2-khorenko@virtuozzo.com
+Link: https://lore.kernel.org/r/20231110022857.1273836-2-seanjc@google.com
+Signed-off-by: Sean Christopherson <seanjc@google.com>
+Stable-dep-of: de120e1d692d ("KVM: x86/pmu: Set enable bits for GP counters in PERF_GLOBAL_CTRL at "RESET"")
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ arch/x86/kvm/pmu.c           | 20 ++++++++++++++++++--
+ arch/x86/kvm/vmx/pmu_intel.c | 16 ++--------------
+ 2 files changed, 20 insertions(+), 16 deletions(-)
+
+diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c
+index dc8e8e907cfbf..fa6f5cd70d4c8 100644
+--- a/arch/x86/kvm/pmu.c
++++ b/arch/x86/kvm/pmu.c
+@@ -691,6 +691,8 @@ void kvm_pmu_reset(struct kvm_vcpu *vcpu)
+  */
+ void kvm_pmu_refresh(struct kvm_vcpu *vcpu)
+ {
++      struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
++
+       if (KVM_BUG_ON(kvm_vcpu_has_run(vcpu), vcpu->kvm))
+               return;
+@@ -700,8 +702,22 @@ void kvm_pmu_refresh(struct kvm_vcpu *vcpu)
+        */
+       kvm_pmu_reset(vcpu);
+-      bitmap_zero(vcpu_to_pmu(vcpu)->all_valid_pmc_idx, X86_PMC_IDX_MAX);
+-      static_call(kvm_x86_pmu_refresh)(vcpu);
++      pmu->version = 0;
++      pmu->nr_arch_gp_counters = 0;
++      pmu->nr_arch_fixed_counters = 0;
++      pmu->counter_bitmask[KVM_PMC_GP] = 0;
++      pmu->counter_bitmask[KVM_PMC_FIXED] = 0;
++      pmu->reserved_bits = 0xffffffff00200000ull;
++      pmu->raw_event_mask = X86_RAW_EVENT_MASK;
++      pmu->global_ctrl_mask = ~0ull;
++      pmu->global_status_mask = ~0ull;
++      pmu->fixed_ctr_ctrl_mask = ~0ull;
++      pmu->pebs_enable_mask = ~0ull;
++      pmu->pebs_data_cfg_mask = ~0ull;
++      bitmap_zero(pmu->all_valid_pmc_idx, X86_PMC_IDX_MAX);
++
++      if (vcpu->kvm->arch.enable_pmu)
++              static_call(kvm_x86_pmu_refresh)(vcpu);
+ }
+ void kvm_pmu_init(struct kvm_vcpu *vcpu)
+diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c
+index 1549461fa42b7..48a2f77f62ef3 100644
+--- a/arch/x86/kvm/vmx/pmu_intel.c
++++ b/arch/x86/kvm/vmx/pmu_intel.c
+@@ -493,19 +493,6 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu)
+       u64 counter_mask;
+       int i;
+-      pmu->nr_arch_gp_counters = 0;
+-      pmu->nr_arch_fixed_counters = 0;
+-      pmu->counter_bitmask[KVM_PMC_GP] = 0;
+-      pmu->counter_bitmask[KVM_PMC_FIXED] = 0;
+-      pmu->version = 0;
+-      pmu->reserved_bits = 0xffffffff00200000ull;
+-      pmu->raw_event_mask = X86_RAW_EVENT_MASK;
+-      pmu->global_ctrl_mask = ~0ull;
+-      pmu->global_status_mask = ~0ull;
+-      pmu->fixed_ctr_ctrl_mask = ~0ull;
+-      pmu->pebs_enable_mask = ~0ull;
+-      pmu->pebs_data_cfg_mask = ~0ull;
+-
+       memset(&lbr_desc->records, 0, sizeof(lbr_desc->records));
+       /*
+@@ -517,8 +504,9 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu)
+               return;
+       entry = kvm_find_cpuid_entry(vcpu, 0xa);
+-      if (!entry || !vcpu->kvm->arch.enable_pmu)
++      if (!entry)
+               return;
++
+       eax.full = entry->eax;
+       edx.full = entry->edx;
+-- 
+2.43.0
+
diff --git a/queue-6.6/mm-gup-explicitly-define-and-check-internal-gup-flag.patch b/queue-6.6/mm-gup-explicitly-define-and-check-internal-gup-flag.patch
new file mode 100644 (file)
index 0000000..112b4c4
--- /dev/null
@@ -0,0 +1,79 @@
+From d64d15556b61253f6109226a4c039ace736a218f Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Tue, 3 Oct 2023 00:14:52 +0100
+Subject: mm/gup: explicitly define and check internal GUP flags, disallow
+ FOLL_TOUCH
+
+From: Lorenzo Stoakes <lstoakes@gmail.com>
+
+[ Upstream commit 0f20bba1688bdf3b32df0162511a67d4eda15790 ]
+
+Rather than open-coding a list of internal GUP flags in
+is_valid_gup_args(), define which ones are internal.
+
+In addition, explicitly check to see if the user passed in FOLL_TOUCH
+somehow, as this appears to have been accidentally excluded.
+
+Link: https://lkml.kernel.org/r/971e013dfe20915612ea8b704e801d7aef9a66b6.1696288092.git.lstoakes@gmail.com
+Signed-off-by: Lorenzo Stoakes <lstoakes@gmail.com>
+Reviewed-by: Arnd Bergmann <arnd@arndb.de>
+Reviewed-by: David Hildenbrand <david@redhat.com>
+Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
+Cc: Adrian Hunter <adrian.hunter@intel.com>
+Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
+Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
+Cc: Catalin Marinas <catalin.marinas@arm.com>
+Cc: Ian Rogers <irogers@google.com>
+Cc: Ingo Molnar <mingo@redhat.com>
+Cc: Jiri Olsa <jolsa@kernel.org>
+Cc: John Hubbard <jhubbard@nvidia.com>
+Cc: Mark Rutland <mark.rutland@arm.com>
+Cc: Namhyung Kim <namhyung@kernel.org>
+Cc: Oleg Nesterov <oleg@redhat.com>
+Cc: Peter Zijlstra <peterz@infradead.org>
+Cc: Richard Cochran <richardcochran@gmail.com>
+Cc: Will Deacon <will@kernel.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Stable-dep-of: 631426ba1d45 ("mm/madvise: make MADV_POPULATE_(READ|WRITE) handle VM_FAULT_RETRY properly")
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ mm/gup.c      | 5 ++---
+ mm/internal.h | 3 +++
+ 2 files changed, 5 insertions(+), 3 deletions(-)
+
+diff --git a/mm/gup.c b/mm/gup.c
+index 2f8a2d89fde19..b21b33d1787e1 100644
+--- a/mm/gup.c
++++ b/mm/gup.c
+@@ -2227,12 +2227,11 @@ static bool is_valid_gup_args(struct page **pages, int *locked,
+       /*
+        * These flags not allowed to be specified externally to the gup
+        * interfaces:
+-       * - FOLL_PIN/FOLL_TRIED/FOLL_FAST_ONLY are internal only
++       * - FOLL_TOUCH/FOLL_PIN/FOLL_TRIED/FOLL_FAST_ONLY are internal only
+        * - FOLL_REMOTE is internal only and used on follow_page()
+        * - FOLL_UNLOCKABLE is internal only and used if locked is !NULL
+        */
+-      if (WARN_ON_ONCE(gup_flags & (FOLL_PIN | FOLL_TRIED | FOLL_UNLOCKABLE |
+-                                    FOLL_REMOTE | FOLL_FAST_ONLY)))
++      if (WARN_ON_ONCE(gup_flags & INTERNAL_GUP_FLAGS))
+               return false;
+       gup_flags |= to_set;
+diff --git a/mm/internal.h b/mm/internal.h
+index 30cf724ddbce3..50cf76d30a88f 100644
+--- a/mm/internal.h
++++ b/mm/internal.h
+@@ -964,6 +964,9 @@ enum {
+       FOLL_UNLOCKABLE = 1 << 21,
+ };
++#define INTERNAL_GUP_FLAGS (FOLL_TOUCH | FOLL_TRIED | FOLL_REMOTE | FOLL_PIN | \
++                          FOLL_FAST_ONLY | FOLL_UNLOCKABLE)
++
+ /*
+  * Indicates for which pages that are write-protected in the page table,
+  * whether GUP has to trigger unsharing via FAULT_FLAG_UNSHARE such that the
+-- 
+2.43.0
+
diff --git a/queue-6.6/mm-madvise-make-madv_populate_-read-write-handle-vm_.patch b/queue-6.6/mm-madvise-make-madv_populate_-read-write-handle-vm_.patch
new file mode 100644 (file)
index 0000000..2b15ba1
--- /dev/null
@@ -0,0 +1,244 @@
+From 7d3472c1c4d4a10f6720d5894d2efd29400d4ea4 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 14 Mar 2024 17:12:59 +0100
+Subject: mm/madvise: make MADV_POPULATE_(READ|WRITE) handle VM_FAULT_RETRY
+ properly
+
+From: David Hildenbrand <david@redhat.com>
+
+[ Upstream commit 631426ba1d45a8672b177ee85ad4cabe760dd131 ]
+
+Darrick reports that in some cases where pread() would fail with -EIO and
+mmap()+access would generate a SIGBUS signal, MADV_POPULATE_READ /
+MADV_POPULATE_WRITE will keep retrying forever and not fail with -EFAULT.
+
+While the madvise() call can be interrupted by a signal, this is not the
+desired behavior.  MADV_POPULATE_READ / MADV_POPULATE_WRITE should behave
+like page faults in that case: fail and not retry forever.
+
+A reproducer can be found at [1].
+
+The reason is that __get_user_pages(), as called by
+faultin_vma_page_range(), will not handle VM_FAULT_RETRY in a proper way:
+it will simply return 0 when VM_FAULT_RETRY happened, making
+madvise_populate()->faultin_vma_page_range() retry again and again, never
+setting FOLL_TRIED->FAULT_FLAG_TRIED for __get_user_pages().
+
+__get_user_pages_locked() does what we want, but duplicating that logic in
+faultin_vma_page_range() feels wrong.
+
+So let's use __get_user_pages_locked() instead, that will detect
+VM_FAULT_RETRY and set FOLL_TRIED when retrying, making the fault handler
+return VM_FAULT_SIGBUS (VM_FAULT_ERROR) at some point, propagating -EFAULT
+from faultin_page() to __get_user_pages(), all the way to
+madvise_populate().
+
+But, there is an issue: __get_user_pages_locked() will end up re-taking
+the MM lock and then __get_user_pages() will do another VMA lookup.  In
+the meantime, the VMA layout could have changed and we'd fail with
+different error codes than we'd want to.
+
+As __get_user_pages() will currently do a new VMA lookup either way, let
+it do the VMA handling in a different way, controlled by a new
+FOLL_MADV_POPULATE flag, effectively moving these checks from
+madvise_populate() + faultin_page_range() in there.
+
+With this change, Darricks reproducer properly fails with -EFAULT, as
+documented for MADV_POPULATE_READ / MADV_POPULATE_WRITE.
+
+[1] https://lore.kernel.org/all/20240313171936.GN1927156@frogsfrogsfrogs/
+
+Link: https://lkml.kernel.org/r/20240314161300.382526-1-david@redhat.com
+Link: https://lkml.kernel.org/r/20240314161300.382526-2-david@redhat.com
+Fixes: 4ca9b3859dac ("mm/madvise: introduce MADV_POPULATE_(READ|WRITE) to prefault page tables")
+Signed-off-by: David Hildenbrand <david@redhat.com>
+Reported-by: Darrick J. Wong <djwong@kernel.org>
+Closes: https://lore.kernel.org/all/20240311223815.GW1927156@frogsfrogsfrogs/
+Cc: Darrick J. Wong <djwong@kernel.org>
+Cc: Hugh Dickins <hughd@google.com>
+Cc: Jason Gunthorpe <jgg@nvidia.com>
+Cc: John Hubbard <jhubbard@nvidia.com>
+Cc: <stable@vger.kernel.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ mm/gup.c      | 54 ++++++++++++++++++++++++++++++---------------------
+ mm/internal.h | 10 ++++++----
+ mm/madvise.c  | 17 ++--------------
+ 3 files changed, 40 insertions(+), 41 deletions(-)
+
+diff --git a/mm/gup.c b/mm/gup.c
+index b21b33d1787e1..cfc0a66d951b9 100644
+--- a/mm/gup.c
++++ b/mm/gup.c
+@@ -1204,6 +1204,22 @@ static long __get_user_pages(struct mm_struct *mm,
+               /* first iteration or cross vma bound */
+               if (!vma || start >= vma->vm_end) {
++                      /*
++                       * MADV_POPULATE_(READ|WRITE) wants to handle VMA
++                       * lookups+error reporting differently.
++                       */
++                      if (gup_flags & FOLL_MADV_POPULATE) {
++                              vma = vma_lookup(mm, start);
++                              if (!vma) {
++                                      ret = -ENOMEM;
++                                      goto out;
++                              }
++                              if (check_vma_flags(vma, gup_flags)) {
++                                      ret = -EINVAL;
++                                      goto out;
++                              }
++                              goto retry;
++                      }
+                       vma = gup_vma_lookup(mm, start);
+                       if (!vma && in_gate_area(mm, start)) {
+                               ret = get_gate_page(mm, start & PAGE_MASK,
+@@ -1670,35 +1686,35 @@ long populate_vma_page_range(struct vm_area_struct *vma,
+ }
+ /*
+- * faultin_vma_page_range() - populate (prefault) page tables inside the
+- *                          given VMA range readable/writable
++ * faultin_page_range() - populate (prefault) page tables inside the
++ *                      given range readable/writable
+  *
+  * This takes care of mlocking the pages, too, if VM_LOCKED is set.
+  *
+- * @vma: target vma
++ * @mm: the mm to populate page tables in
+  * @start: start address
+  * @end: end address
+  * @write: whether to prefault readable or writable
+  * @locked: whether the mmap_lock is still held
+  *
+- * Returns either number of processed pages in the vma, or a negative error
+- * code on error (see __get_user_pages()).
++ * Returns either number of processed pages in the MM, or a negative error
++ * code on error (see __get_user_pages()). Note that this function reports
++ * errors related to VMAs, such as incompatible mappings, as expected by
++ * MADV_POPULATE_(READ|WRITE).
+  *
+- * vma->vm_mm->mmap_lock must be held. The range must be page-aligned and
+- * covered by the VMA. If it's released, *@locked will be set to 0.
++ * The range must be page-aligned.
++ *
++ * mm->mmap_lock must be held. If it's released, *@locked will be set to 0.
+  */
+-long faultin_vma_page_range(struct vm_area_struct *vma, unsigned long start,
+-                          unsigned long end, bool write, int *locked)
++long faultin_page_range(struct mm_struct *mm, unsigned long start,
++                      unsigned long end, bool write, int *locked)
+ {
+-      struct mm_struct *mm = vma->vm_mm;
+       unsigned long nr_pages = (end - start) / PAGE_SIZE;
+       int gup_flags;
+       long ret;
+       VM_BUG_ON(!PAGE_ALIGNED(start));
+       VM_BUG_ON(!PAGE_ALIGNED(end));
+-      VM_BUG_ON_VMA(start < vma->vm_start, vma);
+-      VM_BUG_ON_VMA(end > vma->vm_end, vma);
+       mmap_assert_locked(mm);
+       /*
+@@ -1710,19 +1726,13 @@ long faultin_vma_page_range(struct vm_area_struct *vma, unsigned long start,
+        *                a poisoned page.
+        * !FOLL_FORCE: Require proper access permissions.
+        */
+-      gup_flags = FOLL_TOUCH | FOLL_HWPOISON | FOLL_UNLOCKABLE;
++      gup_flags = FOLL_TOUCH | FOLL_HWPOISON | FOLL_UNLOCKABLE |
++                  FOLL_MADV_POPULATE;
+       if (write)
+               gup_flags |= FOLL_WRITE;
+-      /*
+-       * We want to report -EINVAL instead of -EFAULT for any permission
+-       * problems or incompatible mappings.
+-       */
+-      if (check_vma_flags(vma, gup_flags))
+-              return -EINVAL;
+-
+-      ret = __get_user_pages(mm, start, nr_pages, gup_flags,
+-                             NULL, locked);
++      ret = __get_user_pages_locked(mm, start, nr_pages, NULL, locked,
++                                    gup_flags);
+       lru_add_drain();
+       return ret;
+ }
+diff --git a/mm/internal.h b/mm/internal.h
+index 50cf76d30a88f..abed947f784b7 100644
+--- a/mm/internal.h
++++ b/mm/internal.h
+@@ -581,9 +581,8 @@ struct anon_vma *folio_anon_vma(struct folio *folio);
+ void unmap_mapping_folio(struct folio *folio);
+ extern long populate_vma_page_range(struct vm_area_struct *vma,
+               unsigned long start, unsigned long end, int *locked);
+-extern long faultin_vma_page_range(struct vm_area_struct *vma,
+-                                 unsigned long start, unsigned long end,
+-                                 bool write, int *locked);
++extern long faultin_page_range(struct mm_struct *mm, unsigned long start,
++              unsigned long end, bool write, int *locked);
+ extern bool mlock_future_ok(struct mm_struct *mm, unsigned long flags,
+                              unsigned long bytes);
+ /*
+@@ -962,10 +961,13 @@ enum {
+       FOLL_FAST_ONLY = 1 << 20,
+       /* allow unlocking the mmap lock */
+       FOLL_UNLOCKABLE = 1 << 21,
++      /* VMA lookup+checks compatible with MADV_POPULATE_(READ|WRITE) */
++      FOLL_MADV_POPULATE = 1 << 22,
+ };
+ #define INTERNAL_GUP_FLAGS (FOLL_TOUCH | FOLL_TRIED | FOLL_REMOTE | FOLL_PIN | \
+-                          FOLL_FAST_ONLY | FOLL_UNLOCKABLE)
++                          FOLL_FAST_ONLY | FOLL_UNLOCKABLE | \
++                          FOLL_MADV_POPULATE)
+ /*
+  * Indicates for which pages that are write-protected in the page table,
+diff --git a/mm/madvise.c b/mm/madvise.c
+index 4dded5d27e7ea..98fdb9288a68a 100644
+--- a/mm/madvise.c
++++ b/mm/madvise.c
+@@ -917,27 +917,14 @@ static long madvise_populate(struct vm_area_struct *vma,
+ {
+       const bool write = behavior == MADV_POPULATE_WRITE;
+       struct mm_struct *mm = vma->vm_mm;
+-      unsigned long tmp_end;
+       int locked = 1;
+       long pages;
+       *prev = vma;
+       while (start < end) {
+-              /*
+-               * We might have temporarily dropped the lock. For example,
+-               * our VMA might have been split.
+-               */
+-              if (!vma || start >= vma->vm_end) {
+-                      vma = vma_lookup(mm, start);
+-                      if (!vma)
+-                              return -ENOMEM;
+-              }
+-
+-              tmp_end = min_t(unsigned long, end, vma->vm_end);
+               /* Populate (prefault) page tables readable/writable. */
+-              pages = faultin_vma_page_range(vma, start, tmp_end, write,
+-                                             &locked);
++              pages = faultin_page_range(mm, start, end, write, &locked);
+               if (!locked) {
+                       mmap_read_lock(mm);
+                       locked = 1;
+@@ -958,7 +945,7 @@ static long madvise_populate(struct vm_area_struct *vma,
+                               pr_warn_once("%s: unhandled return value: %ld\n",
+                                            __func__, pages);
+                               fallthrough;
+-                      case -ENOMEM:
++                      case -ENOMEM: /* No VMA or out of memory. */
+                               return -ENOMEM;
+                       }
+               }
+-- 
+2.43.0
+
diff --git a/queue-6.6/mm-treewide-introduce-nr_page_orders.patch b/queue-6.6/mm-treewide-introduce-nr_page_orders.patch
new file mode 100644 (file)
index 0000000..e6c4728
--- /dev/null
@@ -0,0 +1,441 @@
+From c781c72f054614cc0c84bd00d0e44ed387a86b8c Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 28 Dec 2023 17:47:03 +0300
+Subject: mm, treewide: introduce NR_PAGE_ORDERS
+
+From: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
+
+[ Upstream commit fd37721803c6e73619108f76ad2e12a9aa5fafaf ]
+
+NR_PAGE_ORDERS defines the number of page orders supported by the page
+allocator, ranging from 0 to MAX_ORDER, MAX_ORDER + 1 in total.
+
+NR_PAGE_ORDERS assists in defining arrays of page orders and allows for
+more natural iteration over them.
+
+[kirill.shutemov@linux.intel.com: fixup for kerneldoc warning]
+  Link: https://lkml.kernel.org/r/20240101111512.7empzyifq7kxtzk3@box
+Link: https://lkml.kernel.org/r/20231228144704.14033-1-kirill.shutemov@linux.intel.com
+Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
+Reviewed-by: Zi Yan <ziy@nvidia.com>
+Cc: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Stable-dep-of: b6976f323a86 ("drm/ttm: stop pooling cached NUMA pages v2")
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ .../admin-guide/kdump/vmcoreinfo.rst          |  6 +++---
+ arch/arm64/kvm/hyp/include/nvhe/gfp.h         |  2 +-
+ arch/sparc/kernel/traps_64.c                  |  2 +-
+ drivers/gpu/drm/ttm/tests/ttm_device_test.c   |  2 +-
+ drivers/gpu/drm/ttm/ttm_pool.c                | 20 +++++++++----------
+ include/drm/ttm/ttm_pool.h                    |  2 +-
+ include/linux/mmzone.h                        |  6 ++++--
+ kernel/crash_core.c                           |  2 +-
+ lib/test_meminit.c                            |  2 +-
+ mm/compaction.c                               |  2 +-
+ mm/kmsan/init.c                               |  2 +-
+ mm/page_alloc.c                               | 13 ++++++------
+ mm/page_reporting.c                           |  2 +-
+ mm/show_mem.c                                 |  8 ++++----
+ mm/vmstat.c                                   | 12 +++++------
+ 15 files changed, 42 insertions(+), 41 deletions(-)
+
+diff --git a/Documentation/admin-guide/kdump/vmcoreinfo.rst b/Documentation/admin-guide/kdump/vmcoreinfo.rst
+index 599e8d3bcbc31..9235cf4fbabff 100644
+--- a/Documentation/admin-guide/kdump/vmcoreinfo.rst
++++ b/Documentation/admin-guide/kdump/vmcoreinfo.rst
+@@ -172,7 +172,7 @@ variables.
+ Offset of the free_list's member. This value is used to compute the number
+ of free pages.
+-Each zone has a free_area structure array called free_area[MAX_ORDER + 1].
++Each zone has a free_area structure array called free_area[NR_PAGE_ORDERS].
+ The free_list represents a linked list of free page blocks.
+ (list_head, next|prev)
+@@ -189,8 +189,8 @@ Offsets of the vmap_area's members. They carry vmalloc-specific
+ information. Makedumpfile gets the start address of the vmalloc region
+ from this.
+-(zone.free_area, MAX_ORDER + 1)
+--------------------------------
++(zone.free_area, NR_PAGE_ORDERS)
++--------------------------------
+ Free areas descriptor. User-space tools use this value to iterate the
+ free_area ranges. MAX_ORDER is used by the zone buddy allocator.
+diff --git a/arch/arm64/kvm/hyp/include/nvhe/gfp.h b/arch/arm64/kvm/hyp/include/nvhe/gfp.h
+index fe5472a184a37..97c527ef53c2a 100644
+--- a/arch/arm64/kvm/hyp/include/nvhe/gfp.h
++++ b/arch/arm64/kvm/hyp/include/nvhe/gfp.h
+@@ -16,7 +16,7 @@ struct hyp_pool {
+        * API at EL2.
+        */
+       hyp_spinlock_t lock;
+-      struct list_head free_area[MAX_ORDER + 1];
++      struct list_head free_area[NR_PAGE_ORDERS];
+       phys_addr_t range_start;
+       phys_addr_t range_end;
+       unsigned short max_order;
+diff --git a/arch/sparc/kernel/traps_64.c b/arch/sparc/kernel/traps_64.c
+index 08ffd17d5ec34..523a6e5ee9251 100644
+--- a/arch/sparc/kernel/traps_64.c
++++ b/arch/sparc/kernel/traps_64.c
+@@ -897,7 +897,7 @@ void __init cheetah_ecache_flush_init(void)
+       /* Now allocate error trap reporting scoreboard. */
+       sz = NR_CPUS * (2 * sizeof(struct cheetah_err_info));
+-      for (order = 0; order <= MAX_ORDER; order++) {
++      for (order = 0; order < NR_PAGE_ORDERS; order++) {
+               if ((PAGE_SIZE << order) >= sz)
+                       break;
+       }
+diff --git a/drivers/gpu/drm/ttm/tests/ttm_device_test.c b/drivers/gpu/drm/ttm/tests/ttm_device_test.c
+index b1b423b68cdf1..19eaff22e6ae0 100644
+--- a/drivers/gpu/drm/ttm/tests/ttm_device_test.c
++++ b/drivers/gpu/drm/ttm/tests/ttm_device_test.c
+@@ -175,7 +175,7 @@ static void ttm_device_init_pools(struct kunit *test)
+       if (params->pools_init_expected) {
+               for (int i = 0; i < TTM_NUM_CACHING_TYPES; ++i) {
+-                      for (int j = 0; j <= MAX_ORDER; ++j) {
++                      for (int j = 0; j < NR_PAGE_ORDERS; ++j) {
+                               pt = pool->caching[i].orders[j];
+                               KUNIT_EXPECT_PTR_EQ(test, pt.pool, pool);
+                               KUNIT_EXPECT_EQ(test, pt.caching, i);
+diff --git a/drivers/gpu/drm/ttm/ttm_pool.c b/drivers/gpu/drm/ttm/ttm_pool.c
+index 9b60222511d65..c8ec6a2cac5d4 100644
+--- a/drivers/gpu/drm/ttm/ttm_pool.c
++++ b/drivers/gpu/drm/ttm/ttm_pool.c
+@@ -65,11 +65,11 @@ module_param(page_pool_size, ulong, 0644);
+ static atomic_long_t allocated_pages;
+-static struct ttm_pool_type global_write_combined[MAX_ORDER + 1];
+-static struct ttm_pool_type global_uncached[MAX_ORDER + 1];
++static struct ttm_pool_type global_write_combined[NR_PAGE_ORDERS];
++static struct ttm_pool_type global_uncached[NR_PAGE_ORDERS];
+-static struct ttm_pool_type global_dma32_write_combined[MAX_ORDER + 1];
+-static struct ttm_pool_type global_dma32_uncached[MAX_ORDER + 1];
++static struct ttm_pool_type global_dma32_write_combined[NR_PAGE_ORDERS];
++static struct ttm_pool_type global_dma32_uncached[NR_PAGE_ORDERS];
+ static spinlock_t shrinker_lock;
+ static struct list_head shrinker_list;
+@@ -565,7 +565,7 @@ void ttm_pool_init(struct ttm_pool *pool, struct device *dev,
+       if (use_dma_alloc || nid != NUMA_NO_NODE) {
+               for (i = 0; i < TTM_NUM_CACHING_TYPES; ++i)
+-                      for (j = 0; j <= MAX_ORDER; ++j)
++                      for (j = 0; j < NR_PAGE_ORDERS; ++j)
+                               ttm_pool_type_init(&pool->caching[i].orders[j],
+                                                  pool, i, j);
+       }
+@@ -586,7 +586,7 @@ void ttm_pool_fini(struct ttm_pool *pool)
+       if (pool->use_dma_alloc || pool->nid != NUMA_NO_NODE) {
+               for (i = 0; i < TTM_NUM_CACHING_TYPES; ++i)
+-                      for (j = 0; j <= MAX_ORDER; ++j)
++                      for (j = 0; j < NR_PAGE_ORDERS; ++j)
+                               ttm_pool_type_fini(&pool->caching[i].orders[j]);
+       }
+@@ -641,7 +641,7 @@ static void ttm_pool_debugfs_header(struct seq_file *m)
+       unsigned int i;
+       seq_puts(m, "\t ");
+-      for (i = 0; i <= MAX_ORDER; ++i)
++      for (i = 0; i < NR_PAGE_ORDERS; ++i)
+               seq_printf(m, " ---%2u---", i);
+       seq_puts(m, "\n");
+ }
+@@ -652,7 +652,7 @@ static void ttm_pool_debugfs_orders(struct ttm_pool_type *pt,
+ {
+       unsigned int i;
+-      for (i = 0; i <= MAX_ORDER; ++i)
++      for (i = 0; i < NR_PAGE_ORDERS; ++i)
+               seq_printf(m, " %8u", ttm_pool_type_count(&pt[i]));
+       seq_puts(m, "\n");
+ }
+@@ -761,7 +761,7 @@ int ttm_pool_mgr_init(unsigned long num_pages)
+       spin_lock_init(&shrinker_lock);
+       INIT_LIST_HEAD(&shrinker_list);
+-      for (i = 0; i <= MAX_ORDER; ++i) {
++      for (i = 0; i < NR_PAGE_ORDERS; ++i) {
+               ttm_pool_type_init(&global_write_combined[i], NULL,
+                                  ttm_write_combined, i);
+               ttm_pool_type_init(&global_uncached[i], NULL, ttm_uncached, i);
+@@ -794,7 +794,7 @@ void ttm_pool_mgr_fini(void)
+ {
+       unsigned int i;
+-      for (i = 0; i <= MAX_ORDER; ++i) {
++      for (i = 0; i < NR_PAGE_ORDERS; ++i) {
+               ttm_pool_type_fini(&global_write_combined[i]);
+               ttm_pool_type_fini(&global_uncached[i]);
+diff --git a/include/drm/ttm/ttm_pool.h b/include/drm/ttm/ttm_pool.h
+index 30a347e5aa114..4490d43c63e33 100644
+--- a/include/drm/ttm/ttm_pool.h
++++ b/include/drm/ttm/ttm_pool.h
+@@ -74,7 +74,7 @@ struct ttm_pool {
+       bool use_dma32;
+       struct {
+-              struct ttm_pool_type orders[MAX_ORDER + 1];
++              struct ttm_pool_type orders[NR_PAGE_ORDERS];
+       } caching[TTM_NUM_CACHING_TYPES];
+ };
+diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
+index 0f62786269d0c..1acbc6ce1fe43 100644
+--- a/include/linux/mmzone.h
++++ b/include/linux/mmzone.h
+@@ -34,6 +34,8 @@
+ #define IS_MAX_ORDER_ALIGNED(pfn) IS_ALIGNED(pfn, MAX_ORDER_NR_PAGES)
++#define NR_PAGE_ORDERS (MAX_ORDER + 1)
++
+ /*
+  * PAGE_ALLOC_COSTLY_ORDER is the order at which allocations are deemed
+  * costly to service.  That is between allocation orders which should
+@@ -95,7 +97,7 @@ static inline bool migratetype_is_mergeable(int mt)
+ }
+ #define for_each_migratetype_order(order, type) \
+-      for (order = 0; order <= MAX_ORDER; order++) \
++      for (order = 0; order < NR_PAGE_ORDERS; order++) \
+               for (type = 0; type < MIGRATE_TYPES; type++)
+ extern int page_group_by_mobility_disabled;
+@@ -929,7 +931,7 @@ struct zone {
+       CACHELINE_PADDING(_pad1_);
+       /* free areas of different sizes */
+-      struct free_area        free_area[MAX_ORDER + 1];
++      struct free_area        free_area[NR_PAGE_ORDERS];
+ #ifdef CONFIG_UNACCEPTED_MEMORY
+       /* Pages to be accepted. All pages on the list are MAX_ORDER */
+diff --git a/kernel/crash_core.c b/kernel/crash_core.c
+index 2f675ef045d40..b685e94605841 100644
+--- a/kernel/crash_core.c
++++ b/kernel/crash_core.c
+@@ -660,7 +660,7 @@ static int __init crash_save_vmcoreinfo_init(void)
+       VMCOREINFO_OFFSET(list_head, prev);
+       VMCOREINFO_OFFSET(vmap_area, va_start);
+       VMCOREINFO_OFFSET(vmap_area, list);
+-      VMCOREINFO_LENGTH(zone.free_area, MAX_ORDER + 1);
++      VMCOREINFO_LENGTH(zone.free_area, NR_PAGE_ORDERS);
+       log_buf_vmcoreinfo_setup();
+       VMCOREINFO_LENGTH(free_area.free_list, MIGRATE_TYPES);
+       VMCOREINFO_NUMBER(NR_FREE_PAGES);
+diff --git a/lib/test_meminit.c b/lib/test_meminit.c
+index 0ae35223d7733..0dc173849a542 100644
+--- a/lib/test_meminit.c
++++ b/lib/test_meminit.c
+@@ -93,7 +93,7 @@ static int __init test_pages(int *total_failures)
+       int failures = 0, num_tests = 0;
+       int i;
+-      for (i = 0; i <= MAX_ORDER; i++)
++      for (i = 0; i < NR_PAGE_ORDERS; i++)
+               num_tests += do_alloc_pages_order(i, &failures);
+       REPORT_FAILURES_IN_FN();
+diff --git a/mm/compaction.c b/mm/compaction.c
+index 5a3c644c978e2..61c741f11e9bb 100644
+--- a/mm/compaction.c
++++ b/mm/compaction.c
+@@ -2225,7 +2225,7 @@ static enum compact_result __compact_finished(struct compact_control *cc)
+       /* Direct compactor: Is a suitable page free? */
+       ret = COMPACT_NO_SUITABLE_PAGE;
+-      for (order = cc->order; order <= MAX_ORDER; order++) {
++      for (order = cc->order; order < NR_PAGE_ORDERS; order++) {
+               struct free_area *area = &cc->zone->free_area[order];
+               bool can_steal;
+diff --git a/mm/kmsan/init.c b/mm/kmsan/init.c
+index ffedf4dbc49d7..103e2e88ea033 100644
+--- a/mm/kmsan/init.c
++++ b/mm/kmsan/init.c
+@@ -96,7 +96,7 @@ void __init kmsan_init_shadow(void)
+ struct metadata_page_pair {
+       struct page *shadow, *origin;
+ };
+-static struct metadata_page_pair held_back[MAX_ORDER + 1] __initdata;
++static struct metadata_page_pair held_back[NR_PAGE_ORDERS] __initdata;
+ /*
+  * Eager metadata allocation. When the memblock allocator is freeing pages to
+diff --git a/mm/page_alloc.c b/mm/page_alloc.c
+index ab71417350127..6b4c30fcae1c9 100644
+--- a/mm/page_alloc.c
++++ b/mm/page_alloc.c
+@@ -1570,7 +1570,7 @@ struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,
+       struct page *page;
+       /* Find a page of the appropriate size in the preferred list */
+-      for (current_order = order; current_order <= MAX_ORDER; ++current_order) {
++      for (current_order = order; current_order < NR_PAGE_ORDERS; ++current_order) {
+               area = &(zone->free_area[current_order]);
+               page = get_page_from_free_area(area, migratetype);
+               if (!page)
+@@ -1940,7 +1940,7 @@ static bool unreserve_highatomic_pageblock(const struct alloc_context *ac,
+                       continue;
+               spin_lock_irqsave(&zone->lock, flags);
+-              for (order = 0; order <= MAX_ORDER; order++) {
++              for (order = 0; order < NR_PAGE_ORDERS; order++) {
+                       struct free_area *area = &(zone->free_area[order]);
+                       page = get_page_from_free_area(area, MIGRATE_HIGHATOMIC);
+@@ -2050,8 +2050,7 @@ __rmqueue_fallback(struct zone *zone, int order, int start_migratetype,
+       return false;
+ find_smallest:
+-      for (current_order = order; current_order <= MAX_ORDER;
+-                                                      current_order++) {
++      for (current_order = order; current_order < NR_PAGE_ORDERS; current_order++) {
+               area = &(zone->free_area[current_order]);
+               fallback_mt = find_suitable_fallback(area, current_order,
+                               start_migratetype, false, &can_steal);
+@@ -2884,7 +2883,7 @@ bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
+               return true;
+       /* For a high-order request, check at least one suitable page is free */
+-      for (o = order; o <= MAX_ORDER; o++) {
++      for (o = order; o < NR_PAGE_ORDERS; o++) {
+               struct free_area *area = &z->free_area[o];
+               int mt;
+@@ -6442,7 +6441,7 @@ bool is_free_buddy_page(struct page *page)
+       unsigned long pfn = page_to_pfn(page);
+       unsigned int order;
+-      for (order = 0; order <= MAX_ORDER; order++) {
++      for (order = 0; order < NR_PAGE_ORDERS; order++) {
+               struct page *page_head = page - (pfn & ((1 << order) - 1));
+               if (PageBuddy(page_head) &&
+@@ -6501,7 +6500,7 @@ bool take_page_off_buddy(struct page *page)
+       bool ret = false;
+       spin_lock_irqsave(&zone->lock, flags);
+-      for (order = 0; order <= MAX_ORDER; order++) {
++      for (order = 0; order < NR_PAGE_ORDERS; order++) {
+               struct page *page_head = page - (pfn & ((1 << order) - 1));
+               int page_order = buddy_order(page_head);
+diff --git a/mm/page_reporting.c b/mm/page_reporting.c
+index b021f482a4cb3..66369cc5279bf 100644
+--- a/mm/page_reporting.c
++++ b/mm/page_reporting.c
+@@ -276,7 +276,7 @@ page_reporting_process_zone(struct page_reporting_dev_info *prdev,
+               return err;
+       /* Process each free list starting from lowest order/mt */
+-      for (order = page_reporting_order; order <= MAX_ORDER; order++) {
++      for (order = page_reporting_order; order < NR_PAGE_ORDERS; order++) {
+               for (mt = 0; mt < MIGRATE_TYPES; mt++) {
+                       /* We do not pull pages from the isolate free list */
+                       if (is_migrate_isolate(mt))
+diff --git a/mm/show_mem.c b/mm/show_mem.c
+index 4b888b18bddea..b896e54e3a26c 100644
+--- a/mm/show_mem.c
++++ b/mm/show_mem.c
+@@ -355,8 +355,8 @@ static void show_free_areas(unsigned int filter, nodemask_t *nodemask, int max_z
+       for_each_populated_zone(zone) {
+               unsigned int order;
+-              unsigned long nr[MAX_ORDER + 1], flags, total = 0;
+-              unsigned char types[MAX_ORDER + 1];
++              unsigned long nr[NR_PAGE_ORDERS], flags, total = 0;
++              unsigned char types[NR_PAGE_ORDERS];
+               if (zone_idx(zone) > max_zone_idx)
+                       continue;
+@@ -366,7 +366,7 @@ static void show_free_areas(unsigned int filter, nodemask_t *nodemask, int max_z
+               printk(KERN_CONT "%s: ", zone->name);
+               spin_lock_irqsave(&zone->lock, flags);
+-              for (order = 0; order <= MAX_ORDER; order++) {
++              for (order = 0; order < NR_PAGE_ORDERS; order++) {
+                       struct free_area *area = &zone->free_area[order];
+                       int type;
+@@ -380,7 +380,7 @@ static void show_free_areas(unsigned int filter, nodemask_t *nodemask, int max_z
+                       }
+               }
+               spin_unlock_irqrestore(&zone->lock, flags);
+-              for (order = 0; order <= MAX_ORDER; order++) {
++              for (order = 0; order < NR_PAGE_ORDERS; order++) {
+                       printk(KERN_CONT "%lu*%lukB ",
+                              nr[order], K(1UL) << order);
+                       if (nr[order])
+diff --git a/mm/vmstat.c b/mm/vmstat.c
+index 00e81e99c6ee2..e9616c4ca12db 100644
+--- a/mm/vmstat.c
++++ b/mm/vmstat.c
+@@ -1055,7 +1055,7 @@ static void fill_contig_page_info(struct zone *zone,
+       info->free_blocks_total = 0;
+       info->free_blocks_suitable = 0;
+-      for (order = 0; order <= MAX_ORDER; order++) {
++      for (order = 0; order < NR_PAGE_ORDERS; order++) {
+               unsigned long blocks;
+               /*
+@@ -1471,7 +1471,7 @@ static void frag_show_print(struct seq_file *m, pg_data_t *pgdat,
+       int order;
+       seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name);
+-      for (order = 0; order <= MAX_ORDER; ++order)
++      for (order = 0; order < NR_PAGE_ORDERS; ++order)
+               /*
+                * Access to nr_free is lockless as nr_free is used only for
+                * printing purposes. Use data_race to avoid KCSAN warning.
+@@ -1500,7 +1500,7 @@ static void pagetypeinfo_showfree_print(struct seq_file *m,
+                                       pgdat->node_id,
+                                       zone->name,
+                                       migratetype_names[mtype]);
+-              for (order = 0; order <= MAX_ORDER; ++order) {
++              for (order = 0; order < NR_PAGE_ORDERS; ++order) {
+                       unsigned long freecount = 0;
+                       struct free_area *area;
+                       struct list_head *curr;
+@@ -1540,7 +1540,7 @@ static void pagetypeinfo_showfree(struct seq_file *m, void *arg)
+       /* Print header */
+       seq_printf(m, "%-43s ", "Free pages count per migrate type at order");
+-      for (order = 0; order <= MAX_ORDER; ++order)
++      for (order = 0; order < NR_PAGE_ORDERS; ++order)
+               seq_printf(m, "%6d ", order);
+       seq_putc(m, '\n');
+@@ -2176,7 +2176,7 @@ static void unusable_show_print(struct seq_file *m,
+       seq_printf(m, "Node %d, zone %8s ",
+                               pgdat->node_id,
+                               zone->name);
+-      for (order = 0; order <= MAX_ORDER; ++order) {
++      for (order = 0; order < NR_PAGE_ORDERS; ++order) {
+               fill_contig_page_info(zone, order, &info);
+               index = unusable_free_index(order, &info);
+               seq_printf(m, "%d.%03d ", index / 1000, index % 1000);
+@@ -2228,7 +2228,7 @@ static void extfrag_show_print(struct seq_file *m,
+       seq_printf(m, "Node %d, zone %8s ",
+                               pgdat->node_id,
+                               zone->name);
+-      for (order = 0; order <= MAX_ORDER; ++order) {
++      for (order = 0; order < NR_PAGE_ORDERS; ++order) {
+               fill_contig_page_info(zone, order, &info);
+               index = __fragmentation_index(order, &info);
+               seq_printf(m, "%2d.%03d ", index / 1000, index % 1000);
+-- 
+2.43.0
+
index 7a1566dbe4764459a4b630d57a440d75b352f7e8..6788759bebb82251388a6fe72a8279009a991d36 100644 (file)
@@ -93,3 +93,14 @@ ice-fix-lag-and-vf-lock-dependency-in-ice_reset_vf.patch
 net-ethernet-ti-am65-cpts-fix-ptpv1-message-type-on-.patch
 tls-fix-lockless-read-of-strp-msg_ready-in-poll.patch
 af_unix-suppress-false-positive-lockdep-splat-for-sp.patch
+kvm-x86-pmu-zero-out-pmu-metadata-on-amd-if-pmu-is-d.patch
+kvm-x86-pmu-set-enable-bits-for-gp-counters-in-perf_.patch
+mm-gup-explicitly-define-and-check-internal-gup-flag.patch
+mm-madvise-make-madv_populate_-read-write-handle-vm_.patch
+drm-add-drm_gem_object_is_shared_for_memory_stats-he.patch
+drm-amdgpu-add-shared-fdinfo-stats.patch
+drm-amdgpu-fix-visible-vram-handling-during-faults.patch
+mm-treewide-introduce-nr_page_orders.patch
+drm-ttm-stop-pooling-cached-numa-pages-v2.patch
+squashfs-convert-to-new-timestamp-accessors.patch
+squashfs-check-the-inode-number-is-not-the-invalid-v.patch
diff --git a/queue-6.6/squashfs-check-the-inode-number-is-not-the-invalid-v.patch b/queue-6.6/squashfs-check-the-inode-number-is-not-the-invalid-v.patch
new file mode 100644 (file)
index 0000000..0e277d3
--- /dev/null
@@ -0,0 +1,72 @@
+From 2c44ece9d52e9e48b8d9813cb44beb104cf8e60b Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Mon, 8 Apr 2024 23:02:06 +0100
+Subject: Squashfs: check the inode number is not the invalid value of zero
+
+From: Phillip Lougher <phillip@squashfs.org.uk>
+
+[ Upstream commit 9253c54e01b6505d348afbc02abaa4d9f8a01395 ]
+
+Syskiller has produced an out of bounds access in fill_meta_index().
+
+That out of bounds access is ultimately caused because the inode
+has an inode number with the invalid value of zero, which was not checked.
+
+The reason this causes the out of bounds access is due to following
+sequence of events:
+
+1. Fill_meta_index() is called to allocate (via empty_meta_index())
+   and fill a metadata index.  It however suffers a data read error
+   and aborts, invalidating the newly returned empty metadata index.
+   It does this by setting the inode number of the index to zero,
+   which means unused (zero is not a valid inode number).
+
+2. When fill_meta_index() is subsequently called again on another
+   read operation, locate_meta_index() returns the previous index
+   because it matches the inode number of 0.  Because this index
+   has been returned it is expected to have been filled, and because
+   it hasn't been, an out of bounds access is performed.
+
+This patch adds a sanity check which checks that the inode number
+is not zero when the inode is created and returns -EINVAL if it is.
+
+[phillip@squashfs.org.uk: whitespace fix]
+  Link: https://lkml.kernel.org/r/20240409204723.446925-1-phillip@squashfs.org.uk
+Link: https://lkml.kernel.org/r/20240408220206.435788-1-phillip@squashfs.org.uk
+Signed-off-by: Phillip Lougher <phillip@squashfs.org.uk>
+Reported-by: "Ubisectech Sirius" <bugreport@ubisectech.com>
+Closes: https://lore.kernel.org/lkml/87f5c007-b8a5-41ae-8b57-431e924c5915.bugreport@ubisectech.com/
+Cc: Christian Brauner <brauner@kernel.org>
+Cc: <stable@vger.kernel.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/squashfs/inode.c | 5 ++++-
+ 1 file changed, 4 insertions(+), 1 deletion(-)
+
+diff --git a/fs/squashfs/inode.c b/fs/squashfs/inode.c
+index aa3411354e66d..16bd693d0b3aa 100644
+--- a/fs/squashfs/inode.c
++++ b/fs/squashfs/inode.c
+@@ -48,6 +48,10 @@ static int squashfs_new_inode(struct super_block *sb, struct inode *inode,
+       gid_t i_gid;
+       int err;
++      inode->i_ino = le32_to_cpu(sqsh_ino->inode_number);
++      if (inode->i_ino == 0)
++              return -EINVAL;
++
+       err = squashfs_get_id(sb, le16_to_cpu(sqsh_ino->uid), &i_uid);
+       if (err)
+               return err;
+@@ -58,7 +62,6 @@ static int squashfs_new_inode(struct super_block *sb, struct inode *inode,
+       i_uid_write(inode, i_uid);
+       i_gid_write(inode, i_gid);
+-      inode->i_ino = le32_to_cpu(sqsh_ino->inode_number);
+       inode_set_mtime(inode, le32_to_cpu(sqsh_ino->mtime), 0);
+       inode_set_atime(inode, inode_get_mtime_sec(inode), 0);
+       inode_set_ctime(inode, inode_get_mtime_sec(inode), 0);
+-- 
+2.43.0
+
diff --git a/queue-6.6/squashfs-convert-to-new-timestamp-accessors.patch b/queue-6.6/squashfs-convert-to-new-timestamp-accessors.patch
new file mode 100644 (file)
index 0000000..5afec4a
--- /dev/null
@@ -0,0 +1,40 @@
+From 87c95dbeff05ea29654e73ee7059cbf94bc5a227 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Wed, 4 Oct 2023 14:52:55 -0400
+Subject: squashfs: convert to new timestamp accessors
+
+From: Jeff Layton <jlayton@kernel.org>
+
+[ Upstream commit a1f13ed8c74893ed31d41c5bca156a623b0e9a86 ]
+
+Convert to using the new inode timestamp accessor functions.
+
+Signed-off-by: Jeff Layton <jlayton@kernel.org>
+Link: https://lore.kernel.org/r/20231004185347.80880-68-jlayton@kernel.org
+Signed-off-by: Christian Brauner <brauner@kernel.org>
+Stable-dep-of: 9253c54e01b6 ("Squashfs: check the inode number is not the invalid value of zero")
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/squashfs/inode.c | 6 +++---
+ 1 file changed, 3 insertions(+), 3 deletions(-)
+
+diff --git a/fs/squashfs/inode.c b/fs/squashfs/inode.c
+index c6e626b00546b..aa3411354e66d 100644
+--- a/fs/squashfs/inode.c
++++ b/fs/squashfs/inode.c
+@@ -59,9 +59,9 @@ static int squashfs_new_inode(struct super_block *sb, struct inode *inode,
+       i_uid_write(inode, i_uid);
+       i_gid_write(inode, i_gid);
+       inode->i_ino = le32_to_cpu(sqsh_ino->inode_number);
+-      inode->i_mtime.tv_sec = le32_to_cpu(sqsh_ino->mtime);
+-      inode->i_atime.tv_sec = inode->i_mtime.tv_sec;
+-      inode_set_ctime(inode, inode->i_mtime.tv_sec, 0);
++      inode_set_mtime(inode, le32_to_cpu(sqsh_ino->mtime), 0);
++      inode_set_atime(inode, inode_get_mtime_sec(inode), 0);
++      inode_set_ctime(inode, inode_get_mtime_sec(inode), 0);
+       inode->i_mode = le16_to_cpu(sqsh_ino->mode);
+       inode->i_size = 0;
+-- 
+2.43.0
+