From: Sasha Levin Date: Mon, 29 Apr 2024 01:53:52 +0000 (-0400) Subject: Fixes for 6.6 X-Git-Tag: v6.1.89~7 X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=f02dbe1bde98661d30c6653d1227d89f4150e1b0;p=thirdparty%2Fkernel%2Fstable-queue.git Fixes for 6.6 Signed-off-by: Sasha Levin --- diff --git a/queue-6.6/drm-add-drm_gem_object_is_shared_for_memory_stats-he.patch b/queue-6.6/drm-add-drm_gem_object_is_shared_for_memory_stats-he.patch new file mode 100644 index 00000000000..692c36ce053 --- /dev/null +++ b/queue-6.6/drm-add-drm_gem_object_is_shared_for_memory_stats-he.patch @@ -0,0 +1,59 @@ +From 20d98716f2ddccaf0c08ef3edeb921836b32d8d9 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Mon, 12 Feb 2024 16:04:24 -0500 +Subject: drm: add drm_gem_object_is_shared_for_memory_stats() helper +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +From: Alex Deucher + +[ Upstream commit b31f5eba32ae8cc28e7cfa5a55ec8670d8c718e2 ] + +Add a helper so that drm drivers can consistently report +shared status via the fdinfo shared memory stats interface. + +In addition to handle count, show buffers as shared if they +are shared via dma-buf as well (e.g., shared with v4l or some +other subsystem). + +v2: switch to inline function + +Link: https://lore.kernel.org/all/20231207180225.439482-1-alexander.deucher@amd.com/ +Reviewed-by: Tvrtko Ursulin (v1) +Signed-off-by: Alex Deucher +Reviewed-by: Christian König +Signed-off-by: Christian König +Stable-dep-of: a6ff969fe9cb ("drm/amdgpu: fix visible VRAM handling during faults") +Signed-off-by: Sasha Levin +--- + include/drm/drm_gem.h | 13 +++++++++++++ + 1 file changed, 13 insertions(+) + +diff --git a/include/drm/drm_gem.h b/include/drm/drm_gem.h +index bc9f6aa2f3fec..7c2ec139c464a 100644 +--- a/include/drm/drm_gem.h ++++ b/include/drm/drm_gem.h +@@ -544,6 +544,19 @@ unsigned long drm_gem_lru_scan(struct drm_gem_lru *lru, + + int drm_gem_evict(struct drm_gem_object *obj); + ++/** ++ * drm_gem_object_is_shared_for_memory_stats - helper for shared memory stats ++ * ++ * This helper should only be used for fdinfo shared memory stats to determine ++ * if a GEM object is shared. ++ * ++ * @obj: obj in question ++ */ ++static inline bool drm_gem_object_is_shared_for_memory_stats(struct drm_gem_object *obj) ++{ ++ return (obj->handle_count > 1) || obj->dma_buf; ++} ++ + #ifdef CONFIG_LOCKDEP + /** + * drm_gem_gpuva_set_lock() - Set the lock protecting accesses to the gpuva list. +-- +2.43.0 + diff --git a/queue-6.6/drm-amdgpu-add-shared-fdinfo-stats.patch b/queue-6.6/drm-amdgpu-add-shared-fdinfo-stats.patch new file mode 100644 index 00000000000..9eb645b6507 --- /dev/null +++ b/queue-6.6/drm-amdgpu-add-shared-fdinfo-stats.patch @@ -0,0 +1,112 @@ +From 414f410b10b6323ad6499db3bafb65b241dfa7c6 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Mon, 12 Feb 2024 16:04:26 -0500 +Subject: drm/amdgpu: add shared fdinfo stats +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +From: Alex Deucher + +[ Upstream commit ba1a58d5b907bdf1814f8f57434aebc86233430f ] + +Add shared stats. Useful for seeing shared memory. + +v2: take dma-buf into account as well +v3: use the new gem helper + +Link: https://lore.kernel.org/all/20231207180225.439482-1-alexander.deucher@amd.com/ +Signed-off-by: Alex Deucher +Cc: Rob Clark +Reviewed-by: Christian König +Signed-off-by: Christian König +Stable-dep-of: a6ff969fe9cb ("drm/amdgpu: fix visible VRAM handling during faults") +Signed-off-by: Sasha Levin +--- + drivers/gpu/drm/amd/amdgpu/amdgpu_fdinfo.c | 4 ++++ + drivers/gpu/drm/amd/amdgpu/amdgpu_object.c | 11 +++++++++++ + drivers/gpu/drm/amd/amdgpu/amdgpu_object.h | 6 ++++++ + 3 files changed, 21 insertions(+) + +diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fdinfo.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_fdinfo.c +index 6038b5021b27b..792c059ff7b35 100644 +--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fdinfo.c ++++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fdinfo.c +@@ -105,6 +105,10 @@ void amdgpu_show_fdinfo(struct drm_printer *p, struct drm_file *file) + stats.requested_visible_vram/1024UL); + drm_printf(p, "amd-requested-gtt:\t%llu KiB\n", + stats.requested_gtt/1024UL); ++ drm_printf(p, "drm-shared-vram:\t%llu KiB\n", stats.vram_shared/1024UL); ++ drm_printf(p, "drm-shared-gtt:\t%llu KiB\n", stats.gtt_shared/1024UL); ++ drm_printf(p, "drm-shared-cpu:\t%llu KiB\n", stats.cpu_shared/1024UL); ++ + for (hw_ip = 0; hw_ip < AMDGPU_HW_IP_NUM; ++hw_ip) { + if (!usage[hw_ip]) + continue; +diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c +index 173b43a5aa13b..394f475877e3b 100644 +--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c ++++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c +@@ -1281,25 +1281,36 @@ void amdgpu_bo_get_memory(struct amdgpu_bo *bo, + struct amdgpu_mem_stats *stats) + { + uint64_t size = amdgpu_bo_size(bo); ++ struct drm_gem_object *obj; + unsigned int domain; ++ bool shared; + + /* Abort if the BO doesn't currently have a backing store */ + if (!bo->tbo.resource) + return; + ++ obj = &bo->tbo.base; ++ shared = drm_gem_object_is_shared_for_memory_stats(obj); ++ + domain = amdgpu_mem_type_to_domain(bo->tbo.resource->mem_type); + switch (domain) { + case AMDGPU_GEM_DOMAIN_VRAM: + stats->vram += size; + if (amdgpu_bo_in_cpu_visible_vram(bo)) + stats->visible_vram += size; ++ if (shared) ++ stats->vram_shared += size; + break; + case AMDGPU_GEM_DOMAIN_GTT: + stats->gtt += size; ++ if (shared) ++ stats->gtt_shared += size; + break; + case AMDGPU_GEM_DOMAIN_CPU: + default: + stats->cpu += size; ++ if (shared) ++ stats->cpu_shared += size; + break; + } + +diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h +index a3ea8a82db23a..be679c42b0b8c 100644 +--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h ++++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h +@@ -138,12 +138,18 @@ struct amdgpu_bo_vm { + struct amdgpu_mem_stats { + /* current VRAM usage, includes visible VRAM */ + uint64_t vram; ++ /* current shared VRAM usage, includes visible VRAM */ ++ uint64_t vram_shared; + /* current visible VRAM usage */ + uint64_t visible_vram; + /* current GTT usage */ + uint64_t gtt; ++ /* current shared GTT usage */ ++ uint64_t gtt_shared; + /* current system memory usage */ + uint64_t cpu; ++ /* current shared system memory usage */ ++ uint64_t cpu_shared; + /* sum of evicted buffers, includes visible VRAM */ + uint64_t evicted_vram; + /* sum of evicted buffers due to CPU access */ +-- +2.43.0 + diff --git a/queue-6.6/drm-amdgpu-fix-visible-vram-handling-during-faults.patch b/queue-6.6/drm-amdgpu-fix-visible-vram-handling-during-faults.patch new file mode 100644 index 00000000000..4f64bafe6cc --- /dev/null +++ b/queue-6.6/drm-amdgpu-fix-visible-vram-handling-during-faults.patch @@ -0,0 +1,283 @@ +From 18ea34ca16ae5fcde5f879e412836c1a81d28914 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Thu, 4 Apr 2024 16:25:40 +0200 +Subject: drm/amdgpu: fix visible VRAM handling during faults +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +From: Christian König + +[ Upstream commit a6ff969fe9cbf369e3cd0ac54261fec1122682ec ] + +When we removed the hacky start code check we actually didn't took into +account that *all* VRAM pages needs to be CPU accessible. + +Clean up the code and unify the handling into a single helper which +checks if the whole resource is CPU accessible. + +The only place where a partial check would make sense is during +eviction, but that is neglitible. + +Signed-off-by: Christian König +Fixes: aed01a68047b ("drm/amdgpu: Remove TTM resource->start visible VRAM condition v2") +Reviewed-by: Alex Deucher +Signed-off-by: Alex Deucher +CC: stable@vger.kernel.org +Signed-off-by: Sasha Levin +--- + drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c | 2 +- + drivers/gpu/drm/amd/amdgpu/amdgpu_object.c | 22 ++++---- + drivers/gpu/drm/amd/amdgpu/amdgpu_object.h | 22 -------- + drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c | 61 ++++++++++++++-------- + drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h | 3 ++ + 5 files changed, 53 insertions(+), 57 deletions(-) + +diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c +index c0a3afe81bb1a..4294f5e7bff9a 100644 +--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c ++++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c +@@ -819,7 +819,7 @@ static int amdgpu_cs_bo_validate(void *param, struct amdgpu_bo *bo) + + p->bytes_moved += ctx.bytes_moved; + if (!amdgpu_gmc_vram_full_visible(&adev->gmc) && +- amdgpu_bo_in_cpu_visible_vram(bo)) ++ amdgpu_res_cpu_visible(adev, bo->tbo.resource)) + p->bytes_moved_vis += ctx.bytes_moved; + + if (unlikely(r == -ENOMEM) && domain != bo->allowed_domains) { +diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c +index 394f475877e3b..361f2cc94e8e5 100644 +--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c ++++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c +@@ -625,8 +625,7 @@ int amdgpu_bo_create(struct amdgpu_device *adev, + return r; + + if (!amdgpu_gmc_vram_full_visible(&adev->gmc) && +- bo->tbo.resource->mem_type == TTM_PL_VRAM && +- amdgpu_bo_in_cpu_visible_vram(bo)) ++ amdgpu_res_cpu_visible(adev, bo->tbo.resource)) + amdgpu_cs_report_moved_bytes(adev, ctx.bytes_moved, + ctx.bytes_moved); + else +@@ -1280,23 +1279,25 @@ void amdgpu_bo_move_notify(struct ttm_buffer_object *bo, bool evict) + void amdgpu_bo_get_memory(struct amdgpu_bo *bo, + struct amdgpu_mem_stats *stats) + { ++ struct amdgpu_device *adev = amdgpu_ttm_adev(bo->tbo.bdev); ++ struct ttm_resource *res = bo->tbo.resource; + uint64_t size = amdgpu_bo_size(bo); + struct drm_gem_object *obj; + unsigned int domain; + bool shared; + + /* Abort if the BO doesn't currently have a backing store */ +- if (!bo->tbo.resource) ++ if (!res) + return; + + obj = &bo->tbo.base; + shared = drm_gem_object_is_shared_for_memory_stats(obj); + +- domain = amdgpu_mem_type_to_domain(bo->tbo.resource->mem_type); ++ domain = amdgpu_mem_type_to_domain(res->mem_type); + switch (domain) { + case AMDGPU_GEM_DOMAIN_VRAM: + stats->vram += size; +- if (amdgpu_bo_in_cpu_visible_vram(bo)) ++ if (amdgpu_res_cpu_visible(adev, bo->tbo.resource)) + stats->visible_vram += size; + if (shared) + stats->vram_shared += size; +@@ -1395,10 +1396,7 @@ vm_fault_t amdgpu_bo_fault_reserve_notify(struct ttm_buffer_object *bo) + /* Remember that this BO was accessed by the CPU */ + abo->flags |= AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED; + +- if (bo->resource->mem_type != TTM_PL_VRAM) +- return 0; +- +- if (amdgpu_bo_in_cpu_visible_vram(abo)) ++ if (amdgpu_res_cpu_visible(adev, bo->resource)) + return 0; + + /* Can't move a pinned BO to visible VRAM */ +@@ -1422,7 +1420,7 @@ vm_fault_t amdgpu_bo_fault_reserve_notify(struct ttm_buffer_object *bo) + + /* this should never happen */ + if (bo->resource->mem_type == TTM_PL_VRAM && +- !amdgpu_bo_in_cpu_visible_vram(abo)) ++ !amdgpu_res_cpu_visible(adev, bo->resource)) + return VM_FAULT_SIGBUS; + + ttm_bo_move_to_lru_tail_unlocked(bo); +@@ -1582,6 +1580,7 @@ uint32_t amdgpu_bo_get_preferred_domain(struct amdgpu_device *adev, + */ + u64 amdgpu_bo_print_info(int id, struct amdgpu_bo *bo, struct seq_file *m) + { ++ struct amdgpu_device *adev = amdgpu_ttm_adev(bo->tbo.bdev); + struct dma_buf_attachment *attachment; + struct dma_buf *dma_buf; + const char *placement; +@@ -1590,10 +1589,11 @@ u64 amdgpu_bo_print_info(int id, struct amdgpu_bo *bo, struct seq_file *m) + + if (dma_resv_trylock(bo->tbo.base.resv)) { + unsigned int domain; ++ + domain = amdgpu_mem_type_to_domain(bo->tbo.resource->mem_type); + switch (domain) { + case AMDGPU_GEM_DOMAIN_VRAM: +- if (amdgpu_bo_in_cpu_visible_vram(bo)) ++ if (amdgpu_res_cpu_visible(adev, bo->tbo.resource)) + placement = "VRAM VISIBLE"; + else + placement = "VRAM"; +diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h +index be679c42b0b8c..fa03d9e4874cc 100644 +--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h ++++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h +@@ -250,28 +250,6 @@ static inline u64 amdgpu_bo_mmap_offset(struct amdgpu_bo *bo) + return drm_vma_node_offset_addr(&bo->tbo.base.vma_node); + } + +-/** +- * amdgpu_bo_in_cpu_visible_vram - check if BO is (partly) in visible VRAM +- */ +-static inline bool amdgpu_bo_in_cpu_visible_vram(struct amdgpu_bo *bo) +-{ +- struct amdgpu_device *adev = amdgpu_ttm_adev(bo->tbo.bdev); +- struct amdgpu_res_cursor cursor; +- +- if (!bo->tbo.resource || bo->tbo.resource->mem_type != TTM_PL_VRAM) +- return false; +- +- amdgpu_res_first(bo->tbo.resource, 0, amdgpu_bo_size(bo), &cursor); +- while (cursor.remaining) { +- if (cursor.start < adev->gmc.visible_vram_size) +- return true; +- +- amdgpu_res_next(&cursor, cursor.size); +- } +- +- return false; +-} +- + /** + * amdgpu_bo_explicit_sync - return whether the bo is explicitly synced + */ +diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c +index 1124e2d4f8530..d1687b5725693 100644 +--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c ++++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c +@@ -137,7 +137,7 @@ static void amdgpu_evict_flags(struct ttm_buffer_object *bo, + amdgpu_bo_placement_from_domain(abo, AMDGPU_GEM_DOMAIN_CPU); + } else if (!amdgpu_gmc_vram_full_visible(&adev->gmc) && + !(abo->flags & AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED) && +- amdgpu_bo_in_cpu_visible_vram(abo)) { ++ amdgpu_res_cpu_visible(adev, bo->resource)) { + + /* Try evicting to the CPU inaccessible part of VRAM + * first, but only set GTT as busy placement, so this +@@ -408,40 +408,55 @@ static int amdgpu_move_blit(struct ttm_buffer_object *bo, + return r; + } + +-/* +- * amdgpu_mem_visible - Check that memory can be accessed by ttm_bo_move_memcpy ++/** ++ * amdgpu_res_cpu_visible - Check that resource can be accessed by CPU ++ * @adev: amdgpu device ++ * @res: the resource to check + * +- * Called by amdgpu_bo_move() ++ * Returns: true if the full resource is CPU visible, false otherwise. + */ +-static bool amdgpu_mem_visible(struct amdgpu_device *adev, +- struct ttm_resource *mem) ++bool amdgpu_res_cpu_visible(struct amdgpu_device *adev, ++ struct ttm_resource *res) + { +- u64 mem_size = (u64)mem->size; + struct amdgpu_res_cursor cursor; +- u64 end; + +- if (mem->mem_type == TTM_PL_SYSTEM || +- mem->mem_type == TTM_PL_TT) ++ if (!res) ++ return false; ++ ++ if (res->mem_type == TTM_PL_SYSTEM || res->mem_type == TTM_PL_TT || ++ res->mem_type == AMDGPU_PL_PREEMPT) + return true; +- if (mem->mem_type != TTM_PL_VRAM) ++ ++ if (res->mem_type != TTM_PL_VRAM) + return false; + +- amdgpu_res_first(mem, 0, mem_size, &cursor); +- end = cursor.start + cursor.size; ++ amdgpu_res_first(res, 0, res->size, &cursor); + while (cursor.remaining) { ++ if ((cursor.start + cursor.size) >= adev->gmc.visible_vram_size) ++ return false; + amdgpu_res_next(&cursor, cursor.size); ++ } + +- if (!cursor.remaining) +- break; ++ return true; ++} + +- /* ttm_resource_ioremap only supports contiguous memory */ +- if (end != cursor.start) +- return false; ++/* ++ * amdgpu_res_copyable - Check that memory can be accessed by ttm_bo_move_memcpy ++ * ++ * Called by amdgpu_bo_move() ++ */ ++static bool amdgpu_res_copyable(struct amdgpu_device *adev, ++ struct ttm_resource *mem) ++{ ++ if (!amdgpu_res_cpu_visible(adev, mem)) ++ return false; + +- end = cursor.start + cursor.size; +- } ++ /* ttm_resource_ioremap only supports contiguous memory */ ++ if (mem->mem_type == TTM_PL_VRAM && ++ !(mem->placement & TTM_PL_FLAG_CONTIGUOUS)) ++ return false; + +- return end <= adev->gmc.visible_vram_size; ++ return true; + } + + /* +@@ -534,8 +549,8 @@ static int amdgpu_bo_move(struct ttm_buffer_object *bo, bool evict, + + if (r) { + /* Check that all memory is CPU accessible */ +- if (!amdgpu_mem_visible(adev, old_mem) || +- !amdgpu_mem_visible(adev, new_mem)) { ++ if (!amdgpu_res_copyable(adev, old_mem) || ++ !amdgpu_res_copyable(adev, new_mem)) { + pr_err("Move buffer fallback to memcpy unavailable\n"); + return r; + } +diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h +index 65ec82141a8e0..32cf6b6f6efd9 100644 +--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h ++++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h +@@ -139,6 +139,9 @@ int amdgpu_vram_mgr_reserve_range(struct amdgpu_vram_mgr *mgr, + int amdgpu_vram_mgr_query_page_status(struct amdgpu_vram_mgr *mgr, + uint64_t start); + ++bool amdgpu_res_cpu_visible(struct amdgpu_device *adev, ++ struct ttm_resource *res); ++ + int amdgpu_ttm_init(struct amdgpu_device *adev); + void amdgpu_ttm_fini(struct amdgpu_device *adev); + void amdgpu_ttm_set_buffer_funcs_status(struct amdgpu_device *adev, +-- +2.43.0 + diff --git a/queue-6.6/drm-ttm-stop-pooling-cached-numa-pages-v2.patch b/queue-6.6/drm-ttm-stop-pooling-cached-numa-pages-v2.patch new file mode 100644 index 00000000000..4097b8e3cf2 --- /dev/null +++ b/queue-6.6/drm-ttm-stop-pooling-cached-numa-pages-v2.patch @@ -0,0 +1,112 @@ +From 1c9adb22902b02073346490edd1c27feb4ff0eb3 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Mon, 15 Apr 2024 15:48:21 +0200 +Subject: drm/ttm: stop pooling cached NUMA pages v2 +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +From: Christian König + +[ Upstream commit b6976f323a8687cc0d55bc92c2086fd934324ed5 ] + +We only pool write combined and uncached allocations because they +require extra overhead on allocation and release. + +If we also pool cached NUMA it not only means some extra unnecessary +overhead, but also that under memory pressure it can happen that +pages from the wrong NUMA node enters the pool and are re-used +over and over again. + +This can lead to performance reduction after running into memory +pressure. + +v2: restructure and cleanup the code a bit from the internal hack to + test this. + +Signed-off-by: Christian König +Fixes: 4482d3c94d7f ("drm/ttm: add NUMA node id to the pool") +CC: stable@vger.kernel.org +Reviewed-by: Felix Kuehling +Link: https://patchwork.freedesktop.org/patch/msgid/20240415134821.1919-1-christian.koenig@amd.com +Signed-off-by: Sasha Levin +--- + drivers/gpu/drm/ttm/ttm_pool.c | 38 +++++++++++++++++++++++++--------- + 1 file changed, 28 insertions(+), 10 deletions(-) + +diff --git a/drivers/gpu/drm/ttm/ttm_pool.c b/drivers/gpu/drm/ttm/ttm_pool.c +index c8ec6a2cac5d4..37c08fac7e7d0 100644 +--- a/drivers/gpu/drm/ttm/ttm_pool.c ++++ b/drivers/gpu/drm/ttm/ttm_pool.c +@@ -287,17 +287,23 @@ static struct ttm_pool_type *ttm_pool_select_type(struct ttm_pool *pool, + enum ttm_caching caching, + unsigned int order) + { +- if (pool->use_dma_alloc || pool->nid != NUMA_NO_NODE) ++ if (pool->use_dma_alloc) + return &pool->caching[caching].orders[order]; + + #ifdef CONFIG_X86 + switch (caching) { + case ttm_write_combined: ++ if (pool->nid != NUMA_NO_NODE) ++ return &pool->caching[caching].orders[order]; ++ + if (pool->use_dma32) + return &global_dma32_write_combined[order]; + + return &global_write_combined[order]; + case ttm_uncached: ++ if (pool->nid != NUMA_NO_NODE) ++ return &pool->caching[caching].orders[order]; ++ + if (pool->use_dma32) + return &global_dma32_uncached[order]; + +@@ -563,11 +569,17 @@ void ttm_pool_init(struct ttm_pool *pool, struct device *dev, + pool->use_dma_alloc = use_dma_alloc; + pool->use_dma32 = use_dma32; + +- if (use_dma_alloc || nid != NUMA_NO_NODE) { +- for (i = 0; i < TTM_NUM_CACHING_TYPES; ++i) +- for (j = 0; j < NR_PAGE_ORDERS; ++j) +- ttm_pool_type_init(&pool->caching[i].orders[j], +- pool, i, j); ++ for (i = 0; i < TTM_NUM_CACHING_TYPES; ++i) { ++ for (j = 0; j < NR_PAGE_ORDERS; ++j) { ++ struct ttm_pool_type *pt; ++ ++ /* Initialize only pool types which are actually used */ ++ pt = ttm_pool_select_type(pool, i, j); ++ if (pt != &pool->caching[i].orders[j]) ++ continue; ++ ++ ttm_pool_type_init(pt, pool, i, j); ++ } + } + } + EXPORT_SYMBOL(ttm_pool_init); +@@ -584,10 +596,16 @@ void ttm_pool_fini(struct ttm_pool *pool) + { + unsigned int i, j; + +- if (pool->use_dma_alloc || pool->nid != NUMA_NO_NODE) { +- for (i = 0; i < TTM_NUM_CACHING_TYPES; ++i) +- for (j = 0; j < NR_PAGE_ORDERS; ++j) +- ttm_pool_type_fini(&pool->caching[i].orders[j]); ++ for (i = 0; i < TTM_NUM_CACHING_TYPES; ++i) { ++ for (j = 0; j < NR_PAGE_ORDERS; ++j) { ++ struct ttm_pool_type *pt; ++ ++ pt = ttm_pool_select_type(pool, i, j); ++ if (pt != &pool->caching[i].orders[j]) ++ continue; ++ ++ ttm_pool_type_fini(pt); ++ } + } + + /* We removed the pool types from the LRU, but we need to also make sure +-- +2.43.0 + diff --git a/queue-6.6/kvm-x86-pmu-set-enable-bits-for-gp-counters-in-perf_.patch b/queue-6.6/kvm-x86-pmu-set-enable-bits-for-gp-counters-in-perf_.patch new file mode 100644 index 00000000000..1f7d5a2f613 --- /dev/null +++ b/queue-6.6/kvm-x86-pmu-set-enable-bits-for-gp-counters-in-perf_.patch @@ -0,0 +1,135 @@ +From e53f791282b6567256f6c19d43cf78cd053ff7bb Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Fri, 8 Mar 2024 17:36:40 -0800 +Subject: KVM: x86/pmu: Set enable bits for GP counters in PERF_GLOBAL_CTRL at + "RESET" + +From: Sean Christopherson + +[ Upstream commit de120e1d692d73c7eefa3278837b1eb68f90728a ] + +Set the enable bits for general purpose counters in IA32_PERF_GLOBAL_CTRL +when refreshing the PMU to emulate the MSR's architecturally defined +post-RESET behavior. Per Intel's SDM: + + IA32_PERF_GLOBAL_CTRL: Sets bits n-1:0 and clears the upper bits. + +and + + Where "n" is the number of general-purpose counters available in the processor. + +AMD also documents this behavior for PerfMonV2 CPUs in one of AMD's many +PPRs. + +Do not set any PERF_GLOBAL_CTRL bits if there are no general purpose +counters, although a literal reading of the SDM would require the CPU to +set either bits 63:0 or 31:0. The intent of the behavior is to globally +enable all GP counters; honor the intent, if not the letter of the law. + +Leaving PERF_GLOBAL_CTRL '0' effectively breaks PMU usage in guests that +haven't been updated to work with PMUs that support PERF_GLOBAL_CTRL. +This bug was recently exposed when KVM added supported for AMD's +PerfMonV2, i.e. when KVM started exposing a vPMU with PERF_GLOBAL_CTRL to +guest software that only knew how to program v1 PMUs (that don't support +PERF_GLOBAL_CTRL). + +Failure to emulate the post-RESET behavior results in such guests +unknowingly leaving all general purpose counters globally disabled (the +entire reason the post-RESET value sets the GP counter enable bits is to +maintain backwards compatibility). + +The bug has likely gone unnoticed because PERF_GLOBAL_CTRL has been +supported on Intel CPUs for as long as KVM has existed, i.e. hardly anyone +is running guest software that isn't aware of PERF_GLOBAL_CTRL on Intel +PMUs. And because up until v6.0, KVM _did_ emulate the behavior for Intel +CPUs, although the old behavior was likely dumb luck. + +Because (a) that old code was also broken in its own way (the history of +this code is a comedy of errors), and (b) PERF_GLOBAL_CTRL was documented +as having a value of '0' post-RESET in all SDMs before March 2023. + +Initial vPMU support in commit f5132b01386b ("KVM: Expose a version 2 +architectural PMU to a guests") *almost* got it right (again likely by +dumb luck), but for some reason only set the bits if the guest PMU was +advertised as v1: + + if (pmu->version == 1) { + pmu->global_ctrl = (1 << pmu->nr_arch_gp_counters) - 1; + return; + } + +Commit f19a0c2c2e6a ("KVM: PMU emulation: GLOBAL_CTRL MSR should be +enabled on reset") then tried to remedy that goof, presumably because +guest PMUs were leaving PERF_GLOBAL_CTRL '0', i.e. weren't enabling +counters. + + pmu->global_ctrl = ((1 << pmu->nr_arch_gp_counters) - 1) | + (((1ull << pmu->nr_arch_fixed_counters) - 1) << X86_PMC_IDX_FIXED); + pmu->global_ctrl_mask = ~pmu->global_ctrl; + +That was KVM's behavior up until commit c49467a45fe0 ("KVM: x86/pmu: +Don't overwrite the pmu->global_ctrl when refreshing") removed +*everything*. However, it did so based on the behavior defined by the +SDM , which at the time stated that "Global Perf Counter Controls" is +'0' at Power-Up and RESET. + +But then the March 2023 SDM (325462-079US), stealthily changed its +"IA-32 and Intel 64 Processor States Following Power-up, Reset, or INIT" +table to say: + + IA32_PERF_GLOBAL_CTRL: Sets bits n-1:0 and clears the upper bits. + +Note, kvm_pmu_refresh() can be invoked multiple times, i.e. it's not a +"pure" RESET flow. But it can only be called prior to the first KVM_RUN, +i.e. the guest will only ever observe the final value. + +Note #2, KVM has always cleared global_ctrl during refresh (see commit +f5132b01386b ("KVM: Expose a version 2 architectural PMU to a guests")), +i.e. there is no danger of breaking existing setups by clobbering a value +set by userspace. + +Reported-by: Babu Moger +Cc: Sandipan Das +Cc: Like Xu +Cc: Mingwei Zhang +Cc: Dapeng Mi +Cc: stable@vger.kernel.org +Reviewed-by: Dapeng Mi +Tested-by: Dapeng Mi +Link: https://lore.kernel.org/r/20240309013641.1413400-2-seanjc@google.com +Signed-off-by: Sean Christopherson +Signed-off-by: Sasha Levin +--- + arch/x86/kvm/pmu.c | 16 ++++++++++++++-- + 1 file changed, 14 insertions(+), 2 deletions(-) + +diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c +index fa6f5cd70d4c8..da2d82e3a8735 100644 +--- a/arch/x86/kvm/pmu.c ++++ b/arch/x86/kvm/pmu.c +@@ -716,8 +716,20 @@ void kvm_pmu_refresh(struct kvm_vcpu *vcpu) + pmu->pebs_data_cfg_mask = ~0ull; + bitmap_zero(pmu->all_valid_pmc_idx, X86_PMC_IDX_MAX); + +- if (vcpu->kvm->arch.enable_pmu) +- static_call(kvm_x86_pmu_refresh)(vcpu); ++ if (!vcpu->kvm->arch.enable_pmu) ++ return; ++ ++ static_call(kvm_x86_pmu_refresh)(vcpu); ++ ++ /* ++ * At RESET, both Intel and AMD CPUs set all enable bits for general ++ * purpose counters in IA32_PERF_GLOBAL_CTRL (so that software that ++ * was written for v1 PMUs don't unknowingly leave GP counters disabled ++ * in the global controls). Emulate that behavior when refreshing the ++ * PMU so that userspace doesn't need to manually set PERF_GLOBAL_CTRL. ++ */ ++ if (kvm_pmu_has_perf_global_ctrl(pmu) && pmu->nr_arch_gp_counters) ++ pmu->global_ctrl = GENMASK_ULL(pmu->nr_arch_gp_counters - 1, 0); + } + + void kvm_pmu_init(struct kvm_vcpu *vcpu) +-- +2.43.0 + diff --git a/queue-6.6/kvm-x86-pmu-zero-out-pmu-metadata-on-amd-if-pmu-is-d.patch b/queue-6.6/kvm-x86-pmu-zero-out-pmu-metadata-on-amd-if-pmu-is-d.patch new file mode 100644 index 00000000000..4c4e17c8df1 --- /dev/null +++ b/queue-6.6/kvm-x86-pmu-zero-out-pmu-metadata-on-amd-if-pmu-is-d.patch @@ -0,0 +1,109 @@ +From ca1df8e3df9e6041412863834209ce5cdef70af9 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Thu, 9 Nov 2023 18:28:48 -0800 +Subject: KVM: x86/pmu: Zero out PMU metadata on AMD if PMU is disabled + +From: Sean Christopherson + +[ Upstream commit f933b88e20150f15787390e2a1754a7e412754ed ] + +Move the purging of common PMU metadata from intel_pmu_refresh() to +kvm_pmu_refresh(), and invoke the vendor refresh() hook if and only if +the VM is supposed to have a vPMU. + +KVM already denies access to the PMU based on kvm->arch.enable_pmu, as +get_gp_pmc_amd() returns NULL for all PMCs in that case, i.e. KVM already +violates AMD's architecture by not virtualizing a PMU (kernels have long +since learned to not panic when the PMU is unavailable). But configuring +the PMU as if it were enabled causes unwanted side effects, e.g. calls to +kvm_pmu_trigger_event() waste an absurd number of cycles due to the +all_valid_pmc_idx bitmap being non-zero. + +Fixes: b1d66dad65dc ("KVM: x86/svm: Add module param to control PMU virtualization") +Reported-by: Konstantin Khorenko +Closes: https://lore.kernel.org/all/20231109180646.2963718-2-khorenko@virtuozzo.com +Link: https://lore.kernel.org/r/20231110022857.1273836-2-seanjc@google.com +Signed-off-by: Sean Christopherson +Stable-dep-of: de120e1d692d ("KVM: x86/pmu: Set enable bits for GP counters in PERF_GLOBAL_CTRL at "RESET"") +Signed-off-by: Sasha Levin +--- + arch/x86/kvm/pmu.c | 20 ++++++++++++++++++-- + arch/x86/kvm/vmx/pmu_intel.c | 16 ++-------------- + 2 files changed, 20 insertions(+), 16 deletions(-) + +diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c +index dc8e8e907cfbf..fa6f5cd70d4c8 100644 +--- a/arch/x86/kvm/pmu.c ++++ b/arch/x86/kvm/pmu.c +@@ -691,6 +691,8 @@ void kvm_pmu_reset(struct kvm_vcpu *vcpu) + */ + void kvm_pmu_refresh(struct kvm_vcpu *vcpu) + { ++ struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); ++ + if (KVM_BUG_ON(kvm_vcpu_has_run(vcpu), vcpu->kvm)) + return; + +@@ -700,8 +702,22 @@ void kvm_pmu_refresh(struct kvm_vcpu *vcpu) + */ + kvm_pmu_reset(vcpu); + +- bitmap_zero(vcpu_to_pmu(vcpu)->all_valid_pmc_idx, X86_PMC_IDX_MAX); +- static_call(kvm_x86_pmu_refresh)(vcpu); ++ pmu->version = 0; ++ pmu->nr_arch_gp_counters = 0; ++ pmu->nr_arch_fixed_counters = 0; ++ pmu->counter_bitmask[KVM_PMC_GP] = 0; ++ pmu->counter_bitmask[KVM_PMC_FIXED] = 0; ++ pmu->reserved_bits = 0xffffffff00200000ull; ++ pmu->raw_event_mask = X86_RAW_EVENT_MASK; ++ pmu->global_ctrl_mask = ~0ull; ++ pmu->global_status_mask = ~0ull; ++ pmu->fixed_ctr_ctrl_mask = ~0ull; ++ pmu->pebs_enable_mask = ~0ull; ++ pmu->pebs_data_cfg_mask = ~0ull; ++ bitmap_zero(pmu->all_valid_pmc_idx, X86_PMC_IDX_MAX); ++ ++ if (vcpu->kvm->arch.enable_pmu) ++ static_call(kvm_x86_pmu_refresh)(vcpu); + } + + void kvm_pmu_init(struct kvm_vcpu *vcpu) +diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c +index 1549461fa42b7..48a2f77f62ef3 100644 +--- a/arch/x86/kvm/vmx/pmu_intel.c ++++ b/arch/x86/kvm/vmx/pmu_intel.c +@@ -493,19 +493,6 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu) + u64 counter_mask; + int i; + +- pmu->nr_arch_gp_counters = 0; +- pmu->nr_arch_fixed_counters = 0; +- pmu->counter_bitmask[KVM_PMC_GP] = 0; +- pmu->counter_bitmask[KVM_PMC_FIXED] = 0; +- pmu->version = 0; +- pmu->reserved_bits = 0xffffffff00200000ull; +- pmu->raw_event_mask = X86_RAW_EVENT_MASK; +- pmu->global_ctrl_mask = ~0ull; +- pmu->global_status_mask = ~0ull; +- pmu->fixed_ctr_ctrl_mask = ~0ull; +- pmu->pebs_enable_mask = ~0ull; +- pmu->pebs_data_cfg_mask = ~0ull; +- + memset(&lbr_desc->records, 0, sizeof(lbr_desc->records)); + + /* +@@ -517,8 +504,9 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu) + return; + + entry = kvm_find_cpuid_entry(vcpu, 0xa); +- if (!entry || !vcpu->kvm->arch.enable_pmu) ++ if (!entry) + return; ++ + eax.full = entry->eax; + edx.full = entry->edx; + +-- +2.43.0 + diff --git a/queue-6.6/mm-gup-explicitly-define-and-check-internal-gup-flag.patch b/queue-6.6/mm-gup-explicitly-define-and-check-internal-gup-flag.patch new file mode 100644 index 00000000000..112b4c4d9ae --- /dev/null +++ b/queue-6.6/mm-gup-explicitly-define-and-check-internal-gup-flag.patch @@ -0,0 +1,79 @@ +From d64d15556b61253f6109226a4c039ace736a218f Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Tue, 3 Oct 2023 00:14:52 +0100 +Subject: mm/gup: explicitly define and check internal GUP flags, disallow + FOLL_TOUCH + +From: Lorenzo Stoakes + +[ Upstream commit 0f20bba1688bdf3b32df0162511a67d4eda15790 ] + +Rather than open-coding a list of internal GUP flags in +is_valid_gup_args(), define which ones are internal. + +In addition, explicitly check to see if the user passed in FOLL_TOUCH +somehow, as this appears to have been accidentally excluded. + +Link: https://lkml.kernel.org/r/971e013dfe20915612ea8b704e801d7aef9a66b6.1696288092.git.lstoakes@gmail.com +Signed-off-by: Lorenzo Stoakes +Reviewed-by: Arnd Bergmann +Reviewed-by: David Hildenbrand +Reviewed-by: Jason Gunthorpe +Cc: Adrian Hunter +Cc: Alexander Shishkin +Cc: Arnaldo Carvalho de Melo +Cc: Catalin Marinas +Cc: Ian Rogers +Cc: Ingo Molnar +Cc: Jiri Olsa +Cc: John Hubbard +Cc: Mark Rutland +Cc: Namhyung Kim +Cc: Oleg Nesterov +Cc: Peter Zijlstra +Cc: Richard Cochran +Cc: Will Deacon +Signed-off-by: Andrew Morton +Stable-dep-of: 631426ba1d45 ("mm/madvise: make MADV_POPULATE_(READ|WRITE) handle VM_FAULT_RETRY properly") +Signed-off-by: Sasha Levin +--- + mm/gup.c | 5 ++--- + mm/internal.h | 3 +++ + 2 files changed, 5 insertions(+), 3 deletions(-) + +diff --git a/mm/gup.c b/mm/gup.c +index 2f8a2d89fde19..b21b33d1787e1 100644 +--- a/mm/gup.c ++++ b/mm/gup.c +@@ -2227,12 +2227,11 @@ static bool is_valid_gup_args(struct page **pages, int *locked, + /* + * These flags not allowed to be specified externally to the gup + * interfaces: +- * - FOLL_PIN/FOLL_TRIED/FOLL_FAST_ONLY are internal only ++ * - FOLL_TOUCH/FOLL_PIN/FOLL_TRIED/FOLL_FAST_ONLY are internal only + * - FOLL_REMOTE is internal only and used on follow_page() + * - FOLL_UNLOCKABLE is internal only and used if locked is !NULL + */ +- if (WARN_ON_ONCE(gup_flags & (FOLL_PIN | FOLL_TRIED | FOLL_UNLOCKABLE | +- FOLL_REMOTE | FOLL_FAST_ONLY))) ++ if (WARN_ON_ONCE(gup_flags & INTERNAL_GUP_FLAGS)) + return false; + + gup_flags |= to_set; +diff --git a/mm/internal.h b/mm/internal.h +index 30cf724ddbce3..50cf76d30a88f 100644 +--- a/mm/internal.h ++++ b/mm/internal.h +@@ -964,6 +964,9 @@ enum { + FOLL_UNLOCKABLE = 1 << 21, + }; + ++#define INTERNAL_GUP_FLAGS (FOLL_TOUCH | FOLL_TRIED | FOLL_REMOTE | FOLL_PIN | \ ++ FOLL_FAST_ONLY | FOLL_UNLOCKABLE) ++ + /* + * Indicates for which pages that are write-protected in the page table, + * whether GUP has to trigger unsharing via FAULT_FLAG_UNSHARE such that the +-- +2.43.0 + diff --git a/queue-6.6/mm-madvise-make-madv_populate_-read-write-handle-vm_.patch b/queue-6.6/mm-madvise-make-madv_populate_-read-write-handle-vm_.patch new file mode 100644 index 00000000000..2b15ba1f1d4 --- /dev/null +++ b/queue-6.6/mm-madvise-make-madv_populate_-read-write-handle-vm_.patch @@ -0,0 +1,244 @@ +From 7d3472c1c4d4a10f6720d5894d2efd29400d4ea4 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Thu, 14 Mar 2024 17:12:59 +0100 +Subject: mm/madvise: make MADV_POPULATE_(READ|WRITE) handle VM_FAULT_RETRY + properly + +From: David Hildenbrand + +[ Upstream commit 631426ba1d45a8672b177ee85ad4cabe760dd131 ] + +Darrick reports that in some cases where pread() would fail with -EIO and +mmap()+access would generate a SIGBUS signal, MADV_POPULATE_READ / +MADV_POPULATE_WRITE will keep retrying forever and not fail with -EFAULT. + +While the madvise() call can be interrupted by a signal, this is not the +desired behavior. MADV_POPULATE_READ / MADV_POPULATE_WRITE should behave +like page faults in that case: fail and not retry forever. + +A reproducer can be found at [1]. + +The reason is that __get_user_pages(), as called by +faultin_vma_page_range(), will not handle VM_FAULT_RETRY in a proper way: +it will simply return 0 when VM_FAULT_RETRY happened, making +madvise_populate()->faultin_vma_page_range() retry again and again, never +setting FOLL_TRIED->FAULT_FLAG_TRIED for __get_user_pages(). + +__get_user_pages_locked() does what we want, but duplicating that logic in +faultin_vma_page_range() feels wrong. + +So let's use __get_user_pages_locked() instead, that will detect +VM_FAULT_RETRY and set FOLL_TRIED when retrying, making the fault handler +return VM_FAULT_SIGBUS (VM_FAULT_ERROR) at some point, propagating -EFAULT +from faultin_page() to __get_user_pages(), all the way to +madvise_populate(). + +But, there is an issue: __get_user_pages_locked() will end up re-taking +the MM lock and then __get_user_pages() will do another VMA lookup. In +the meantime, the VMA layout could have changed and we'd fail with +different error codes than we'd want to. + +As __get_user_pages() will currently do a new VMA lookup either way, let +it do the VMA handling in a different way, controlled by a new +FOLL_MADV_POPULATE flag, effectively moving these checks from +madvise_populate() + faultin_page_range() in there. + +With this change, Darricks reproducer properly fails with -EFAULT, as +documented for MADV_POPULATE_READ / MADV_POPULATE_WRITE. + +[1] https://lore.kernel.org/all/20240313171936.GN1927156@frogsfrogsfrogs/ + +Link: https://lkml.kernel.org/r/20240314161300.382526-1-david@redhat.com +Link: https://lkml.kernel.org/r/20240314161300.382526-2-david@redhat.com +Fixes: 4ca9b3859dac ("mm/madvise: introduce MADV_POPULATE_(READ|WRITE) to prefault page tables") +Signed-off-by: David Hildenbrand +Reported-by: Darrick J. Wong +Closes: https://lore.kernel.org/all/20240311223815.GW1927156@frogsfrogsfrogs/ +Cc: Darrick J. Wong +Cc: Hugh Dickins +Cc: Jason Gunthorpe +Cc: John Hubbard +Cc: +Signed-off-by: Andrew Morton +Signed-off-by: Sasha Levin +--- + mm/gup.c | 54 ++++++++++++++++++++++++++++++--------------------- + mm/internal.h | 10 ++++++---- + mm/madvise.c | 17 ++-------------- + 3 files changed, 40 insertions(+), 41 deletions(-) + +diff --git a/mm/gup.c b/mm/gup.c +index b21b33d1787e1..cfc0a66d951b9 100644 +--- a/mm/gup.c ++++ b/mm/gup.c +@@ -1204,6 +1204,22 @@ static long __get_user_pages(struct mm_struct *mm, + + /* first iteration or cross vma bound */ + if (!vma || start >= vma->vm_end) { ++ /* ++ * MADV_POPULATE_(READ|WRITE) wants to handle VMA ++ * lookups+error reporting differently. ++ */ ++ if (gup_flags & FOLL_MADV_POPULATE) { ++ vma = vma_lookup(mm, start); ++ if (!vma) { ++ ret = -ENOMEM; ++ goto out; ++ } ++ if (check_vma_flags(vma, gup_flags)) { ++ ret = -EINVAL; ++ goto out; ++ } ++ goto retry; ++ } + vma = gup_vma_lookup(mm, start); + if (!vma && in_gate_area(mm, start)) { + ret = get_gate_page(mm, start & PAGE_MASK, +@@ -1670,35 +1686,35 @@ long populate_vma_page_range(struct vm_area_struct *vma, + } + + /* +- * faultin_vma_page_range() - populate (prefault) page tables inside the +- * given VMA range readable/writable ++ * faultin_page_range() - populate (prefault) page tables inside the ++ * given range readable/writable + * + * This takes care of mlocking the pages, too, if VM_LOCKED is set. + * +- * @vma: target vma ++ * @mm: the mm to populate page tables in + * @start: start address + * @end: end address + * @write: whether to prefault readable or writable + * @locked: whether the mmap_lock is still held + * +- * Returns either number of processed pages in the vma, or a negative error +- * code on error (see __get_user_pages()). ++ * Returns either number of processed pages in the MM, or a negative error ++ * code on error (see __get_user_pages()). Note that this function reports ++ * errors related to VMAs, such as incompatible mappings, as expected by ++ * MADV_POPULATE_(READ|WRITE). + * +- * vma->vm_mm->mmap_lock must be held. The range must be page-aligned and +- * covered by the VMA. If it's released, *@locked will be set to 0. ++ * The range must be page-aligned. ++ * ++ * mm->mmap_lock must be held. If it's released, *@locked will be set to 0. + */ +-long faultin_vma_page_range(struct vm_area_struct *vma, unsigned long start, +- unsigned long end, bool write, int *locked) ++long faultin_page_range(struct mm_struct *mm, unsigned long start, ++ unsigned long end, bool write, int *locked) + { +- struct mm_struct *mm = vma->vm_mm; + unsigned long nr_pages = (end - start) / PAGE_SIZE; + int gup_flags; + long ret; + + VM_BUG_ON(!PAGE_ALIGNED(start)); + VM_BUG_ON(!PAGE_ALIGNED(end)); +- VM_BUG_ON_VMA(start < vma->vm_start, vma); +- VM_BUG_ON_VMA(end > vma->vm_end, vma); + mmap_assert_locked(mm); + + /* +@@ -1710,19 +1726,13 @@ long faultin_vma_page_range(struct vm_area_struct *vma, unsigned long start, + * a poisoned page. + * !FOLL_FORCE: Require proper access permissions. + */ +- gup_flags = FOLL_TOUCH | FOLL_HWPOISON | FOLL_UNLOCKABLE; ++ gup_flags = FOLL_TOUCH | FOLL_HWPOISON | FOLL_UNLOCKABLE | ++ FOLL_MADV_POPULATE; + if (write) + gup_flags |= FOLL_WRITE; + +- /* +- * We want to report -EINVAL instead of -EFAULT for any permission +- * problems or incompatible mappings. +- */ +- if (check_vma_flags(vma, gup_flags)) +- return -EINVAL; +- +- ret = __get_user_pages(mm, start, nr_pages, gup_flags, +- NULL, locked); ++ ret = __get_user_pages_locked(mm, start, nr_pages, NULL, locked, ++ gup_flags); + lru_add_drain(); + return ret; + } +diff --git a/mm/internal.h b/mm/internal.h +index 50cf76d30a88f..abed947f784b7 100644 +--- a/mm/internal.h ++++ b/mm/internal.h +@@ -581,9 +581,8 @@ struct anon_vma *folio_anon_vma(struct folio *folio); + void unmap_mapping_folio(struct folio *folio); + extern long populate_vma_page_range(struct vm_area_struct *vma, + unsigned long start, unsigned long end, int *locked); +-extern long faultin_vma_page_range(struct vm_area_struct *vma, +- unsigned long start, unsigned long end, +- bool write, int *locked); ++extern long faultin_page_range(struct mm_struct *mm, unsigned long start, ++ unsigned long end, bool write, int *locked); + extern bool mlock_future_ok(struct mm_struct *mm, unsigned long flags, + unsigned long bytes); + /* +@@ -962,10 +961,13 @@ enum { + FOLL_FAST_ONLY = 1 << 20, + /* allow unlocking the mmap lock */ + FOLL_UNLOCKABLE = 1 << 21, ++ /* VMA lookup+checks compatible with MADV_POPULATE_(READ|WRITE) */ ++ FOLL_MADV_POPULATE = 1 << 22, + }; + + #define INTERNAL_GUP_FLAGS (FOLL_TOUCH | FOLL_TRIED | FOLL_REMOTE | FOLL_PIN | \ +- FOLL_FAST_ONLY | FOLL_UNLOCKABLE) ++ FOLL_FAST_ONLY | FOLL_UNLOCKABLE | \ ++ FOLL_MADV_POPULATE) + + /* + * Indicates for which pages that are write-protected in the page table, +diff --git a/mm/madvise.c b/mm/madvise.c +index 4dded5d27e7ea..98fdb9288a68a 100644 +--- a/mm/madvise.c ++++ b/mm/madvise.c +@@ -917,27 +917,14 @@ static long madvise_populate(struct vm_area_struct *vma, + { + const bool write = behavior == MADV_POPULATE_WRITE; + struct mm_struct *mm = vma->vm_mm; +- unsigned long tmp_end; + int locked = 1; + long pages; + + *prev = vma; + + while (start < end) { +- /* +- * We might have temporarily dropped the lock. For example, +- * our VMA might have been split. +- */ +- if (!vma || start >= vma->vm_end) { +- vma = vma_lookup(mm, start); +- if (!vma) +- return -ENOMEM; +- } +- +- tmp_end = min_t(unsigned long, end, vma->vm_end); + /* Populate (prefault) page tables readable/writable. */ +- pages = faultin_vma_page_range(vma, start, tmp_end, write, +- &locked); ++ pages = faultin_page_range(mm, start, end, write, &locked); + if (!locked) { + mmap_read_lock(mm); + locked = 1; +@@ -958,7 +945,7 @@ static long madvise_populate(struct vm_area_struct *vma, + pr_warn_once("%s: unhandled return value: %ld\n", + __func__, pages); + fallthrough; +- case -ENOMEM: ++ case -ENOMEM: /* No VMA or out of memory. */ + return -ENOMEM; + } + } +-- +2.43.0 + diff --git a/queue-6.6/mm-treewide-introduce-nr_page_orders.patch b/queue-6.6/mm-treewide-introduce-nr_page_orders.patch new file mode 100644 index 00000000000..e6c47285312 --- /dev/null +++ b/queue-6.6/mm-treewide-introduce-nr_page_orders.patch @@ -0,0 +1,441 @@ +From c781c72f054614cc0c84bd00d0e44ed387a86b8c Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Thu, 28 Dec 2023 17:47:03 +0300 +Subject: mm, treewide: introduce NR_PAGE_ORDERS + +From: Kirill A. Shutemov + +[ Upstream commit fd37721803c6e73619108f76ad2e12a9aa5fafaf ] + +NR_PAGE_ORDERS defines the number of page orders supported by the page +allocator, ranging from 0 to MAX_ORDER, MAX_ORDER + 1 in total. + +NR_PAGE_ORDERS assists in defining arrays of page orders and allows for +more natural iteration over them. + +[kirill.shutemov@linux.intel.com: fixup for kerneldoc warning] + Link: https://lkml.kernel.org/r/20240101111512.7empzyifq7kxtzk3@box +Link: https://lkml.kernel.org/r/20231228144704.14033-1-kirill.shutemov@linux.intel.com +Signed-off-by: Kirill A. Shutemov +Reviewed-by: Zi Yan +Cc: Linus Torvalds +Signed-off-by: Andrew Morton +Stable-dep-of: b6976f323a86 ("drm/ttm: stop pooling cached NUMA pages v2") +Signed-off-by: Sasha Levin +--- + .../admin-guide/kdump/vmcoreinfo.rst | 6 +++--- + arch/arm64/kvm/hyp/include/nvhe/gfp.h | 2 +- + arch/sparc/kernel/traps_64.c | 2 +- + drivers/gpu/drm/ttm/tests/ttm_device_test.c | 2 +- + drivers/gpu/drm/ttm/ttm_pool.c | 20 +++++++++---------- + include/drm/ttm/ttm_pool.h | 2 +- + include/linux/mmzone.h | 6 ++++-- + kernel/crash_core.c | 2 +- + lib/test_meminit.c | 2 +- + mm/compaction.c | 2 +- + mm/kmsan/init.c | 2 +- + mm/page_alloc.c | 13 ++++++------ + mm/page_reporting.c | 2 +- + mm/show_mem.c | 8 ++++---- + mm/vmstat.c | 12 +++++------ + 15 files changed, 42 insertions(+), 41 deletions(-) + +diff --git a/Documentation/admin-guide/kdump/vmcoreinfo.rst b/Documentation/admin-guide/kdump/vmcoreinfo.rst +index 599e8d3bcbc31..9235cf4fbabff 100644 +--- a/Documentation/admin-guide/kdump/vmcoreinfo.rst ++++ b/Documentation/admin-guide/kdump/vmcoreinfo.rst +@@ -172,7 +172,7 @@ variables. + Offset of the free_list's member. This value is used to compute the number + of free pages. + +-Each zone has a free_area structure array called free_area[MAX_ORDER + 1]. ++Each zone has a free_area structure array called free_area[NR_PAGE_ORDERS]. + The free_list represents a linked list of free page blocks. + + (list_head, next|prev) +@@ -189,8 +189,8 @@ Offsets of the vmap_area's members. They carry vmalloc-specific + information. Makedumpfile gets the start address of the vmalloc region + from this. + +-(zone.free_area, MAX_ORDER + 1) +-------------------------------- ++(zone.free_area, NR_PAGE_ORDERS) ++-------------------------------- + + Free areas descriptor. User-space tools use this value to iterate the + free_area ranges. MAX_ORDER is used by the zone buddy allocator. +diff --git a/arch/arm64/kvm/hyp/include/nvhe/gfp.h b/arch/arm64/kvm/hyp/include/nvhe/gfp.h +index fe5472a184a37..97c527ef53c2a 100644 +--- a/arch/arm64/kvm/hyp/include/nvhe/gfp.h ++++ b/arch/arm64/kvm/hyp/include/nvhe/gfp.h +@@ -16,7 +16,7 @@ struct hyp_pool { + * API at EL2. + */ + hyp_spinlock_t lock; +- struct list_head free_area[MAX_ORDER + 1]; ++ struct list_head free_area[NR_PAGE_ORDERS]; + phys_addr_t range_start; + phys_addr_t range_end; + unsigned short max_order; +diff --git a/arch/sparc/kernel/traps_64.c b/arch/sparc/kernel/traps_64.c +index 08ffd17d5ec34..523a6e5ee9251 100644 +--- a/arch/sparc/kernel/traps_64.c ++++ b/arch/sparc/kernel/traps_64.c +@@ -897,7 +897,7 @@ void __init cheetah_ecache_flush_init(void) + + /* Now allocate error trap reporting scoreboard. */ + sz = NR_CPUS * (2 * sizeof(struct cheetah_err_info)); +- for (order = 0; order <= MAX_ORDER; order++) { ++ for (order = 0; order < NR_PAGE_ORDERS; order++) { + if ((PAGE_SIZE << order) >= sz) + break; + } +diff --git a/drivers/gpu/drm/ttm/tests/ttm_device_test.c b/drivers/gpu/drm/ttm/tests/ttm_device_test.c +index b1b423b68cdf1..19eaff22e6ae0 100644 +--- a/drivers/gpu/drm/ttm/tests/ttm_device_test.c ++++ b/drivers/gpu/drm/ttm/tests/ttm_device_test.c +@@ -175,7 +175,7 @@ static void ttm_device_init_pools(struct kunit *test) + + if (params->pools_init_expected) { + for (int i = 0; i < TTM_NUM_CACHING_TYPES; ++i) { +- for (int j = 0; j <= MAX_ORDER; ++j) { ++ for (int j = 0; j < NR_PAGE_ORDERS; ++j) { + pt = pool->caching[i].orders[j]; + KUNIT_EXPECT_PTR_EQ(test, pt.pool, pool); + KUNIT_EXPECT_EQ(test, pt.caching, i); +diff --git a/drivers/gpu/drm/ttm/ttm_pool.c b/drivers/gpu/drm/ttm/ttm_pool.c +index 9b60222511d65..c8ec6a2cac5d4 100644 +--- a/drivers/gpu/drm/ttm/ttm_pool.c ++++ b/drivers/gpu/drm/ttm/ttm_pool.c +@@ -65,11 +65,11 @@ module_param(page_pool_size, ulong, 0644); + + static atomic_long_t allocated_pages; + +-static struct ttm_pool_type global_write_combined[MAX_ORDER + 1]; +-static struct ttm_pool_type global_uncached[MAX_ORDER + 1]; ++static struct ttm_pool_type global_write_combined[NR_PAGE_ORDERS]; ++static struct ttm_pool_type global_uncached[NR_PAGE_ORDERS]; + +-static struct ttm_pool_type global_dma32_write_combined[MAX_ORDER + 1]; +-static struct ttm_pool_type global_dma32_uncached[MAX_ORDER + 1]; ++static struct ttm_pool_type global_dma32_write_combined[NR_PAGE_ORDERS]; ++static struct ttm_pool_type global_dma32_uncached[NR_PAGE_ORDERS]; + + static spinlock_t shrinker_lock; + static struct list_head shrinker_list; +@@ -565,7 +565,7 @@ void ttm_pool_init(struct ttm_pool *pool, struct device *dev, + + if (use_dma_alloc || nid != NUMA_NO_NODE) { + for (i = 0; i < TTM_NUM_CACHING_TYPES; ++i) +- for (j = 0; j <= MAX_ORDER; ++j) ++ for (j = 0; j < NR_PAGE_ORDERS; ++j) + ttm_pool_type_init(&pool->caching[i].orders[j], + pool, i, j); + } +@@ -586,7 +586,7 @@ void ttm_pool_fini(struct ttm_pool *pool) + + if (pool->use_dma_alloc || pool->nid != NUMA_NO_NODE) { + for (i = 0; i < TTM_NUM_CACHING_TYPES; ++i) +- for (j = 0; j <= MAX_ORDER; ++j) ++ for (j = 0; j < NR_PAGE_ORDERS; ++j) + ttm_pool_type_fini(&pool->caching[i].orders[j]); + } + +@@ -641,7 +641,7 @@ static void ttm_pool_debugfs_header(struct seq_file *m) + unsigned int i; + + seq_puts(m, "\t "); +- for (i = 0; i <= MAX_ORDER; ++i) ++ for (i = 0; i < NR_PAGE_ORDERS; ++i) + seq_printf(m, " ---%2u---", i); + seq_puts(m, "\n"); + } +@@ -652,7 +652,7 @@ static void ttm_pool_debugfs_orders(struct ttm_pool_type *pt, + { + unsigned int i; + +- for (i = 0; i <= MAX_ORDER; ++i) ++ for (i = 0; i < NR_PAGE_ORDERS; ++i) + seq_printf(m, " %8u", ttm_pool_type_count(&pt[i])); + seq_puts(m, "\n"); + } +@@ -761,7 +761,7 @@ int ttm_pool_mgr_init(unsigned long num_pages) + spin_lock_init(&shrinker_lock); + INIT_LIST_HEAD(&shrinker_list); + +- for (i = 0; i <= MAX_ORDER; ++i) { ++ for (i = 0; i < NR_PAGE_ORDERS; ++i) { + ttm_pool_type_init(&global_write_combined[i], NULL, + ttm_write_combined, i); + ttm_pool_type_init(&global_uncached[i], NULL, ttm_uncached, i); +@@ -794,7 +794,7 @@ void ttm_pool_mgr_fini(void) + { + unsigned int i; + +- for (i = 0; i <= MAX_ORDER; ++i) { ++ for (i = 0; i < NR_PAGE_ORDERS; ++i) { + ttm_pool_type_fini(&global_write_combined[i]); + ttm_pool_type_fini(&global_uncached[i]); + +diff --git a/include/drm/ttm/ttm_pool.h b/include/drm/ttm/ttm_pool.h +index 30a347e5aa114..4490d43c63e33 100644 +--- a/include/drm/ttm/ttm_pool.h ++++ b/include/drm/ttm/ttm_pool.h +@@ -74,7 +74,7 @@ struct ttm_pool { + bool use_dma32; + + struct { +- struct ttm_pool_type orders[MAX_ORDER + 1]; ++ struct ttm_pool_type orders[NR_PAGE_ORDERS]; + } caching[TTM_NUM_CACHING_TYPES]; + }; + +diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h +index 0f62786269d0c..1acbc6ce1fe43 100644 +--- a/include/linux/mmzone.h ++++ b/include/linux/mmzone.h +@@ -34,6 +34,8 @@ + + #define IS_MAX_ORDER_ALIGNED(pfn) IS_ALIGNED(pfn, MAX_ORDER_NR_PAGES) + ++#define NR_PAGE_ORDERS (MAX_ORDER + 1) ++ + /* + * PAGE_ALLOC_COSTLY_ORDER is the order at which allocations are deemed + * costly to service. That is between allocation orders which should +@@ -95,7 +97,7 @@ static inline bool migratetype_is_mergeable(int mt) + } + + #define for_each_migratetype_order(order, type) \ +- for (order = 0; order <= MAX_ORDER; order++) \ ++ for (order = 0; order < NR_PAGE_ORDERS; order++) \ + for (type = 0; type < MIGRATE_TYPES; type++) + + extern int page_group_by_mobility_disabled; +@@ -929,7 +931,7 @@ struct zone { + CACHELINE_PADDING(_pad1_); + + /* free areas of different sizes */ +- struct free_area free_area[MAX_ORDER + 1]; ++ struct free_area free_area[NR_PAGE_ORDERS]; + + #ifdef CONFIG_UNACCEPTED_MEMORY + /* Pages to be accepted. All pages on the list are MAX_ORDER */ +diff --git a/kernel/crash_core.c b/kernel/crash_core.c +index 2f675ef045d40..b685e94605841 100644 +--- a/kernel/crash_core.c ++++ b/kernel/crash_core.c +@@ -660,7 +660,7 @@ static int __init crash_save_vmcoreinfo_init(void) + VMCOREINFO_OFFSET(list_head, prev); + VMCOREINFO_OFFSET(vmap_area, va_start); + VMCOREINFO_OFFSET(vmap_area, list); +- VMCOREINFO_LENGTH(zone.free_area, MAX_ORDER + 1); ++ VMCOREINFO_LENGTH(zone.free_area, NR_PAGE_ORDERS); + log_buf_vmcoreinfo_setup(); + VMCOREINFO_LENGTH(free_area.free_list, MIGRATE_TYPES); + VMCOREINFO_NUMBER(NR_FREE_PAGES); +diff --git a/lib/test_meminit.c b/lib/test_meminit.c +index 0ae35223d7733..0dc173849a542 100644 +--- a/lib/test_meminit.c ++++ b/lib/test_meminit.c +@@ -93,7 +93,7 @@ static int __init test_pages(int *total_failures) + int failures = 0, num_tests = 0; + int i; + +- for (i = 0; i <= MAX_ORDER; i++) ++ for (i = 0; i < NR_PAGE_ORDERS; i++) + num_tests += do_alloc_pages_order(i, &failures); + + REPORT_FAILURES_IN_FN(); +diff --git a/mm/compaction.c b/mm/compaction.c +index 5a3c644c978e2..61c741f11e9bb 100644 +--- a/mm/compaction.c ++++ b/mm/compaction.c +@@ -2225,7 +2225,7 @@ static enum compact_result __compact_finished(struct compact_control *cc) + + /* Direct compactor: Is a suitable page free? */ + ret = COMPACT_NO_SUITABLE_PAGE; +- for (order = cc->order; order <= MAX_ORDER; order++) { ++ for (order = cc->order; order < NR_PAGE_ORDERS; order++) { + struct free_area *area = &cc->zone->free_area[order]; + bool can_steal; + +diff --git a/mm/kmsan/init.c b/mm/kmsan/init.c +index ffedf4dbc49d7..103e2e88ea033 100644 +--- a/mm/kmsan/init.c ++++ b/mm/kmsan/init.c +@@ -96,7 +96,7 @@ void __init kmsan_init_shadow(void) + struct metadata_page_pair { + struct page *shadow, *origin; + }; +-static struct metadata_page_pair held_back[MAX_ORDER + 1] __initdata; ++static struct metadata_page_pair held_back[NR_PAGE_ORDERS] __initdata; + + /* + * Eager metadata allocation. When the memblock allocator is freeing pages to +diff --git a/mm/page_alloc.c b/mm/page_alloc.c +index ab71417350127..6b4c30fcae1c9 100644 +--- a/mm/page_alloc.c ++++ b/mm/page_alloc.c +@@ -1570,7 +1570,7 @@ struct page *__rmqueue_smallest(struct zone *zone, unsigned int order, + struct page *page; + + /* Find a page of the appropriate size in the preferred list */ +- for (current_order = order; current_order <= MAX_ORDER; ++current_order) { ++ for (current_order = order; current_order < NR_PAGE_ORDERS; ++current_order) { + area = &(zone->free_area[current_order]); + page = get_page_from_free_area(area, migratetype); + if (!page) +@@ -1940,7 +1940,7 @@ static bool unreserve_highatomic_pageblock(const struct alloc_context *ac, + continue; + + spin_lock_irqsave(&zone->lock, flags); +- for (order = 0; order <= MAX_ORDER; order++) { ++ for (order = 0; order < NR_PAGE_ORDERS; order++) { + struct free_area *area = &(zone->free_area[order]); + + page = get_page_from_free_area(area, MIGRATE_HIGHATOMIC); +@@ -2050,8 +2050,7 @@ __rmqueue_fallback(struct zone *zone, int order, int start_migratetype, + return false; + + find_smallest: +- for (current_order = order; current_order <= MAX_ORDER; +- current_order++) { ++ for (current_order = order; current_order < NR_PAGE_ORDERS; current_order++) { + area = &(zone->free_area[current_order]); + fallback_mt = find_suitable_fallback(area, current_order, + start_migratetype, false, &can_steal); +@@ -2884,7 +2883,7 @@ bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark, + return true; + + /* For a high-order request, check at least one suitable page is free */ +- for (o = order; o <= MAX_ORDER; o++) { ++ for (o = order; o < NR_PAGE_ORDERS; o++) { + struct free_area *area = &z->free_area[o]; + int mt; + +@@ -6442,7 +6441,7 @@ bool is_free_buddy_page(struct page *page) + unsigned long pfn = page_to_pfn(page); + unsigned int order; + +- for (order = 0; order <= MAX_ORDER; order++) { ++ for (order = 0; order < NR_PAGE_ORDERS; order++) { + struct page *page_head = page - (pfn & ((1 << order) - 1)); + + if (PageBuddy(page_head) && +@@ -6501,7 +6500,7 @@ bool take_page_off_buddy(struct page *page) + bool ret = false; + + spin_lock_irqsave(&zone->lock, flags); +- for (order = 0; order <= MAX_ORDER; order++) { ++ for (order = 0; order < NR_PAGE_ORDERS; order++) { + struct page *page_head = page - (pfn & ((1 << order) - 1)); + int page_order = buddy_order(page_head); + +diff --git a/mm/page_reporting.c b/mm/page_reporting.c +index b021f482a4cb3..66369cc5279bf 100644 +--- a/mm/page_reporting.c ++++ b/mm/page_reporting.c +@@ -276,7 +276,7 @@ page_reporting_process_zone(struct page_reporting_dev_info *prdev, + return err; + + /* Process each free list starting from lowest order/mt */ +- for (order = page_reporting_order; order <= MAX_ORDER; order++) { ++ for (order = page_reporting_order; order < NR_PAGE_ORDERS; order++) { + for (mt = 0; mt < MIGRATE_TYPES; mt++) { + /* We do not pull pages from the isolate free list */ + if (is_migrate_isolate(mt)) +diff --git a/mm/show_mem.c b/mm/show_mem.c +index 4b888b18bddea..b896e54e3a26c 100644 +--- a/mm/show_mem.c ++++ b/mm/show_mem.c +@@ -355,8 +355,8 @@ static void show_free_areas(unsigned int filter, nodemask_t *nodemask, int max_z + + for_each_populated_zone(zone) { + unsigned int order; +- unsigned long nr[MAX_ORDER + 1], flags, total = 0; +- unsigned char types[MAX_ORDER + 1]; ++ unsigned long nr[NR_PAGE_ORDERS], flags, total = 0; ++ unsigned char types[NR_PAGE_ORDERS]; + + if (zone_idx(zone) > max_zone_idx) + continue; +@@ -366,7 +366,7 @@ static void show_free_areas(unsigned int filter, nodemask_t *nodemask, int max_z + printk(KERN_CONT "%s: ", zone->name); + + spin_lock_irqsave(&zone->lock, flags); +- for (order = 0; order <= MAX_ORDER; order++) { ++ for (order = 0; order < NR_PAGE_ORDERS; order++) { + struct free_area *area = &zone->free_area[order]; + int type; + +@@ -380,7 +380,7 @@ static void show_free_areas(unsigned int filter, nodemask_t *nodemask, int max_z + } + } + spin_unlock_irqrestore(&zone->lock, flags); +- for (order = 0; order <= MAX_ORDER; order++) { ++ for (order = 0; order < NR_PAGE_ORDERS; order++) { + printk(KERN_CONT "%lu*%lukB ", + nr[order], K(1UL) << order); + if (nr[order]) +diff --git a/mm/vmstat.c b/mm/vmstat.c +index 00e81e99c6ee2..e9616c4ca12db 100644 +--- a/mm/vmstat.c ++++ b/mm/vmstat.c +@@ -1055,7 +1055,7 @@ static void fill_contig_page_info(struct zone *zone, + info->free_blocks_total = 0; + info->free_blocks_suitable = 0; + +- for (order = 0; order <= MAX_ORDER; order++) { ++ for (order = 0; order < NR_PAGE_ORDERS; order++) { + unsigned long blocks; + + /* +@@ -1471,7 +1471,7 @@ static void frag_show_print(struct seq_file *m, pg_data_t *pgdat, + int order; + + seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name); +- for (order = 0; order <= MAX_ORDER; ++order) ++ for (order = 0; order < NR_PAGE_ORDERS; ++order) + /* + * Access to nr_free is lockless as nr_free is used only for + * printing purposes. Use data_race to avoid KCSAN warning. +@@ -1500,7 +1500,7 @@ static void pagetypeinfo_showfree_print(struct seq_file *m, + pgdat->node_id, + zone->name, + migratetype_names[mtype]); +- for (order = 0; order <= MAX_ORDER; ++order) { ++ for (order = 0; order < NR_PAGE_ORDERS; ++order) { + unsigned long freecount = 0; + struct free_area *area; + struct list_head *curr; +@@ -1540,7 +1540,7 @@ static void pagetypeinfo_showfree(struct seq_file *m, void *arg) + + /* Print header */ + seq_printf(m, "%-43s ", "Free pages count per migrate type at order"); +- for (order = 0; order <= MAX_ORDER; ++order) ++ for (order = 0; order < NR_PAGE_ORDERS; ++order) + seq_printf(m, "%6d ", order); + seq_putc(m, '\n'); + +@@ -2176,7 +2176,7 @@ static void unusable_show_print(struct seq_file *m, + seq_printf(m, "Node %d, zone %8s ", + pgdat->node_id, + zone->name); +- for (order = 0; order <= MAX_ORDER; ++order) { ++ for (order = 0; order < NR_PAGE_ORDERS; ++order) { + fill_contig_page_info(zone, order, &info); + index = unusable_free_index(order, &info); + seq_printf(m, "%d.%03d ", index / 1000, index % 1000); +@@ -2228,7 +2228,7 @@ static void extfrag_show_print(struct seq_file *m, + seq_printf(m, "Node %d, zone %8s ", + pgdat->node_id, + zone->name); +- for (order = 0; order <= MAX_ORDER; ++order) { ++ for (order = 0; order < NR_PAGE_ORDERS; ++order) { + fill_contig_page_info(zone, order, &info); + index = __fragmentation_index(order, &info); + seq_printf(m, "%2d.%03d ", index / 1000, index % 1000); +-- +2.43.0 + diff --git a/queue-6.6/series b/queue-6.6/series index 7a1566dbe47..6788759bebb 100644 --- a/queue-6.6/series +++ b/queue-6.6/series @@ -93,3 +93,14 @@ ice-fix-lag-and-vf-lock-dependency-in-ice_reset_vf.patch net-ethernet-ti-am65-cpts-fix-ptpv1-message-type-on-.patch tls-fix-lockless-read-of-strp-msg_ready-in-poll.patch af_unix-suppress-false-positive-lockdep-splat-for-sp.patch +kvm-x86-pmu-zero-out-pmu-metadata-on-amd-if-pmu-is-d.patch +kvm-x86-pmu-set-enable-bits-for-gp-counters-in-perf_.patch +mm-gup-explicitly-define-and-check-internal-gup-flag.patch +mm-madvise-make-madv_populate_-read-write-handle-vm_.patch +drm-add-drm_gem_object_is_shared_for_memory_stats-he.patch +drm-amdgpu-add-shared-fdinfo-stats.patch +drm-amdgpu-fix-visible-vram-handling-during-faults.patch +mm-treewide-introduce-nr_page_orders.patch +drm-ttm-stop-pooling-cached-numa-pages-v2.patch +squashfs-convert-to-new-timestamp-accessors.patch +squashfs-check-the-inode-number-is-not-the-invalid-v.patch diff --git a/queue-6.6/squashfs-check-the-inode-number-is-not-the-invalid-v.patch b/queue-6.6/squashfs-check-the-inode-number-is-not-the-invalid-v.patch new file mode 100644 index 00000000000..0e277d3245b --- /dev/null +++ b/queue-6.6/squashfs-check-the-inode-number-is-not-the-invalid-v.patch @@ -0,0 +1,72 @@ +From 2c44ece9d52e9e48b8d9813cb44beb104cf8e60b Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Mon, 8 Apr 2024 23:02:06 +0100 +Subject: Squashfs: check the inode number is not the invalid value of zero + +From: Phillip Lougher + +[ Upstream commit 9253c54e01b6505d348afbc02abaa4d9f8a01395 ] + +Syskiller has produced an out of bounds access in fill_meta_index(). + +That out of bounds access is ultimately caused because the inode +has an inode number with the invalid value of zero, which was not checked. + +The reason this causes the out of bounds access is due to following +sequence of events: + +1. Fill_meta_index() is called to allocate (via empty_meta_index()) + and fill a metadata index. It however suffers a data read error + and aborts, invalidating the newly returned empty metadata index. + It does this by setting the inode number of the index to zero, + which means unused (zero is not a valid inode number). + +2. When fill_meta_index() is subsequently called again on another + read operation, locate_meta_index() returns the previous index + because it matches the inode number of 0. Because this index + has been returned it is expected to have been filled, and because + it hasn't been, an out of bounds access is performed. + +This patch adds a sanity check which checks that the inode number +is not zero when the inode is created and returns -EINVAL if it is. + +[phillip@squashfs.org.uk: whitespace fix] + Link: https://lkml.kernel.org/r/20240409204723.446925-1-phillip@squashfs.org.uk +Link: https://lkml.kernel.org/r/20240408220206.435788-1-phillip@squashfs.org.uk +Signed-off-by: Phillip Lougher +Reported-by: "Ubisectech Sirius" +Closes: https://lore.kernel.org/lkml/87f5c007-b8a5-41ae-8b57-431e924c5915.bugreport@ubisectech.com/ +Cc: Christian Brauner +Cc: +Signed-off-by: Andrew Morton +Signed-off-by: Sasha Levin +--- + fs/squashfs/inode.c | 5 ++++- + 1 file changed, 4 insertions(+), 1 deletion(-) + +diff --git a/fs/squashfs/inode.c b/fs/squashfs/inode.c +index aa3411354e66d..16bd693d0b3aa 100644 +--- a/fs/squashfs/inode.c ++++ b/fs/squashfs/inode.c +@@ -48,6 +48,10 @@ static int squashfs_new_inode(struct super_block *sb, struct inode *inode, + gid_t i_gid; + int err; + ++ inode->i_ino = le32_to_cpu(sqsh_ino->inode_number); ++ if (inode->i_ino == 0) ++ return -EINVAL; ++ + err = squashfs_get_id(sb, le16_to_cpu(sqsh_ino->uid), &i_uid); + if (err) + return err; +@@ -58,7 +62,6 @@ static int squashfs_new_inode(struct super_block *sb, struct inode *inode, + + i_uid_write(inode, i_uid); + i_gid_write(inode, i_gid); +- inode->i_ino = le32_to_cpu(sqsh_ino->inode_number); + inode_set_mtime(inode, le32_to_cpu(sqsh_ino->mtime), 0); + inode_set_atime(inode, inode_get_mtime_sec(inode), 0); + inode_set_ctime(inode, inode_get_mtime_sec(inode), 0); +-- +2.43.0 + diff --git a/queue-6.6/squashfs-convert-to-new-timestamp-accessors.patch b/queue-6.6/squashfs-convert-to-new-timestamp-accessors.patch new file mode 100644 index 00000000000..5afec4a2d73 --- /dev/null +++ b/queue-6.6/squashfs-convert-to-new-timestamp-accessors.patch @@ -0,0 +1,40 @@ +From 87c95dbeff05ea29654e73ee7059cbf94bc5a227 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Wed, 4 Oct 2023 14:52:55 -0400 +Subject: squashfs: convert to new timestamp accessors + +From: Jeff Layton + +[ Upstream commit a1f13ed8c74893ed31d41c5bca156a623b0e9a86 ] + +Convert to using the new inode timestamp accessor functions. + +Signed-off-by: Jeff Layton +Link: https://lore.kernel.org/r/20231004185347.80880-68-jlayton@kernel.org +Signed-off-by: Christian Brauner +Stable-dep-of: 9253c54e01b6 ("Squashfs: check the inode number is not the invalid value of zero") +Signed-off-by: Sasha Levin +--- + fs/squashfs/inode.c | 6 +++--- + 1 file changed, 3 insertions(+), 3 deletions(-) + +diff --git a/fs/squashfs/inode.c b/fs/squashfs/inode.c +index c6e626b00546b..aa3411354e66d 100644 +--- a/fs/squashfs/inode.c ++++ b/fs/squashfs/inode.c +@@ -59,9 +59,9 @@ static int squashfs_new_inode(struct super_block *sb, struct inode *inode, + i_uid_write(inode, i_uid); + i_gid_write(inode, i_gid); + inode->i_ino = le32_to_cpu(sqsh_ino->inode_number); +- inode->i_mtime.tv_sec = le32_to_cpu(sqsh_ino->mtime); +- inode->i_atime.tv_sec = inode->i_mtime.tv_sec; +- inode_set_ctime(inode, inode->i_mtime.tv_sec, 0); ++ inode_set_mtime(inode, le32_to_cpu(sqsh_ino->mtime), 0); ++ inode_set_atime(inode, inode_get_mtime_sec(inode), 0); ++ inode_set_ctime(inode, inode_get_mtime_sec(inode), 0); + inode->i_mode = le16_to_cpu(sqsh_ino->mode); + inode->i_size = 0; + +-- +2.43.0 +