From: Greg Kroah-Hartman Date: Tue, 1 Aug 2023 06:28:11 +0000 (+0200) Subject: 6.4-stable patches X-Git-Tag: v5.15.124~31 X-Git-Url: http://git.ipfire.org/gitweb.cgi?a=commitdiff_plain;h=f1891554541737597283d002bc5343e9ba2d0146;p=thirdparty%2Fkernel%2Fstable-queue.git 6.4-stable patches added patches: ceph-never-send-metrics-if-disable_send_metrics-is-set.patch dm-cache-policy-smq-ensure-io-doesn-t-prevent-cleaner-policy-progress.patch drm-i915-dpt-use-shmem-for-dpt-objects.patch mm-fix-memory-ordering-for-mm_lock_seq-and-vm_lock_seq.patch mm-lock-vma-in-dup_anon_vma-before-setting-anon_vma.patch mm-memory-failure-fix-hardware-poison-check-in-unpoison_memory.patch mm-mempolicy-take-vma-lock-before-replacing-policy.patch rbd-harden-get_lock_owner_info-a-bit.patch rbd-make-get_lock_owner_info-return-a-single-locker-or-null.patch rbd-retrieve-and-check-lock-owner-twice-before-blocklisting.patch --- diff --git a/queue-6.4/ceph-never-send-metrics-if-disable_send_metrics-is-set.patch b/queue-6.4/ceph-never-send-metrics-if-disable_send_metrics-is-set.patch new file mode 100644 index 00000000000..fec9d1f5e78 --- /dev/null +++ b/queue-6.4/ceph-never-send-metrics-if-disable_send_metrics-is-set.patch @@ -0,0 +1,34 @@ +From 50164507f6b7b7ed85d8c3ac0266849fbd908db7 Mon Sep 17 00:00:00 2001 +From: Xiubo Li +Date: Thu, 20 Jul 2023 11:33:55 +0800 +Subject: ceph: never send metrics if disable_send_metrics is set + +From: Xiubo Li + +commit 50164507f6b7b7ed85d8c3ac0266849fbd908db7 upstream. + +Even the 'disable_send_metrics' is true so when the session is +being opened it will always trigger to send the metric for the +first time. + +Cc: stable@vger.kernel.org +Signed-off-by: Xiubo Li +Reviewed-by: Venky Shankar +Reviewed-by: Jeff Layton +Signed-off-by: Ilya Dryomov +Signed-off-by: Greg Kroah-Hartman +--- + fs/ceph/metric.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/fs/ceph/metric.c ++++ b/fs/ceph/metric.c +@@ -208,7 +208,7 @@ static void metric_delayed_work(struct w + struct ceph_mds_client *mdsc = + container_of(m, struct ceph_mds_client, metric); + +- if (mdsc->stopping) ++ if (mdsc->stopping || disable_send_metrics) + return; + + if (!m->session || !check_session_state(m->session)) { diff --git a/queue-6.4/dm-cache-policy-smq-ensure-io-doesn-t-prevent-cleaner-policy-progress.patch b/queue-6.4/dm-cache-policy-smq-ensure-io-doesn-t-prevent-cleaner-policy-progress.patch new file mode 100644 index 00000000000..a30e3dbcf06 --- /dev/null +++ b/queue-6.4/dm-cache-policy-smq-ensure-io-doesn-t-prevent-cleaner-policy-progress.patch @@ -0,0 +1,106 @@ +From 1e4ab7b4c881cf26c1c72b3f56519e03475486fb Mon Sep 17 00:00:00 2001 +From: Joe Thornber +Date: Tue, 25 Jul 2023 11:44:41 -0400 +Subject: dm cache policy smq: ensure IO doesn't prevent cleaner policy progress + +From: Joe Thornber + +commit 1e4ab7b4c881cf26c1c72b3f56519e03475486fb upstream. + +When using the cleaner policy to decommission the cache, there is +never any writeback started from the cache as it is constantly delayed +due to normal I/O keeping the device busy. Meaning @idle=false was +always being passed to clean_target_met() + +Fix this by adding a specific 'cleaner' flag that is set when the +cleaner policy is configured. This flag serves to always allow the +cleaner's writeback work to be queued until the cache is +decommissioned (even if the cache isn't idle). + +Reported-by: David Jeffery +Fixes: b29d4986d0da ("dm cache: significant rework to leverage dm-bio-prison-v2") +Cc: stable@vger.kernel.org +Signed-off-by: Joe Thornber +Signed-off-by: Mike Snitzer +Signed-off-by: Greg Kroah-Hartman +--- + drivers/md/dm-cache-policy-smq.c | 28 ++++++++++++++++++---------- + 1 file changed, 18 insertions(+), 10 deletions(-) + +--- a/drivers/md/dm-cache-policy-smq.c ++++ b/drivers/md/dm-cache-policy-smq.c +@@ -857,7 +857,13 @@ struct smq_policy { + + struct background_tracker *bg_work; + +- bool migrations_allowed; ++ bool migrations_allowed:1; ++ ++ /* ++ * If this is set the policy will try and clean the whole cache ++ * even if the device is not idle. ++ */ ++ bool cleaner:1; + }; + + /*----------------------------------------------------------------*/ +@@ -1138,7 +1144,7 @@ static bool clean_target_met(struct smq_ + * Cache entries may not be populated. So we cannot rely on the + * size of the clean queue. + */ +- if (idle) { ++ if (idle || mq->cleaner) { + /* + * We'd like to clean everything. + */ +@@ -1722,11 +1728,9 @@ static void calc_hotspot_params(sector_t + *hotspot_block_size /= 2u; + } + +-static struct dm_cache_policy *__smq_create(dm_cblock_t cache_size, +- sector_t origin_size, +- sector_t cache_block_size, +- bool mimic_mq, +- bool migrations_allowed) ++static struct dm_cache_policy * ++__smq_create(dm_cblock_t cache_size, sector_t origin_size, sector_t cache_block_size, ++ bool mimic_mq, bool migrations_allowed, bool cleaner) + { + unsigned int i; + unsigned int nr_sentinels_per_queue = 2u * NR_CACHE_LEVELS; +@@ -1813,6 +1817,7 @@ static struct dm_cache_policy *__smq_cre + goto bad_btracker; + + mq->migrations_allowed = migrations_allowed; ++ mq->cleaner = cleaner; + + return &mq->policy; + +@@ -1836,21 +1841,24 @@ static struct dm_cache_policy *smq_creat + sector_t origin_size, + sector_t cache_block_size) + { +- return __smq_create(cache_size, origin_size, cache_block_size, false, true); ++ return __smq_create(cache_size, origin_size, cache_block_size, ++ false, true, false); + } + + static struct dm_cache_policy *mq_create(dm_cblock_t cache_size, + sector_t origin_size, + sector_t cache_block_size) + { +- return __smq_create(cache_size, origin_size, cache_block_size, true, true); ++ return __smq_create(cache_size, origin_size, cache_block_size, ++ true, true, false); + } + + static struct dm_cache_policy *cleaner_create(dm_cblock_t cache_size, + sector_t origin_size, + sector_t cache_block_size) + { +- return __smq_create(cache_size, origin_size, cache_block_size, false, false); ++ return __smq_create(cache_size, origin_size, cache_block_size, ++ false, false, true); + } + + /*----------------------------------------------------------------*/ diff --git a/queue-6.4/drm-i915-dpt-use-shmem-for-dpt-objects.patch b/queue-6.4/drm-i915-dpt-use-shmem-for-dpt-objects.patch new file mode 100644 index 00000000000..43a76797dee --- /dev/null +++ b/queue-6.4/drm-i915-dpt-use-shmem-for-dpt-objects.patch @@ -0,0 +1,58 @@ +From 3844ed5e78823eebb5f0f1edefc403310693d402 Mon Sep 17 00:00:00 2001 +From: Radhakrishna Sripada +Date: Tue, 18 Jul 2023 15:51:18 -0700 +Subject: drm/i915/dpt: Use shmem for dpt objects +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +From: Radhakrishna Sripada + +commit 3844ed5e78823eebb5f0f1edefc403310693d402 upstream. + +Dpt objects that are created from internal get evicted when there is +memory pressure and do not get restored when pinned during scanout. The +pinned page table entries look corrupted and programming the display +engine with the incorrect pte's result in DE throwing pipe faults. + +Create DPT objects from shmem and mark the object as dirty when pinning so +that the object is restored when shrinker evicts an unpinned buffer object. + +v2: Unconditionally mark the dpt objects dirty during pinning(Chris). + +Fixes: 0dc987b699ce ("drm/i915/display: Add smem fallback allocation for dpt") +Cc: # v6.0+ +Cc: Ville Syrjälä +Cc: Tvrtko Ursulin +Suggested-by: Chris Wilson +Signed-off-by: Fei Yang +Signed-off-by: Radhakrishna Sripada +Reviewed-by: Tvrtko Ursulin +Link: https://patchwork.freedesktop.org/patch/msgid/20230718225118.2562132-1-radhakrishna.sripada@intel.com +(cherry picked from commit e91a777a6e602ba0e3366e053e4e094a334a1244) +Signed-off-by: Tvrtko Ursulin +Signed-off-by: Greg Kroah-Hartman +--- + drivers/gpu/drm/i915/display/intel_dpt.c | 4 +++- + 1 file changed, 3 insertions(+), 1 deletion(-) + +--- a/drivers/gpu/drm/i915/display/intel_dpt.c ++++ b/drivers/gpu/drm/i915/display/intel_dpt.c +@@ -166,6 +166,8 @@ struct i915_vma *intel_dpt_pin(struct i9 + i915_vma_get(vma); + } + ++ dpt->obj->mm.dirty = true; ++ + atomic_dec(&i915->gpu_error.pending_fb_pin); + intel_runtime_pm_put(&i915->runtime_pm, wakeref); + +@@ -261,7 +263,7 @@ intel_dpt_create(struct intel_framebuffe + dpt_obj = i915_gem_object_create_stolen(i915, size); + if (IS_ERR(dpt_obj) && !HAS_LMEM(i915)) { + drm_dbg_kms(&i915->drm, "Allocating dpt from smem\n"); +- dpt_obj = i915_gem_object_create_internal(i915, size); ++ dpt_obj = i915_gem_object_create_shmem(i915, size); + } + if (IS_ERR(dpt_obj)) + return ERR_CAST(dpt_obj); diff --git a/queue-6.4/mm-fix-memory-ordering-for-mm_lock_seq-and-vm_lock_seq.patch b/queue-6.4/mm-fix-memory-ordering-for-mm_lock_seq-and-vm_lock_seq.patch new file mode 100644 index 00000000000..58322abc57f --- /dev/null +++ b/queue-6.4/mm-fix-memory-ordering-for-mm_lock_seq-and-vm_lock_seq.patch @@ -0,0 +1,226 @@ +From b1f02b95758d05b799731d939e76a0bd6da312db Mon Sep 17 00:00:00 2001 +From: Jann Horn +Date: Sat, 22 Jul 2023 00:51:07 +0200 +Subject: mm: fix memory ordering for mm_lock_seq and vm_lock_seq + +From: Jann Horn + +commit b1f02b95758d05b799731d939e76a0bd6da312db upstream. + +mm->mm_lock_seq effectively functions as a read/write lock; therefore it +must be used with acquire/release semantics. + +A specific example is the interaction between userfaultfd_register() and +lock_vma_under_rcu(). + +userfaultfd_register() does the following from the point where it changes +a VMA's flags to the point where concurrent readers are permitted again +(in a simple scenario where only a single private VMA is accessed and no +merging/splitting is involved): + +userfaultfd_register + userfaultfd_set_vm_flags + vm_flags_reset + vma_start_write + down_write(&vma->vm_lock->lock) + vma->vm_lock_seq = mm_lock_seq [marks VMA as busy] + up_write(&vma->vm_lock->lock) + vm_flags_init + [sets VM_UFFD_* in __vm_flags] + vma->vm_userfaultfd_ctx.ctx = ctx + mmap_write_unlock + vma_end_write_all + WRITE_ONCE(mm->mm_lock_seq, mm->mm_lock_seq + 1) [unlocks VMA] + +There are no memory barriers in between the __vm_flags update and the +mm->mm_lock_seq update that unlocks the VMA, so the unlock can be +reordered to above the `vm_flags_init()` call, which means from the +perspective of a concurrent reader, a VMA can be marked as a userfaultfd +VMA while it is not VMA-locked. That's bad, we definitely need a +store-release for the unlock operation. + +The non-atomic write to vma->vm_lock_seq in vma_start_write() is mostly +fine because all accesses to vma->vm_lock_seq that matter are always +protected by the VMA lock. There is a racy read in vma_start_read() +though that can tolerate false-positives, so we should be using +WRITE_ONCE() to keep things tidy and data-race-free (including for KCSAN). + +On the other side, lock_vma_under_rcu() works as follows in the relevant +region for locking and userfaultfd check: + +lock_vma_under_rcu + vma_start_read + vma->vm_lock_seq == READ_ONCE(vma->vm_mm->mm_lock_seq) [early bailout] + down_read_trylock(&vma->vm_lock->lock) + vma->vm_lock_seq == READ_ONCE(vma->vm_mm->mm_lock_seq) [main check] + userfaultfd_armed + checks vma->vm_flags & __VM_UFFD_FLAGS + +Here, the interesting aspect is how far down the mm->mm_lock_seq read can +be reordered - if this read is reordered down below the vma->vm_flags +access, this could cause lock_vma_under_rcu() to partly operate on +information that was read while the VMA was supposed to be locked. To +prevent this kind of downwards bleeding of the mm->mm_lock_seq read, we +need to read it with a load-acquire. + +Some of the comment wording is based on suggestions by Suren. + +BACKPORT WARNING: One of the functions changed by this patch (which I've +written against Linus' tree) is vma_try_start_write(), but this function +no longer exists in mm/mm-everything. I don't know whether the merged +version of this patch will be ordered before or after the patch that +removes vma_try_start_write(). If you're backporting this patch to a tree +with vma_try_start_write(), make sure this patch changes that function. + +Link: https://lkml.kernel.org/r/20230721225107.942336-1-jannh@google.com +Fixes: 5e31275cc997 ("mm: add per-VMA lock and helper functions to control it") +Signed-off-by: Jann Horn +Reviewed-by: Suren Baghdasaryan +Cc: +Signed-off-by: Andrew Morton +Signed-off-by: Greg Kroah-Hartman +--- + include/linux/mm.h | 29 +++++++++++++++++++++++------ + include/linux/mm_types.h | 28 ++++++++++++++++++++++++++++ + include/linux/mmap_lock.h | 10 ++++++++-- + 3 files changed, 59 insertions(+), 8 deletions(-) + +diff --git a/include/linux/mm.h b/include/linux/mm.h +index 2dd73e4f3d8e..406ab9ea818f 100644 +--- a/include/linux/mm.h ++++ b/include/linux/mm.h +@@ -641,8 +641,14 @@ static inline void vma_numab_state_free(struct vm_area_struct *vma) {} + */ + static inline bool vma_start_read(struct vm_area_struct *vma) + { +- /* Check before locking. A race might cause false locked result. */ +- if (vma->vm_lock_seq == READ_ONCE(vma->vm_mm->mm_lock_seq)) ++ /* ++ * Check before locking. A race might cause false locked result. ++ * We can use READ_ONCE() for the mm_lock_seq here, and don't need ++ * ACQUIRE semantics, because this is just a lockless check whose result ++ * we don't rely on for anything - the mm_lock_seq read against which we ++ * need ordering is below. ++ */ ++ if (READ_ONCE(vma->vm_lock_seq) == READ_ONCE(vma->vm_mm->mm_lock_seq)) + return false; + + if (unlikely(down_read_trylock(&vma->vm_lock->lock) == 0)) +@@ -653,8 +659,13 @@ static inline bool vma_start_read(struct vm_area_struct *vma) + * False unlocked result is impossible because we modify and check + * vma->vm_lock_seq under vma->vm_lock protection and mm->mm_lock_seq + * modification invalidates all existing locks. ++ * ++ * We must use ACQUIRE semantics for the mm_lock_seq so that if we are ++ * racing with vma_end_write_all(), we only start reading from the VMA ++ * after it has been unlocked. ++ * This pairs with RELEASE semantics in vma_end_write_all(). + */ +- if (unlikely(vma->vm_lock_seq == READ_ONCE(vma->vm_mm->mm_lock_seq))) { ++ if (unlikely(vma->vm_lock_seq == smp_load_acquire(&vma->vm_mm->mm_lock_seq))) { + up_read(&vma->vm_lock->lock); + return false; + } +@@ -676,7 +687,7 @@ static bool __is_vma_write_locked(struct vm_area_struct *vma, int *mm_lock_seq) + * current task is holding mmap_write_lock, both vma->vm_lock_seq and + * mm->mm_lock_seq can't be concurrently modified. + */ +- *mm_lock_seq = READ_ONCE(vma->vm_mm->mm_lock_seq); ++ *mm_lock_seq = vma->vm_mm->mm_lock_seq; + return (vma->vm_lock_seq == *mm_lock_seq); + } + +@@ -688,7 +699,13 @@ static inline void vma_start_write(struct vm_area_struct *vma) + return; + + down_write(&vma->vm_lock->lock); +- vma->vm_lock_seq = mm_lock_seq; ++ /* ++ * We should use WRITE_ONCE() here because we can have concurrent reads ++ * from the early lockless pessimistic check in vma_start_read(). ++ * We don't really care about the correctness of that early check, but ++ * we should use WRITE_ONCE() for cleanliness and to keep KCSAN happy. ++ */ ++ WRITE_ONCE(vma->vm_lock_seq, mm_lock_seq); + up_write(&vma->vm_lock->lock); + } + +@@ -702,7 +719,7 @@ static inline bool vma_try_start_write(struct vm_area_struct *vma) + if (!down_write_trylock(&vma->vm_lock->lock)) + return false; + +- vma->vm_lock_seq = mm_lock_seq; ++ WRITE_ONCE(vma->vm_lock_seq, mm_lock_seq); + up_write(&vma->vm_lock->lock); + return true; + } +diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h +index de10fc797c8e..5e74ce4a28cd 100644 +--- a/include/linux/mm_types.h ++++ b/include/linux/mm_types.h +@@ -514,6 +514,20 @@ struct vm_area_struct { + }; + + #ifdef CONFIG_PER_VMA_LOCK ++ /* ++ * Can only be written (using WRITE_ONCE()) while holding both: ++ * - mmap_lock (in write mode) ++ * - vm_lock->lock (in write mode) ++ * Can be read reliably while holding one of: ++ * - mmap_lock (in read or write mode) ++ * - vm_lock->lock (in read or write mode) ++ * Can be read unreliably (using READ_ONCE()) for pessimistic bailout ++ * while holding nothing (except RCU to keep the VMA struct allocated). ++ * ++ * This sequence counter is explicitly allowed to overflow; sequence ++ * counter reuse can only lead to occasional unnecessary use of the ++ * slowpath. ++ */ + int vm_lock_seq; + struct vma_lock *vm_lock; + +@@ -679,6 +693,20 @@ struct mm_struct { + * by mmlist_lock + */ + #ifdef CONFIG_PER_VMA_LOCK ++ /* ++ * This field has lock-like semantics, meaning it is sometimes ++ * accessed with ACQUIRE/RELEASE semantics. ++ * Roughly speaking, incrementing the sequence number is ++ * equivalent to releasing locks on VMAs; reading the sequence ++ * number can be part of taking a read lock on a VMA. ++ * ++ * Can be modified under write mmap_lock using RELEASE ++ * semantics. ++ * Can be read with no other protection when holding write ++ * mmap_lock. ++ * Can be read with ACQUIRE semantics if not holding write ++ * mmap_lock. ++ */ + int mm_lock_seq; + #endif + +diff --git a/include/linux/mmap_lock.h b/include/linux/mmap_lock.h +index aab8f1b28d26..e05e167dbd16 100644 +--- a/include/linux/mmap_lock.h ++++ b/include/linux/mmap_lock.h +@@ -76,8 +76,14 @@ static inline void mmap_assert_write_locked(struct mm_struct *mm) + static inline void vma_end_write_all(struct mm_struct *mm) + { + mmap_assert_write_locked(mm); +- /* No races during update due to exclusive mmap_lock being held */ +- WRITE_ONCE(mm->mm_lock_seq, mm->mm_lock_seq + 1); ++ /* ++ * Nobody can concurrently modify mm->mm_lock_seq due to exclusive ++ * mmap_lock being held. ++ * We need RELEASE semantics here to ensure that preceding stores into ++ * the VMA take effect before we unlock it with this store. ++ * Pairs with ACQUIRE semantics in vma_start_read(). ++ */ ++ smp_store_release(&mm->mm_lock_seq, mm->mm_lock_seq + 1); + } + #else + static inline void vma_end_write_all(struct mm_struct *mm) {} +-- +2.41.0 + diff --git a/queue-6.4/mm-lock-vma-in-dup_anon_vma-before-setting-anon_vma.patch b/queue-6.4/mm-lock-vma-in-dup_anon_vma-before-setting-anon_vma.patch new file mode 100644 index 00000000000..aa85a039f81 --- /dev/null +++ b/queue-6.4/mm-lock-vma-in-dup_anon_vma-before-setting-anon_vma.patch @@ -0,0 +1,59 @@ +From d8ab9f7b644a2c9b64de405c1953c905ff219dc9 Mon Sep 17 00:00:00 2001 +From: Jann Horn +Date: Fri, 21 Jul 2023 05:46:43 +0200 +Subject: mm: lock VMA in dup_anon_vma() before setting ->anon_vma + +From: Jann Horn + +commit d8ab9f7b644a2c9b64de405c1953c905ff219dc9 upstream. + +When VMAs are merged, dup_anon_vma() is called with `dst` pointing to the +VMA that is being expanded to cover the area previously occupied by +another VMA. This currently happens while `dst` is not write-locked. + +This means that, in the `src->anon_vma && !dst->anon_vma` case, as soon as +the assignment `dst->anon_vma = src->anon_vma` has happened, concurrent +page faults can happen on `dst` under the per-VMA lock. This is already +icky in itself, since such page faults can now install pages into `dst` +that are attached to an `anon_vma` that is not yet tied back to the +`anon_vma` with an `anon_vma_chain`. But if `anon_vma_clone()` fails due +to an out-of-memory error, things get much worse: `anon_vma_clone()` then +reverts `dst->anon_vma` back to NULL, and `dst` remains completely +unconnected to the `anon_vma`, even though we can have pages in the area +covered by `dst` that point to the `anon_vma`. + +This means the `anon_vma` of such pages can be freed while the pages are +still mapped into userspace, which leads to UAF when a helper like +folio_lock_anon_vma_read() tries to look up the anon_vma of such a page. + +This theoretically is a security bug, but I believe it is really hard to +actually trigger as an unprivileged user because it requires that you can +make an order-0 GFP_KERNEL allocation fail, and the page allocator tries +pretty hard to prevent that. + +I think doing the vma_start_write() call inside dup_anon_vma() is the most +straightforward fix for now. + +For a kernel-assisted reproducer, see the notes section of the patch mail. + +Link: https://lkml.kernel.org/r/20230721034643.616851-1-jannh@google.com +Fixes: 5e31275cc997 ("mm: add per-VMA lock and helper functions to control it") +Signed-off-by: Jann Horn +Reviewed-by: Suren Baghdasaryan +Cc: +Signed-off-by: Andrew Morton +Signed-off-by: Greg Kroah-Hartman +--- + mm/mmap.c | 1 + + 1 file changed, 1 insertion(+) + +--- a/mm/mmap.c ++++ b/mm/mmap.c +@@ -647,6 +647,7 @@ static inline int dup_anon_vma(struct vm + * anon pages imported. + */ + if (src->anon_vma && !dst->anon_vma) { ++ vma_start_write(dst); + dst->anon_vma = src->anon_vma; + return anon_vma_clone(dst, src); + } diff --git a/queue-6.4/mm-memory-failure-fix-hardware-poison-check-in-unpoison_memory.patch b/queue-6.4/mm-memory-failure-fix-hardware-poison-check-in-unpoison_memory.patch new file mode 100644 index 00000000000..86f20d33614 --- /dev/null +++ b/queue-6.4/mm-memory-failure-fix-hardware-poison-check-in-unpoison_memory.patch @@ -0,0 +1,45 @@ +From 6c54312f9689fbe27c70db5d42eebd29d04b672e Mon Sep 17 00:00:00 2001 +From: Sidhartha Kumar +Date: Mon, 17 Jul 2023 11:18:12 -0700 +Subject: mm/memory-failure: fix hardware poison check in unpoison_memory() + +From: Sidhartha Kumar + +commit 6c54312f9689fbe27c70db5d42eebd29d04b672e upstream. + +It was pointed out[1] that using folio_test_hwpoison() is wrong as we need +to check the indiviual page that has poison. folio_test_hwpoison() only +checks the head page so go back to using PageHWPoison(). + +User-visible effects include existing hwpoison-inject tests possibly +failing as unpoisoning a single subpage could lead to unpoisoning an +entire folio. Memory unpoisoning could also not work as expected as +the function will break early due to only checking the head page and +not the actually poisoned subpage. + +[1]: https://lore.kernel.org/lkml/ZLIbZygG7LqSI9xe@casper.infradead.org/ + +Link: https://lkml.kernel.org/r/20230717181812.167757-1-sidhartha.kumar@oracle.com +Fixes: a6fddef49eef ("mm/memory-failure: convert unpoison_memory() to folios") +Signed-off-by: Sidhartha Kumar +Reported-by: Matthew Wilcox (Oracle) +Acked-by: Naoya Horiguchi +Reviewed-by: Miaohe Lin +Cc: +Signed-off-by: Andrew Morton +Signed-off-by: Greg Kroah-Hartman +--- + mm/memory-failure.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/mm/memory-failure.c ++++ b/mm/memory-failure.c +@@ -2490,7 +2490,7 @@ int unpoison_memory(unsigned long pfn) + goto unlock_mutex; + } + +- if (!folio_test_hwpoison(folio)) { ++ if (!PageHWPoison(p)) { + unpoison_pr_info("Unpoison: Page was already unpoisoned %#lx\n", + pfn, &unpoison_rs); + goto unlock_mutex; diff --git a/queue-6.4/mm-mempolicy-take-vma-lock-before-replacing-policy.patch b/queue-6.4/mm-mempolicy-take-vma-lock-before-replacing-policy.patch new file mode 100644 index 00000000000..7c75a0c2e58 --- /dev/null +++ b/queue-6.4/mm-mempolicy-take-vma-lock-before-replacing-policy.patch @@ -0,0 +1,78 @@ +From 6c21e066f9256ea1df6f88768f6ae1080b7cf509 Mon Sep 17 00:00:00 2001 +From: Jann Horn +Date: Fri, 28 Jul 2023 06:13:21 +0200 +Subject: mm/mempolicy: Take VMA lock before replacing policy + +From: Jann Horn + +commit 6c21e066f9256ea1df6f88768f6ae1080b7cf509 upstream. + +mbind() calls down into vma_replace_policy() without taking the per-VMA +locks, replaces the VMA's vma->vm_policy pointer, and frees the old +policy. That's bad; a concurrent page fault might still be using the +old policy (in vma_alloc_folio()), resulting in use-after-free. + +Normally this will manifest as a use-after-free read first, but it can +result in memory corruption, including because vma_alloc_folio() can +call mpol_cond_put() on the freed policy, which conditionally changes +the policy's refcount member. + +This bug is specific to CONFIG_NUMA, but it does also affect non-NUMA +systems as long as the kernel was built with CONFIG_NUMA. + +Signed-off-by: Jann Horn +Reviewed-by: Suren Baghdasaryan +Fixes: 5e31275cc997 ("mm: add per-VMA lock and helper functions to control it") +Cc: stable@kernel.org +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman +--- + mm/mempolicy.c | 15 ++++++++++++++- + 1 file changed, 14 insertions(+), 1 deletion(-) + +--- a/mm/mempolicy.c ++++ b/mm/mempolicy.c +@@ -384,8 +384,10 @@ void mpol_rebind_mm(struct mm_struct *mm + VMA_ITERATOR(vmi, mm, 0); + + mmap_write_lock(mm); +- for_each_vma(vmi, vma) ++ for_each_vma(vmi, vma) { ++ vma_start_write(vma); + mpol_rebind_policy(vma->vm_policy, new); ++ } + mmap_write_unlock(mm); + } + +@@ -765,6 +767,8 @@ static int vma_replace_policy(struct vm_ + struct mempolicy *old; + struct mempolicy *new; + ++ vma_assert_write_locked(vma); ++ + pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n", + vma->vm_start, vma->vm_end, vma->vm_pgoff, + vma->vm_ops, vma->vm_file, +@@ -1313,6 +1317,14 @@ static long do_mbind(unsigned long start + if (err) + goto mpol_out; + ++ /* ++ * Lock the VMAs before scanning for pages to migrate, to ensure we don't ++ * miss a concurrently inserted page. ++ */ ++ vma_iter_init(&vmi, mm, start); ++ for_each_vma_range(vmi, vma, end) ++ vma_start_write(vma); ++ + ret = queue_pages_range(mm, start, end, nmask, + flags | MPOL_MF_INVERT, &pagelist); + +@@ -1538,6 +1550,7 @@ SYSCALL_DEFINE4(set_mempolicy_home_node, + break; + } + ++ vma_start_write(vma); + new->home_node = home_node; + err = mbind_range(&vmi, vma, &prev, start, end, new); + mpol_put(new); diff --git a/queue-6.4/rbd-harden-get_lock_owner_info-a-bit.patch b/queue-6.4/rbd-harden-get_lock_owner_info-a-bit.patch new file mode 100644 index 00000000000..46ae86237a0 --- /dev/null +++ b/queue-6.4/rbd-harden-get_lock_owner_info-a-bit.patch @@ -0,0 +1,79 @@ +From 8ff2c64c9765446c3cef804fb99da04916603e27 Mon Sep 17 00:00:00 2001 +From: Ilya Dryomov +Date: Sat, 8 Jul 2023 16:16:59 +0200 +Subject: rbd: harden get_lock_owner_info() a bit + +From: Ilya Dryomov + +commit 8ff2c64c9765446c3cef804fb99da04916603e27 upstream. + +- we want the exclusive lock type, so test for it directly +- use sscanf() to actually parse the lock cookie and avoid admitting + invalid handles +- bail if locker has a blank address + +Signed-off-by: Ilya Dryomov +Reviewed-by: Dongsheng Yang +Signed-off-by: Greg Kroah-Hartman +--- + drivers/block/rbd.c | 21 +++++++++++++++------ + net/ceph/messenger.c | 1 + + 2 files changed, 16 insertions(+), 6 deletions(-) + +--- a/drivers/block/rbd.c ++++ b/drivers/block/rbd.c +@@ -3862,10 +3862,9 @@ static struct ceph_locker *get_lock_owne + u32 num_lockers; + u8 lock_type; + char *lock_tag; ++ u64 handle; + int ret; + +- dout("%s rbd_dev %p\n", __func__, rbd_dev); +- + ret = ceph_cls_lock_info(osdc, &rbd_dev->header_oid, + &rbd_dev->header_oloc, RBD_LOCK_NAME, + &lock_type, &lock_tag, &lockers, &num_lockers); +@@ -3886,18 +3885,28 @@ static struct ceph_locker *get_lock_owne + goto err_busy; + } + +- if (lock_type == CEPH_CLS_LOCK_SHARED) { +- rbd_warn(rbd_dev, "shared lock type detected"); ++ if (lock_type != CEPH_CLS_LOCK_EXCLUSIVE) { ++ rbd_warn(rbd_dev, "incompatible lock type detected"); + goto err_busy; + } + + WARN_ON(num_lockers != 1); +- if (strncmp(lockers[0].id.cookie, RBD_LOCK_COOKIE_PREFIX, +- strlen(RBD_LOCK_COOKIE_PREFIX))) { ++ ret = sscanf(lockers[0].id.cookie, RBD_LOCK_COOKIE_PREFIX " %llu", ++ &handle); ++ if (ret != 1) { + rbd_warn(rbd_dev, "locked by external mechanism, cookie %s", + lockers[0].id.cookie); + goto err_busy; + } ++ if (ceph_addr_is_blank(&lockers[0].info.addr)) { ++ rbd_warn(rbd_dev, "locker has a blank address"); ++ goto err_busy; ++ } ++ ++ dout("%s rbd_dev %p got locker %s%llu@%pISpc/%u handle %llu\n", ++ __func__, rbd_dev, ENTITY_NAME(lockers[0].id.name), ++ &lockers[0].info.addr.in_addr, ++ le32_to_cpu(lockers[0].info.addr.nonce), handle); + + out: + kfree(lock_tag); +--- a/net/ceph/messenger.c ++++ b/net/ceph/messenger.c +@@ -1123,6 +1123,7 @@ bool ceph_addr_is_blank(const struct cep + return true; + } + } ++EXPORT_SYMBOL(ceph_addr_is_blank); + + int ceph_addr_port(const struct ceph_entity_addr *addr) + { diff --git a/queue-6.4/rbd-make-get_lock_owner_info-return-a-single-locker-or-null.patch b/queue-6.4/rbd-make-get_lock_owner_info-return-a-single-locker-or-null.patch new file mode 100644 index 00000000000..b130fd94672 --- /dev/null +++ b/queue-6.4/rbd-make-get_lock_owner_info-return-a-single-locker-or-null.patch @@ -0,0 +1,176 @@ +From f38cb9d9c2045dad16eead4a2e1aedfddd94603b Mon Sep 17 00:00:00 2001 +From: Ilya Dryomov +Date: Fri, 30 Jun 2023 13:52:13 +0200 +Subject: rbd: make get_lock_owner_info() return a single locker or NULL + +From: Ilya Dryomov + +commit f38cb9d9c2045dad16eead4a2e1aedfddd94603b upstream. + +Make the "num_lockers can be only 0 or 1" assumption explicit and +simplify the API by getting rid of output parameters in preparation +for calling get_lock_owner_info() twice before blocklisting. + +Signed-off-by: Ilya Dryomov +Reviewed-by: Dongsheng Yang +Signed-off-by: Greg Kroah-Hartman +--- + drivers/block/rbd.c | 84 +++++++++++++++++++++++++++++++--------------------- + 1 file changed, 51 insertions(+), 33 deletions(-) + +--- a/drivers/block/rbd.c ++++ b/drivers/block/rbd.c +@@ -3849,10 +3849,17 @@ static void wake_lock_waiters(struct rbd + list_splice_tail_init(&rbd_dev->acquiring_list, &rbd_dev->running_list); + } + +-static int get_lock_owner_info(struct rbd_device *rbd_dev, +- struct ceph_locker **lockers, u32 *num_lockers) ++static void free_locker(struct ceph_locker *locker) ++{ ++ if (locker) ++ ceph_free_lockers(locker, 1); ++} ++ ++static struct ceph_locker *get_lock_owner_info(struct rbd_device *rbd_dev) + { + struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; ++ struct ceph_locker *lockers; ++ u32 num_lockers; + u8 lock_type; + char *lock_tag; + int ret; +@@ -3861,39 +3868,45 @@ static int get_lock_owner_info(struct rb + + ret = ceph_cls_lock_info(osdc, &rbd_dev->header_oid, + &rbd_dev->header_oloc, RBD_LOCK_NAME, +- &lock_type, &lock_tag, lockers, num_lockers); +- if (ret) +- return ret; ++ &lock_type, &lock_tag, &lockers, &num_lockers); ++ if (ret) { ++ rbd_warn(rbd_dev, "failed to retrieve lockers: %d", ret); ++ return ERR_PTR(ret); ++ } + +- if (*num_lockers == 0) { ++ if (num_lockers == 0) { + dout("%s rbd_dev %p no lockers detected\n", __func__, rbd_dev); ++ lockers = NULL; + goto out; + } + + if (strcmp(lock_tag, RBD_LOCK_TAG)) { + rbd_warn(rbd_dev, "locked by external mechanism, tag %s", + lock_tag); +- ret = -EBUSY; +- goto out; ++ goto err_busy; + } + + if (lock_type == CEPH_CLS_LOCK_SHARED) { + rbd_warn(rbd_dev, "shared lock type detected"); +- ret = -EBUSY; +- goto out; ++ goto err_busy; + } + +- if (strncmp((*lockers)[0].id.cookie, RBD_LOCK_COOKIE_PREFIX, ++ WARN_ON(num_lockers != 1); ++ if (strncmp(lockers[0].id.cookie, RBD_LOCK_COOKIE_PREFIX, + strlen(RBD_LOCK_COOKIE_PREFIX))) { + rbd_warn(rbd_dev, "locked by external mechanism, cookie %s", +- (*lockers)[0].id.cookie); +- ret = -EBUSY; +- goto out; ++ lockers[0].id.cookie); ++ goto err_busy; + } + + out: + kfree(lock_tag); +- return ret; ++ return lockers; ++ ++err_busy: ++ kfree(lock_tag); ++ ceph_free_lockers(lockers, num_lockers); ++ return ERR_PTR(-EBUSY); + } + + static int find_watcher(struct rbd_device *rbd_dev, +@@ -3947,51 +3960,56 @@ out: + static int rbd_try_lock(struct rbd_device *rbd_dev) + { + struct ceph_client *client = rbd_dev->rbd_client->client; +- struct ceph_locker *lockers; +- u32 num_lockers; ++ struct ceph_locker *locker; + int ret; + + for (;;) { ++ locker = NULL; ++ + ret = rbd_lock(rbd_dev); + if (ret != -EBUSY) +- return ret; ++ goto out; + + /* determine if the current lock holder is still alive */ +- ret = get_lock_owner_info(rbd_dev, &lockers, &num_lockers); +- if (ret) +- return ret; +- +- if (num_lockers == 0) ++ locker = get_lock_owner_info(rbd_dev); ++ if (IS_ERR(locker)) { ++ ret = PTR_ERR(locker); ++ locker = NULL; ++ goto out; ++ } ++ if (!locker) + goto again; + +- ret = find_watcher(rbd_dev, lockers); ++ ret = find_watcher(rbd_dev, locker); + if (ret) + goto out; /* request lock or error */ + + rbd_warn(rbd_dev, "breaking header lock owned by %s%llu", +- ENTITY_NAME(lockers[0].id.name)); ++ ENTITY_NAME(locker->id.name)); + + ret = ceph_monc_blocklist_add(&client->monc, +- &lockers[0].info.addr); ++ &locker->info.addr); + if (ret) { +- rbd_warn(rbd_dev, "blocklist of %s%llu failed: %d", +- ENTITY_NAME(lockers[0].id.name), ret); ++ rbd_warn(rbd_dev, "failed to blocklist %s%llu: %d", ++ ENTITY_NAME(locker->id.name), ret); + goto out; + } + + ret = ceph_cls_break_lock(&client->osdc, &rbd_dev->header_oid, + &rbd_dev->header_oloc, RBD_LOCK_NAME, +- lockers[0].id.cookie, +- &lockers[0].id.name); +- if (ret && ret != -ENOENT) ++ locker->id.cookie, &locker->id.name); ++ if (ret && ret != -ENOENT) { ++ rbd_warn(rbd_dev, "failed to break header lock: %d", ++ ret); + goto out; ++ } + + again: +- ceph_free_lockers(lockers, num_lockers); ++ free_locker(locker); + } + + out: +- ceph_free_lockers(lockers, num_lockers); ++ free_locker(locker); + return ret; + } + diff --git a/queue-6.4/rbd-retrieve-and-check-lock-owner-twice-before-blocklisting.patch b/queue-6.4/rbd-retrieve-and-check-lock-owner-twice-before-blocklisting.patch new file mode 100644 index 00000000000..f7338754c12 --- /dev/null +++ b/queue-6.4/rbd-retrieve-and-check-lock-owner-twice-before-blocklisting.patch @@ -0,0 +1,99 @@ +From 588159009d5b7a09c3e5904cffddbe4a4e170301 Mon Sep 17 00:00:00 2001 +From: Ilya Dryomov +Date: Sat, 22 Jul 2023 20:28:08 +0200 +Subject: rbd: retrieve and check lock owner twice before blocklisting + +From: Ilya Dryomov + +commit 588159009d5b7a09c3e5904cffddbe4a4e170301 upstream. + +An attempt to acquire exclusive lock can race with the current lock +owner closing the image: + +1. lock is held by client123, rbd_lock() returns -EBUSY +2. get_lock_owner_info() returns client123 instance details +3. client123 closes the image, lock is released +4. find_watcher() returns 0 as there is no matching watcher anymore +5. client123 instance gets erroneously blocklisted + +Particularly impacted is mirror snapshot scheduler in snapshot-based +mirroring since it happens to open and close images a lot (images are +opened only for as long as it takes to take the next mirror snapshot, +the same client instance is used for all images). + +To reduce the potential for erroneous blocklisting, retrieve the lock +owner again after find_watcher() returns 0. If it's still there, make +sure it matches the previously detected lock owner. + +Cc: stable@vger.kernel.org # f38cb9d9c204: rbd: make get_lock_owner_info() return a single locker or NULL +Cc: stable@vger.kernel.org # 8ff2c64c9765: rbd: harden get_lock_owner_info() a bit +Cc: stable@vger.kernel.org +Signed-off-by: Ilya Dryomov +Reviewed-by: Dongsheng Yang +Signed-off-by: Greg Kroah-Hartman +--- + drivers/block/rbd.c | 25 +++++++++++++++++++++++-- + 1 file changed, 23 insertions(+), 2 deletions(-) + +--- a/drivers/block/rbd.c ++++ b/drivers/block/rbd.c +@@ -3849,6 +3849,15 @@ static void wake_lock_waiters(struct rbd + list_splice_tail_init(&rbd_dev->acquiring_list, &rbd_dev->running_list); + } + ++static bool locker_equal(const struct ceph_locker *lhs, ++ const struct ceph_locker *rhs) ++{ ++ return lhs->id.name.type == rhs->id.name.type && ++ lhs->id.name.num == rhs->id.name.num && ++ !strcmp(lhs->id.cookie, rhs->id.cookie) && ++ ceph_addr_equal_no_type(&lhs->info.addr, &rhs->info.addr); ++} ++ + static void free_locker(struct ceph_locker *locker) + { + if (locker) +@@ -3969,11 +3978,11 @@ out: + static int rbd_try_lock(struct rbd_device *rbd_dev) + { + struct ceph_client *client = rbd_dev->rbd_client->client; +- struct ceph_locker *locker; ++ struct ceph_locker *locker, *refreshed_locker; + int ret; + + for (;;) { +- locker = NULL; ++ locker = refreshed_locker = NULL; + + ret = rbd_lock(rbd_dev); + if (ret != -EBUSY) +@@ -3993,6 +4002,16 @@ static int rbd_try_lock(struct rbd_devic + if (ret) + goto out; /* request lock or error */ + ++ refreshed_locker = get_lock_owner_info(rbd_dev); ++ if (IS_ERR(refreshed_locker)) { ++ ret = PTR_ERR(refreshed_locker); ++ refreshed_locker = NULL; ++ goto out; ++ } ++ if (!refreshed_locker || ++ !locker_equal(locker, refreshed_locker)) ++ goto again; ++ + rbd_warn(rbd_dev, "breaking header lock owned by %s%llu", + ENTITY_NAME(locker->id.name)); + +@@ -4014,10 +4033,12 @@ static int rbd_try_lock(struct rbd_devic + } + + again: ++ free_locker(refreshed_locker); + free_locker(locker); + } + + out: ++ free_locker(refreshed_locker); + free_locker(locker); + return ret; + } diff --git a/queue-6.4/series b/queue-6.4/series index 953ecc6c4e0..a935156a750 100644 --- a/queue-6.4/series +++ b/queue-6.4/series @@ -225,3 +225,13 @@ asoc-wm8904-fill-the-cache-for-wm8904_adc_test_0-register.patch arm64-sme-set-new-vector-length-before-reallocating.patch pm-sleep-wakeirq-fix-wake-irq-arming.patch thermal-of-fix-double-free-on-unregistration.patch +ceph-never-send-metrics-if-disable_send_metrics-is-set.patch +drm-i915-dpt-use-shmem-for-dpt-objects.patch +dm-cache-policy-smq-ensure-io-doesn-t-prevent-cleaner-policy-progress.patch +rbd-make-get_lock_owner_info-return-a-single-locker-or-null.patch +rbd-harden-get_lock_owner_info-a-bit.patch +rbd-retrieve-and-check-lock-owner-twice-before-blocklisting.patch +mm-lock-vma-in-dup_anon_vma-before-setting-anon_vma.patch +mm-fix-memory-ordering-for-mm_lock_seq-and-vm_lock_seq.patch +mm-memory-failure-fix-hardware-poison-check-in-unpoison_memory.patch +mm-mempolicy-take-vma-lock-before-replacing-policy.patch