--- /dev/null
+From 50164507f6b7b7ed85d8c3ac0266849fbd908db7 Mon Sep 17 00:00:00 2001
+From: Xiubo Li <xiubli@redhat.com>
+Date: Thu, 20 Jul 2023 11:33:55 +0800
+Subject: ceph: never send metrics if disable_send_metrics is set
+
+From: Xiubo Li <xiubli@redhat.com>
+
+commit 50164507f6b7b7ed85d8c3ac0266849fbd908db7 upstream.
+
+Even the 'disable_send_metrics' is true so when the session is
+being opened it will always trigger to send the metric for the
+first time.
+
+Cc: stable@vger.kernel.org
+Signed-off-by: Xiubo Li <xiubli@redhat.com>
+Reviewed-by: Venky Shankar <vshankar@redhat.com>
+Reviewed-by: Jeff Layton <jlayton@kernel.org>
+Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/ceph/metric.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/fs/ceph/metric.c
++++ b/fs/ceph/metric.c
+@@ -208,7 +208,7 @@ static void metric_delayed_work(struct w
+ struct ceph_mds_client *mdsc =
+ container_of(m, struct ceph_mds_client, metric);
+
+- if (mdsc->stopping)
++ if (mdsc->stopping || disable_send_metrics)
+ return;
+
+ if (!m->session || !check_session_state(m->session)) {
--- /dev/null
+From 1e4ab7b4c881cf26c1c72b3f56519e03475486fb Mon Sep 17 00:00:00 2001
+From: Joe Thornber <ejt@redhat.com>
+Date: Tue, 25 Jul 2023 11:44:41 -0400
+Subject: dm cache policy smq: ensure IO doesn't prevent cleaner policy progress
+
+From: Joe Thornber <ejt@redhat.com>
+
+commit 1e4ab7b4c881cf26c1c72b3f56519e03475486fb upstream.
+
+When using the cleaner policy to decommission the cache, there is
+never any writeback started from the cache as it is constantly delayed
+due to normal I/O keeping the device busy. Meaning @idle=false was
+always being passed to clean_target_met()
+
+Fix this by adding a specific 'cleaner' flag that is set when the
+cleaner policy is configured. This flag serves to always allow the
+cleaner's writeback work to be queued until the cache is
+decommissioned (even if the cache isn't idle).
+
+Reported-by: David Jeffery <djeffery@redhat.com>
+Fixes: b29d4986d0da ("dm cache: significant rework to leverage dm-bio-prison-v2")
+Cc: stable@vger.kernel.org
+Signed-off-by: Joe Thornber <ejt@redhat.com>
+Signed-off-by: Mike Snitzer <snitzer@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/md/dm-cache-policy-smq.c | 28 ++++++++++++++++++----------
+ 1 file changed, 18 insertions(+), 10 deletions(-)
+
+--- a/drivers/md/dm-cache-policy-smq.c
++++ b/drivers/md/dm-cache-policy-smq.c
+@@ -857,7 +857,13 @@ struct smq_policy {
+
+ struct background_tracker *bg_work;
+
+- bool migrations_allowed;
++ bool migrations_allowed:1;
++
++ /*
++ * If this is set the policy will try and clean the whole cache
++ * even if the device is not idle.
++ */
++ bool cleaner:1;
+ };
+
+ /*----------------------------------------------------------------*/
+@@ -1138,7 +1144,7 @@ static bool clean_target_met(struct smq_
+ * Cache entries may not be populated. So we cannot rely on the
+ * size of the clean queue.
+ */
+- if (idle) {
++ if (idle || mq->cleaner) {
+ /*
+ * We'd like to clean everything.
+ */
+@@ -1722,11 +1728,9 @@ static void calc_hotspot_params(sector_t
+ *hotspot_block_size /= 2u;
+ }
+
+-static struct dm_cache_policy *__smq_create(dm_cblock_t cache_size,
+- sector_t origin_size,
+- sector_t cache_block_size,
+- bool mimic_mq,
+- bool migrations_allowed)
++static struct dm_cache_policy *
++__smq_create(dm_cblock_t cache_size, sector_t origin_size, sector_t cache_block_size,
++ bool mimic_mq, bool migrations_allowed, bool cleaner)
+ {
+ unsigned int i;
+ unsigned int nr_sentinels_per_queue = 2u * NR_CACHE_LEVELS;
+@@ -1813,6 +1817,7 @@ static struct dm_cache_policy *__smq_cre
+ goto bad_btracker;
+
+ mq->migrations_allowed = migrations_allowed;
++ mq->cleaner = cleaner;
+
+ return &mq->policy;
+
+@@ -1836,21 +1841,24 @@ static struct dm_cache_policy *smq_creat
+ sector_t origin_size,
+ sector_t cache_block_size)
+ {
+- return __smq_create(cache_size, origin_size, cache_block_size, false, true);
++ return __smq_create(cache_size, origin_size, cache_block_size,
++ false, true, false);
+ }
+
+ static struct dm_cache_policy *mq_create(dm_cblock_t cache_size,
+ sector_t origin_size,
+ sector_t cache_block_size)
+ {
+- return __smq_create(cache_size, origin_size, cache_block_size, true, true);
++ return __smq_create(cache_size, origin_size, cache_block_size,
++ true, true, false);
+ }
+
+ static struct dm_cache_policy *cleaner_create(dm_cblock_t cache_size,
+ sector_t origin_size,
+ sector_t cache_block_size)
+ {
+- return __smq_create(cache_size, origin_size, cache_block_size, false, false);
++ return __smq_create(cache_size, origin_size, cache_block_size,
++ false, false, true);
+ }
+
+ /*----------------------------------------------------------------*/
--- /dev/null
+From 3844ed5e78823eebb5f0f1edefc403310693d402 Mon Sep 17 00:00:00 2001
+From: Radhakrishna Sripada <radhakrishna.sripada@intel.com>
+Date: Tue, 18 Jul 2023 15:51:18 -0700
+Subject: drm/i915/dpt: Use shmem for dpt objects
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+From: Radhakrishna Sripada <radhakrishna.sripada@intel.com>
+
+commit 3844ed5e78823eebb5f0f1edefc403310693d402 upstream.
+
+Dpt objects that are created from internal get evicted when there is
+memory pressure and do not get restored when pinned during scanout. The
+pinned page table entries look corrupted and programming the display
+engine with the incorrect pte's result in DE throwing pipe faults.
+
+Create DPT objects from shmem and mark the object as dirty when pinning so
+that the object is restored when shrinker evicts an unpinned buffer object.
+
+v2: Unconditionally mark the dpt objects dirty during pinning(Chris).
+
+Fixes: 0dc987b699ce ("drm/i915/display: Add smem fallback allocation for dpt")
+Cc: <stable@vger.kernel.org> # v6.0+
+Cc: Ville Syrjälä <ville.syrjala@linux.intel.com>
+Cc: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com>
+Suggested-by: Chris Wilson <chris.p.wilson@intel.com>
+Signed-off-by: Fei Yang <fei.yang@intel.com>
+Signed-off-by: Radhakrishna Sripada <radhakrishna.sripada@intel.com>
+Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
+Link: https://patchwork.freedesktop.org/patch/msgid/20230718225118.2562132-1-radhakrishna.sripada@intel.com
+(cherry picked from commit e91a777a6e602ba0e3366e053e4e094a334a1244)
+Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/gpu/drm/i915/display/intel_dpt.c | 4 +++-
+ 1 file changed, 3 insertions(+), 1 deletion(-)
+
+--- a/drivers/gpu/drm/i915/display/intel_dpt.c
++++ b/drivers/gpu/drm/i915/display/intel_dpt.c
+@@ -166,6 +166,8 @@ struct i915_vma *intel_dpt_pin(struct i9
+ i915_vma_get(vma);
+ }
+
++ dpt->obj->mm.dirty = true;
++
+ atomic_dec(&i915->gpu_error.pending_fb_pin);
+ intel_runtime_pm_put(&i915->runtime_pm, wakeref);
+
+@@ -261,7 +263,7 @@ intel_dpt_create(struct intel_framebuffe
+ dpt_obj = i915_gem_object_create_stolen(i915, size);
+ if (IS_ERR(dpt_obj) && !HAS_LMEM(i915)) {
+ drm_dbg_kms(&i915->drm, "Allocating dpt from smem\n");
+- dpt_obj = i915_gem_object_create_internal(i915, size);
++ dpt_obj = i915_gem_object_create_shmem(i915, size);
+ }
+ if (IS_ERR(dpt_obj))
+ return ERR_CAST(dpt_obj);
--- /dev/null
+From b1f02b95758d05b799731d939e76a0bd6da312db Mon Sep 17 00:00:00 2001
+From: Jann Horn <jannh@google.com>
+Date: Sat, 22 Jul 2023 00:51:07 +0200
+Subject: mm: fix memory ordering for mm_lock_seq and vm_lock_seq
+
+From: Jann Horn <jannh@google.com>
+
+commit b1f02b95758d05b799731d939e76a0bd6da312db upstream.
+
+mm->mm_lock_seq effectively functions as a read/write lock; therefore it
+must be used with acquire/release semantics.
+
+A specific example is the interaction between userfaultfd_register() and
+lock_vma_under_rcu().
+
+userfaultfd_register() does the following from the point where it changes
+a VMA's flags to the point where concurrent readers are permitted again
+(in a simple scenario where only a single private VMA is accessed and no
+merging/splitting is involved):
+
+userfaultfd_register
+ userfaultfd_set_vm_flags
+ vm_flags_reset
+ vma_start_write
+ down_write(&vma->vm_lock->lock)
+ vma->vm_lock_seq = mm_lock_seq [marks VMA as busy]
+ up_write(&vma->vm_lock->lock)
+ vm_flags_init
+ [sets VM_UFFD_* in __vm_flags]
+ vma->vm_userfaultfd_ctx.ctx = ctx
+ mmap_write_unlock
+ vma_end_write_all
+ WRITE_ONCE(mm->mm_lock_seq, mm->mm_lock_seq + 1) [unlocks VMA]
+
+There are no memory barriers in between the __vm_flags update and the
+mm->mm_lock_seq update that unlocks the VMA, so the unlock can be
+reordered to above the `vm_flags_init()` call, which means from the
+perspective of a concurrent reader, a VMA can be marked as a userfaultfd
+VMA while it is not VMA-locked. That's bad, we definitely need a
+store-release for the unlock operation.
+
+The non-atomic write to vma->vm_lock_seq in vma_start_write() is mostly
+fine because all accesses to vma->vm_lock_seq that matter are always
+protected by the VMA lock. There is a racy read in vma_start_read()
+though that can tolerate false-positives, so we should be using
+WRITE_ONCE() to keep things tidy and data-race-free (including for KCSAN).
+
+On the other side, lock_vma_under_rcu() works as follows in the relevant
+region for locking and userfaultfd check:
+
+lock_vma_under_rcu
+ vma_start_read
+ vma->vm_lock_seq == READ_ONCE(vma->vm_mm->mm_lock_seq) [early bailout]
+ down_read_trylock(&vma->vm_lock->lock)
+ vma->vm_lock_seq == READ_ONCE(vma->vm_mm->mm_lock_seq) [main check]
+ userfaultfd_armed
+ checks vma->vm_flags & __VM_UFFD_FLAGS
+
+Here, the interesting aspect is how far down the mm->mm_lock_seq read can
+be reordered - if this read is reordered down below the vma->vm_flags
+access, this could cause lock_vma_under_rcu() to partly operate on
+information that was read while the VMA was supposed to be locked. To
+prevent this kind of downwards bleeding of the mm->mm_lock_seq read, we
+need to read it with a load-acquire.
+
+Some of the comment wording is based on suggestions by Suren.
+
+BACKPORT WARNING: One of the functions changed by this patch (which I've
+written against Linus' tree) is vma_try_start_write(), but this function
+no longer exists in mm/mm-everything. I don't know whether the merged
+version of this patch will be ordered before or after the patch that
+removes vma_try_start_write(). If you're backporting this patch to a tree
+with vma_try_start_write(), make sure this patch changes that function.
+
+Link: https://lkml.kernel.org/r/20230721225107.942336-1-jannh@google.com
+Fixes: 5e31275cc997 ("mm: add per-VMA lock and helper functions to control it")
+Signed-off-by: Jann Horn <jannh@google.com>
+Reviewed-by: Suren Baghdasaryan <surenb@google.com>
+Cc: <stable@vger.kernel.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ include/linux/mm.h | 29 +++++++++++++++++++++++------
+ include/linux/mm_types.h | 28 ++++++++++++++++++++++++++++
+ include/linux/mmap_lock.h | 10 ++++++++--
+ 3 files changed, 59 insertions(+), 8 deletions(-)
+
+diff --git a/include/linux/mm.h b/include/linux/mm.h
+index 2dd73e4f3d8e..406ab9ea818f 100644
+--- a/include/linux/mm.h
++++ b/include/linux/mm.h
+@@ -641,8 +641,14 @@ static inline void vma_numab_state_free(struct vm_area_struct *vma) {}
+ */
+ static inline bool vma_start_read(struct vm_area_struct *vma)
+ {
+- /* Check before locking. A race might cause false locked result. */
+- if (vma->vm_lock_seq == READ_ONCE(vma->vm_mm->mm_lock_seq))
++ /*
++ * Check before locking. A race might cause false locked result.
++ * We can use READ_ONCE() for the mm_lock_seq here, and don't need
++ * ACQUIRE semantics, because this is just a lockless check whose result
++ * we don't rely on for anything - the mm_lock_seq read against which we
++ * need ordering is below.
++ */
++ if (READ_ONCE(vma->vm_lock_seq) == READ_ONCE(vma->vm_mm->mm_lock_seq))
+ return false;
+
+ if (unlikely(down_read_trylock(&vma->vm_lock->lock) == 0))
+@@ -653,8 +659,13 @@ static inline bool vma_start_read(struct vm_area_struct *vma)
+ * False unlocked result is impossible because we modify and check
+ * vma->vm_lock_seq under vma->vm_lock protection and mm->mm_lock_seq
+ * modification invalidates all existing locks.
++ *
++ * We must use ACQUIRE semantics for the mm_lock_seq so that if we are
++ * racing with vma_end_write_all(), we only start reading from the VMA
++ * after it has been unlocked.
++ * This pairs with RELEASE semantics in vma_end_write_all().
+ */
+- if (unlikely(vma->vm_lock_seq == READ_ONCE(vma->vm_mm->mm_lock_seq))) {
++ if (unlikely(vma->vm_lock_seq == smp_load_acquire(&vma->vm_mm->mm_lock_seq))) {
+ up_read(&vma->vm_lock->lock);
+ return false;
+ }
+@@ -676,7 +687,7 @@ static bool __is_vma_write_locked(struct vm_area_struct *vma, int *mm_lock_seq)
+ * current task is holding mmap_write_lock, both vma->vm_lock_seq and
+ * mm->mm_lock_seq can't be concurrently modified.
+ */
+- *mm_lock_seq = READ_ONCE(vma->vm_mm->mm_lock_seq);
++ *mm_lock_seq = vma->vm_mm->mm_lock_seq;
+ return (vma->vm_lock_seq == *mm_lock_seq);
+ }
+
+@@ -688,7 +699,13 @@ static inline void vma_start_write(struct vm_area_struct *vma)
+ return;
+
+ down_write(&vma->vm_lock->lock);
+- vma->vm_lock_seq = mm_lock_seq;
++ /*
++ * We should use WRITE_ONCE() here because we can have concurrent reads
++ * from the early lockless pessimistic check in vma_start_read().
++ * We don't really care about the correctness of that early check, but
++ * we should use WRITE_ONCE() for cleanliness and to keep KCSAN happy.
++ */
++ WRITE_ONCE(vma->vm_lock_seq, mm_lock_seq);
+ up_write(&vma->vm_lock->lock);
+ }
+
+@@ -702,7 +719,7 @@ static inline bool vma_try_start_write(struct vm_area_struct *vma)
+ if (!down_write_trylock(&vma->vm_lock->lock))
+ return false;
+
+- vma->vm_lock_seq = mm_lock_seq;
++ WRITE_ONCE(vma->vm_lock_seq, mm_lock_seq);
+ up_write(&vma->vm_lock->lock);
+ return true;
+ }
+diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
+index de10fc797c8e..5e74ce4a28cd 100644
+--- a/include/linux/mm_types.h
++++ b/include/linux/mm_types.h
+@@ -514,6 +514,20 @@ struct vm_area_struct {
+ };
+
+ #ifdef CONFIG_PER_VMA_LOCK
++ /*
++ * Can only be written (using WRITE_ONCE()) while holding both:
++ * - mmap_lock (in write mode)
++ * - vm_lock->lock (in write mode)
++ * Can be read reliably while holding one of:
++ * - mmap_lock (in read or write mode)
++ * - vm_lock->lock (in read or write mode)
++ * Can be read unreliably (using READ_ONCE()) for pessimistic bailout
++ * while holding nothing (except RCU to keep the VMA struct allocated).
++ *
++ * This sequence counter is explicitly allowed to overflow; sequence
++ * counter reuse can only lead to occasional unnecessary use of the
++ * slowpath.
++ */
+ int vm_lock_seq;
+ struct vma_lock *vm_lock;
+
+@@ -679,6 +693,20 @@ struct mm_struct {
+ * by mmlist_lock
+ */
+ #ifdef CONFIG_PER_VMA_LOCK
++ /*
++ * This field has lock-like semantics, meaning it is sometimes
++ * accessed with ACQUIRE/RELEASE semantics.
++ * Roughly speaking, incrementing the sequence number is
++ * equivalent to releasing locks on VMAs; reading the sequence
++ * number can be part of taking a read lock on a VMA.
++ *
++ * Can be modified under write mmap_lock using RELEASE
++ * semantics.
++ * Can be read with no other protection when holding write
++ * mmap_lock.
++ * Can be read with ACQUIRE semantics if not holding write
++ * mmap_lock.
++ */
+ int mm_lock_seq;
+ #endif
+
+diff --git a/include/linux/mmap_lock.h b/include/linux/mmap_lock.h
+index aab8f1b28d26..e05e167dbd16 100644
+--- a/include/linux/mmap_lock.h
++++ b/include/linux/mmap_lock.h
+@@ -76,8 +76,14 @@ static inline void mmap_assert_write_locked(struct mm_struct *mm)
+ static inline void vma_end_write_all(struct mm_struct *mm)
+ {
+ mmap_assert_write_locked(mm);
+- /* No races during update due to exclusive mmap_lock being held */
+- WRITE_ONCE(mm->mm_lock_seq, mm->mm_lock_seq + 1);
++ /*
++ * Nobody can concurrently modify mm->mm_lock_seq due to exclusive
++ * mmap_lock being held.
++ * We need RELEASE semantics here to ensure that preceding stores into
++ * the VMA take effect before we unlock it with this store.
++ * Pairs with ACQUIRE semantics in vma_start_read().
++ */
++ smp_store_release(&mm->mm_lock_seq, mm->mm_lock_seq + 1);
+ }
+ #else
+ static inline void vma_end_write_all(struct mm_struct *mm) {}
+--
+2.41.0
+
--- /dev/null
+From d8ab9f7b644a2c9b64de405c1953c905ff219dc9 Mon Sep 17 00:00:00 2001
+From: Jann Horn <jannh@google.com>
+Date: Fri, 21 Jul 2023 05:46:43 +0200
+Subject: mm: lock VMA in dup_anon_vma() before setting ->anon_vma
+
+From: Jann Horn <jannh@google.com>
+
+commit d8ab9f7b644a2c9b64de405c1953c905ff219dc9 upstream.
+
+When VMAs are merged, dup_anon_vma() is called with `dst` pointing to the
+VMA that is being expanded to cover the area previously occupied by
+another VMA. This currently happens while `dst` is not write-locked.
+
+This means that, in the `src->anon_vma && !dst->anon_vma` case, as soon as
+the assignment `dst->anon_vma = src->anon_vma` has happened, concurrent
+page faults can happen on `dst` under the per-VMA lock. This is already
+icky in itself, since such page faults can now install pages into `dst`
+that are attached to an `anon_vma` that is not yet tied back to the
+`anon_vma` with an `anon_vma_chain`. But if `anon_vma_clone()` fails due
+to an out-of-memory error, things get much worse: `anon_vma_clone()` then
+reverts `dst->anon_vma` back to NULL, and `dst` remains completely
+unconnected to the `anon_vma`, even though we can have pages in the area
+covered by `dst` that point to the `anon_vma`.
+
+This means the `anon_vma` of such pages can be freed while the pages are
+still mapped into userspace, which leads to UAF when a helper like
+folio_lock_anon_vma_read() tries to look up the anon_vma of such a page.
+
+This theoretically is a security bug, but I believe it is really hard to
+actually trigger as an unprivileged user because it requires that you can
+make an order-0 GFP_KERNEL allocation fail, and the page allocator tries
+pretty hard to prevent that.
+
+I think doing the vma_start_write() call inside dup_anon_vma() is the most
+straightforward fix for now.
+
+For a kernel-assisted reproducer, see the notes section of the patch mail.
+
+Link: https://lkml.kernel.org/r/20230721034643.616851-1-jannh@google.com
+Fixes: 5e31275cc997 ("mm: add per-VMA lock and helper functions to control it")
+Signed-off-by: Jann Horn <jannh@google.com>
+Reviewed-by: Suren Baghdasaryan <surenb@google.com>
+Cc: <stable@vger.kernel.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ mm/mmap.c | 1 +
+ 1 file changed, 1 insertion(+)
+
+--- a/mm/mmap.c
++++ b/mm/mmap.c
+@@ -647,6 +647,7 @@ static inline int dup_anon_vma(struct vm
+ * anon pages imported.
+ */
+ if (src->anon_vma && !dst->anon_vma) {
++ vma_start_write(dst);
+ dst->anon_vma = src->anon_vma;
+ return anon_vma_clone(dst, src);
+ }
--- /dev/null
+From 6c54312f9689fbe27c70db5d42eebd29d04b672e Mon Sep 17 00:00:00 2001
+From: Sidhartha Kumar <sidhartha.kumar@oracle.com>
+Date: Mon, 17 Jul 2023 11:18:12 -0700
+Subject: mm/memory-failure: fix hardware poison check in unpoison_memory()
+
+From: Sidhartha Kumar <sidhartha.kumar@oracle.com>
+
+commit 6c54312f9689fbe27c70db5d42eebd29d04b672e upstream.
+
+It was pointed out[1] that using folio_test_hwpoison() is wrong as we need
+to check the indiviual page that has poison. folio_test_hwpoison() only
+checks the head page so go back to using PageHWPoison().
+
+User-visible effects include existing hwpoison-inject tests possibly
+failing as unpoisoning a single subpage could lead to unpoisoning an
+entire folio. Memory unpoisoning could also not work as expected as
+the function will break early due to only checking the head page and
+not the actually poisoned subpage.
+
+[1]: https://lore.kernel.org/lkml/ZLIbZygG7LqSI9xe@casper.infradead.org/
+
+Link: https://lkml.kernel.org/r/20230717181812.167757-1-sidhartha.kumar@oracle.com
+Fixes: a6fddef49eef ("mm/memory-failure: convert unpoison_memory() to folios")
+Signed-off-by: Sidhartha Kumar <sidhartha.kumar@oracle.com>
+Reported-by: Matthew Wilcox (Oracle) <willy@infradead.org>
+Acked-by: Naoya Horiguchi <naoya.horiguchi@nec.com>
+Reviewed-by: Miaohe Lin <linmiaohe@huawei.com>
+Cc: <stable@vger.kernel.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ mm/memory-failure.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/mm/memory-failure.c
++++ b/mm/memory-failure.c
+@@ -2490,7 +2490,7 @@ int unpoison_memory(unsigned long pfn)
+ goto unlock_mutex;
+ }
+
+- if (!folio_test_hwpoison(folio)) {
++ if (!PageHWPoison(p)) {
+ unpoison_pr_info("Unpoison: Page was already unpoisoned %#lx\n",
+ pfn, &unpoison_rs);
+ goto unlock_mutex;
--- /dev/null
+From 6c21e066f9256ea1df6f88768f6ae1080b7cf509 Mon Sep 17 00:00:00 2001
+From: Jann Horn <jannh@google.com>
+Date: Fri, 28 Jul 2023 06:13:21 +0200
+Subject: mm/mempolicy: Take VMA lock before replacing policy
+
+From: Jann Horn <jannh@google.com>
+
+commit 6c21e066f9256ea1df6f88768f6ae1080b7cf509 upstream.
+
+mbind() calls down into vma_replace_policy() without taking the per-VMA
+locks, replaces the VMA's vma->vm_policy pointer, and frees the old
+policy. That's bad; a concurrent page fault might still be using the
+old policy (in vma_alloc_folio()), resulting in use-after-free.
+
+Normally this will manifest as a use-after-free read first, but it can
+result in memory corruption, including because vma_alloc_folio() can
+call mpol_cond_put() on the freed policy, which conditionally changes
+the policy's refcount member.
+
+This bug is specific to CONFIG_NUMA, but it does also affect non-NUMA
+systems as long as the kernel was built with CONFIG_NUMA.
+
+Signed-off-by: Jann Horn <jannh@google.com>
+Reviewed-by: Suren Baghdasaryan <surenb@google.com>
+Fixes: 5e31275cc997 ("mm: add per-VMA lock and helper functions to control it")
+Cc: stable@kernel.org
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ mm/mempolicy.c | 15 ++++++++++++++-
+ 1 file changed, 14 insertions(+), 1 deletion(-)
+
+--- a/mm/mempolicy.c
++++ b/mm/mempolicy.c
+@@ -384,8 +384,10 @@ void mpol_rebind_mm(struct mm_struct *mm
+ VMA_ITERATOR(vmi, mm, 0);
+
+ mmap_write_lock(mm);
+- for_each_vma(vmi, vma)
++ for_each_vma(vmi, vma) {
++ vma_start_write(vma);
+ mpol_rebind_policy(vma->vm_policy, new);
++ }
+ mmap_write_unlock(mm);
+ }
+
+@@ -765,6 +767,8 @@ static int vma_replace_policy(struct vm_
+ struct mempolicy *old;
+ struct mempolicy *new;
+
++ vma_assert_write_locked(vma);
++
+ pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
+ vma->vm_start, vma->vm_end, vma->vm_pgoff,
+ vma->vm_ops, vma->vm_file,
+@@ -1313,6 +1317,14 @@ static long do_mbind(unsigned long start
+ if (err)
+ goto mpol_out;
+
++ /*
++ * Lock the VMAs before scanning for pages to migrate, to ensure we don't
++ * miss a concurrently inserted page.
++ */
++ vma_iter_init(&vmi, mm, start);
++ for_each_vma_range(vmi, vma, end)
++ vma_start_write(vma);
++
+ ret = queue_pages_range(mm, start, end, nmask,
+ flags | MPOL_MF_INVERT, &pagelist);
+
+@@ -1538,6 +1550,7 @@ SYSCALL_DEFINE4(set_mempolicy_home_node,
+ break;
+ }
+
++ vma_start_write(vma);
+ new->home_node = home_node;
+ err = mbind_range(&vmi, vma, &prev, start, end, new);
+ mpol_put(new);
--- /dev/null
+From 8ff2c64c9765446c3cef804fb99da04916603e27 Mon Sep 17 00:00:00 2001
+From: Ilya Dryomov <idryomov@gmail.com>
+Date: Sat, 8 Jul 2023 16:16:59 +0200
+Subject: rbd: harden get_lock_owner_info() a bit
+
+From: Ilya Dryomov <idryomov@gmail.com>
+
+commit 8ff2c64c9765446c3cef804fb99da04916603e27 upstream.
+
+- we want the exclusive lock type, so test for it directly
+- use sscanf() to actually parse the lock cookie and avoid admitting
+ invalid handles
+- bail if locker has a blank address
+
+Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
+Reviewed-by: Dongsheng Yang <dongsheng.yang@easystack.cn>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/block/rbd.c | 21 +++++++++++++++------
+ net/ceph/messenger.c | 1 +
+ 2 files changed, 16 insertions(+), 6 deletions(-)
+
+--- a/drivers/block/rbd.c
++++ b/drivers/block/rbd.c
+@@ -3862,10 +3862,9 @@ static struct ceph_locker *get_lock_owne
+ u32 num_lockers;
+ u8 lock_type;
+ char *lock_tag;
++ u64 handle;
+ int ret;
+
+- dout("%s rbd_dev %p\n", __func__, rbd_dev);
+-
+ ret = ceph_cls_lock_info(osdc, &rbd_dev->header_oid,
+ &rbd_dev->header_oloc, RBD_LOCK_NAME,
+ &lock_type, &lock_tag, &lockers, &num_lockers);
+@@ -3886,18 +3885,28 @@ static struct ceph_locker *get_lock_owne
+ goto err_busy;
+ }
+
+- if (lock_type == CEPH_CLS_LOCK_SHARED) {
+- rbd_warn(rbd_dev, "shared lock type detected");
++ if (lock_type != CEPH_CLS_LOCK_EXCLUSIVE) {
++ rbd_warn(rbd_dev, "incompatible lock type detected");
+ goto err_busy;
+ }
+
+ WARN_ON(num_lockers != 1);
+- if (strncmp(lockers[0].id.cookie, RBD_LOCK_COOKIE_PREFIX,
+- strlen(RBD_LOCK_COOKIE_PREFIX))) {
++ ret = sscanf(lockers[0].id.cookie, RBD_LOCK_COOKIE_PREFIX " %llu",
++ &handle);
++ if (ret != 1) {
+ rbd_warn(rbd_dev, "locked by external mechanism, cookie %s",
+ lockers[0].id.cookie);
+ goto err_busy;
+ }
++ if (ceph_addr_is_blank(&lockers[0].info.addr)) {
++ rbd_warn(rbd_dev, "locker has a blank address");
++ goto err_busy;
++ }
++
++ dout("%s rbd_dev %p got locker %s%llu@%pISpc/%u handle %llu\n",
++ __func__, rbd_dev, ENTITY_NAME(lockers[0].id.name),
++ &lockers[0].info.addr.in_addr,
++ le32_to_cpu(lockers[0].info.addr.nonce), handle);
+
+ out:
+ kfree(lock_tag);
+--- a/net/ceph/messenger.c
++++ b/net/ceph/messenger.c
+@@ -1123,6 +1123,7 @@ bool ceph_addr_is_blank(const struct cep
+ return true;
+ }
+ }
++EXPORT_SYMBOL(ceph_addr_is_blank);
+
+ int ceph_addr_port(const struct ceph_entity_addr *addr)
+ {
--- /dev/null
+From f38cb9d9c2045dad16eead4a2e1aedfddd94603b Mon Sep 17 00:00:00 2001
+From: Ilya Dryomov <idryomov@gmail.com>
+Date: Fri, 30 Jun 2023 13:52:13 +0200
+Subject: rbd: make get_lock_owner_info() return a single locker or NULL
+
+From: Ilya Dryomov <idryomov@gmail.com>
+
+commit f38cb9d9c2045dad16eead4a2e1aedfddd94603b upstream.
+
+Make the "num_lockers can be only 0 or 1" assumption explicit and
+simplify the API by getting rid of output parameters in preparation
+for calling get_lock_owner_info() twice before blocklisting.
+
+Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
+Reviewed-by: Dongsheng Yang <dongsheng.yang@easystack.cn>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/block/rbd.c | 84 +++++++++++++++++++++++++++++++---------------------
+ 1 file changed, 51 insertions(+), 33 deletions(-)
+
+--- a/drivers/block/rbd.c
++++ b/drivers/block/rbd.c
+@@ -3849,10 +3849,17 @@ static void wake_lock_waiters(struct rbd
+ list_splice_tail_init(&rbd_dev->acquiring_list, &rbd_dev->running_list);
+ }
+
+-static int get_lock_owner_info(struct rbd_device *rbd_dev,
+- struct ceph_locker **lockers, u32 *num_lockers)
++static void free_locker(struct ceph_locker *locker)
++{
++ if (locker)
++ ceph_free_lockers(locker, 1);
++}
++
++static struct ceph_locker *get_lock_owner_info(struct rbd_device *rbd_dev)
+ {
+ struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
++ struct ceph_locker *lockers;
++ u32 num_lockers;
+ u8 lock_type;
+ char *lock_tag;
+ int ret;
+@@ -3861,39 +3868,45 @@ static int get_lock_owner_info(struct rb
+
+ ret = ceph_cls_lock_info(osdc, &rbd_dev->header_oid,
+ &rbd_dev->header_oloc, RBD_LOCK_NAME,
+- &lock_type, &lock_tag, lockers, num_lockers);
+- if (ret)
+- return ret;
++ &lock_type, &lock_tag, &lockers, &num_lockers);
++ if (ret) {
++ rbd_warn(rbd_dev, "failed to retrieve lockers: %d", ret);
++ return ERR_PTR(ret);
++ }
+
+- if (*num_lockers == 0) {
++ if (num_lockers == 0) {
+ dout("%s rbd_dev %p no lockers detected\n", __func__, rbd_dev);
++ lockers = NULL;
+ goto out;
+ }
+
+ if (strcmp(lock_tag, RBD_LOCK_TAG)) {
+ rbd_warn(rbd_dev, "locked by external mechanism, tag %s",
+ lock_tag);
+- ret = -EBUSY;
+- goto out;
++ goto err_busy;
+ }
+
+ if (lock_type == CEPH_CLS_LOCK_SHARED) {
+ rbd_warn(rbd_dev, "shared lock type detected");
+- ret = -EBUSY;
+- goto out;
++ goto err_busy;
+ }
+
+- if (strncmp((*lockers)[0].id.cookie, RBD_LOCK_COOKIE_PREFIX,
++ WARN_ON(num_lockers != 1);
++ if (strncmp(lockers[0].id.cookie, RBD_LOCK_COOKIE_PREFIX,
+ strlen(RBD_LOCK_COOKIE_PREFIX))) {
+ rbd_warn(rbd_dev, "locked by external mechanism, cookie %s",
+- (*lockers)[0].id.cookie);
+- ret = -EBUSY;
+- goto out;
++ lockers[0].id.cookie);
++ goto err_busy;
+ }
+
+ out:
+ kfree(lock_tag);
+- return ret;
++ return lockers;
++
++err_busy:
++ kfree(lock_tag);
++ ceph_free_lockers(lockers, num_lockers);
++ return ERR_PTR(-EBUSY);
+ }
+
+ static int find_watcher(struct rbd_device *rbd_dev,
+@@ -3947,51 +3960,56 @@ out:
+ static int rbd_try_lock(struct rbd_device *rbd_dev)
+ {
+ struct ceph_client *client = rbd_dev->rbd_client->client;
+- struct ceph_locker *lockers;
+- u32 num_lockers;
++ struct ceph_locker *locker;
+ int ret;
+
+ for (;;) {
++ locker = NULL;
++
+ ret = rbd_lock(rbd_dev);
+ if (ret != -EBUSY)
+- return ret;
++ goto out;
+
+ /* determine if the current lock holder is still alive */
+- ret = get_lock_owner_info(rbd_dev, &lockers, &num_lockers);
+- if (ret)
+- return ret;
+-
+- if (num_lockers == 0)
++ locker = get_lock_owner_info(rbd_dev);
++ if (IS_ERR(locker)) {
++ ret = PTR_ERR(locker);
++ locker = NULL;
++ goto out;
++ }
++ if (!locker)
+ goto again;
+
+- ret = find_watcher(rbd_dev, lockers);
++ ret = find_watcher(rbd_dev, locker);
+ if (ret)
+ goto out; /* request lock or error */
+
+ rbd_warn(rbd_dev, "breaking header lock owned by %s%llu",
+- ENTITY_NAME(lockers[0].id.name));
++ ENTITY_NAME(locker->id.name));
+
+ ret = ceph_monc_blocklist_add(&client->monc,
+- &lockers[0].info.addr);
++ &locker->info.addr);
+ if (ret) {
+- rbd_warn(rbd_dev, "blocklist of %s%llu failed: %d",
+- ENTITY_NAME(lockers[0].id.name), ret);
++ rbd_warn(rbd_dev, "failed to blocklist %s%llu: %d",
++ ENTITY_NAME(locker->id.name), ret);
+ goto out;
+ }
+
+ ret = ceph_cls_break_lock(&client->osdc, &rbd_dev->header_oid,
+ &rbd_dev->header_oloc, RBD_LOCK_NAME,
+- lockers[0].id.cookie,
+- &lockers[0].id.name);
+- if (ret && ret != -ENOENT)
++ locker->id.cookie, &locker->id.name);
++ if (ret && ret != -ENOENT) {
++ rbd_warn(rbd_dev, "failed to break header lock: %d",
++ ret);
+ goto out;
++ }
+
+ again:
+- ceph_free_lockers(lockers, num_lockers);
++ free_locker(locker);
+ }
+
+ out:
+- ceph_free_lockers(lockers, num_lockers);
++ free_locker(locker);
+ return ret;
+ }
+
--- /dev/null
+From 588159009d5b7a09c3e5904cffddbe4a4e170301 Mon Sep 17 00:00:00 2001
+From: Ilya Dryomov <idryomov@gmail.com>
+Date: Sat, 22 Jul 2023 20:28:08 +0200
+Subject: rbd: retrieve and check lock owner twice before blocklisting
+
+From: Ilya Dryomov <idryomov@gmail.com>
+
+commit 588159009d5b7a09c3e5904cffddbe4a4e170301 upstream.
+
+An attempt to acquire exclusive lock can race with the current lock
+owner closing the image:
+
+1. lock is held by client123, rbd_lock() returns -EBUSY
+2. get_lock_owner_info() returns client123 instance details
+3. client123 closes the image, lock is released
+4. find_watcher() returns 0 as there is no matching watcher anymore
+5. client123 instance gets erroneously blocklisted
+
+Particularly impacted is mirror snapshot scheduler in snapshot-based
+mirroring since it happens to open and close images a lot (images are
+opened only for as long as it takes to take the next mirror snapshot,
+the same client instance is used for all images).
+
+To reduce the potential for erroneous blocklisting, retrieve the lock
+owner again after find_watcher() returns 0. If it's still there, make
+sure it matches the previously detected lock owner.
+
+Cc: stable@vger.kernel.org # f38cb9d9c204: rbd: make get_lock_owner_info() return a single locker or NULL
+Cc: stable@vger.kernel.org # 8ff2c64c9765: rbd: harden get_lock_owner_info() a bit
+Cc: stable@vger.kernel.org
+Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
+Reviewed-by: Dongsheng Yang <dongsheng.yang@easystack.cn>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/block/rbd.c | 25 +++++++++++++++++++++++--
+ 1 file changed, 23 insertions(+), 2 deletions(-)
+
+--- a/drivers/block/rbd.c
++++ b/drivers/block/rbd.c
+@@ -3849,6 +3849,15 @@ static void wake_lock_waiters(struct rbd
+ list_splice_tail_init(&rbd_dev->acquiring_list, &rbd_dev->running_list);
+ }
+
++static bool locker_equal(const struct ceph_locker *lhs,
++ const struct ceph_locker *rhs)
++{
++ return lhs->id.name.type == rhs->id.name.type &&
++ lhs->id.name.num == rhs->id.name.num &&
++ !strcmp(lhs->id.cookie, rhs->id.cookie) &&
++ ceph_addr_equal_no_type(&lhs->info.addr, &rhs->info.addr);
++}
++
+ static void free_locker(struct ceph_locker *locker)
+ {
+ if (locker)
+@@ -3969,11 +3978,11 @@ out:
+ static int rbd_try_lock(struct rbd_device *rbd_dev)
+ {
+ struct ceph_client *client = rbd_dev->rbd_client->client;
+- struct ceph_locker *locker;
++ struct ceph_locker *locker, *refreshed_locker;
+ int ret;
+
+ for (;;) {
+- locker = NULL;
++ locker = refreshed_locker = NULL;
+
+ ret = rbd_lock(rbd_dev);
+ if (ret != -EBUSY)
+@@ -3993,6 +4002,16 @@ static int rbd_try_lock(struct rbd_devic
+ if (ret)
+ goto out; /* request lock or error */
+
++ refreshed_locker = get_lock_owner_info(rbd_dev);
++ if (IS_ERR(refreshed_locker)) {
++ ret = PTR_ERR(refreshed_locker);
++ refreshed_locker = NULL;
++ goto out;
++ }
++ if (!refreshed_locker ||
++ !locker_equal(locker, refreshed_locker))
++ goto again;
++
+ rbd_warn(rbd_dev, "breaking header lock owned by %s%llu",
+ ENTITY_NAME(locker->id.name));
+
+@@ -4014,10 +4033,12 @@ static int rbd_try_lock(struct rbd_devic
+ }
+
+ again:
++ free_locker(refreshed_locker);
+ free_locker(locker);
+ }
+
+ out:
++ free_locker(refreshed_locker);
+ free_locker(locker);
+ return ret;
+ }
arm64-sme-set-new-vector-length-before-reallocating.patch
pm-sleep-wakeirq-fix-wake-irq-arming.patch
thermal-of-fix-double-free-on-unregistration.patch
+ceph-never-send-metrics-if-disable_send_metrics-is-set.patch
+drm-i915-dpt-use-shmem-for-dpt-objects.patch
+dm-cache-policy-smq-ensure-io-doesn-t-prevent-cleaner-policy-progress.patch
+rbd-make-get_lock_owner_info-return-a-single-locker-or-null.patch
+rbd-harden-get_lock_owner_info-a-bit.patch
+rbd-retrieve-and-check-lock-owner-twice-before-blocklisting.patch
+mm-lock-vma-in-dup_anon_vma-before-setting-anon_vma.patch
+mm-fix-memory-ordering-for-mm_lock_seq-and-vm_lock_seq.patch
+mm-memory-failure-fix-hardware-poison-check-in-unpoison_memory.patch
+mm-mempolicy-take-vma-lock-before-replacing-policy.patch