]> git.ipfire.org Git - thirdparty/kernel/stable-queue.git/commitdiff
6.6-stable patches
authorGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Mon, 27 Oct 2025 08:04:27 +0000 (09:04 +0100)
committerGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Mon, 27 Oct 2025 08:04:27 +0000 (09:04 +0100)
added patches:
devcoredump-fix-circular-locking-dependency-with-devcd-mutex.patch
fs-notify-call-exportfs_encode_fid-with-s_umount.patch
fuse-allocate-ff-release_args-only-if-release-is-needed.patch
fuse-fix-livelock-in-synchronous-file-put-from-fuseblk-workers.patch
s390-cio-update-purge-function-to-unregister-the-unused-subchannels.patch
x86-resctrl-fix-miscount-of-bandwidth-event-when-reactivating-previously-unavailable-rmid.patch
xfs-always-warn-about-deprecated-mount-options.patch

queue-6.6/devcoredump-fix-circular-locking-dependency-with-devcd-mutex.patch [new file with mode: 0644]
queue-6.6/fs-notify-call-exportfs_encode_fid-with-s_umount.patch [new file with mode: 0644]
queue-6.6/fuse-allocate-ff-release_args-only-if-release-is-needed.patch [new file with mode: 0644]
queue-6.6/fuse-fix-livelock-in-synchronous-file-put-from-fuseblk-workers.patch [new file with mode: 0644]
queue-6.6/s390-cio-update-purge-function-to-unregister-the-unused-subchannels.patch [new file with mode: 0644]
queue-6.6/series
queue-6.6/x86-resctrl-fix-miscount-of-bandwidth-event-when-reactivating-previously-unavailable-rmid.patch [new file with mode: 0644]
queue-6.6/xfs-always-warn-about-deprecated-mount-options.patch [new file with mode: 0644]

diff --git a/queue-6.6/devcoredump-fix-circular-locking-dependency-with-devcd-mutex.patch b/queue-6.6/devcoredump-fix-circular-locking-dependency-with-devcd-mutex.patch
new file mode 100644 (file)
index 0000000..070e8f2
--- /dev/null
@@ -0,0 +1,389 @@
+From sashal@kernel.org Mon Oct 27 00:49:57 2025
+From: Sasha Levin <sashal@kernel.org>
+Date: Sun, 26 Oct 2025 19:49:50 -0400
+Subject: devcoredump: Fix circular locking dependency with devcd->mutex.
+To: stable@vger.kernel.org
+Cc: Maarten Lankhorst <dev@lankhorst.se>, Mukesh Ojha <quic_mojha@quicinc.com>, Greg Kroah-Hartman <gregkh@linuxfoundation.org>, Johannes Berg <johannes@sipsolutions.net>, "Rafael J. Wysocki" <rafael@kernel.org>, Danilo Krummrich <dakr@kernel.org>, linux-kernel@vger.kernel.org, Matthew Brost <matthew.brost@intel.com>, Mukesh Ojha <mukesh.ojha@oss.qualcomm.com>, Sasha Levin <sashal@kernel.org>
+Message-ID: <20251026234950.288779-1-sashal@kernel.org>
+
+From: Maarten Lankhorst <dev@lankhorst.se>
+
+[ Upstream commit a91c8096590bd7801a26454789f2992094fe36da ]
+
+The original code causes a circular locking dependency found by lockdep.
+
+======================================================
+WARNING: possible circular locking dependency detected
+6.16.0-rc6-lgci-xe-xe-pw-151626v3+ #1 Tainted: G S   U
+------------------------------------------------------
+xe_fault_inject/5091 is trying to acquire lock:
+ffff888156815688 ((work_completion)(&(&devcd->del_wk)->work)){+.+.}-{0:0}, at: __flush_work+0x25d/0x660
+
+but task is already holding lock:
+
+ffff888156815620 (&devcd->mutex){+.+.}-{3:3}, at: dev_coredump_put+0x3f/0xa0
+which lock already depends on the new lock.
+the existing dependency chain (in reverse order) is:
+-> #2 (&devcd->mutex){+.+.}-{3:3}:
+       mutex_lock_nested+0x4e/0xc0
+       devcd_data_write+0x27/0x90
+       sysfs_kf_bin_write+0x80/0xf0
+       kernfs_fop_write_iter+0x169/0x220
+       vfs_write+0x293/0x560
+       ksys_write+0x72/0xf0
+       __x64_sys_write+0x19/0x30
+       x64_sys_call+0x2bf/0x2660
+       do_syscall_64+0x93/0xb60
+       entry_SYSCALL_64_after_hwframe+0x76/0x7e
+-> #1 (kn->active#236){++++}-{0:0}:
+       kernfs_drain+0x1e2/0x200
+       __kernfs_remove+0xae/0x400
+       kernfs_remove_by_name_ns+0x5d/0xc0
+       remove_files+0x54/0x70
+       sysfs_remove_group+0x3d/0xa0
+       sysfs_remove_groups+0x2e/0x60
+       device_remove_attrs+0xc7/0x100
+       device_del+0x15d/0x3b0
+       devcd_del+0x19/0x30
+       process_one_work+0x22b/0x6f0
+       worker_thread+0x1e8/0x3d0
+       kthread+0x11c/0x250
+       ret_from_fork+0x26c/0x2e0
+       ret_from_fork_asm+0x1a/0x30
+-> #0 ((work_completion)(&(&devcd->del_wk)->work)){+.+.}-{0:0}:
+       __lock_acquire+0x1661/0x2860
+       lock_acquire+0xc4/0x2f0
+       __flush_work+0x27a/0x660
+       flush_delayed_work+0x5d/0xa0
+       dev_coredump_put+0x63/0xa0
+       xe_driver_devcoredump_fini+0x12/0x20 [xe]
+       devm_action_release+0x12/0x30
+       release_nodes+0x3a/0x120
+       devres_release_all+0x8a/0xd0
+       device_unbind_cleanup+0x12/0x80
+       device_release_driver_internal+0x23a/0x280
+       device_driver_detach+0x14/0x20
+       unbind_store+0xaf/0xc0
+       drv_attr_store+0x21/0x50
+       sysfs_kf_write+0x4a/0x80
+       kernfs_fop_write_iter+0x169/0x220
+       vfs_write+0x293/0x560
+       ksys_write+0x72/0xf0
+       __x64_sys_write+0x19/0x30
+       x64_sys_call+0x2bf/0x2660
+       do_syscall_64+0x93/0xb60
+       entry_SYSCALL_64_after_hwframe+0x76/0x7e
+other info that might help us debug this:
+Chain exists of: (work_completion)(&(&devcd->del_wk)->work) --> kn->active#236 --> &devcd->mutex
+ Possible unsafe locking scenario:
+       CPU0                    CPU1
+       ----                    ----
+  lock(&devcd->mutex);
+                               lock(kn->active#236);
+                               lock(&devcd->mutex);
+  lock((work_completion)(&(&devcd->del_wk)->work));
+ *** DEADLOCK ***
+5 locks held by xe_fault_inject/5091:
+ #0: ffff8881129f9488 (sb_writers#5){.+.+}-{0:0}, at: ksys_write+0x72/0xf0
+ #1: ffff88810c755078 (&of->mutex#2){+.+.}-{3:3}, at: kernfs_fop_write_iter+0x123/0x220
+ #2: ffff8881054811a0 (&dev->mutex){....}-{3:3}, at: device_release_driver_internal+0x55/0x280
+ #3: ffff888156815620 (&devcd->mutex){+.+.}-{3:3}, at: dev_coredump_put+0x3f/0xa0
+ #4: ffffffff8359e020 (rcu_read_lock){....}-{1:2}, at: __flush_work+0x72/0x660
+stack backtrace:
+CPU: 14 UID: 0 PID: 5091 Comm: xe_fault_inject Tainted: G S   U              6.16.0-rc6-lgci-xe-xe-pw-151626v3+ #1 PREEMPT_{RT,(lazy)}
+Tainted: [S]=CPU_OUT_OF_SPEC, [U]=USER
+Hardware name: Micro-Star International Co., Ltd. MS-7D25/PRO Z690-A DDR4(MS-7D25), BIOS 1.10 12/13/2021
+Call Trace:
+ <TASK>
+ dump_stack_lvl+0x91/0xf0
+ dump_stack+0x10/0x20
+ print_circular_bug+0x285/0x360
+ check_noncircular+0x135/0x150
+ ? register_lock_class+0x48/0x4a0
+ __lock_acquire+0x1661/0x2860
+ lock_acquire+0xc4/0x2f0
+ ? __flush_work+0x25d/0x660
+ ? mark_held_locks+0x46/0x90
+ ? __flush_work+0x25d/0x660
+ __flush_work+0x27a/0x660
+ ? __flush_work+0x25d/0x660
+ ? trace_hardirqs_on+0x1e/0xd0
+ ? __pfx_wq_barrier_func+0x10/0x10
+ flush_delayed_work+0x5d/0xa0
+ dev_coredump_put+0x63/0xa0
+ xe_driver_devcoredump_fini+0x12/0x20 [xe]
+ devm_action_release+0x12/0x30
+ release_nodes+0x3a/0x120
+ devres_release_all+0x8a/0xd0
+ device_unbind_cleanup+0x12/0x80
+ device_release_driver_internal+0x23a/0x280
+ ? bus_find_device+0xa8/0xe0
+ device_driver_detach+0x14/0x20
+ unbind_store+0xaf/0xc0
+ drv_attr_store+0x21/0x50
+ sysfs_kf_write+0x4a/0x80
+ kernfs_fop_write_iter+0x169/0x220
+ vfs_write+0x293/0x560
+ ksys_write+0x72/0xf0
+ __x64_sys_write+0x19/0x30
+ x64_sys_call+0x2bf/0x2660
+ do_syscall_64+0x93/0xb60
+ ? __f_unlock_pos+0x15/0x20
+ ? __x64_sys_getdents64+0x9b/0x130
+ ? __pfx_filldir64+0x10/0x10
+ ? do_syscall_64+0x1a2/0xb60
+ ? clear_bhb_loop+0x30/0x80
+ ? clear_bhb_loop+0x30/0x80
+ entry_SYSCALL_64_after_hwframe+0x76/0x7e
+RIP: 0033:0x76e292edd574
+Code: c7 00 16 00 00 00 b8 ff ff ff ff c3 66 2e 0f 1f 84 00 00 00 00 00 f3 0f 1e fa 80 3d d5 ea 0e 00 00 74 13 b8 01 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 54 c3 0f 1f 00 55 48 89 e5 48 83 ec 20 48 89
+RSP: 002b:00007fffe247a828 EFLAGS: 00000202 ORIG_RAX: 0000000000000001
+RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 000076e292edd574
+RDX: 000000000000000c RSI: 00006267f6306063 RDI: 000000000000000b
+RBP: 000000000000000c R08: 000076e292fc4b20 R09: 0000000000000000
+R10: 0000000000000000 R11: 0000000000000202 R12: 00006267f6306063
+R13: 000000000000000b R14: 00006267e6859c00 R15: 000076e29322a000
+ </TASK>
+xe 0000:03:00.0: [drm] Xe device coredump has been deleted.
+
+Fixes: 01daccf74832 ("devcoredump : Serialize devcd_del work")
+Cc: Mukesh Ojha <quic_mojha@quicinc.com>
+Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+Cc: Johannes Berg <johannes@sipsolutions.net>
+Cc: Rafael J. Wysocki <rafael@kernel.org>
+Cc: Danilo Krummrich <dakr@kernel.org>
+Cc: linux-kernel@vger.kernel.org
+Cc: stable@vger.kernel.org # v6.1+
+Signed-off-by: Maarten Lankhorst <dev@lankhorst.se>
+Cc: Matthew Brost <matthew.brost@intel.com>
+Acked-by: Mukesh Ojha <mukesh.ojha@oss.qualcomm.com>
+Link: https://lore.kernel.org/r/20250723142416.1020423-1-dev@lankhorst.se
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+[ replaced disable_delayed_work_sync() with cancel_delayed_work_sync() ]
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/base/devcoredump.c |  138 +++++++++++++++++++++++++++------------------
+ 1 file changed, 84 insertions(+), 54 deletions(-)
+
+--- a/drivers/base/devcoredump.c
++++ b/drivers/base/devcoredump.c
+@@ -26,50 +26,46 @@ struct devcd_entry {
+       void *data;
+       size_t datalen;
+       /*
+-       * Here, mutex is required to serialize the calls to del_wk work between
+-       * user/kernel space which happens when devcd is added with device_add()
+-       * and that sends uevent to user space. User space reads the uevents,
+-       * and calls to devcd_data_write() which try to modify the work which is
+-       * not even initialized/queued from devcoredump.
++       * There are 2 races for which mutex is required.
+        *
++       * The first race is between device creation and userspace writing to
++       * schedule immediately destruction.
+        *
++       * This race is handled by arming the timer before device creation, but
++       * when device creation fails the timer still exists.
+        *
+-       *        cpu0(X)                                 cpu1(Y)
++       * To solve this, hold the mutex during device_add(), and set
++       * init_completed on success before releasing the mutex.
+        *
+-       *        dev_coredump() uevent sent to user space
+-       *        device_add()  ======================> user space process Y reads the
+-       *                                              uevents writes to devcd fd
+-       *                                              which results into writes to
++       * That way the timer will never fire until device_add() is called,
++       * it will do nothing if init_completed is not set. The timer is also
++       * cancelled in that case.
+        *
+-       *                                             devcd_data_write()
+-       *                                               mod_delayed_work()
+-       *                                                 try_to_grab_pending()
+-       *                                                   del_timer()
+-       *                                                     debug_assert_init()
+-       *       INIT_DELAYED_WORK()
+-       *       schedule_delayed_work()
+-       *
+-       *
+-       * Also, mutex alone would not be enough to avoid scheduling of
+-       * del_wk work after it get flush from a call to devcd_free()
+-       * mentioned as below.
+-       *
+-       *      disabled_store()
+-       *        devcd_free()
+-       *          mutex_lock()             devcd_data_write()
+-       *          flush_delayed_work()
+-       *          mutex_unlock()
+-       *                                   mutex_lock()
+-       *                                   mod_delayed_work()
+-       *                                   mutex_unlock()
+-       * So, delete_work flag is required.
++       * The second race involves multiple parallel invocations of devcd_free(),
++       * add a deleted flag so only 1 can call the destructor.
+        */
+       struct mutex mutex;
+-      bool delete_work;
++      bool init_completed, deleted;
+       struct module *owner;
+       ssize_t (*read)(char *buffer, loff_t offset, size_t count,
+                       void *data, size_t datalen);
+       void (*free)(void *data);
++      /*
++       * If nothing interferes and device_add() was returns success,
++       * del_wk will destroy the device after the timer fires.
++       *
++       * Multiple userspace processes can interfere in the working of the timer:
++       * - Writing to the coredump will reschedule the timer to run immediately,
++       *   if still armed.
++       *
++       *   This is handled by using "if (cancel_delayed_work()) {
++       *   schedule_delayed_work() }", to prevent re-arming after having
++       *   been previously fired.
++       * - Writing to /sys/class/devcoredump/disabled will destroy the
++       *   coredump synchronously.
++       *   This is handled by using disable_delayed_work_sync(), and then
++       *   checking if deleted flag is set with &devcd->mutex held.
++       */
+       struct delayed_work del_wk;
+       struct device *failing_dev;
+ };
+@@ -98,14 +94,27 @@ static void devcd_dev_release(struct dev
+       kfree(devcd);
+ }
++static void __devcd_del(struct devcd_entry *devcd)
++{
++      devcd->deleted = true;
++      device_del(&devcd->devcd_dev);
++      put_device(&devcd->devcd_dev);
++}
++
+ static void devcd_del(struct work_struct *wk)
+ {
+       struct devcd_entry *devcd;
++      bool init_completed;
+       devcd = container_of(wk, struct devcd_entry, del_wk.work);
+-      device_del(&devcd->devcd_dev);
+-      put_device(&devcd->devcd_dev);
++      /* devcd->mutex serializes against dev_coredumpm_timeout */
++      mutex_lock(&devcd->mutex);
++      init_completed = devcd->init_completed;
++      mutex_unlock(&devcd->mutex);
++
++      if (init_completed)
++              __devcd_del(devcd);
+ }
+ static ssize_t devcd_data_read(struct file *filp, struct kobject *kobj,
+@@ -125,12 +134,12 @@ static ssize_t devcd_data_write(struct f
+       struct device *dev = kobj_to_dev(kobj);
+       struct devcd_entry *devcd = dev_to_devcd(dev);
+-      mutex_lock(&devcd->mutex);
+-      if (!devcd->delete_work) {
+-              devcd->delete_work = true;
+-              mod_delayed_work(system_wq, &devcd->del_wk, 0);
+-      }
+-      mutex_unlock(&devcd->mutex);
++      /*
++       * Although it's tempting to use mod_delayed work here,
++       * that will cause a reschedule if the timer already fired.
++       */
++      if (cancel_delayed_work(&devcd->del_wk))
++              schedule_delayed_work(&devcd->del_wk, 0);
+       return count;
+ }
+@@ -158,11 +167,21 @@ static int devcd_free(struct device *dev
+ {
+       struct devcd_entry *devcd = dev_to_devcd(dev);
++      /*
++       * To prevent a race with devcd_data_write(), cancel work and
++       * complete manually instead.
++       *
++       * We cannot rely on the return value of
++       * cancel_delayed_work_sync() here, because it might be in the
++       * middle of a cancel_delayed_work + schedule_delayed_work pair.
++       *
++       * devcd->mutex here guards against multiple parallel invocations
++       * of devcd_free().
++       */
++      cancel_delayed_work_sync(&devcd->del_wk);
+       mutex_lock(&devcd->mutex);
+-      if (!devcd->delete_work)
+-              devcd->delete_work = true;
+-
+-      flush_delayed_work(&devcd->del_wk);
++      if (!devcd->deleted)
++              __devcd_del(devcd);
+       mutex_unlock(&devcd->mutex);
+       return 0;
+ }
+@@ -186,12 +205,10 @@ static ssize_t disabled_show(const struc
+  *                                                                 put_device() <- last reference
+  *             error = fn(dev, data)                           devcd_dev_release()
+  *             devcd_free(dev, data)                           kfree(devcd)
+- *             mutex_lock(&devcd->mutex);
+  *
+  *
+- * In the above diagram, It looks like disabled_store() would be racing with parallely
+- * running devcd_del() and result in memory abort while acquiring devcd->mutex which
+- * is called after kfree of devcd memory  after dropping its last reference with
++ * In the above diagram, it looks like disabled_store() would be racing with parallelly
++ * running devcd_del() and result in memory abort after dropping its last reference with
+  * put_device(). However, this will not happens as fn(dev, data) runs
+  * with its own reference to device via klist_node so it is not its last reference.
+  * so, above situation would not occur.
+@@ -352,7 +369,7 @@ void dev_coredumpm(struct device *dev, s
+       devcd->read = read;
+       devcd->free = free;
+       devcd->failing_dev = get_device(dev);
+-      devcd->delete_work = false;
++      devcd->deleted = false;
+       mutex_init(&devcd->mutex);
+       device_initialize(&devcd->devcd_dev);
+@@ -361,8 +378,14 @@ void dev_coredumpm(struct device *dev, s
+                    atomic_inc_return(&devcd_count));
+       devcd->devcd_dev.class = &devcd_class;
+-      mutex_lock(&devcd->mutex);
+       dev_set_uevent_suppress(&devcd->devcd_dev, true);
++
++      /* devcd->mutex prevents devcd_del() completing until init finishes */
++      mutex_lock(&devcd->mutex);
++      devcd->init_completed = false;
++      INIT_DELAYED_WORK(&devcd->del_wk, devcd_del);
++      schedule_delayed_work(&devcd->del_wk, DEVCD_TIMEOUT);
++
+       if (device_add(&devcd->devcd_dev))
+               goto put_device;
+@@ -379,13 +402,20 @@ void dev_coredumpm(struct device *dev, s
+       dev_set_uevent_suppress(&devcd->devcd_dev, false);
+       kobject_uevent(&devcd->devcd_dev.kobj, KOBJ_ADD);
+-      INIT_DELAYED_WORK(&devcd->del_wk, devcd_del);
+-      schedule_delayed_work(&devcd->del_wk, DEVCD_TIMEOUT);
++
++      /*
++       * Safe to run devcd_del() now that we are done with devcd_dev.
++       * Alternatively we could have taken a ref on devcd_dev before
++       * dropping the lock.
++       */
++      devcd->init_completed = true;
+       mutex_unlock(&devcd->mutex);
+       return;
+  put_device:
+-      put_device(&devcd->devcd_dev);
+       mutex_unlock(&devcd->mutex);
++      cancel_delayed_work_sync(&devcd->del_wk);
++      put_device(&devcd->devcd_dev);
++
+  put_module:
+       module_put(owner);
+  free:
diff --git a/queue-6.6/fs-notify-call-exportfs_encode_fid-with-s_umount.patch b/queue-6.6/fs-notify-call-exportfs_encode_fid-with-s_umount.patch
new file mode 100644 (file)
index 0000000..cbfc893
--- /dev/null
@@ -0,0 +1,111 @@
+From stable+bounces-189868-greg=kroah.com@vger.kernel.org Sun Oct 26 17:05:06 2025
+From: Sasha Levin <sashal@kernel.org>
+Date: Sun, 26 Oct 2025 12:04:56 -0400
+Subject: fs/notify: call exportfs_encode_fid with s_umount
+To: stable@vger.kernel.org
+Cc: Jakub Acs <acsjakub@amazon.de>, Jan Kara <jack@suse.cz>, Amir Goldstein <amir73il@gmail.com>, Miklos Szeredi <miklos@szeredi.hu>, Christian Brauner <brauner@kernel.org>, linux-unionfs@vger.kernel.org, linux-fsdevel@vger.kernel.org, linux-kernel@vger.kernel.org, Sasha Levin <sashal@kernel.org>
+Message-ID: <20251026160456.99836-1-sashal@kernel.org>
+
+From: Jakub Acs <acsjakub@amazon.de>
+
+[ Upstream commit a7c4bb43bfdc2b9f06ee9d036028ed13a83df42a ]
+
+Calling intotify_show_fdinfo() on fd watching an overlayfs inode, while
+the overlayfs is being unmounted, can lead to dereferencing NULL ptr.
+
+This issue was found by syzkaller.
+
+Race Condition Diagram:
+
+Thread 1                           Thread 2
+--------                           --------
+
+generic_shutdown_super()
+ shrink_dcache_for_umount
+  sb->s_root = NULL
+
+                    |
+                    |             vfs_read()
+                    |              inotify_fdinfo()
+                    |               * inode get from mark *
+                    |               show_mark_fhandle(m, inode)
+                    |                exportfs_encode_fid(inode, ..)
+                    |                 ovl_encode_fh(inode, ..)
+                    |                  ovl_check_encode_origin(inode)
+                    |                   * deref i_sb->s_root *
+                    |
+                    |
+                    v
+ fsnotify_sb_delete(sb)
+
+Which then leads to:
+
+[   32.133461] Oops: general protection fault, probably for non-canonical address 0xdffffc0000000006: 0000 [#1] SMP DEBUG_PAGEALLOC KASAN NOPTI
+[   32.134438] KASAN: null-ptr-deref in range [0x0000000000000030-0x0000000000000037]
+[   32.135032] CPU: 1 UID: 0 PID: 4468 Comm: systemd-coredum Not tainted 6.17.0-rc6 #22 PREEMPT(none)
+
+<snip registers, unreliable trace>
+
+[   32.143353] Call Trace:
+[   32.143732]  ovl_encode_fh+0xd5/0x170
+[   32.144031]  exportfs_encode_inode_fh+0x12f/0x300
+[   32.144425]  show_mark_fhandle+0xbe/0x1f0
+[   32.145805]  inotify_fdinfo+0x226/0x2d0
+[   32.146442]  inotify_show_fdinfo+0x1c5/0x350
+[   32.147168]  seq_show+0x530/0x6f0
+[   32.147449]  seq_read_iter+0x503/0x12a0
+[   32.148419]  seq_read+0x31f/0x410
+[   32.150714]  vfs_read+0x1f0/0x9e0
+[   32.152297]  ksys_read+0x125/0x240
+
+IOW ovl_check_encode_origin derefs inode->i_sb->s_root, after it was set
+to NULL in the unmount path.
+
+Fix it by protecting calling exportfs_encode_fid() from
+show_mark_fhandle() with s_umount lock.
+
+This form of fix was suggested by Amir in [1].
+
+[1]: https://lore.kernel.org/all/CAOQ4uxhbDwhb+2Brs1UdkoF0a3NSdBAOQPNfEHjahrgoKJpLEw@mail.gmail.com/
+
+Fixes: c45beebfde34 ("ovl: support encoding fid from inode with no alias")
+Signed-off-by: Jakub Acs <acsjakub@amazon.de>
+Cc: Jan Kara <jack@suse.cz>
+Cc: Amir Goldstein <amir73il@gmail.com>
+Cc: Miklos Szeredi <miklos@szeredi.hu>
+Cc: Christian Brauner <brauner@kernel.org>
+Cc: linux-unionfs@vger.kernel.org
+Cc: linux-fsdevel@vger.kernel.org
+Cc: linux-kernel@vger.kernel.org
+Cc: stable@vger.kernel.org
+Signed-off-by: Jan Kara <jack@suse.cz>
+[ Adjust context ]
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/notify/fdinfo.c |    6 ++++++
+ 1 file changed, 6 insertions(+)
+
+--- a/fs/notify/fdinfo.c
++++ b/fs/notify/fdinfo.c
+@@ -17,6 +17,7 @@
+ #include "fanotify/fanotify.h"
+ #include "fdinfo.h"
+ #include "fsnotify.h"
++#include "../internal.h"
+ #if defined(CONFIG_PROC_FS)
+@@ -50,7 +51,12 @@ static void show_mark_fhandle(struct seq
+       f.handle.handle_bytes = sizeof(f.pad);
+       size = f.handle.handle_bytes >> 2;
++      if (!super_trylock_shared(inode->i_sb))
++              return;
++
+       ret = exportfs_encode_fid(inode, (struct fid *)f.handle.f_handle, &size);
++      up_read(&inode->i_sb->s_umount);
++
+       if ((ret == FILEID_INVALID) || (ret < 0))
+               return;
diff --git a/queue-6.6/fuse-allocate-ff-release_args-only-if-release-is-needed.patch b/queue-6.6/fuse-allocate-ff-release_args-only-if-release-is-needed.patch
new file mode 100644 (file)
index 0000000..adfa40d
--- /dev/null
@@ -0,0 +1,241 @@
+From stable+bounces-188825-greg=kroah.com@vger.kernel.org Tue Oct 21 22:16:29 2025
+From: Sasha Levin <sashal@kernel.org>
+Date: Tue, 21 Oct 2025 16:16:18 -0400
+Subject: fuse: allocate ff->release_args only if release is needed
+To: stable@vger.kernel.org
+Cc: Amir Goldstein <amir73il@gmail.com>, Miklos Szeredi <mszeredi@redhat.com>, Sasha Levin <sashal@kernel.org>
+Message-ID: <20251021201619.2922630-1-sashal@kernel.org>
+
+From: Amir Goldstein <amir73il@gmail.com>
+
+[ Upstream commit e26ee4efbc79610b20e7abe9d96c87f33dacc1ff ]
+
+This removed the need to pass isdir argument to fuse_put_file().
+
+Signed-off-by: Amir Goldstein <amir73il@gmail.com>
+Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
+Stable-dep-of: 26e5c67deb2e ("fuse: fix livelock in synchronous file put from fuseblk workers")
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/fuse/dir.c    |    2 -
+ fs/fuse/file.c   |   69 +++++++++++++++++++++++++++++++------------------------
+ fs/fuse/fuse_i.h |    2 -
+ 3 files changed, 41 insertions(+), 32 deletions(-)
+
+--- a/fs/fuse/dir.c
++++ b/fs/fuse/dir.c
+@@ -634,7 +634,7 @@ static int fuse_create_open(struct inode
+               goto out_err;
+       err = -ENOMEM;
+-      ff = fuse_file_alloc(fm);
++      ff = fuse_file_alloc(fm, true);
+       if (!ff)
+               goto out_put_forget_req;
+--- a/fs/fuse/file.c
++++ b/fs/fuse/file.c
+@@ -55,7 +55,7 @@ struct fuse_release_args {
+       struct inode *inode;
+ };
+-struct fuse_file *fuse_file_alloc(struct fuse_mount *fm)
++struct fuse_file *fuse_file_alloc(struct fuse_mount *fm, bool release)
+ {
+       struct fuse_file *ff;
+@@ -64,11 +64,13 @@ struct fuse_file *fuse_file_alloc(struct
+               return NULL;
+       ff->fm = fm;
+-      ff->release_args = kzalloc(sizeof(*ff->release_args),
+-                                 GFP_KERNEL_ACCOUNT);
+-      if (!ff->release_args) {
+-              kfree(ff);
+-              return NULL;
++      if (release) {
++              ff->release_args = kzalloc(sizeof(*ff->release_args),
++                                         GFP_KERNEL_ACCOUNT);
++              if (!ff->release_args) {
++                      kfree(ff);
++                      return NULL;
++              }
+       }
+       INIT_LIST_HEAD(&ff->write_entry);
+@@ -104,14 +106,14 @@ static void fuse_release_end(struct fuse
+       kfree(ra);
+ }
+-static void fuse_file_put(struct fuse_file *ff, bool sync, bool isdir)
++static void fuse_file_put(struct fuse_file *ff, bool sync)
+ {
+       if (refcount_dec_and_test(&ff->count)) {
+-              struct fuse_args *args = &ff->release_args->args;
++              struct fuse_release_args *ra = ff->release_args;
++              struct fuse_args *args = (ra ? &ra->args : NULL);
+-              if (isdir ? ff->fm->fc->no_opendir : ff->fm->fc->no_open) {
+-                      /* Do nothing when client does not implement 'open' */
+-                      fuse_release_end(ff->fm, args, 0);
++              if (!args) {
++                      /* Do nothing when server does not implement 'open' */
+               } else if (sync) {
+                       fuse_simple_request(ff->fm, args);
+                       fuse_release_end(ff->fm, args, 0);
+@@ -131,15 +133,16 @@ struct fuse_file *fuse_file_open(struct
+       struct fuse_conn *fc = fm->fc;
+       struct fuse_file *ff;
+       int opcode = isdir ? FUSE_OPENDIR : FUSE_OPEN;
++      bool open = isdir ? !fc->no_opendir : !fc->no_open;
+-      ff = fuse_file_alloc(fm);
++      ff = fuse_file_alloc(fm, open);
+       if (!ff)
+               return ERR_PTR(-ENOMEM);
+       ff->fh = 0;
+       /* Default for no-open */
+       ff->open_flags = FOPEN_KEEP_CACHE | (isdir ? FOPEN_CACHE_DIR : 0);
+-      if (isdir ? !fc->no_opendir : !fc->no_open) {
++      if (open) {
+               struct fuse_open_out outarg;
+               int err;
+@@ -147,11 +150,13 @@ struct fuse_file *fuse_file_open(struct
+               if (!err) {
+                       ff->fh = outarg.fh;
+                       ff->open_flags = outarg.open_flags;
+-
+               } else if (err != -ENOSYS) {
+                       fuse_file_free(ff);
+                       return ERR_PTR(err);
+               } else {
++                      /* No release needed */
++                      kfree(ff->release_args);
++                      ff->release_args = NULL;
+                       if (isdir)
+                               fc->no_opendir = 1;
+                       else
+@@ -273,7 +278,7 @@ out_inode_unlock:
+ }
+ static void fuse_prepare_release(struct fuse_inode *fi, struct fuse_file *ff,
+-                               unsigned int flags, int opcode)
++                               unsigned int flags, int opcode, bool sync)
+ {
+       struct fuse_conn *fc = ff->fm->fc;
+       struct fuse_release_args *ra = ff->release_args;
+@@ -291,6 +296,9 @@ static void fuse_prepare_release(struct
+       wake_up_interruptible_all(&ff->poll_wait);
++      if (!ra)
++              return;
++
+       ra->inarg.fh = ff->fh;
+       ra->inarg.flags = flags;
+       ra->args.in_numargs = 1;
+@@ -300,6 +308,13 @@ static void fuse_prepare_release(struct
+       ra->args.nodeid = ff->nodeid;
+       ra->args.force = true;
+       ra->args.nocreds = true;
++
++      /*
++       * Hold inode until release is finished.
++       * From fuse_sync_release() the refcount is 1 and everything's
++       * synchronous, so we are fine with not doing igrab() here.
++       */
++      ra->inode = sync ? NULL : igrab(&fi->inode);
+ }
+ void fuse_file_release(struct inode *inode, struct fuse_file *ff,
+@@ -309,14 +324,12 @@ void fuse_file_release(struct inode *ino
+       struct fuse_release_args *ra = ff->release_args;
+       int opcode = isdir ? FUSE_RELEASEDIR : FUSE_RELEASE;
+-      fuse_prepare_release(fi, ff, open_flags, opcode);
++      fuse_prepare_release(fi, ff, open_flags, opcode, false);
+-      if (ff->flock) {
++      if (ra && ff->flock) {
+               ra->inarg.release_flags |= FUSE_RELEASE_FLOCK_UNLOCK;
+               ra->inarg.lock_owner = fuse_lock_owner_id(ff->fm->fc, id);
+       }
+-      /* Hold inode until release is finished */
+-      ra->inode = igrab(inode);
+       /*
+        * Normally this will send the RELEASE request, however if
+@@ -327,7 +340,7 @@ void fuse_file_release(struct inode *ino
+        * synchronous RELEASE is allowed (and desirable) in this case
+        * because the server can be trusted not to screw up.
+        */
+-      fuse_file_put(ff, ff->fm->fc->destroy, isdir);
++      fuse_file_put(ff, ff->fm->fc->destroy);
+ }
+ void fuse_release_common(struct file *file, bool isdir)
+@@ -362,12 +375,8 @@ void fuse_sync_release(struct fuse_inode
+                      unsigned int flags)
+ {
+       WARN_ON(refcount_read(&ff->count) > 1);
+-      fuse_prepare_release(fi, ff, flags, FUSE_RELEASE);
+-      /*
+-       * iput(NULL) is a no-op and since the refcount is 1 and everything's
+-       * synchronous, we are fine with not doing igrab() here"
+-       */
+-      fuse_file_put(ff, true, false);
++      fuse_prepare_release(fi, ff, flags, FUSE_RELEASE, true);
++      fuse_file_put(ff, true);
+ }
+ EXPORT_SYMBOL_GPL(fuse_sync_release);
+@@ -924,7 +933,7 @@ static void fuse_readpages_end(struct fu
+               put_page(page);
+       }
+       if (ia->ff)
+-              fuse_file_put(ia->ff, false, false);
++              fuse_file_put(ia->ff, false);
+       fuse_io_free(ia);
+ }
+@@ -1666,7 +1675,7 @@ static void fuse_writepage_free(struct f
+               __free_page(ap->pages[i]);
+       if (wpa->ia.ff)
+-              fuse_file_put(wpa->ia.ff, false, false);
++              fuse_file_put(wpa->ia.ff, false);
+       kfree(ap->pages);
+       kfree(wpa);
+@@ -1914,7 +1923,7 @@ int fuse_write_inode(struct inode *inode
+       ff = __fuse_write_file_get(fi);
+       err = fuse_flush_times(inode, ff);
+       if (ff)
+-              fuse_file_put(ff, false, false);
++              fuse_file_put(ff, false);
+       return err;
+ }
+@@ -2312,7 +2321,7 @@ static int fuse_writepages(struct addres
+               fuse_writepages_send(&data);
+       }
+       if (data.ff)
+-              fuse_file_put(data.ff, false, false);
++              fuse_file_put(data.ff, false);
+       kfree(data.orig_pages);
+ out:
+--- a/fs/fuse/fuse_i.h
++++ b/fs/fuse/fuse_i.h
+@@ -1036,7 +1036,7 @@ void fuse_read_args_fill(struct fuse_io_
+  */
+ int fuse_open_common(struct inode *inode, struct file *file, bool isdir);
+-struct fuse_file *fuse_file_alloc(struct fuse_mount *fm);
++struct fuse_file *fuse_file_alloc(struct fuse_mount *fm, bool release);
+ void fuse_file_free(struct fuse_file *ff);
+ void fuse_finish_open(struct inode *inode, struct file *file);
diff --git a/queue-6.6/fuse-fix-livelock-in-synchronous-file-put-from-fuseblk-workers.patch b/queue-6.6/fuse-fix-livelock-in-synchronous-file-put-from-fuseblk-workers.patch
new file mode 100644 (file)
index 0000000..e75cda4
--- /dev/null
@@ -0,0 +1,94 @@
+From stable+bounces-188826-greg=kroah.com@vger.kernel.org Tue Oct 21 22:16:59 2025
+From: Sasha Levin <sashal@kernel.org>
+Date: Tue, 21 Oct 2025 16:16:19 -0400
+Subject: fuse: fix livelock in synchronous file put from fuseblk workers
+To: stable@vger.kernel.org
+Cc: "Darrick J. Wong" <djwong@kernel.org>, Miklos Szeredi <mszeredi@redhat.com>, Sasha Levin <sashal@kernel.org>
+Message-ID: <20251021201619.2922630-2-sashal@kernel.org>
+
+From: "Darrick J. Wong" <djwong@kernel.org>
+
+[ Upstream commit 26e5c67deb2e1f42a951f022fdf5b9f7eb747b01 ]
+
+I observed a hang when running generic/323 against a fuseblk server.
+This test opens a file, initiates a lot of AIO writes to that file
+descriptor, and closes the file descriptor before the writes complete.
+Unsurprisingly, the AIO exerciser threads are mostly stuck waiting for
+responses from the fuseblk server:
+
+# cat /proc/372265/task/372313/stack
+[<0>] request_wait_answer+0x1fe/0x2a0 [fuse]
+[<0>] __fuse_simple_request+0xd3/0x2b0 [fuse]
+[<0>] fuse_do_getattr+0xfc/0x1f0 [fuse]
+[<0>] fuse_file_read_iter+0xbe/0x1c0 [fuse]
+[<0>] aio_read+0x130/0x1e0
+[<0>] io_submit_one+0x542/0x860
+[<0>] __x64_sys_io_submit+0x98/0x1a0
+[<0>] do_syscall_64+0x37/0xf0
+[<0>] entry_SYSCALL_64_after_hwframe+0x4b/0x53
+
+But the /weird/ part is that the fuseblk server threads are waiting for
+responses from itself:
+
+# cat /proc/372210/task/372232/stack
+[<0>] request_wait_answer+0x1fe/0x2a0 [fuse]
+[<0>] __fuse_simple_request+0xd3/0x2b0 [fuse]
+[<0>] fuse_file_put+0x9a/0xd0 [fuse]
+[<0>] fuse_release+0x36/0x50 [fuse]
+[<0>] __fput+0xec/0x2b0
+[<0>] task_work_run+0x55/0x90
+[<0>] syscall_exit_to_user_mode+0xe9/0x100
+[<0>] do_syscall_64+0x43/0xf0
+[<0>] entry_SYSCALL_64_after_hwframe+0x4b/0x53
+
+The fuseblk server is fuse2fs so there's nothing all that exciting in
+the server itself.  So why is the fuse server calling fuse_file_put?
+The commit message for the fstest sheds some light on that:
+
+"By closing the file descriptor before calling io_destroy, you pretty
+much guarantee that the last put on the ioctx will be done in interrupt
+context (during I/O completion).
+
+Aha.  AIO fgets a new struct file from the fd when it queues the ioctx.
+The completion of the FUSE_WRITE command from userspace causes the fuse
+server to call the AIO completion function.  The completion puts the
+struct file, queuing a delayed fput to the fuse server task.  When the
+fuse server task returns to userspace, it has to run the delayed fput,
+which in the case of a fuseblk server, it does synchronously.
+
+Sending the FUSE_RELEASE command sychronously from fuse server threads
+is a bad idea because a client program can initiate enough simultaneous
+AIOs such that all the fuse server threads end up in delayed_fput, and
+now there aren't any threads left to handle the queued fuse commands.
+
+Fix this by only using asynchronous fputs when closing files, and leave
+a comment explaining why.
+
+Cc: stable@vger.kernel.org # v2.6.38
+Fixes: 5a18ec176c934c ("fuse: fix hang of single threaded fuseblk filesystem")
+Signed-off-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/fuse/file.c |    8 +++++++-
+ 1 file changed, 7 insertions(+), 1 deletion(-)
+
+--- a/fs/fuse/file.c
++++ b/fs/fuse/file.c
+@@ -339,8 +339,14 @@ void fuse_file_release(struct inode *ino
+        * Make the release synchronous if this is a fuseblk mount,
+        * synchronous RELEASE is allowed (and desirable) in this case
+        * because the server can be trusted not to screw up.
++       *
++       * Always use the asynchronous file put because the current thread
++       * might be the fuse server.  This can happen if a process starts some
++       * aio and closes the fd before the aio completes.  Since aio takes its
++       * own ref to the file, the IO completion has to drop the ref, which is
++       * how the fuse server can end up closing its clients' files.
+        */
+-      fuse_file_put(ff, ff->fm->fc->destroy);
++      fuse_file_put(ff, false);
+ }
+ void fuse_release_common(struct file *file, bool isdir)
diff --git a/queue-6.6/s390-cio-update-purge-function-to-unregister-the-unused-subchannels.patch b/queue-6.6/s390-cio-update-purge-function-to-unregister-the-unused-subchannels.patch
new file mode 100644 (file)
index 0000000..4fb36fe
--- /dev/null
@@ -0,0 +1,95 @@
+From stable+bounces-189106-greg=kroah.com@vger.kernel.org Thu Oct 23 13:49:33 2025
+From: Vineeth Vijayan <vneethv@linux.ibm.com>
+Date: Thu, 23 Oct 2025 13:49:13 +0200
+Subject: s390/cio: Update purge function to unregister the unused subchannels
+To: stable@vger.kernel.org
+Cc: hca@linux.ibm.com, oberpar@linux.ibm.com
+Message-ID: <20251023114913.2143450-1-vneethv@linux.ibm.com>
+
+From: Vineeth Vijayan <vneethv@linux.ibm.com>
+
+commit 9daa5a8795865f9a3c93d8d1066785b07ded6073 upstream.
+
+Starting with 'commit 2297791c92d0 ("s390/cio: dont unregister
+subchannel from child-drivers")', cio no longer unregisters
+subchannels when the attached device is invalid or unavailable.
+
+As an unintended side-effect, the cio_ignore purge function no longer
+removes subchannels for devices on the cio_ignore list if no CCW device
+is attached. This situation occurs when a CCW device is non-operational
+or unavailable
+
+To ensure the same outcome of the purge function as when the
+current cio_ignore list had been active during boot, update the purge
+function to remove I/O subchannels without working CCW devices if the
+associated device number is found on the cio_ignore list.
+
+Fixes: 2297791c92d0 ("s390/cio: dont unregister subchannel from child-drivers")
+Suggested-by: Peter Oberparleiter <oberpar@linux.ibm.com>
+Reviewed-by: Peter Oberparleiter <oberpar@linux.ibm.com>
+Signed-off-by: Vineeth Vijayan <vneethv@linux.ibm.com>
+Signed-off-by: Heiko Carstens <hca@linux.ibm.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/s390/cio/device.c |   39 +++++++++++++++++++++++++--------------
+ 1 file changed, 25 insertions(+), 14 deletions(-)
+
+--- a/drivers/s390/cio/device.c
++++ b/drivers/s390/cio/device.c
+@@ -1318,23 +1318,34 @@ void ccw_device_schedule_recovery(void)
+       spin_unlock_irqrestore(&recovery_lock, flags);
+ }
+-static int purge_fn(struct device *dev, void *data)
++static int purge_fn(struct subchannel *sch, void *data)
+ {
+-      struct ccw_device *cdev = to_ccwdev(dev);
+-      struct ccw_dev_id *id = &cdev->private->dev_id;
+-      struct subchannel *sch = to_subchannel(cdev->dev.parent);
+-
+-      spin_lock_irq(cdev->ccwlock);
+-      if (is_blacklisted(id->ssid, id->devno) &&
+-          (cdev->private->state == DEV_STATE_OFFLINE) &&
+-          (atomic_cmpxchg(&cdev->private->onoff, 0, 1) == 0)) {
+-              CIO_MSG_EVENT(3, "ccw: purging 0.%x.%04x\n", id->ssid,
+-                            id->devno);
++      struct ccw_device *cdev;
++
++      spin_lock_irq(sch->lock);
++      if (sch->st != SUBCHANNEL_TYPE_IO || !sch->schib.pmcw.dnv)
++              goto unlock;
++
++      if (!is_blacklisted(sch->schid.ssid, sch->schib.pmcw.dev))
++              goto unlock;
++
++      cdev = sch_get_cdev(sch);
++      if (cdev) {
++              if (cdev->private->state != DEV_STATE_OFFLINE)
++                      goto unlock;
++
++              if (atomic_cmpxchg(&cdev->private->onoff, 0, 1) != 0)
++                      goto unlock;
+               ccw_device_sched_todo(cdev, CDEV_TODO_UNREG);
+-              css_sched_sch_todo(sch, SCH_TODO_UNREG);
+               atomic_set(&cdev->private->onoff, 0);
+       }
+-      spin_unlock_irq(cdev->ccwlock);
++
++      css_sched_sch_todo(sch, SCH_TODO_UNREG);
++      CIO_MSG_EVENT(3, "ccw: purging 0.%x.%04x%s\n", sch->schid.ssid,
++                    sch->schib.pmcw.dev, cdev ? "" : " (no cdev)");
++
++unlock:
++      spin_unlock_irq(sch->lock);
+       /* Abort loop in case of pending signal. */
+       if (signal_pending(current))
+               return -EINTR;
+@@ -1350,7 +1361,7 @@ static int purge_fn(struct device *dev,
+ int ccw_purge_blacklisted(void)
+ {
+       CIO_MSG_EVENT(2, "ccw: purging blacklisted devices\n");
+-      bus_for_each_dev(&ccw_bus_type, NULL, NULL, purge_fn);
++      for_each_subchannel_staged(purge_fn, NULL, NULL);
+       return 0;
+ }
index 6aa35fdf3bd58461680f0846ddd5a6649ba020f7..d293453b1bf15457cbe315a1ff55b6061aca6b0b 100644 (file)
@@ -74,3 +74,10 @@ dt-bindings-usb-dwc3-imx8mp-dma-range-is-required-only-for-imx8mp.patch
 serial-8250_dw-handle-reset-control-deassert-error.patch
 serial-8250_exar-add-support-for-advantech-2-port-card-with-device-id-0x0018.patch
 serial-8250_mtk-enable-baud-clock-and-manage-in-runtime-pm.patch
+devcoredump-fix-circular-locking-dependency-with-devcd-mutex.patch
+xfs-always-warn-about-deprecated-mount-options.patch
+fs-notify-call-exportfs_encode_fid-with-s_umount.patch
+x86-resctrl-fix-miscount-of-bandwidth-event-when-reactivating-previously-unavailable-rmid.patch
+s390-cio-update-purge-function-to-unregister-the-unused-subchannels.patch
+fuse-allocate-ff-release_args-only-if-release-is-needed.patch
+fuse-fix-livelock-in-synchronous-file-put-from-fuseblk-workers.patch
diff --git a/queue-6.6/x86-resctrl-fix-miscount-of-bandwidth-event-when-reactivating-previously-unavailable-rmid.patch b/queue-6.6/x86-resctrl-fix-miscount-of-bandwidth-event-when-reactivating-previously-unavailable-rmid.patch
new file mode 100644 (file)
index 0000000..d8c2c7b
--- /dev/null
@@ -0,0 +1,134 @@
+From stable+bounces-189148-greg=kroah.com@vger.kernel.org Thu Oct 23 18:15:45 2025
+From: Babu Moger <babu.moger@amd.com>
+Date: Thu, 23 Oct 2025 11:12:40 -0500
+Subject: x86/resctrl: Fix miscount of bandwidth event when reactivating previously unavailable RMID
+To: <stable@vger.kernel.org>
+Message-ID: <20251023161240.75240-1-babu.moger@amd.com>
+
+From: Babu Moger <babu.moger@amd.com>
+
+Users can create as many monitoring groups as the number of RMIDs supported
+by the hardware. However, on AMD systems, only a limited number of RMIDs
+are guaranteed to be actively tracked by the hardware. RMIDs that exceed
+this limit are placed in an "Unavailable" state.
+
+When a bandwidth counter is read for such an RMID, the hardware sets
+MSR_IA32_QM_CTR.Unavailable (bit 62). When such an RMID starts being tracked
+again the hardware counter is reset to zero. MSR_IA32_QM_CTR.Unavailable
+remains set on first read after tracking re-starts and is clear on all
+subsequent reads as long as the RMID is tracked.
+
+resctrl miscounts the bandwidth events after an RMID transitions from the
+"Unavailable" state back to being tracked. This happens because when the
+hardware starts counting again after resetting the counter to zero, resctrl
+in turn compares the new count against the counter value stored from the
+previous time the RMID was tracked.
+
+This results in resctrl computing an event value that is either undercounting
+(when new counter is more than stored counter) or a mistaken overflow (when
+new counter is less than stored counter).
+
+Reset the stored value (arch_mbm_state::prev_msr) of MSR_IA32_QM_CTR to
+zero whenever the RMID is in the "Unavailable" state to ensure accurate
+counting after the RMID resets to zero when it starts to be tracked again.
+
+Example scenario that results in mistaken overflow
+==================================================
+1. The resctrl filesystem is mounted, and a task is assigned to a
+   monitoring group.
+
+   $mount -t resctrl resctrl /sys/fs/resctrl
+   $mkdir /sys/fs/resctrl/mon_groups/test1/
+   $echo 1234 > /sys/fs/resctrl/mon_groups/test1/tasks
+
+   $cat /sys/fs/resctrl/mon_groups/test1/mon_data/mon_L3_*/mbm_total_bytes
+   21323            <- Total bytes on domain 0
+   "Unavailable"    <- Total bytes on domain 1
+
+   Task is running on domain 0. Counter on domain 1 is "Unavailable".
+
+2. The task runs on domain 0 for a while and then moves to domain 1. The
+   counter starts incrementing on domain 1.
+
+   $cat /sys/fs/resctrl/mon_groups/test1/mon_data/mon_L3_*/mbm_total_bytes
+   7345357          <- Total bytes on domain 0
+   4545             <- Total bytes on domain 1
+
+3. At some point, the RMID in domain 0 transitions to the "Unavailable"
+   state because the task is no longer executing in that domain.
+
+   $cat /sys/fs/resctrl/mon_groups/test1/mon_data/mon_L3_*/mbm_total_bytes
+   "Unavailable"    <- Total bytes on domain 0
+   434341           <- Total bytes on domain 1
+
+4.  Since the task continues to migrate between domains, it may eventually
+    return to domain 0.
+
+    $cat /sys/fs/resctrl/mon_groups/test1/mon_data/mon_L3_*/mbm_total_bytes
+    17592178699059  <- Overflow on domain 0
+    3232332         <- Total bytes on domain 1
+
+In this case, the RMID on domain 0 transitions from "Unavailable" state to
+active state. The hardware sets MSR_IA32_QM_CTR.Unavailable (bit 62) when
+the counter is read and begins tracking the RMID counting from 0.
+
+Subsequent reads succeed but return a value smaller than the previously
+saved MSR value (7345357). Consequently, the resctrl's overflow logic is
+triggered, it compares the previous value (7345357) with the new, smaller
+value and incorrectly interprets this as a counter overflow, adding a large
+delta.
+
+In reality, this is a false positive: the counter did not overflow but was
+simply reset when the RMID transitioned from "Unavailable" back to active
+state.
+
+Here is the text from APM [1] available from [2].
+
+"In PQOS Version 2.0 or higher, the MBM hardware will set the U bit on the
+first QM_CTR read when it begins tracking an RMID that it was not
+previously tracking. The U bit will be zero for all subsequent reads from
+that RMID while it is still tracked by the hardware. Therefore, a QM_CTR
+read with the U bit set when that RMID is in use by a processor can be
+considered 0 when calculating the difference with a subsequent read."
+
+[1] AMD64 Architecture Programmer's Manual Volume 2: System Programming
+    Publication # 24593 Revision 3.41 section 19.3.3 Monitoring L3 Memory
+    Bandwidth (MBM).
+
+  [ bp: Split commit message into smaller paragraph chunks for better
+    consumption. ]
+
+Fixes: 4d05bf71f157d ("x86/resctrl: Introduce AMD QOS feature")
+Signed-off-by: Babu Moger <babu.moger@amd.com>
+Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de>
+Reviewed-by: Reinette Chatre <reinette.chatre@intel.com>
+Tested-by: Reinette Chatre <reinette.chatre@intel.com>
+Cc: stable@vger.kernel.org # needs adjustments for <= v6.17
+Link: https://bugzilla.kernel.org/show_bug.cgi?id=206537 # [2]
+(cherry picked from commit 15292f1b4c55a3a7c940dbcb6cb8793871ed3d92)
+[babu.moger@amd.com: Fix conflict for v6.6 stable]
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/x86/kernel/cpu/resctrl/monitor.c |    8 ++++++--
+ 1 file changed, 6 insertions(+), 2 deletions(-)
+
+--- a/arch/x86/kernel/cpu/resctrl/monitor.c
++++ b/arch/x86/kernel/cpu/resctrl/monitor.c
+@@ -241,11 +241,15 @@ int resctrl_arch_rmid_read(struct rdt_re
+       if (!cpumask_test_cpu(smp_processor_id(), &d->cpu_mask))
+               return -EINVAL;
++      am = get_arch_mbm_state(hw_dom, rmid, eventid);
++
+       ret = __rmid_read(rmid, eventid, &msr_val);
+-      if (ret)
++      if (ret) {
++              if (am && ret == -EINVAL)
++                      am->prev_msr = 0;
+               return ret;
++      }
+-      am = get_arch_mbm_state(hw_dom, rmid, eventid);
+       if (am) {
+               am->chunks += mbm_overflow_count(am->prev_msr, msr_val,
+                                                hw_res->mbm_width);
diff --git a/queue-6.6/xfs-always-warn-about-deprecated-mount-options.patch b/queue-6.6/xfs-always-warn-about-deprecated-mount-options.patch
new file mode 100644 (file)
index 0000000..f8f5748
--- /dev/null
@@ -0,0 +1,93 @@
+From stable+bounces-189890-greg=kroah.com@vger.kernel.org Sun Oct 26 23:50:17 2025
+From: Sasha Levin <sashal@kernel.org>
+Date: Sun, 26 Oct 2025 18:50:08 -0400
+Subject: xfs: always warn about deprecated mount options
+To: stable@vger.kernel.org
+Cc: "Darrick J. Wong" <djwong@kernel.org>, Christoph Hellwig <hch@lst.de>, Carlos Maiolino <cmaiolino@redhat.com>, Carlos Maiolino <cem@kernel.org>, Sasha Levin <sashal@kernel.org>
+Message-ID: <20251026225008.272115-1-sashal@kernel.org>
+
+From: "Darrick J. Wong" <djwong@kernel.org>
+
+[ Upstream commit 630785bfbe12c3ee3ebccd8b530a98d632b7e39d ]
+
+The deprecation of the 'attr2' mount option in 6.18 wasn't entirely
+successful because nobody noticed that the kernel never printed a
+warning about attr2 being set in fstab if the only xfs filesystem is the
+root fs; the initramfs mounts the root fs with no mount options; and the
+init scripts only conveyed the fstab options by remounting the root fs.
+
+Fix this by making it complain all the time.
+
+Cc: stable@vger.kernel.org # v5.13
+Fixes: 92cf7d36384b99 ("xfs: Skip repetitive warnings about mount options")
+Signed-off-by: Darrick J. Wong <djwong@kernel.org>
+Reviewed-by: Christoph Hellwig <hch@lst.de>
+Reviewed-by: Carlos Maiolino <cmaiolino@redhat.com>
+Signed-off-by: Carlos Maiolino <cem@kernel.org>
+[ Update existing xfs_fs_warn_deprecated() callers ]
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/xfs_super.c |   33 +++++++++++++++++++++------------
+ 1 file changed, 21 insertions(+), 12 deletions(-)
+
+--- a/fs/xfs/xfs_super.c
++++ b/fs/xfs/xfs_super.c
+@@ -1230,16 +1230,25 @@ suffix_kstrtoint(
+ static inline void
+ xfs_fs_warn_deprecated(
+       struct fs_context       *fc,
+-      struct fs_parameter     *param,
+-      uint64_t                flag,
+-      bool                    value)
++      struct fs_parameter     *param)
+ {
+-      /* Don't print the warning if reconfiguring and current mount point
+-       * already had the flag set
++      /*
++       * Always warn about someone passing in a deprecated mount option.
++       * Previously we wouldn't print the warning if we were reconfiguring
++       * and current mount point already had the flag set, but that was not
++       * the right thing to do.
++       *
++       * Many distributions mount the root filesystem with no options in the
++       * initramfs and rely on mount -a to remount the root fs with the
++       * options in fstab.  However, the old behavior meant that there would
++       * never be a warning about deprecated mount options for the root fs in
++       * /etc/fstab.  On a single-fs system, that means no warning at all.
++       *
++       * Compounding this problem are distribution scripts that copy
++       * /proc/mounts to fstab, which means that we can't remove mount
++       * options unless we're 100% sure they have only ever been advertised
++       * in /proc/mounts in response to explicitly provided mount options.
+        */
+-      if ((fc->purpose & FS_CONTEXT_FOR_RECONFIGURE) &&
+-            !!(XFS_M(fc->root->d_sb)->m_features & flag) == value)
+-              return;
+       xfs_warn(fc->s_fs_info, "%s mount option is deprecated.", param->key);
+ }
+@@ -1378,19 +1387,19 @@ xfs_fs_parse_param(
+ #endif
+       /* Following mount options will be removed in September 2025 */
+       case Opt_ikeep:
+-              xfs_fs_warn_deprecated(fc, param, XFS_FEAT_IKEEP, true);
++              xfs_fs_warn_deprecated(fc, param);
+               parsing_mp->m_features |= XFS_FEAT_IKEEP;
+               return 0;
+       case Opt_noikeep:
+-              xfs_fs_warn_deprecated(fc, param, XFS_FEAT_IKEEP, false);
++              xfs_fs_warn_deprecated(fc, param);
+               parsing_mp->m_features &= ~XFS_FEAT_IKEEP;
+               return 0;
+       case Opt_attr2:
+-              xfs_fs_warn_deprecated(fc, param, XFS_FEAT_ATTR2, true);
++              xfs_fs_warn_deprecated(fc, param);
+               parsing_mp->m_features |= XFS_FEAT_ATTR2;
+               return 0;
+       case Opt_noattr2:
+-              xfs_fs_warn_deprecated(fc, param, XFS_FEAT_NOATTR2, true);
++              xfs_fs_warn_deprecated(fc, param);
+               parsing_mp->m_features |= XFS_FEAT_NOATTR2;
+               return 0;
+       default: