--- /dev/null
+From 95b2e31e1752494d477c5da89d6789f769b0d67b Mon Sep 17 00:00:00 2001
+From: Kuan-Wei Chiu <visitorckw@gmail.com>
+Date: Sun, 15 Jun 2025 04:23:53 +0800
+Subject: bcache: remove unnecessary select MIN_HEAP
+
+From: Kuan-Wei Chiu <visitorckw@gmail.com>
+
+commit 95b2e31e1752494d477c5da89d6789f769b0d67b upstream.
+
+After reverting the transition to the generic min heap library, bcache no
+longer depends on MIN_HEAP. The select entry can be removed to reduce
+code size and shrink the kernel's attack surface.
+
+This change effectively reverts the bcache-related part of commit
+92a8b224b833 ("lib/min_heap: introduce non-inline versions of min heap API
+functions").
+
+This is part of a series of changes to address a performance regression
+caused by the use of the generic min_heap implementation.
+
+As reported by Robert, bcache now suffers from latency spikes, with P100
+(max) latency increasing from 600 ms to 2.4 seconds every 5 minutes.
+These regressions degrade bcache's effectiveness as a low-latency cache
+layer and lead to frequent timeouts and application stalls in production
+environments.
+
+Link: https://lore.kernel.org/lkml/CAJhEC05+0S69z+3+FB2Cd0hD+pCRyWTKLEOsc8BOmH73p1m+KQ@mail.gmail.com
+Link: https://lkml.kernel.org/r/20250614202353.1632957-4-visitorckw@gmail.com
+Fixes: 866898efbb25 ("bcache: remove heap-related macros and switch to generic min_heap")
+Fixes: 92a8b224b833 ("lib/min_heap: introduce non-inline versions of min heap API functions")
+Signed-off-by: Kuan-Wei Chiu <visitorckw@gmail.com>
+Reported-by: Robert Pang <robertpang@google.com>
+Closes: https://lore.kernel.org/linux-bcache/CAJhEC06F_AtrPgw2-7CvCqZgeStgCtitbD-ryuPpXQA-JG5XXw@mail.gmail.com
+Acked-by: Coly Li <colyli@kernel.org>
+Cc: Ching-Chun (Jim) Huang <jserv@ccns.ncku.edu.tw>
+Cc: Kent Overstreet <kent.overstreet@linux.dev>
+Cc: <stable@vger.kernel.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/md/bcache/Kconfig | 1 -
+ 1 file changed, 1 deletion(-)
+
+--- a/drivers/md/bcache/Kconfig
++++ b/drivers/md/bcache/Kconfig
+@@ -5,7 +5,6 @@ config BCACHE
+ select BLOCK_HOLDER_DEPRECATED if SYSFS
+ select CRC64
+ select CLOSURES
+- select MIN_HEAP
+ help
+ Allows a block device to be used as cache for other devices; uses
+ a btree for indexing and the layout is optimized for SSDs.
--- /dev/null
+From 042bb9603c44620dce98717a2d23235ca57a00d7 Mon Sep 17 00:00:00 2001
+From: =?UTF-8?q?Fr=C3=A9d=C3=A9ric=20Danis?= <frederic.danis@collabora.com>
+Date: Thu, 12 Jun 2025 09:50:34 +0200
+Subject: Bluetooth: L2CAP: Fix L2CAP MTU negotiation
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+From: Frédéric Danis <frederic.danis@collabora.com>
+
+commit 042bb9603c44620dce98717a2d23235ca57a00d7 upstream.
+
+OBEX download from iPhone is currently slow due to small packet size
+used to transfer data which doesn't follow the MTU negotiated during
+L2CAP connection, i.e. 672 bytes instead of 32767:
+
+ < ACL Data TX: Handle 11 flags 0x00 dlen 12
+ L2CAP: Connection Request (0x02) ident 18 len 4
+ PSM: 4103 (0x1007)
+ Source CID: 72
+ > ACL Data RX: Handle 11 flags 0x02 dlen 16
+ L2CAP: Connection Response (0x03) ident 18 len 8
+ Destination CID: 14608
+ Source CID: 72
+ Result: Connection successful (0x0000)
+ Status: No further information available (0x0000)
+ < ACL Data TX: Handle 11 flags 0x00 dlen 27
+ L2CAP: Configure Request (0x04) ident 20 len 19
+ Destination CID: 14608
+ Flags: 0x0000
+ Option: Maximum Transmission Unit (0x01) [mandatory]
+ MTU: 32767
+ Option: Retransmission and Flow Control (0x04) [mandatory]
+ Mode: Enhanced Retransmission (0x03)
+ TX window size: 63
+ Max transmit: 3
+ Retransmission timeout: 2000
+ Monitor timeout: 12000
+ Maximum PDU size: 1009
+ > ACL Data RX: Handle 11 flags 0x02 dlen 26
+ L2CAP: Configure Request (0x04) ident 72 len 18
+ Destination CID: 72
+ Flags: 0x0000
+ Option: Retransmission and Flow Control (0x04) [mandatory]
+ Mode: Enhanced Retransmission (0x03)
+ TX window size: 32
+ Max transmit: 255
+ Retransmission timeout: 0
+ Monitor timeout: 0
+ Maximum PDU size: 65527
+ Option: Frame Check Sequence (0x05) [mandatory]
+ FCS: 16-bit FCS (0x01)
+ < ACL Data TX: Handle 11 flags 0x00 dlen 29
+ L2CAP: Configure Response (0x05) ident 72 len 21
+ Source CID: 14608
+ Flags: 0x0000
+ Result: Success (0x0000)
+ Option: Maximum Transmission Unit (0x01) [mandatory]
+ MTU: 672
+ Option: Retransmission and Flow Control (0x04) [mandatory]
+ Mode: Enhanced Retransmission (0x03)
+ TX window size: 32
+ Max transmit: 255
+ Retransmission timeout: 2000
+ Monitor timeout: 12000
+ Maximum PDU size: 1009
+ > ACL Data RX: Handle 11 flags 0x02 dlen 32
+ L2CAP: Configure Response (0x05) ident 20 len 24
+ Source CID: 72
+ Flags: 0x0000
+ Result: Success (0x0000)
+ Option: Maximum Transmission Unit (0x01) [mandatory]
+ MTU: 32767
+ Option: Retransmission and Flow Control (0x04) [mandatory]
+ Mode: Enhanced Retransmission (0x03)
+ TX window size: 63
+ Max transmit: 3
+ Retransmission timeout: 2000
+ Monitor timeout: 12000
+ Maximum PDU size: 1009
+ Option: Frame Check Sequence (0x05) [mandatory]
+ FCS: 16-bit FCS (0x01)
+ ...
+ > ACL Data RX: Handle 11 flags 0x02 dlen 680
+ Channel: 72 len 676 ctrl 0x0202 [PSM 4103 mode Enhanced Retransmission (0x03)] {chan 8}
+ I-frame: Unsegmented TxSeq 1 ReqSeq 2
+ < ACL Data TX: Handle 11 flags 0x00 dlen 13
+ Channel: 14608 len 9 ctrl 0x0204 [PSM 4103 mode Enhanced Retransmission (0x03)] {chan 8}
+ I-frame: Unsegmented TxSeq 2 ReqSeq 2
+ > ACL Data RX: Handle 11 flags 0x02 dlen 680
+ Channel: 72 len 676 ctrl 0x0304 [PSM 4103 mode Enhanced Retransmission (0x03)] {chan 8}
+ I-frame: Unsegmented TxSeq 2 ReqSeq 3
+
+The MTUs are negotiated for each direction. In this traces 32767 for
+iPhone->localhost and no MTU for localhost->iPhone, which based on
+'4.4 L2CAP_CONFIGURATION_REQ' (Core specification v5.4, Vol. 3, Part
+A):
+
+ The only parameters that should be included in the
+ L2CAP_CONFIGURATION_REQ packet are those that require different
+ values than the default or previously agreed values.
+ ...
+ Any missing configuration parameters are assumed to have their
+ most recently explicitly or implicitly accepted values.
+
+and '5.1 Maximum transmission unit (MTU)':
+
+ If the remote device sends a positive L2CAP_CONFIGURATION_RSP
+ packet it should include the actual MTU to be used on this channel
+ for traffic flowing into the local device.
+ ...
+ The default value is 672 octets.
+
+is set by BlueZ to 672 bytes.
+
+It seems that the iPhone used the lowest negotiated value to transfer
+data to the localhost instead of the negotiated one for the incoming
+direction.
+
+This could be fixed by using the MTU negotiated for the other
+direction, if exists, in the L2CAP_CONFIGURATION_RSP.
+This allows to use segmented packets as in the following traces:
+
+ < ACL Data TX: Handle 11 flags 0x00 dlen 12
+ L2CAP: Connection Request (0x02) ident 22 len 4
+ PSM: 4103 (0x1007)
+ Source CID: 72
+ < ACL Data TX: Handle 11 flags 0x00 dlen 27
+ L2CAP: Configure Request (0x04) ident 24 len 19
+ Destination CID: 2832
+ Flags: 0x0000
+ Option: Maximum Transmission Unit (0x01) [mandatory]
+ MTU: 32767
+ Option: Retransmission and Flow Control (0x04) [mandatory]
+ Mode: Enhanced Retransmission (0x03)
+ TX window size: 63
+ Max transmit: 3
+ Retransmission timeout: 2000
+ Monitor timeout: 12000
+ Maximum PDU size: 1009
+ > ACL Data RX: Handle 11 flags 0x02 dlen 26
+ L2CAP: Configure Request (0x04) ident 15 len 18
+ Destination CID: 72
+ Flags: 0x0000
+ Option: Retransmission and Flow Control (0x04) [mandatory]
+ Mode: Enhanced Retransmission (0x03)
+ TX window size: 32
+ Max transmit: 255
+ Retransmission timeout: 0
+ Monitor timeout: 0
+ Maximum PDU size: 65527
+ Option: Frame Check Sequence (0x05) [mandatory]
+ FCS: 16-bit FCS (0x01)
+ < ACL Data TX: Handle 11 flags 0x00 dlen 29
+ L2CAP: Configure Response (0x05) ident 15 len 21
+ Source CID: 2832
+ Flags: 0x0000
+ Result: Success (0x0000)
+ Option: Maximum Transmission Unit (0x01) [mandatory]
+ MTU: 32767
+ Option: Retransmission and Flow Control (0x04) [mandatory]
+ Mode: Enhanced Retransmission (0x03)
+ TX window size: 32
+ Max transmit: 255
+ Retransmission timeout: 2000
+ Monitor timeout: 12000
+ Maximum PDU size: 1009
+ > ACL Data RX: Handle 11 flags 0x02 dlen 32
+ L2CAP: Configure Response (0x05) ident 24 len 24
+ Source CID: 72
+ Flags: 0x0000
+ Result: Success (0x0000)
+ Option: Maximum Transmission Unit (0x01) [mandatory]
+ MTU: 32767
+ Option: Retransmission and Flow Control (0x04) [mandatory]
+ Mode: Enhanced Retransmission (0x03)
+ TX window size: 63
+ Max transmit: 3
+ Retransmission timeout: 2000
+ Monitor timeout: 12000
+ Maximum PDU size: 1009
+ Option: Frame Check Sequence (0x05) [mandatory]
+ FCS: 16-bit FCS (0x01)
+ ...
+ > ACL Data RX: Handle 11 flags 0x02 dlen 1009
+ Channel: 72 len 1005 ctrl 0x4202 [PSM 4103 mode Enhanced Retransmission (0x03)] {chan 8}
+ I-frame: Start (len 21884) TxSeq 1 ReqSeq 2
+ > ACL Data RX: Handle 11 flags 0x02 dlen 1009
+ Channel: 72 len 1005 ctrl 0xc204 [PSM 4103 mode Enhanced Retransmission (0x03)] {chan 8}
+ I-frame: Continuation TxSeq 2 ReqSeq 2
+
+This has been tested with kernel 5.4 and BlueZ 5.77.
+
+Cc: stable@vger.kernel.org
+Signed-off-by: Frédéric Danis <frederic.danis@collabora.com>
+Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/bluetooth/l2cap_core.c | 9 ++++++++-
+ 1 file changed, 8 insertions(+), 1 deletion(-)
+
+--- a/net/bluetooth/l2cap_core.c
++++ b/net/bluetooth/l2cap_core.c
+@@ -3415,7 +3415,7 @@ static int l2cap_parse_conf_req(struct l
+ struct l2cap_conf_rfc rfc = { .mode = L2CAP_MODE_BASIC };
+ struct l2cap_conf_efs efs;
+ u8 remote_efs = 0;
+- u16 mtu = L2CAP_DEFAULT_MTU;
++ u16 mtu = 0;
+ u16 result = L2CAP_CONF_SUCCESS;
+ u16 size;
+
+@@ -3520,6 +3520,13 @@ done:
+ /* Configure output options and let the other side know
+ * which ones we don't like. */
+
++ /* If MTU is not provided in configure request, use the most recently
++ * explicitly or implicitly accepted value for the other direction,
++ * or the default value.
++ */
++ if (mtu == 0)
++ mtu = chan->imtu ? chan->imtu : L2CAP_DEFAULT_MTU;
++
+ if (mtu < L2CAP_DEFAULT_MIN_MTU)
+ result = L2CAP_CONF_UNACCEPT;
+ else {
--- /dev/null
+From 3ca864de852bc91007b32d2a0d48993724f4abad Mon Sep 17 00:00:00 2001
+From: Filipe Manana <fdmanana@suse.com>
+Date: Wed, 28 May 2025 12:28:27 +0100
+Subject: btrfs: fix a race between renames and directory logging
+
+From: Filipe Manana <fdmanana@suse.com>
+
+commit 3ca864de852bc91007b32d2a0d48993724f4abad upstream.
+
+We have a race between a rename and directory inode logging that if it
+happens and we crash/power fail before the rename completes, the next time
+the filesystem is mounted, the log replay code will end up deleting the
+file that was being renamed.
+
+This is best explained following a step by step analysis of an interleaving
+of steps that lead into this situation.
+
+Consider the initial conditions:
+
+1) We are at transaction N;
+
+2) We have directories A and B created in a past transaction (< N);
+
+3) We have inode X corresponding to a file that has 2 hardlinks, one in
+ directory A and the other in directory B, so we'll name them as
+ "A/foo_link1" and "B/foo_link2". Both hard links were persisted in a
+ past transaction (< N);
+
+4) We have inode Y corresponding to a file that as a single hard link and
+ is located in directory A, we'll name it as "A/bar". This file was also
+ persisted in a past transaction (< N).
+
+The steps leading to a file loss are the following and for all of them we
+are under transaction N:
+
+ 1) Link "A/foo_link1" is removed, so inode's X last_unlink_trans field
+ is updated to N, through btrfs_unlink() -> btrfs_record_unlink_dir();
+
+ 2) Task A starts a rename for inode Y, with the goal of renaming from
+ "A/bar" to "A/baz", so we enter btrfs_rename();
+
+ 3) Task A inserts the new BTRFS_INODE_REF_KEY for inode Y by calling
+ btrfs_insert_inode_ref();
+
+ 4) Because the rename happens in the same directory, we don't set the
+ last_unlink_trans field of directoty A's inode to the current
+ transaction id, that is, we don't cal btrfs_record_unlink_dir();
+
+ 5) Task A then removes the entries from directory A (BTRFS_DIR_ITEM_KEY
+ and BTRFS_DIR_INDEX_KEY items) when calling __btrfs_unlink_inode()
+ (actually the dir index item is added as a delayed item, but the
+ effect is the same);
+
+ 6) Now before task A adds the new entry "A/baz" to directory A by
+ calling btrfs_add_link(), another task, task B is logging inode X;
+
+ 7) Task B starts a fsync of inode X and after logging inode X, at
+ btrfs_log_inode_parent() it calls btrfs_log_all_parents(), since
+ inode X has a last_unlink_trans value of N, set at in step 1;
+
+ 8) At btrfs_log_all_parents() we search for all parent directories of
+ inode X using the commit root, so we find directories A and B and log
+ them. Bu when logging direct A, we don't have a dir index item for
+ inode Y anymore, neither the old name "A/bar" nor for the new name
+ "A/baz" since the rename has deleted the old name but has not yet
+ inserted the new name - task A hasn't called yet btrfs_add_link() to
+ do that.
+
+ Note that logging directory A doesn't fallback to a transaction
+ commit because its last_unlink_trans has a lower value than the
+ current transaction's id (see step 4);
+
+ 9) Task B finishes logging directories A and B and gets back to
+ btrfs_sync_file() where it calls btrfs_sync_log() to persist the log
+ tree;
+
+10) Task B successfully persisted the log tree, btrfs_sync_log() completed
+ with success, and a power failure happened.
+
+ We have a log tree without any directory entry for inode Y, so the
+ log replay code deletes the entry for inode Y, name "A/bar", from the
+ subvolume tree since it doesn't exist in the log tree and the log
+ tree is authorative for its index (we logged a BTRFS_DIR_LOG_INDEX_KEY
+ item that covers the index range for the dentry that corresponds to
+ "A/bar").
+
+ Since there's no other hard link for inode Y and the log replay code
+ deletes the name "A/bar", the file is lost.
+
+The issue wouldn't happen if task B synced the log only after task A
+called btrfs_log_new_name(), which would update the log with the new name
+for inode Y ("A/bar").
+
+Fix this by pinning the log root during renames before removing the old
+directory entry, and unpinning after btrfs_log_new_name() is called.
+
+Fixes: 259c4b96d78d ("btrfs: stop doing unnecessary log updates during a rename")
+CC: stable@vger.kernel.org # 5.18+
+Reviewed-by: Boris Burkov <boris@bur.io>
+Signed-off-by: Filipe Manana <fdmanana@suse.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/btrfs/inode.c | 81 +++++++++++++++++++++++++++++++++++++++++++------------
+ 1 file changed, 64 insertions(+), 17 deletions(-)
+
+--- a/fs/btrfs/inode.c
++++ b/fs/btrfs/inode.c
+@@ -7979,6 +7979,7 @@ static int btrfs_rename_exchange(struct
+ int ret;
+ int ret2;
+ bool need_abort = false;
++ bool logs_pinned = false;
+ struct fscrypt_name old_fname, new_fname;
+ struct fscrypt_str *old_name, *new_name;
+
+@@ -8102,6 +8103,31 @@ static int btrfs_rename_exchange(struct
+ inode_inc_iversion(new_inode);
+ simple_rename_timestamp(old_dir, old_dentry, new_dir, new_dentry);
+
++ if (old_ino != BTRFS_FIRST_FREE_OBJECTID &&
++ new_ino != BTRFS_FIRST_FREE_OBJECTID) {
++ /*
++ * If we are renaming in the same directory (and it's not for
++ * root entries) pin the log early to prevent any concurrent
++ * task from logging the directory after we removed the old
++ * entries and before we add the new entries, otherwise that
++ * task can sync a log without any entry for the inodes we are
++ * renaming and therefore replaying that log, if a power failure
++ * happens after syncing the log, would result in deleting the
++ * inodes.
++ *
++ * If the rename affects two different directories, we want to
++ * make sure the that there's no log commit that contains
++ * updates for only one of the directories but not for the
++ * other.
++ *
++ * If we are renaming an entry for a root, we don't care about
++ * log updates since we called btrfs_set_log_full_commit().
++ */
++ btrfs_pin_log_trans(root);
++ btrfs_pin_log_trans(dest);
++ logs_pinned = true;
++ }
++
+ if (old_dentry->d_parent != new_dentry->d_parent) {
+ btrfs_record_unlink_dir(trans, BTRFS_I(old_dir),
+ BTRFS_I(old_inode), true);
+@@ -8173,30 +8199,23 @@ static int btrfs_rename_exchange(struct
+ BTRFS_I(new_inode)->dir_index = new_idx;
+
+ /*
+- * Now pin the logs of the roots. We do it to ensure that no other task
+- * can sync the logs while we are in progress with the rename, because
+- * that could result in an inconsistency in case any of the inodes that
+- * are part of this rename operation were logged before.
++ * Do the log updates for all inodes.
++ *
++ * If either entry is for a root we don't need to update the logs since
++ * we've called btrfs_set_log_full_commit() before.
+ */
+- if (old_ino != BTRFS_FIRST_FREE_OBJECTID)
+- btrfs_pin_log_trans(root);
+- if (new_ino != BTRFS_FIRST_FREE_OBJECTID)
+- btrfs_pin_log_trans(dest);
+-
+- /* Do the log updates for all inodes. */
+- if (old_ino != BTRFS_FIRST_FREE_OBJECTID)
++ if (logs_pinned) {
+ btrfs_log_new_name(trans, old_dentry, BTRFS_I(old_dir),
+ old_rename_ctx.index, new_dentry->d_parent);
+- if (new_ino != BTRFS_FIRST_FREE_OBJECTID)
+ btrfs_log_new_name(trans, new_dentry, BTRFS_I(new_dir),
+ new_rename_ctx.index, old_dentry->d_parent);
++ }
+
+- /* Now unpin the logs. */
+- if (old_ino != BTRFS_FIRST_FREE_OBJECTID)
++out_fail:
++ if (logs_pinned) {
+ btrfs_end_log_trans(root);
+- if (new_ino != BTRFS_FIRST_FREE_OBJECTID)
+ btrfs_end_log_trans(dest);
+-out_fail:
++ }
+ ret2 = btrfs_end_transaction(trans);
+ ret = ret ? ret : ret2;
+ out_notrans:
+@@ -8246,6 +8265,7 @@ static int btrfs_rename(struct mnt_idmap
+ int ret2;
+ u64 old_ino = btrfs_ino(BTRFS_I(old_inode));
+ struct fscrypt_name old_fname, new_fname;
++ bool logs_pinned = false;
+
+ if (btrfs_ino(BTRFS_I(new_dir)) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)
+ return -EPERM;
+@@ -8380,6 +8400,29 @@ static int btrfs_rename(struct mnt_idmap
+ inode_inc_iversion(old_inode);
+ simple_rename_timestamp(old_dir, old_dentry, new_dir, new_dentry);
+
++ if (old_ino != BTRFS_FIRST_FREE_OBJECTID) {
++ /*
++ * If we are renaming in the same directory (and it's not a
++ * root entry) pin the log to prevent any concurrent task from
++ * logging the directory after we removed the old entry and
++ * before we add the new entry, otherwise that task can sync
++ * a log without any entry for the inode we are renaming and
++ * therefore replaying that log, if a power failure happens
++ * after syncing the log, would result in deleting the inode.
++ *
++ * If the rename affects two different directories, we want to
++ * make sure the that there's no log commit that contains
++ * updates for only one of the directories but not for the
++ * other.
++ *
++ * If we are renaming an entry for a root, we don't care about
++ * log updates since we called btrfs_set_log_full_commit().
++ */
++ btrfs_pin_log_trans(root);
++ btrfs_pin_log_trans(dest);
++ logs_pinned = true;
++ }
++
+ if (old_dentry->d_parent != new_dentry->d_parent)
+ btrfs_record_unlink_dir(trans, BTRFS_I(old_dir),
+ BTRFS_I(old_inode), true);
+@@ -8444,7 +8487,7 @@ static int btrfs_rename(struct mnt_idmap
+ if (old_inode->i_nlink == 1)
+ BTRFS_I(old_inode)->dir_index = index;
+
+- if (old_ino != BTRFS_FIRST_FREE_OBJECTID)
++ if (logs_pinned)
+ btrfs_log_new_name(trans, old_dentry, BTRFS_I(old_dir),
+ rename_ctx.index, new_dentry->d_parent);
+
+@@ -8460,6 +8503,10 @@ static int btrfs_rename(struct mnt_idmap
+ }
+ }
+ out_fail:
++ if (logs_pinned) {
++ btrfs_end_log_trans(root);
++ btrfs_end_log_trans(dest);
++ }
+ ret2 = btrfs_end_transaction(trans);
+ ret = ret ? ret : ret2;
+ out_notrans:
--- /dev/null
+From 2dcf838cf5c2f0f4501edaa1680fcad03618d760 Mon Sep 17 00:00:00 2001
+From: Filipe Manana <fdmanana@suse.com>
+Date: Tue, 3 Jun 2025 19:29:01 +0100
+Subject: btrfs: fix invalid inode pointer dereferences during log replay
+
+From: Filipe Manana <fdmanana@suse.com>
+
+commit 2dcf838cf5c2f0f4501edaa1680fcad03618d760 upstream.
+
+In a few places where we call read_one_inode(), if we get a NULL pointer
+we end up jumping into an error path, or fallthrough in case of
+__add_inode_ref(), where we then do something like this:
+
+ iput(&inode->vfs_inode);
+
+which results in an invalid inode pointer that triggers an invalid memory
+access, resulting in a crash.
+
+Fix this by making sure we don't do such dereferences.
+
+Fixes: b4c50cbb01a1 ("btrfs: return a btrfs_inode from read_one_inode()")
+CC: stable@vger.kernel.org # 6.15+
+Signed-off-by: Filipe Manana <fdmanana@suse.com>
+Reviewed-by: David Sterba <dsterba@suse.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/btrfs/tree-log.c | 14 ++++++--------
+ 1 file changed, 6 insertions(+), 8 deletions(-)
+
+diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
+index 97e933113b82..21d2f3dded51 100644
+--- a/fs/btrfs/tree-log.c
++++ b/fs/btrfs/tree-log.c
+@@ -668,15 +668,12 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
+ extent_end = ALIGN(start + size,
+ fs_info->sectorsize);
+ } else {
+- ret = 0;
+- goto out;
++ return 0;
+ }
+
+ inode = read_one_inode(root, key->objectid);
+- if (!inode) {
+- ret = -EIO;
+- goto out;
+- }
++ if (!inode)
++ return -EIO;
+
+ /*
+ * first check to see if we already have this extent in the
+@@ -961,7 +958,8 @@ static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans,
+ ret = unlink_inode_for_log_replay(trans, dir, inode, &name);
+ out:
+ kfree(name.name);
+- iput(&inode->vfs_inode);
++ if (inode)
++ iput(&inode->vfs_inode);
+ return ret;
+ }
+
+@@ -1176,8 +1174,8 @@ static inline int __add_inode_ref(struct btrfs_trans_handle *trans,
+ ret = unlink_inode_for_log_replay(trans,
+ victim_parent,
+ inode, &victim_name);
++ iput(&victim_parent->vfs_inode);
+ }
+- iput(&victim_parent->vfs_inode);
+ kfree(victim_name.name);
+ if (ret)
+ return ret;
+--
+2.50.0
+
--- /dev/null
+From ae4477f937569d097ca5dbce92a89ba384b49bc6 Mon Sep 17 00:00:00 2001
+From: Mark Harmstone <maharmstone@fb.com>
+Date: Thu, 29 May 2025 10:37:44 +0100
+Subject: btrfs: update superblock's device bytes_used when dropping chunk
+
+From: Mark Harmstone <maharmstone@fb.com>
+
+commit ae4477f937569d097ca5dbce92a89ba384b49bc6 upstream.
+
+Each superblock contains a copy of the device item for that device. In a
+transaction which drops a chunk but doesn't create any new ones, we were
+correctly updating the device item in the chunk tree but not copying
+over the new bytes_used value to the superblock.
+
+This can be seen by doing the following:
+
+ # dd if=/dev/zero of=test bs=4096 count=2621440
+ # mkfs.btrfs test
+ # mount test /root/temp
+
+ # cd /root/temp
+ # for i in {00..10}; do dd if=/dev/zero of=$i bs=4096 count=32768; done
+ # sync
+ # rm *
+ # sync
+ # btrfs balance start -dusage=0 .
+ # sync
+
+ # cd
+ # umount /root/temp
+ # btrfs check test
+
+For btrfs-check to detect this, you will also need my patch at
+https://github.com/kdave/btrfs-progs/pull/991.
+
+Change btrfs_remove_dev_extents() so that it adds the devices to the
+fs_info->post_commit_list if they're not there already. This causes
+btrfs_commit_device_sizes() to be called, which updates the bytes_used
+value in the superblock.
+
+Fixes: bbbf7243d62d ("btrfs: combine device update operations during transaction commit")
+CC: stable@vger.kernel.org # 5.10+
+Reviewed-by: Qu Wenruo <wqu@suse.com>
+Signed-off-by: Mark Harmstone <maharmstone@fb.com>
+Reviewed-by: David Sterba <dsterba@suse.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/btrfs/volumes.c | 6 ++++++
+ 1 file changed, 6 insertions(+)
+
+--- a/fs/btrfs/volumes.c
++++ b/fs/btrfs/volumes.c
+@@ -3281,6 +3281,12 @@ int btrfs_remove_chunk(struct btrfs_tran
+ device->bytes_used - dev_extent_len);
+ atomic64_add(dev_extent_len, &fs_info->free_chunk_space);
+ btrfs_clear_space_info_full(fs_info);
++
++ if (list_empty(&device->post_commit_list)) {
++ list_add_tail(&device->post_commit_list,
++ &trans->transaction->dev_update_list);
++ }
++
+ mutex_unlock(&fs_info->chunk_mutex);
+ }
+ }
--- /dev/null
+From db53805156f1e0aa6d059c0d3f9ac660d4ef3eb4 Mon Sep 17 00:00:00 2001
+From: Heinz Mauelshagen <heinzm@redhat.com>
+Date: Tue, 10 Jun 2025 20:53:30 +0200
+Subject: dm-raid: fix variable in journal device check
+
+From: Heinz Mauelshagen <heinzm@redhat.com>
+
+commit db53805156f1e0aa6d059c0d3f9ac660d4ef3eb4 upstream.
+
+Replace "rdev" with correct loop variable name "r".
+
+Signed-off-by: Heinz Mauelshagen <heinzm@redhat.com>
+Cc: stable@vger.kernel.org
+Fixes: 63c32ed4afc2 ("dm raid: add raid4/5/6 journaling support")
+Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/md/dm-raid.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/drivers/md/dm-raid.c
++++ b/drivers/md/dm-raid.c
+@@ -2410,7 +2410,7 @@ static int super_init_validation(struct
+ */
+ sb_retrieve_failed_devices(sb, failed_devices);
+ rdev_for_each(r, mddev) {
+- if (test_bit(Journal, &rdev->flags) ||
++ if (test_bit(Journal, &r->flags) ||
+ !r->sb_page)
+ continue;
+ sb2 = page_address(r->sb_page);
--- /dev/null
+From cfb05257ae168a0496c7637e1d9e3ab8a25cbffe Mon Sep 17 00:00:00 2001
+From: Jay Cornwall <jay.cornwall@amd.com>
+Date: Wed, 11 Jun 2025 09:52:14 -0500
+Subject: drm/amdkfd: Fix race in GWS queue scheduling
+
+From: Jay Cornwall <jay.cornwall@amd.com>
+
+commit cfb05257ae168a0496c7637e1d9e3ab8a25cbffe upstream.
+
+q->gws is not updated atomically with qpd->mapped_gws_queue. If a
+runlist is created between pqm_set_gws and update_queue it will
+contain a queue which uses GWS in a process with no GWS allocated.
+This will result in a scheduler hang.
+
+Use q->properties.is_gws which is changed while holding the DQM lock.
+
+Signed-off-by: Jay Cornwall <jay.cornwall@amd.com>
+Reviewed-by: Harish Kasiviswanathan <Harish.Kasiviswanathan@amd.com>
+Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
+(cherry picked from commit b98370220eb3110e82248e3354e16a489a492cfb)
+Cc: stable@vger.kernel.org
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/gpu/drm/amd/amdkfd/kfd_packet_manager_v9.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager_v9.c
++++ b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager_v9.c
+@@ -237,7 +237,7 @@ static int pm_map_queues_v9(struct packe
+
+ packet->bitfields2.engine_sel =
+ engine_sel__mes_map_queues__compute_vi;
+- packet->bitfields2.gws_control_queue = q->gws ? 1 : 0;
++ packet->bitfields2.gws_control_queue = q->properties.is_gws ? 1 : 0;
+ packet->bitfields2.extended_engine_sel =
+ extended_engine_sel__mes_map_queues__legacy_engine_sel;
+ packet->bitfields2.queue_type =
--- /dev/null
+From 7cce65f3789e04c0f7668a66563e680d81d54493 Mon Sep 17 00:00:00 2001
+From: Thomas Zimmermann <tzimmermann@suse.de>
+Date: Mon, 24 Mar 2025 10:44:09 +0100
+Subject: drm/ast: Fix comment on modeset lock
+
+From: Thomas Zimmermann <tzimmermann@suse.de>
+
+commit 7cce65f3789e04c0f7668a66563e680d81d54493 upstream.
+
+The ast driver protects the commit tail against concurrent reads
+of the display modes by acquiring a lock. The comment is misleading
+as the lock is not released in atomic_flush, but at the end of the
+commit-tail helper. Rewrite the comment.
+
+Signed-off-by: Thomas Zimmermann <tzimmermann@suse.de>
+Fixes: 1fe182154984 ("drm/ast: Acquire I/O-register lock in atomic_commit_tail function")
+Cc: Thomas Zimmermann <tzimmermann@suse.de>
+Cc: Jocelyn Falempe <jfalempe@redhat.com>
+Cc: Dave Airlie <airlied@redhat.com>
+Cc: dri-devel@lists.freedesktop.org
+Cc: <stable@vger.kernel.org> # v6.2+
+Reviewed-by: Jocelyn Falempe <jfalempe@redhat.com>
+Link: https://lore.kernel.org/r/20250324094520.192974-2-tzimmermann@suse.de
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/gpu/drm/ast/ast_mode.c | 6 +++---
+ 1 file changed, 3 insertions(+), 3 deletions(-)
+
+--- a/drivers/gpu/drm/ast/ast_mode.c
++++ b/drivers/gpu/drm/ast/ast_mode.c
+@@ -922,9 +922,9 @@ static void ast_mode_config_helper_atomi
+
+ /*
+ * Concurrent operations could possibly trigger a call to
+- * drm_connector_helper_funcs.get_modes by trying to read the
+- * display modes. Protect access to I/O registers by acquiring
+- * the I/O-register lock. Released in atomic_flush().
++ * drm_connector_helper_funcs.get_modes by reading the display
++ * modes. Protect access to registers by acquiring the modeset
++ * lock.
+ */
+ mutex_lock(&ast->modeset_lock);
+ drm_atomic_helper_commit_tail(state);
--- /dev/null
+From 4bfb389a0136a13f0802eeb5e97a0e76d88f77ae Mon Sep 17 00:00:00 2001
+From: Thomas Zimmermann <tzimmermann@suse.de>
+Date: Fri, 28 Mar 2025 10:17:05 +0100
+Subject: drm/cirrus-qemu: Fix pitch programming
+
+From: Thomas Zimmermann <tzimmermann@suse.de>
+
+commit 4bfb389a0136a13f0802eeb5e97a0e76d88f77ae upstream.
+
+Do not set CR1B[6] when programming the pitch. The bit effects VGA
+text mode and is not interpreted by qemu. [1] It has no affect on
+the scanline pitch.
+
+The scanline bit that is set into CR1B[6] belongs into CR13[7], which
+the driver sets up correctly.
+
+This bug goes back to the driver's initial commit.
+
+Signed-off-by: Thomas Zimmermann <tzimmermann@suse.de>
+Acked-by: Gerd Hoffmann <kraxel@redhat.com>
+Link: https://gitlab.com/qemu-project/qemu/-/blob/stable-9.2/hw/display/cirrus_vga.c?ref_type=heads#L1112 # 1
+Fixes: f9aa76a85248 ("drm/kms: driver for virtual cirrus under qemu")
+Cc: Adam Jackson <ajax@redhat.com>
+Cc: Dave Airlie <airlied@redhat.com>
+Cc: Maarten Lankhorst <maarten.lankhorst@linux.intel.com>
+Cc: Maxime Ripard <mripard@kernel.org>
+Cc: Thomas Zimmermann <tzimmermann@suse.de>
+Cc: <stable@vger.kernel.org> # v3.5+
+Link: https://lore.kernel.org/r/20250328091821.195061-2-tzimmermann@suse.de
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/gpu/drm/tiny/cirrus-qemu.c | 1 -
+ 1 file changed, 1 deletion(-)
+
+--- a/drivers/gpu/drm/tiny/cirrus-qemu.c
++++ b/drivers/gpu/drm/tiny/cirrus-qemu.c
+@@ -318,7 +318,6 @@ static void cirrus_pitch_set(struct cirr
+ /* Enable extended blanking and pitch bits, and enable full memory */
+ cr1b = 0x22;
+ cr1b |= (pitch >> 7) & 0x10;
+- cr1b |= (pitch >> 6) & 0x40;
+ wreg_crt(cirrus, 0x1b, cr1b);
+
+ cirrus_set_start_address(cirrus, 0);
--- /dev/null
+From 61ee19dedb8d753249e20308782bf4e9e2fb7344 Mon Sep 17 00:00:00 2001
+From: =?UTF-8?q?Ma=C3=ADra=20Canal?= <mcanal@igalia.com>
+Date: Mon, 2 Jun 2025 10:22:16 -0300
+Subject: drm/etnaviv: Protect the scheduler's pending list with its lock
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+From: Maíra Canal <mcanal@igalia.com>
+
+commit 61ee19dedb8d753249e20308782bf4e9e2fb7344 upstream.
+
+Commit 704d3d60fec4 ("drm/etnaviv: don't block scheduler when GPU is still
+active") ensured that active jobs are returned to the pending list when
+extending the timeout. However, it didn't use the pending list's lock to
+manipulate the list, which causes a race condition as the scheduler's
+workqueues are running.
+
+Hold the lock while manipulating the scheduler's pending list to prevent
+a race.
+
+Cc: stable@vger.kernel.org
+Fixes: 704d3d60fec4 ("drm/etnaviv: don't block scheduler when GPU is still active")
+Reported-by: Philipp Stanner <phasta@kernel.org>
+Closes: https://lore.kernel.org/dri-devel/964e59ba1539083ef29b06d3c78f5e2e9b138ab8.camel@mailbox.org/
+Reviewed-by: Lucas Stach <l.stach@pengutronix.de>
+Reviewed-by: Philipp Stanner <phasta@kernel.org>
+Link: https://lore.kernel.org/r/20250602132240.93314-1-mcanal@igalia.com
+Signed-off-by: Maíra Canal <mcanal@igalia.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/gpu/drm/etnaviv/etnaviv_sched.c | 5 ++++-
+ 1 file changed, 4 insertions(+), 1 deletion(-)
+
+--- a/drivers/gpu/drm/etnaviv/etnaviv_sched.c
++++ b/drivers/gpu/drm/etnaviv/etnaviv_sched.c
+@@ -35,6 +35,7 @@ static enum drm_gpu_sched_stat etnaviv_s
+ *sched_job)
+ {
+ struct etnaviv_gem_submit *submit = to_etnaviv_submit(sched_job);
++ struct drm_gpu_scheduler *sched = sched_job->sched;
+ struct etnaviv_gpu *gpu = submit->gpu;
+ u32 dma_addr, primid = 0;
+ int change;
+@@ -89,7 +90,9 @@ static enum drm_gpu_sched_stat etnaviv_s
+ return DRM_GPU_SCHED_STAT_NOMINAL;
+
+ out_no_timeout:
+- list_add(&sched_job->list, &sched_job->sched->pending_list);
++ spin_lock(&sched->job_list_lock);
++ list_add(&sched_job->list, &sched->pending_list);
++ spin_unlock(&sched->job_list_lock);
+ return DRM_GPU_SCHED_STAT_NOMINAL;
+ }
+
--- /dev/null
+From b71717735be48d7743a34897e9e44a0b53e30c0e Mon Sep 17 00:00:00 2001
+From: Stephan Gerhold <stephan.gerhold@linaro.org>
+Date: Tue, 29 Apr 2025 10:33:56 +0200
+Subject: drm/msm/gpu: Fix crash when throttling GPU immediately during boot
+
+From: Stephan Gerhold <stephan.gerhold@linaro.org>
+
+commit b71717735be48d7743a34897e9e44a0b53e30c0e upstream.
+
+There is a small chance that the GPU is already hot during boot. In that
+case, the call to of_devfreq_cooling_register() will immediately try to
+apply devfreq cooling, as seen in the following crash:
+
+ Unable to handle kernel paging request at virtual address 0000000000014110
+ pc : a6xx_gpu_busy+0x1c/0x58 [msm]
+ lr : msm_devfreq_get_dev_status+0xbc/0x140 [msm]
+ Call trace:
+ a6xx_gpu_busy+0x1c/0x58 [msm] (P)
+ devfreq_simple_ondemand_func+0x3c/0x150
+ devfreq_update_target+0x44/0xd8
+ qos_max_notifier_call+0x30/0x84
+ blocking_notifier_call_chain+0x6c/0xa0
+ pm_qos_update_target+0xd0/0x110
+ freq_qos_apply+0x3c/0x74
+ apply_constraint+0x88/0x148
+ __dev_pm_qos_update_request+0x7c/0xcc
+ dev_pm_qos_update_request+0x38/0x5c
+ devfreq_cooling_set_cur_state+0x98/0xf0
+ __thermal_cdev_update+0x64/0xb4
+ thermal_cdev_update+0x4c/0x58
+ step_wise_manage+0x1f0/0x318
+ __thermal_zone_device_update+0x278/0x424
+ __thermal_cooling_device_register+0x2bc/0x308
+ thermal_of_cooling_device_register+0x10/0x1c
+ of_devfreq_cooling_register_power+0x240/0x2bc
+ of_devfreq_cooling_register+0x14/0x20
+ msm_devfreq_init+0xc4/0x1a0 [msm]
+ msm_gpu_init+0x304/0x574 [msm]
+ adreno_gpu_init+0x1c4/0x2e0 [msm]
+ a6xx_gpu_init+0x5c8/0x9c8 [msm]
+ adreno_bind+0x2a8/0x33c [msm]
+ ...
+
+At this point we haven't initialized the GMU at all yet, so we cannot read
+the GMU registers inside a6xx_gpu_busy(). A similar issue was fixed before
+in commit 6694482a70e9 ("drm/msm: Avoid unclocked GMU register access in
+6xx gpu_busy"): msm_devfreq_init() does call devfreq_suspend_device(), but
+unlike msm_devfreq_suspend(), it doesn't set the df->suspended flag
+accordingly. This means the df->suspended flag does not match the actual
+devfreq state after initialization and msm_devfreq_get_dev_status() will
+end up accessing GMU registers, causing the crash.
+
+Fix this by setting df->suspended correctly during initialization.
+
+Cc: stable@vger.kernel.org
+Fixes: 6694482a70e9 ("drm/msm: Avoid unclocked GMU register access in 6xx gpu_busy")
+Signed-off-by: Stephan Gerhold <stephan.gerhold@linaro.org>
+Reviewed-by: Douglas Anderson <dianders@chromium.org>
+Reviewed-by: Konrad Dybcio <konrad.dybcio@oss.qualcomm.com>
+Patchwork: https://patchwork.freedesktop.org/patch/650772/
+Signed-off-by: Rob Clark <robdclark@chromium.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/gpu/drm/msm/msm_gpu_devfreq.c | 1 +
+ 1 file changed, 1 insertion(+)
+
+--- a/drivers/gpu/drm/msm/msm_gpu_devfreq.c
++++ b/drivers/gpu/drm/msm/msm_gpu_devfreq.c
+@@ -156,6 +156,7 @@ void msm_devfreq_init(struct msm_gpu *gp
+ priv->gpu_devfreq_config.downdifferential = 10;
+
+ mutex_init(&df->lock);
++ df->suspended = true;
+
+ ret = dev_pm_qos_add_request(&gpu->pdev->dev, &df->boost_freq,
+ DEV_PM_QOS_MIN_FREQUENCY, 0);
--- /dev/null
+From 716c75afd83c837f14042309126e838de040658b Mon Sep 17 00:00:00 2001
+From: Luca Ceresoli <luca.ceresoli@bootlin.com>
+Date: Fri, 11 Apr 2025 21:19:45 +0200
+Subject: drm/panel: simple: Tianma TM070JDHG34-00: add delays
+
+From: Luca Ceresoli <luca.ceresoli@bootlin.com>
+
+commit 716c75afd83c837f14042309126e838de040658b upstream.
+
+Add power on/off delays for the Tianma TM070JDHG34-00.
+
+Fixes: bf6daaa281f7 ("drm/panel: simple: Add Tianma TM070JDHG34-00 panel support")
+Cc: stable@vger.kernel.org
+Signed-off-by: Luca Ceresoli <luca.ceresoli@bootlin.com>
+Reviewed-by: Neil Armstrong <neil.armstrong@linaro.org>
+Link: https://lore.kernel.org/r/20250411-tianma-p0700wxf1mbaa-v3-2-acbefe9ea669@bootlin.com
+Signed-off-by: Neil Armstrong <neil.armstrong@linaro.org>
+Link: https://lore.kernel.org/r/20250411-tianma-p0700wxf1mbaa-v3-2-acbefe9ea669@bootlin.com
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/gpu/drm/panel/panel-simple.c | 6 ++++++
+ 1 file changed, 6 insertions(+)
+
+--- a/drivers/gpu/drm/panel/panel-simple.c
++++ b/drivers/gpu/drm/panel/panel-simple.c
+@@ -4455,6 +4455,12 @@ static const struct panel_desc tianma_tm
+ .width = 150, /* 149.76 */
+ .height = 94, /* 93.60 */
+ },
++ .delay = {
++ .prepare = 15, /* Tp1 */
++ .enable = 150, /* Tp2 */
++ .disable = 150, /* Tp4 */
++ .unprepare = 120, /* Tp3 */
++ },
+ .bus_format = MEDIA_BUS_FMT_RGB888_1X7X4_SPWG,
+ .connector_type = DRM_MODE_CONNECTOR_LVDS,
+ };
--- /dev/null
+From d231cde7c84359fb18fb268cf6cff03b5bce48ff Mon Sep 17 00:00:00 2001
+From: Thomas Zimmermann <tzimmermann@suse.de>
+Date: Mon, 7 Apr 2025 15:47:24 +0200
+Subject: drm/simpledrm: Do not upcast in release helpers
+
+From: Thomas Zimmermann <tzimmermann@suse.de>
+
+commit d231cde7c84359fb18fb268cf6cff03b5bce48ff upstream.
+
+The res pointer passed to simpledrm_device_release_clocks() and
+simpledrm_device_release_regulators() points to an instance of
+struct simpledrm_device. No need to upcast from struct drm_device.
+The upcast is harmless, as DRM device is the first field in struct
+simpledrm_device.
+
+Signed-off-by: Thomas Zimmermann <tzimmermann@suse.de>
+Fixes: 11e8f5fd223b ("drm: Add simpledrm driver")
+Cc: <stable@vger.kernel.org> # v5.14+
+Reviewed-by: Javier Martinez Canillas <javierm@redhat.com>
+Reviewed-by: Jocelyn Falempe <jfalempe@redhat.com>
+Link: https://lore.kernel.org/r/20250407134753.985925-2-tzimmermann@suse.de
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/gpu/drm/tiny/simpledrm.c | 4 ++--
+ 1 file changed, 2 insertions(+), 2 deletions(-)
+
+--- a/drivers/gpu/drm/tiny/simpledrm.c
++++ b/drivers/gpu/drm/tiny/simpledrm.c
+@@ -284,7 +284,7 @@ static struct simpledrm_device *simpledr
+
+ static void simpledrm_device_release_clocks(void *res)
+ {
+- struct simpledrm_device *sdev = simpledrm_device_of_dev(res);
++ struct simpledrm_device *sdev = res;
+ unsigned int i;
+
+ for (i = 0; i < sdev->clk_count; ++i) {
+@@ -382,7 +382,7 @@ static int simpledrm_device_init_clocks(
+
+ static void simpledrm_device_release_regulators(void *res)
+ {
+- struct simpledrm_device *sdev = simpledrm_device_of_dev(res);
++ struct simpledrm_device *sdev = res;
+ unsigned int i;
+
+ for (i = 0; i < sdev->regulator_count; ++i) {
--- /dev/null
+From 9ff4fdf4f44b69237c0afc1d3a8dac916ce66f3e Mon Sep 17 00:00:00 2001
+From: Thierry Reding <treding@nvidia.com>
+Date: Mon, 21 Apr 2025 11:13:05 -0500
+Subject: drm/tegra: Assign plane type before registration
+
+From: Thierry Reding <treding@nvidia.com>
+
+commit 9ff4fdf4f44b69237c0afc1d3a8dac916ce66f3e upstream.
+
+Changes to a plane's type after it has been registered aren't propagated
+to userspace automatically. This could possibly be achieved by updating
+the property, but since we can already determine which type this should
+be before the registration, passing in the right type from the start is
+a much better solution.
+
+Suggested-by: Aaron Kling <webgeek1234@gmail.com>
+Signed-off-by: Thierry Reding <treding@nvidia.com>
+Cc: stable@vger.kernel.org
+Fixes: 473079549f27 ("drm/tegra: dc: Add Tegra186 support")
+Signed-off-by: Aaron Kling <webgeek1234@gmail.com>
+Signed-off-by: Thierry Reding <treding@nvidia.com>
+Link: https://lore.kernel.org/r/20250421-tegra-drm-primary-v2-1-7f740c4c2121@gmail.com
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/gpu/drm/tegra/dc.c | 12 ++++++++----
+ drivers/gpu/drm/tegra/hub.c | 4 ++--
+ drivers/gpu/drm/tegra/hub.h | 3 ++-
+ 3 files changed, 12 insertions(+), 7 deletions(-)
+
+--- a/drivers/gpu/drm/tegra/dc.c
++++ b/drivers/gpu/drm/tegra/dc.c
+@@ -1321,10 +1321,16 @@ static struct drm_plane *tegra_dc_add_sh
+ if (wgrp->dc == dc->pipe) {
+ for (j = 0; j < wgrp->num_windows; j++) {
+ unsigned int index = wgrp->windows[j];
++ enum drm_plane_type type;
++
++ if (primary)
++ type = DRM_PLANE_TYPE_OVERLAY;
++ else
++ type = DRM_PLANE_TYPE_PRIMARY;
+
+ plane = tegra_shared_plane_create(drm, dc,
+ wgrp->index,
+- index);
++ index, type);
+ if (IS_ERR(plane))
+ return plane;
+
+@@ -1332,10 +1338,8 @@ static struct drm_plane *tegra_dc_add_sh
+ * Choose the first shared plane owned by this
+ * head as the primary plane.
+ */
+- if (!primary) {
+- plane->type = DRM_PLANE_TYPE_PRIMARY;
++ if (!primary)
+ primary = plane;
+- }
+ }
+ }
+ }
+--- a/drivers/gpu/drm/tegra/hub.c
++++ b/drivers/gpu/drm/tegra/hub.c
+@@ -755,9 +755,9 @@ static const struct drm_plane_helper_fun
+ struct drm_plane *tegra_shared_plane_create(struct drm_device *drm,
+ struct tegra_dc *dc,
+ unsigned int wgrp,
+- unsigned int index)
++ unsigned int index,
++ enum drm_plane_type type)
+ {
+- enum drm_plane_type type = DRM_PLANE_TYPE_OVERLAY;
+ struct tegra_drm *tegra = drm->dev_private;
+ struct tegra_display_hub *hub = tegra->hub;
+ struct tegra_shared_plane *plane;
+--- a/drivers/gpu/drm/tegra/hub.h
++++ b/drivers/gpu/drm/tegra/hub.h
+@@ -80,7 +80,8 @@ void tegra_display_hub_cleanup(struct te
+ struct drm_plane *tegra_shared_plane_create(struct drm_device *drm,
+ struct tegra_dc *dc,
+ unsigned int wgrp,
+- unsigned int index);
++ unsigned int index,
++ enum drm_plane_type type);
+
+ int tegra_display_hub_atomic_check(struct drm_device *drm,
+ struct drm_atomic_state *state);
--- /dev/null
+From 780351a5f61416ed2ba1199cc57e4a076fca644d Mon Sep 17 00:00:00 2001
+From: Qiu-ji Chen <chenqiuji666@gmail.com>
+Date: Wed, 6 Nov 2024 17:59:06 +0800
+Subject: drm/tegra: Fix a possible null pointer dereference
+
+From: Qiu-ji Chen <chenqiuji666@gmail.com>
+
+commit 780351a5f61416ed2ba1199cc57e4a076fca644d upstream.
+
+In tegra_crtc_reset(), new memory is allocated with kzalloc(), but
+no check is performed. Before calling __drm_atomic_helper_crtc_reset,
+state should be checked to prevent possible null pointer dereference.
+
+Fixes: b7e0b04ae450 ("drm/tegra: Convert to using __drm_atomic_helper_crtc_reset() for reset.")
+Cc: stable@vger.kernel.org
+Signed-off-by: Qiu-ji Chen <chenqiuji666@gmail.com>
+Signed-off-by: Thierry Reding <treding@nvidia.com>
+Link: https://lore.kernel.org/r/20241106095906.15247-1-chenqiuji666@gmail.com
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/gpu/drm/tegra/dc.c | 5 ++++-
+ 1 file changed, 4 insertions(+), 1 deletion(-)
+
+--- a/drivers/gpu/drm/tegra/dc.c
++++ b/drivers/gpu/drm/tegra/dc.c
+@@ -1393,7 +1393,10 @@ static void tegra_crtc_reset(struct drm_
+ if (crtc->state)
+ tegra_crtc_atomic_destroy_state(crtc, crtc->state);
+
+- __drm_atomic_helper_crtc_reset(crtc, &state->base);
++ if (state)
++ __drm_atomic_helper_crtc_reset(crtc, &state->base);
++ else
++ __drm_atomic_helper_crtc_reset(crtc, NULL);
+ }
+
+ static struct drm_crtc_state *
--- /dev/null
+From ff9cb6d2035c586ea7c8f1754d4409eec7a2d26d Mon Sep 17 00:00:00 2001
+From: Thomas Zimmermann <tzimmermann@suse.de>
+Date: Mon, 3 Mar 2025 15:52:56 +0100
+Subject: drm/udl: Unregister device before cleaning up on disconnect
+
+From: Thomas Zimmermann <tzimmermann@suse.de>
+
+commit ff9cb6d2035c586ea7c8f1754d4409eec7a2d26d upstream.
+
+Disconnecting a DisplayLink device results in the following kernel
+error messages
+
+[ 93.041748] [drm:udl_urb_completion [udl]] *ERROR* udl_urb_completion - nonzero write bulk status received: -115
+[ 93.055299] [drm:udl_submit_urb [udl]] *ERROR* usb_submit_urb error fffffffe
+[ 93.065363] [drm:udl_urb_completion [udl]] *ERROR* udl_urb_completion - nonzero write bulk status received: -115
+[ 93.078207] [drm:udl_submit_urb [udl]] *ERROR* usb_submit_urb error fffffffe
+
+coming from KMS poll helpers. Shutting down poll helpers runs them
+one final time when the USB device is already gone.
+
+Run drm_dev_unplug() first in udl's USB disconnect handler. Udl's
+polling code already handles disconnects gracefully if the device has
+been marked as unplugged.
+
+Signed-off-by: Thomas Zimmermann <tzimmermann@suse.de>
+Fixes: b1a981bd5576 ("drm/udl: drop drm_driver.release hook")
+Cc: dri-devel@lists.freedesktop.org
+Cc: <stable@vger.kernel.org> # v5.8+
+Reviewed-by: Patrik Jakobsson <patrik.r.jakobsson@gmail.com>
+Link: https://patchwork.freedesktop.org/patch/msgid/20250303145604.62962-2-tzimmermann@suse.de
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/gpu/drm/udl/udl_drv.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/drivers/gpu/drm/udl/udl_drv.c
++++ b/drivers/gpu/drm/udl/udl_drv.c
+@@ -127,9 +127,9 @@ static void udl_usb_disconnect(struct us
+ {
+ struct drm_device *dev = usb_get_intfdata(interface);
+
++ drm_dev_unplug(dev);
+ drm_kms_helper_poll_fini(dev);
+ udl_drop_usb(dev);
+- drm_dev_unplug(dev);
+ }
+
+ /*
--- /dev/null
+From 09812134071b3941fb81def30b61ed36d3a5dfb5 Mon Sep 17 00:00:00 2001
+From: Yao Zi <ziyao@disroot.org>
+Date: Mon, 23 Jun 2025 09:34:45 +0000
+Subject: dt-bindings: serial: 8250: Make clocks and clock-frequency exclusive
+
+From: Yao Zi <ziyao@disroot.org>
+
+commit 09812134071b3941fb81def30b61ed36d3a5dfb5 upstream.
+
+The 8250 binding before converting to json-schema states,
+
+ - clock-frequency : the input clock frequency for the UART
+ or
+ - clocks phandle to refer to the clk used as per Documentation/devicetree
+
+for clock-related properties, where "or" indicates these properties
+shouldn't exist at the same time.
+
+Additionally, the behavior of Linux's driver is strange when both clocks
+and clock-frequency are specified: it ignores clocks and obtains the
+frequency from clock-frequency, left the specified clocks unclaimed. It
+may even be disabled, which is undesired most of the time.
+
+But "anyOf" doesn't prevent these two properties from coexisting, as it
+considers the object valid as long as there's at LEAST one match.
+
+Let's switch to "oneOf" and disallows the other property if one exists,
+precisely matching the original binding and avoiding future confusion on
+the driver's behavior.
+
+Fixes: e69f5dc623f9 ("dt-bindings: serial: Convert 8250 to json-schema")
+Cc: stable <stable@kernel.org>
+Signed-off-by: Yao Zi <ziyao@disroot.org>
+Reviewed-by: Conor Dooley <conor.dooley@microchip.com>
+Link: https://lore.kernel.org/r/20250623093445.62327-1-ziyao@disroot.org
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ Documentation/devicetree/bindings/serial/8250.yaml | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/Documentation/devicetree/bindings/serial/8250.yaml
++++ b/Documentation/devicetree/bindings/serial/8250.yaml
+@@ -45,7 +45,7 @@ allOf:
+ - ns16550
+ - ns16550a
+ then:
+- anyOf:
++ oneOf:
+ - required: [ clock-frequency ]
+ - required: [ clocks ]
+
--- /dev/null
+From a3f3040657417aeadb9622c629d4a0c2693a0f93 Mon Sep 17 00:00:00 2001
+From: Avadhut Naik <avadhut.naik@amd.com>
+Date: Thu, 29 May 2025 20:50:04 +0000
+Subject: EDAC/amd64: Fix size calculation for Non-Power-of-Two DIMMs
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+From: Avadhut Naik <avadhut.naik@amd.com>
+
+commit a3f3040657417aeadb9622c629d4a0c2693a0f93 upstream.
+
+Each Chip-Select (CS) of a Unified Memory Controller (UMC) on AMD Zen-based
+SOCs has an Address Mask and a Secondary Address Mask register associated with
+it. The amd64_edac module logs DIMM sizes on a per-UMC per-CS granularity
+during init using these two registers.
+
+Currently, the module primarily considers only the Address Mask register for
+computing DIMM sizes. The Secondary Address Mask register is only considered
+for odd CS. Additionally, if it has been considered, the Address Mask register
+is ignored altogether for that CS. For power-of-two DIMMs i.e. DIMMs whose
+total capacity is a power of two (32GB, 64GB, etc), this is not an issue
+since only the Address Mask register is used.
+
+For non-power-of-two DIMMs i.e., DIMMs whose total capacity is not a power of
+two (48GB, 96GB, etc), however, the Secondary Address Mask register is used
+in conjunction with the Address Mask register. However, since the module only
+considers either of the two registers for a CS, the size computed by the
+module is incorrect. The Secondary Address Mask register is not considered for
+even CS, and the Address Mask register is not considered for odd CS.
+
+Introduce a new helper function so that both Address Mask and Secondary
+Address Mask registers are considered, when valid, for computing DIMM sizes.
+Furthermore, also rename some variables for greater clarity.
+
+Fixes: 81f5090db843 ("EDAC/amd64: Support asymmetric dual-rank DIMMs")
+Closes: https://lore.kernel.org/dbec22b6-00f2-498b-b70d-ab6f8a5ec87e@natrix.lt
+Reported-by: Žilvinas Žaltiena <zilvinas@natrix.lt>
+Signed-off-by: Avadhut Naik <avadhut.naik@amd.com>
+Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de>
+Reviewed-by: Yazen Ghannam <yazen.ghannam@amd.com>
+Tested-by: Žilvinas Žaltiena <zilvinas@natrix.lt>
+Cc: stable@vger.kernel.org
+Link: https://lore.kernel.org/20250529205013.403450-1-avadhut.naik@amd.com
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/edac/amd64_edac.c | 57 +++++++++++++++++++++++++++++-----------------
+ 1 file changed, 36 insertions(+), 21 deletions(-)
+
+--- a/drivers/edac/amd64_edac.c
++++ b/drivers/edac/amd64_edac.c
+@@ -1209,7 +1209,9 @@ static int umc_get_cs_mode(int dimm, u8
+ if (csrow_enabled(2 * dimm + 1, ctrl, pvt))
+ cs_mode |= CS_ODD_PRIMARY;
+
+- /* Asymmetric dual-rank DIMM support. */
++ if (csrow_sec_enabled(2 * dimm, ctrl, pvt))
++ cs_mode |= CS_EVEN_SECONDARY;
++
+ if (csrow_sec_enabled(2 * dimm + 1, ctrl, pvt))
+ cs_mode |= CS_ODD_SECONDARY;
+
+@@ -1230,12 +1232,13 @@ static int umc_get_cs_mode(int dimm, u8
+ return cs_mode;
+ }
+
+-static int __addr_mask_to_cs_size(u32 addr_mask_orig, unsigned int cs_mode,
+- int csrow_nr, int dimm)
++static int calculate_cs_size(u32 mask, unsigned int cs_mode)
+ {
+- u32 msb, weight, num_zero_bits;
+- u32 addr_mask_deinterleaved;
+- int size = 0;
++ int msb, weight, num_zero_bits;
++ u32 deinterleaved_mask;
++
++ if (!mask)
++ return 0;
+
+ /*
+ * The number of zero bits in the mask is equal to the number of bits
+@@ -1248,19 +1251,30 @@ static int __addr_mask_to_cs_size(u32 ad
+ * without swapping with the most significant bit. This can be handled
+ * by keeping the MSB where it is and ignoring the single zero bit.
+ */
+- msb = fls(addr_mask_orig) - 1;
+- weight = hweight_long(addr_mask_orig);
++ msb = fls(mask) - 1;
++ weight = hweight_long(mask);
+ num_zero_bits = msb - weight - !!(cs_mode & CS_3R_INTERLEAVE);
+
+ /* Take the number of zero bits off from the top of the mask. */
+- addr_mask_deinterleaved = GENMASK_ULL(msb - num_zero_bits, 1);
++ deinterleaved_mask = GENMASK(msb - num_zero_bits, 1);
++ edac_dbg(1, " Deinterleaved AddrMask: 0x%x\n", deinterleaved_mask);
++
++ return (deinterleaved_mask >> 2) + 1;
++}
++
++static int __addr_mask_to_cs_size(u32 addr_mask, u32 addr_mask_sec,
++ unsigned int cs_mode, int csrow_nr, int dimm)
++{
++ int size;
+
+ edac_dbg(1, "CS%d DIMM%d AddrMasks:\n", csrow_nr, dimm);
+- edac_dbg(1, " Original AddrMask: 0x%x\n", addr_mask_orig);
+- edac_dbg(1, " Deinterleaved AddrMask: 0x%x\n", addr_mask_deinterleaved);
++ edac_dbg(1, " Primary AddrMask: 0x%x\n", addr_mask);
+
+ /* Register [31:1] = Address [39:9]. Size is in kBs here. */
+- size = (addr_mask_deinterleaved >> 2) + 1;
++ size = calculate_cs_size(addr_mask, cs_mode);
++
++ edac_dbg(1, " Secondary AddrMask: 0x%x\n", addr_mask_sec);
++ size += calculate_cs_size(addr_mask_sec, cs_mode);
+
+ /* Return size in MBs. */
+ return size >> 10;
+@@ -1269,8 +1283,8 @@ static int __addr_mask_to_cs_size(u32 ad
+ static int umc_addr_mask_to_cs_size(struct amd64_pvt *pvt, u8 umc,
+ unsigned int cs_mode, int csrow_nr)
+ {
++ u32 addr_mask = 0, addr_mask_sec = 0;
+ int cs_mask_nr = csrow_nr;
+- u32 addr_mask_orig;
+ int dimm, size = 0;
+
+ /* No Chip Selects are enabled. */
+@@ -1308,13 +1322,13 @@ static int umc_addr_mask_to_cs_size(stru
+ if (!pvt->flags.zn_regs_v2)
+ cs_mask_nr >>= 1;
+
+- /* Asymmetric dual-rank DIMM support. */
+- if ((csrow_nr & 1) && (cs_mode & CS_ODD_SECONDARY))
+- addr_mask_orig = pvt->csels[umc].csmasks_sec[cs_mask_nr];
+- else
+- addr_mask_orig = pvt->csels[umc].csmasks[cs_mask_nr];
++ if (cs_mode & (CS_EVEN_PRIMARY | CS_ODD_PRIMARY))
++ addr_mask = pvt->csels[umc].csmasks[cs_mask_nr];
++
++ if (cs_mode & (CS_EVEN_SECONDARY | CS_ODD_SECONDARY))
++ addr_mask_sec = pvt->csels[umc].csmasks_sec[cs_mask_nr];
+
+- return __addr_mask_to_cs_size(addr_mask_orig, cs_mode, csrow_nr, dimm);
++ return __addr_mask_to_cs_size(addr_mask, addr_mask_sec, cs_mode, csrow_nr, dimm);
+ }
+
+ static void umc_debug_display_dimm_sizes(struct amd64_pvt *pvt, u8 ctrl)
+@@ -3512,9 +3526,10 @@ static void gpu_get_err_info(struct mce
+ static int gpu_addr_mask_to_cs_size(struct amd64_pvt *pvt, u8 umc,
+ unsigned int cs_mode, int csrow_nr)
+ {
+- u32 addr_mask_orig = pvt->csels[umc].csmasks[csrow_nr];
++ u32 addr_mask = pvt->csels[umc].csmasks[csrow_nr];
++ u32 addr_mask_sec = pvt->csels[umc].csmasks_sec[csrow_nr];
+
+- return __addr_mask_to_cs_size(addr_mask_orig, cs_mode, csrow_nr, csrow_nr >> 1);
++ return __addr_mask_to_cs_size(addr_mask, addr_mask_sec, cs_mode, csrow_nr, csrow_nr >> 1);
+ }
+
+ static void gpu_debug_display_dimm_sizes(struct amd64_pvt *pvt, u8 ctrl)
--- /dev/null
+From ba8dac350faf16afc129ce6303ca4feaf083ccb1 Mon Sep 17 00:00:00 2001
+From: Chao Yu <chao@kernel.org>
+Date: Thu, 5 Jun 2025 11:26:33 +0800
+Subject: f2fs: fix to zero post-eof page
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+From: Chao Yu <chao@kernel.org>
+
+commit ba8dac350faf16afc129ce6303ca4feaf083ccb1 upstream.
+
+fstest reports a f2fs bug:
+
+#generic/363 42s ... [failed, exit status 1]- output mismatch (see /share/git/fstests/results//generic/363.out.bad)
+# --- tests/generic/363.out 2025-01-12 21:57:40.271440542 +0800
+# +++ /share/git/fstests/results//generic/363.out.bad 2025-05-19 19:55:58.000000000 +0800
+# @@ -1,2 +1,78 @@
+# QA output created by 363
+# fsx -q -S 0 -e 1 -N 100000
+# +READ BAD DATA: offset = 0xd6fb, size = 0xf044, fname = /mnt/f2fs/junk
+# +OFFSET GOOD BAD RANGE
+# +0x1540d 0x0000 0x2a25 0x0
+# +operation# (mod 256) for the bad data may be 37
+# +0x1540e 0x0000 0x2527 0x1
+# ...
+# (Run 'diff -u /share/git/fstests/tests/generic/363.out /share/git/fstests/results//generic/363.out.bad' to see the entire diff)
+Ran: generic/363
+Failures: generic/363
+Failed 1 of 1 tests
+
+The root cause is user can update post-eof page via mmap [1], however, f2fs
+missed to zero post-eof page in below operations, so, once it expands i_size,
+then it will include dummy data locates previous post-eof page, so during
+below operations, we need to zero post-eof page.
+
+Operations which can include dummy data after previous i_size after expanding
+i_size:
+- write
+- mapwrite [1]
+- truncate
+- fallocate
+ * preallocate
+ * zero_range
+ * insert_range
+ * collapse_range
+- clone_range (doesn’t support in f2fs)
+- copy_range (doesn’t support in f2fs)
+
+[1] https://man7.org/linux/man-pages/man2/mmap.2.html 'BUG section'
+
+Cc: stable@kernel.org
+Signed-off-by: Chao Yu <chao@kernel.org>
+Reviewed-by: Zhiguo Niu <zhiguo.niu@unisoc.com>
+Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/f2fs/file.c | 38 ++++++++++++++++++++++++++++++++++++++
+ 1 file changed, 38 insertions(+)
+
+--- a/fs/f2fs/file.c
++++ b/fs/f2fs/file.c
+@@ -35,6 +35,17 @@
+ #include <trace/events/f2fs.h>
+ #include <uapi/linux/f2fs.h>
+
++static void f2fs_zero_post_eof_page(struct inode *inode, loff_t new_size)
++{
++ loff_t old_size = i_size_read(inode);
++
++ if (old_size >= new_size)
++ return;
++
++ /* zero or drop pages only in range of [old_size, new_size] */
++ truncate_pagecache(inode, old_size);
++}
++
+ static vm_fault_t f2fs_filemap_fault(struct vm_fault *vmf)
+ {
+ struct inode *inode = file_inode(vmf->vma->vm_file);
+@@ -103,8 +114,13 @@ static vm_fault_t f2fs_vm_page_mkwrite(s
+
+ f2fs_bug_on(sbi, f2fs_has_inline_data(inode));
+
++ filemap_invalidate_lock(inode->i_mapping);
++ f2fs_zero_post_eof_page(inode, (folio->index + 1) << PAGE_SHIFT);
++ filemap_invalidate_unlock(inode->i_mapping);
++
+ file_update_time(vmf->vma->vm_file);
+ filemap_invalidate_lock_shared(inode->i_mapping);
++
+ folio_lock(folio);
+ if (unlikely(folio->mapping != inode->i_mapping ||
+ folio_pos(folio) > i_size_read(inode) ||
+@@ -1106,6 +1122,8 @@ int f2fs_setattr(struct mnt_idmap *idmap
+ f2fs_down_write(&fi->i_gc_rwsem[WRITE]);
+ filemap_invalidate_lock(inode->i_mapping);
+
++ if (attr->ia_size > old_size)
++ f2fs_zero_post_eof_page(inode, attr->ia_size);
+ truncate_setsize(inode, attr->ia_size);
+
+ if (attr->ia_size <= old_size)
+@@ -1224,6 +1242,10 @@ static int f2fs_punch_hole(struct inode
+ if (ret)
+ return ret;
+
++ filemap_invalidate_lock(inode->i_mapping);
++ f2fs_zero_post_eof_page(inode, offset + len);
++ filemap_invalidate_unlock(inode->i_mapping);
++
+ pg_start = ((unsigned long long) offset) >> PAGE_SHIFT;
+ pg_end = ((unsigned long long) offset + len) >> PAGE_SHIFT;
+
+@@ -1507,6 +1529,8 @@ static int f2fs_do_collapse(struct inode
+ f2fs_down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
+ filemap_invalidate_lock(inode->i_mapping);
+
++ f2fs_zero_post_eof_page(inode, offset + len);
++
+ f2fs_lock_op(sbi);
+ f2fs_drop_extent_tree(inode);
+ truncate_pagecache(inode, offset);
+@@ -1628,6 +1652,10 @@ static int f2fs_zero_range(struct inode
+ if (ret)
+ return ret;
+
++ filemap_invalidate_lock(mapping);
++ f2fs_zero_post_eof_page(inode, offset + len);
++ filemap_invalidate_unlock(mapping);
++
+ pg_start = ((unsigned long long) offset) >> PAGE_SHIFT;
+ pg_end = ((unsigned long long) offset + len) >> PAGE_SHIFT;
+
+@@ -1759,6 +1787,8 @@ static int f2fs_insert_range(struct inod
+ /* avoid gc operation during block exchange */
+ f2fs_down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
+ filemap_invalidate_lock(mapping);
++
++ f2fs_zero_post_eof_page(inode, offset + len);
+ truncate_pagecache(inode, offset);
+
+ while (!ret && idx > pg_start) {
+@@ -1816,6 +1846,10 @@ static int f2fs_expand_inode_data(struct
+ if (err)
+ return err;
+
++ filemap_invalidate_lock(inode->i_mapping);
++ f2fs_zero_post_eof_page(inode, offset + len);
++ filemap_invalidate_unlock(inode->i_mapping);
++
+ f2fs_balance_fs(sbi, true);
+
+ pg_start = ((unsigned long long)offset) >> PAGE_SHIFT;
+@@ -4846,6 +4880,10 @@ static ssize_t f2fs_write_checks(struct
+ err = file_modified(file);
+ if (err)
+ return err;
++
++ filemap_invalidate_lock(inode->i_mapping);
++ f2fs_zero_post_eof_page(inode, iocb->ki_pos + iov_iter_count(from));
++ filemap_invalidate_unlock(inode->i_mapping);
+ return count;
+ }
+
--- /dev/null
+From 4540e41e753a7d69ecd3f5bad51fe620205c3a18 Mon Sep 17 00:00:00 2001
+From: Qasim Ijaz <qasdev00@gmail.com>
+Date: Sun, 15 Jun 2025 23:59:41 +0100
+Subject: HID: appletb-kbd: fix "appletb_backlight" backlight device reference counting
+
+From: Qasim Ijaz <qasdev00@gmail.com>
+
+commit 4540e41e753a7d69ecd3f5bad51fe620205c3a18 upstream.
+
+During appletb_kbd_probe, probe attempts to get the backlight device
+by name. When this happens backlight_device_get_by_name looks for a
+device in the backlight class which has name "appletb_backlight" and
+upon finding a match it increments the reference count for the device
+and returns it to the caller. However this reference is never released
+leading to a reference leak.
+
+Fix this by decrementing the backlight device reference count on removal
+via put_device and on probe failure.
+
+Fixes: 93a0fc489481 ("HID: hid-appletb-kbd: add support for automatic brightness control while using the touchbar")
+Cc: stable@vger.kernel.org
+Signed-off-by: Qasim Ijaz <qasdev00@gmail.com>
+Reviewed-by: Aditya Garg <gargaditya08@live.com>
+Signed-off-by: Jiri Kosina <jkosina@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/hid/hid-appletb-kbd.c | 5 +++++
+ 1 file changed, 5 insertions(+)
+
+--- a/drivers/hid/hid-appletb-kbd.c
++++ b/drivers/hid/hid-appletb-kbd.c
+@@ -435,6 +435,8 @@ static int appletb_kbd_probe(struct hid_
+ return 0;
+
+ close_hw:
++ if (kbd->backlight_dev)
++ put_device(&kbd->backlight_dev->dev);
+ hid_hw_close(hdev);
+ stop_hw:
+ hid_hw_stop(hdev);
+@@ -450,6 +452,9 @@ static void appletb_kbd_remove(struct hi
+ input_unregister_handler(&kbd->inp_handler);
+ timer_delete_sync(&kbd->inactivity_timer);
+
++ if (kbd->backlight_dev)
++ put_device(&kbd->backlight_dev->dev);
++
+ hid_hw_close(hdev);
+ hid_hw_stop(hdev);
+ }
--- /dev/null
+From 9327e3ee5b077c4ab4495a09b67624f670ed88b6 Mon Sep 17 00:00:00 2001
+From: Iusico Maxim <iusico.maxim@libero.it>
+Date: Thu, 5 Jun 2025 19:55:50 +0200
+Subject: HID: lenovo: Restrict F7/9/11 mode to compact keyboards only
+
+From: Iusico Maxim <iusico.maxim@libero.it>
+
+commit 9327e3ee5b077c4ab4495a09b67624f670ed88b6 upstream.
+
+Commit 2f2bd7cbd1d1 ("hid: lenovo: Resend all settings on reset_resume
+for compact keyboards") introduced a regression for ThinkPad TrackPoint
+Keyboard II by removing the conditional check for enabling F7/9/11 mode
+needed for compact keyboards only. As a result, the non-compact
+keyboards can no longer toggle Fn-lock via Fn+Esc, although it can be
+controlled via sysfs knob that directly sends raw commands.
+
+This patch restores the previous conditional check without any
+additions.
+
+Cc: stable@vger.kernel.org
+Fixes: 2f2bd7cbd1d1 ("hid: lenovo: Resend all settings on reset_resume for compact keyboards")
+Signed-off-by: Iusico Maxim <iusico.maxim@libero.it>
+Signed-off-by: Jiri Kosina <jkosina@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/hid/hid-lenovo.c | 11 +++++++----
+ 1 file changed, 7 insertions(+), 4 deletions(-)
+
+--- a/drivers/hid/hid-lenovo.c
++++ b/drivers/hid/hid-lenovo.c
+@@ -548,11 +548,14 @@ static void lenovo_features_set_cptkbd(s
+
+ /*
+ * Tell the keyboard a driver understands it, and turn F7, F9, F11 into
+- * regular keys
++ * regular keys (Compact only)
+ */
+- ret = lenovo_send_cmd_cptkbd(hdev, 0x01, 0x03);
+- if (ret)
+- hid_warn(hdev, "Failed to switch F7/9/11 mode: %d\n", ret);
++ if (hdev->product == USB_DEVICE_ID_LENOVO_CUSBKBD ||
++ hdev->product == USB_DEVICE_ID_LENOVO_CBTKBD) {
++ ret = lenovo_send_cmd_cptkbd(hdev, 0x01, 0x03);
++ if (ret)
++ hid_warn(hdev, "Failed to switch F7/9/11 mode: %d\n", ret);
++ }
+
+ /* Switch middle button to native mode */
+ ret = lenovo_send_cmd_cptkbd(hdev, 0x09, 0x01);
--- /dev/null
+From 85a720f4337f0ddf1603c8b75a8f1ffbbe022ef9 Mon Sep 17 00:00:00 2001
+From: Qasim Ijaz <qasdev00@gmail.com>
+Date: Fri, 6 Jun 2025 19:49:59 +0100
+Subject: HID: wacom: fix kobject reference count leak
+
+From: Qasim Ijaz <qasdev00@gmail.com>
+
+commit 85a720f4337f0ddf1603c8b75a8f1ffbbe022ef9 upstream.
+
+When sysfs_create_files() fails in wacom_initialize_remotes() the error
+is returned and the cleanup action will not have been registered yet.
+
+As a result the kobject???s refcount is never dropped, so the
+kobject can never be freed leading to a reference leak.
+
+Fix this by calling kobject_put() before returning.
+
+Fixes: 83e6b40e2de6 ("HID: wacom: EKR: have the wacom resources dynamically allocated")
+Acked-by: Ping Cheng <ping.cheng@wacom.com>
+Cc: stable@vger.kernel.org
+Signed-off-by: Qasim Ijaz <qasdev00@gmail.com>
+Signed-off-by: Jiri Kosina <jkosina@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/hid/wacom_sys.c | 1 +
+ 1 file changed, 1 insertion(+)
+
+--- a/drivers/hid/wacom_sys.c
++++ b/drivers/hid/wacom_sys.c
+@@ -2059,6 +2059,7 @@ static int wacom_initialize_remotes(stru
+ hid_err(wacom->hdev,
+ "cannot create sysfs group err: %d\n", error);
+ kfifo_free(&remote->remote_fifo);
++ kobject_put(remote->remote_dir);
+ return error;
+ }
+
--- /dev/null
+From 5ae416c5b1e2e816aee7b3fc8347adf70afabb4c Mon Sep 17 00:00:00 2001
+From: Qasim Ijaz <qasdev00@gmail.com>
+Date: Fri, 6 Jun 2025 19:49:57 +0100
+Subject: HID: wacom: fix memory leak on kobject creation failure
+
+From: Qasim Ijaz <qasdev00@gmail.com>
+
+commit 5ae416c5b1e2e816aee7b3fc8347adf70afabb4c upstream.
+
+During wacom_initialize_remotes() a fifo buffer is allocated
+with kfifo_alloc() and later a cleanup action is registered
+during devm_add_action_or_reset() to clean it up.
+
+However if the code fails to create a kobject and register it
+with sysfs the code simply returns -ENOMEM before the cleanup
+action is registered leading to a memory leak.
+
+Fix this by ensuring the fifo is freed when the kobject creation
+and registration process fails.
+
+Fixes: 83e6b40e2de6 ("HID: wacom: EKR: have the wacom resources dynamically allocated")
+Reviewed-by: Ping Cheng <ping.cheng@wacom.com>
+Cc: stable@vger.kernel.org
+Signed-off-by: Qasim Ijaz <qasdev00@gmail.com>
+Signed-off-by: Jiri Kosina <jkosina@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/hid/wacom_sys.c | 4 +++-
+ 1 file changed, 3 insertions(+), 1 deletion(-)
+
+--- a/drivers/hid/wacom_sys.c
++++ b/drivers/hid/wacom_sys.c
+@@ -2048,8 +2048,10 @@ static int wacom_initialize_remotes(stru
+
+ remote->remote_dir = kobject_create_and_add("wacom_remote",
+ &wacom->hdev->dev.kobj);
+- if (!remote->remote_dir)
++ if (!remote->remote_dir) {
++ kfifo_free(&remote->remote_fifo);
+ return -ENOMEM;
++ }
+
+ error = sysfs_create_files(remote->remote_dir, remote_unpair_attrs);
+
--- /dev/null
+From 1a19ae437ca5d5c7d9ec2678946fb339b1c706bf Mon Sep 17 00:00:00 2001
+From: Qasim Ijaz <qasdev00@gmail.com>
+Date: Fri, 6 Jun 2025 19:49:58 +0100
+Subject: HID: wacom: fix memory leak on sysfs attribute creation failure
+
+From: Qasim Ijaz <qasdev00@gmail.com>
+
+commit 1a19ae437ca5d5c7d9ec2678946fb339b1c706bf upstream.
+
+When sysfs_create_files() fails during wacom_initialize_remotes() the
+fifo buffer is not freed leading to a memory leak.
+
+Fix this by calling kfifo_free() before returning.
+
+Fixes: 83e6b40e2de6 ("HID: wacom: EKR: have the wacom resources dynamically allocated")
+Reviewed-by: Ping Cheng <ping.cheng@wacom.com>
+Cc: stable@vger.kernel.org
+Signed-off-by: Qasim Ijaz <qasdev00@gmail.com>
+Signed-off-by: Jiri Kosina <jkosina@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/hid/wacom_sys.c | 1 +
+ 1 file changed, 1 insertion(+)
+
+--- a/drivers/hid/wacom_sys.c
++++ b/drivers/hid/wacom_sys.c
+@@ -2058,6 +2058,7 @@ static int wacom_initialize_remotes(stru
+ if (error) {
+ hid_err(wacom->hdev,
+ "cannot create sysfs group err: %d\n", error);
++ kfifo_free(&remote->remote_fifo);
+ return error;
+ }
+
--- /dev/null
+From fba46a5d83ca8decb338722fb4899026d8d9ead2 Mon Sep 17 00:00:00 2001
+From: "Liam R. Howlett" <Liam.Howlett@oracle.com>
+Date: Mon, 16 Jun 2025 14:45:20 -0400
+Subject: maple_tree: fix MA_STATE_PREALLOC flag in mas_preallocate()
+
+From: Liam R. Howlett <Liam.Howlett@oracle.com>
+
+commit fba46a5d83ca8decb338722fb4899026d8d9ead2 upstream.
+
+Temporarily clear the preallocation flag when explicitly requesting
+allocations. Pre-existing allocations are already counted against the
+request through mas_node_count_gfp(), but the allocations will not happen
+if the MA_STATE_PREALLOC flag is set. This flag is meant to avoid
+re-allocating in bulk allocation mode, and to detect issues with
+preallocation calculations.
+
+The MA_STATE_PREALLOC flag should also always be set on zero allocations
+so that detection of underflow allocations will print a WARN_ON() during
+consumption.
+
+User visible effect of this flaw is a WARN_ON() followed by a null pointer
+dereference when subsequent requests for larger number of nodes is
+ignored, such as the vma merge retry in mmap_region() caused by drivers
+altering the vma flags (which happens in v6.6, at least)
+
+Link: https://lkml.kernel.org/r/20250616184521.3382795-3-Liam.Howlett@oracle.com
+Fixes: 54a611b60590 ("Maple Tree: add new data structure")
+Signed-off-by: Liam R. Howlett <Liam.Howlett@oracle.com>
+Reported-by: Zhaoyang Huang <zhaoyang.huang@unisoc.com>
+Reported-by: Hailong Liu <hailong.liu@oppo.com>
+Link: https://lore.kernel.org/all/1652f7eb-a51b-4fee-8058-c73af63bacd1@oppo.com/
+Link: https://lore.kernel.org/all/20250428184058.1416274-1-Liam.Howlett@oracle.com/
+Link: https://lore.kernel.org/all/20250429014754.1479118-1-Liam.Howlett@oracle.com/
+Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
+Cc: Suren Baghdasaryan <surenb@google.com>
+Cc: Hailong Liu <hailong.liu@oppo.com>
+Cc: zhangpeng.00@bytedance.com <zhangpeng.00@bytedance.com>
+Cc: Steve Kang <Steve.Kang@unisoc.com>
+Cc: Matthew Wilcox <willy@infradead.org>
+Cc: Sidhartha Kumar <sidhartha.kumar@oracle.com>
+Cc: <stable@vger.kernel.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ lib/maple_tree.c | 4 +++-
+ 1 file changed, 3 insertions(+), 1 deletion(-)
+
+--- a/lib/maple_tree.c
++++ b/lib/maple_tree.c
+@@ -5496,8 +5496,9 @@ int mas_preallocate(struct ma_state *mas
+ mas->store_type = mas_wr_store_type(&wr_mas);
+ request = mas_prealloc_calc(mas, entry);
+ if (!request)
+- return ret;
++ goto set_flag;
+
++ mas->mas_flags &= ~MA_STATE_PREALLOC;
+ mas_node_count_gfp(mas, request, gfp);
+ if (mas_is_err(mas)) {
+ mas_set_alloc_req(mas, 0);
+@@ -5507,6 +5508,7 @@ int mas_preallocate(struct ma_state *mas
+ return ret;
+ }
+
++set_flag:
+ mas->mas_flags |= MA_STATE_PREALLOC;
+ return ret;
+ }
--- /dev/null
+From 517f496e1e61bd169d585dab4dd77e7147506322 Mon Sep 17 00:00:00 2001
+From: David Hildenbrand <david@redhat.com>
+Date: Wed, 11 Jun 2025 15:13:14 +0200
+Subject: mm/gup: revert "mm: gup: fix infinite loop within __get_longterm_locked"
+
+From: David Hildenbrand <david@redhat.com>
+
+commit 517f496e1e61bd169d585dab4dd77e7147506322 upstream.
+
+After commit 1aaf8c122918 ("mm: gup: fix infinite loop within
+__get_longterm_locked") we are able to longterm pin folios that are not
+supposed to get longterm pinned, simply because they temporarily have the
+LRU flag cleared (esp. temporarily isolated).
+
+For example, two __get_longterm_locked() callers can race, or
+__get_longterm_locked() can race with anything else that temporarily
+isolates folios.
+
+The introducing commit mentions the use case of a driver that uses
+vm_ops->fault to insert pages allocated through cma_alloc() into the page
+tables, assuming they can later get longterm pinned. These pages/ folios
+would never have the LRU flag set and consequently cannot get isolated.
+There is no known in-tree user making use of that so far, fortunately.
+
+To handle that in the future -- and avoid retrying forever to
+isolate/migrate them -- we will need a different mechanism for the CMA
+area *owner* to indicate that it actually already allocated the page and
+is fine with longterm pinning it. The LRU flag is not suitable for that.
+
+Probably we can lookup the relevant CMA area and query the bitmap; we only
+have have to care about some races, probably. If already allocated, we
+could just allow longterm pinning)
+
+Anyhow, let's fix the "must not be longterm pinned" problem first by
+reverting the original commit.
+
+Link: https://lkml.kernel.org/r/20250611131314.594529-1-david@redhat.com
+Fixes: 1aaf8c122918 ("mm: gup: fix infinite loop within __get_longterm_locked")
+Signed-off-by: David Hildenbrand <david@redhat.com>
+Closes: https://lore.kernel.org/all/20250522092755.GA3277597@tiffany/
+Reported-by: Hyesoo Yu <hyesoo.yu@samsung.com>
+Reviewed-by: John Hubbard <jhubbard@nvidia.com>
+Cc: Jason Gunthorpe <jgg@ziepe.ca>
+Cc: Peter Xu <peterx@redhat.com>
+Cc: Zhaoyang Huang <zhaoyang.huang@unisoc.com>
+Cc: Aijun Sun <aijun.sun@unisoc.com>
+Cc: Alistair Popple <apopple@nvidia.com>
+Cc: <stable@vger.kernel.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ mm/gup.c | 14 ++++++++++----
+ 1 file changed, 10 insertions(+), 4 deletions(-)
+
+--- a/mm/gup.c
++++ b/mm/gup.c
+@@ -2320,13 +2320,13 @@ static void pofs_unpin(struct pages_or_f
+ /*
+ * Returns the number of collected folios. Return value is always >= 0.
+ */
+-static void collect_longterm_unpinnable_folios(
++static unsigned long collect_longterm_unpinnable_folios(
+ struct list_head *movable_folio_list,
+ struct pages_or_folios *pofs)
+ {
++ unsigned long i, collected = 0;
+ struct folio *prev_folio = NULL;
+ bool drain_allow = true;
+- unsigned long i;
+
+ for (i = 0; i < pofs->nr_entries; i++) {
+ struct folio *folio = pofs_get_folio(pofs, i);
+@@ -2338,6 +2338,8 @@ static void collect_longterm_unpinnable_
+ if (folio_is_longterm_pinnable(folio))
+ continue;
+
++ collected++;
++
+ if (folio_is_device_coherent(folio))
+ continue;
+
+@@ -2359,6 +2361,8 @@ static void collect_longterm_unpinnable_
+ NR_ISOLATED_ANON + folio_is_file_lru(folio),
+ folio_nr_pages(folio));
+ }
++
++ return collected;
+ }
+
+ /*
+@@ -2435,9 +2439,11 @@ static long
+ check_and_migrate_movable_pages_or_folios(struct pages_or_folios *pofs)
+ {
+ LIST_HEAD(movable_folio_list);
++ unsigned long collected;
+
+- collect_longterm_unpinnable_folios(&movable_folio_list, pofs);
+- if (list_empty(&movable_folio_list))
++ collected = collect_longterm_unpinnable_folios(&movable_folio_list,
++ pofs);
++ if (!collected)
+ return 0;
+
+ return migrate_longterm_unpinnable_folios(&movable_folio_list, pofs);
--- /dev/null
+From a05dd8ae5cbb1cb45f349922cfea4f548a5e5d6f Mon Sep 17 00:00:00 2001
+From: Kairui Song <kasong@tencent.com>
+Date: Tue, 10 Jun 2025 01:17:51 +0800
+Subject: mm/shmem, swap: fix softlockup with mTHP swapin
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+From: Kairui Song <kasong@tencent.com>
+
+commit a05dd8ae5cbb1cb45f349922cfea4f548a5e5d6f upstream.
+
+Following softlockup can be easily reproduced on my test machine with:
+
+echo always > /sys/kernel/mm/transparent_hugepage/hugepages-64kB/enabled
+swapon /dev/zram0 # zram0 is a 48G swap device
+mkdir -p /sys/fs/cgroup/memory/test
+echo 1G > /sys/fs/cgroup/test/memory.max
+echo $BASHPID > /sys/fs/cgroup/test/cgroup.procs
+while true; do
+ dd if=/dev/zero of=/tmp/test.img bs=1M count=5120
+ cat /tmp/test.img > /dev/null
+ rm /tmp/test.img
+done
+
+Then after a while:
+watchdog: BUG: soft lockup - CPU#0 stuck for 763s! [cat:5787]
+Modules linked in: zram virtiofs
+CPU: 0 UID: 0 PID: 5787 Comm: cat Kdump: loaded Tainted: G L 6.15.0.orig-gf3021d9246bc-dirty #118 PREEMPT(voluntary)·
+Tainted: [L]=SOFTLOCKUP
+Hardware name: Red Hat KVM/RHEL-AV, BIOS 0.0.0 02/06/2015
+RIP: 0010:mpol_shared_policy_lookup+0xd/0x70
+Code: e9 b8 b4 ff ff 31 c0 c3 cc cc cc cc 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 66 0f 1f 00 0f 1f 44 00 00 41 54 55 53 <48> 8b 1f 48 85 db 74 41 4c 8d 67 08 48 89 fb 48 89 f5 4c 89 e7 e8
+RSP: 0018:ffffc90002b1fc28 EFLAGS: 00000202
+RAX: 00000000001c20ca RBX: 0000000000724e1e RCX: 0000000000000001
+RDX: ffff888118e214c8 RSI: 0000000000057d42 RDI: ffff888118e21518
+RBP: 000000000002bec8 R08: 0000000000000001 R09: 0000000000000000
+R10: 0000000000000bf4 R11: 0000000000000000 R12: 0000000000000001
+R13: 00000000001c20ca R14: 00000000001c20ca R15: 0000000000000000
+FS: 00007f03f995c740(0000) GS:ffff88a07ad9a000(0000) knlGS:0000000000000000
+CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
+CR2: 00007f03f98f1000 CR3: 0000000144626004 CR4: 0000000000770eb0
+DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
+DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
+PKRU: 55555554
+Call Trace:
+ <TASK>
+ shmem_alloc_folio+0x31/0xc0
+ shmem_swapin_folio+0x309/0xcf0
+ ? filemap_get_entry+0x117/0x1e0
+ ? xas_load+0xd/0xb0
+ ? filemap_get_entry+0x101/0x1e0
+ shmem_get_folio_gfp+0x2ed/0x5b0
+ shmem_file_read_iter+0x7f/0x2e0
+ vfs_read+0x252/0x330
+ ksys_read+0x68/0xf0
+ do_syscall_64+0x4c/0x1c0
+ entry_SYSCALL_64_after_hwframe+0x76/0x7e
+RIP: 0033:0x7f03f9a46991
+Code: 00 48 8b 15 81 14 10 00 f7 d8 64 89 02 b8 ff ff ff ff eb bd e8 20 ad 01 00 f3 0f 1e fa 80 3d 35 97 10 00 00 74 13 31 c0 0f 05 <48> 3d 00 f0 ff ff 77 4f c3 66 0f 1f 44 00 00 55 48 89 e5 48 83 ec
+RSP: 002b:00007fff3c52bd28 EFLAGS: 00000246 ORIG_RAX: 0000000000000000
+RAX: ffffffffffffffda RBX: 0000000000040000 RCX: 00007f03f9a46991
+RDX: 0000000000040000 RSI: 00007f03f98ba000 RDI: 0000000000000003
+RBP: 00007fff3c52bd50 R08: 0000000000000000 R09: 00007f03f9b9a380
+R10: 0000000000000022 R11: 0000000000000246 R12: 0000000000040000
+R13: 00007f03f98ba000 R14: 0000000000000003 R15: 0000000000000000
+ </TASK>
+
+The reason is simple, readahead brought some order 0 folio in swap cache,
+and the swapin mTHP folio being allocated is in conflict with it, so
+swapcache_prepare fails and causes shmem_swap_alloc_folio to return
+-EEXIST, and shmem simply retries again and again causing this loop.
+
+Fix it by applying a similar fix for anon mTHP swapin.
+
+The performance change is very slight, time of swapin 10g zero folios
+with shmem (test for 12 times):
+Before: 2.47s
+After: 2.48s
+
+[kasong@tencent.com: add comment]
+ Link: https://lkml.kernel.org/r/20250610181645.45922-1-ryncsn@gmail.com
+Link: https://lkml.kernel.org/r/20250610181645.45922-1-ryncsn@gmail.com
+Link: https://lkml.kernel.org/r/20250609171751.36305-1-ryncsn@gmail.com
+Fixes: 1dd44c0af4fa ("mm: shmem: skip swapcache for swapin of synchronous swap device")
+Signed-off-by: Kairui Song <kasong@tencent.com>
+Reviewed-by: Barry Song <baohua@kernel.org>
+Acked-by: Nhat Pham <nphamcs@gmail.com>
+Reviewed-by: Baolin Wang <baolin.wang@linux.alibaba.com>
+Cc: Baoquan He <bhe@redhat.com>
+Cc: Chris Li <chrisl@kernel.org>
+Cc: Hugh Dickins <hughd@google.com>
+Cc: Kemeng Shi <shikemeng@huaweicloud.com>
+Cc: Usama Arif <usamaarif642@gmail.com>
+Cc: <stable@vger.kernel.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ mm/memory.c | 20 --------------------
+ mm/shmem.c | 6 +++++-
+ mm/swap.h | 23 +++++++++++++++++++++++
+ 3 files changed, 28 insertions(+), 21 deletions(-)
+
+--- a/mm/memory.c
++++ b/mm/memory.c
+@@ -4224,26 +4224,6 @@ static struct folio *__alloc_swap_folio(
+ }
+
+ #ifdef CONFIG_TRANSPARENT_HUGEPAGE
+-static inline int non_swapcache_batch(swp_entry_t entry, int max_nr)
+-{
+- struct swap_info_struct *si = swp_swap_info(entry);
+- pgoff_t offset = swp_offset(entry);
+- int i;
+-
+- /*
+- * While allocating a large folio and doing swap_read_folio, which is
+- * the case the being faulted pte doesn't have swapcache. We need to
+- * ensure all PTEs have no cache as well, otherwise, we might go to
+- * swap devices while the content is in swapcache.
+- */
+- for (i = 0; i < max_nr; i++) {
+- if ((si->swap_map[offset + i] & SWAP_HAS_CACHE))
+- return i;
+- }
+-
+- return i;
+-}
+-
+ /*
+ * Check if the PTEs within a range are contiguous swap entries
+ * and have consistent swapcache, zeromap.
+--- a/mm/shmem.c
++++ b/mm/shmem.c
+@@ -2262,6 +2262,7 @@ static int shmem_swapin_folio(struct ino
+ folio = swap_cache_get_folio(swap, NULL, 0);
+ order = xa_get_order(&mapping->i_pages, index);
+ if (!folio) {
++ int nr_pages = 1 << order;
+ bool fallback_order0 = false;
+
+ /* Or update major stats only when swapin succeeds?? */
+@@ -2275,9 +2276,12 @@ static int shmem_swapin_folio(struct ino
+ * If uffd is active for the vma, we need per-page fault
+ * fidelity to maintain the uffd semantics, then fallback
+ * to swapin order-0 folio, as well as for zswap case.
++ * Any existing sub folio in the swap cache also blocks
++ * mTHP swapin.
+ */
+ if (order > 0 && ((vma && unlikely(userfaultfd_armed(vma))) ||
+- !zswap_never_enabled()))
++ !zswap_never_enabled() ||
++ non_swapcache_batch(swap, nr_pages) != nr_pages))
+ fallback_order0 = true;
+
+ /* Skip swapcache for synchronous device. */
+--- a/mm/swap.h
++++ b/mm/swap.h
+@@ -106,6 +106,25 @@ static inline int swap_zeromap_batch(swp
+ return find_next_bit(sis->zeromap, end, start) - start;
+ }
+
++static inline int non_swapcache_batch(swp_entry_t entry, int max_nr)
++{
++ struct swap_info_struct *si = swp_swap_info(entry);
++ pgoff_t offset = swp_offset(entry);
++ int i;
++
++ /*
++ * While allocating a large folio and doing mTHP swapin, we need to
++ * ensure all entries are not cached, otherwise, the mTHP folio will
++ * be in conflict with the folio in swap cache.
++ */
++ for (i = 0; i < max_nr; i++) {
++ if ((si->swap_map[offset + i] & SWAP_HAS_CACHE))
++ return i;
++ }
++
++ return i;
++}
++
+ #else /* CONFIG_SWAP */
+ struct swap_iocb;
+ static inline void swap_read_folio(struct folio *folio, struct swap_iocb **plug)
+@@ -199,6 +218,10 @@ static inline int swap_zeromap_batch(swp
+ return 0;
+ }
+
++static inline int non_swapcache_batch(swp_entry_t entry, int max_nr)
++{
++ return 0;
++}
+ #endif /* CONFIG_SWAP */
+
+ #endif /* _MM_SWAP_H */
--- /dev/null
+From 0ea148a799198518d8ebab63ddd0bb6114a103bc Mon Sep 17 00:00:00 2001
+From: Kairui Song <kasong@tencent.com>
+Date: Wed, 4 Jun 2025 23:10:38 +0800
+Subject: mm: userfaultfd: fix race of userfaultfd_move and swap cache
+
+From: Kairui Song <kasong@tencent.com>
+
+commit 0ea148a799198518d8ebab63ddd0bb6114a103bc upstream.
+
+This commit fixes two kinds of races, they may have different results:
+
+Barry reported a BUG_ON in commit c50f8e6053b0, we may see the same
+BUG_ON if the filemap lookup returned NULL and folio is added to swap
+cache after that.
+
+If another kind of race is triggered (folio changed after lookup) we
+may see RSS counter is corrupted:
+
+[ 406.893936] BUG: Bad rss-counter state mm:ffff0000c5a9ddc0
+type:MM_ANONPAGES val:-1
+[ 406.894071] BUG: Bad rss-counter state mm:ffff0000c5a9ddc0
+type:MM_SHMEMPAGES val:1
+
+Because the folio is being accounted to the wrong VMA.
+
+I'm not sure if there will be any data corruption though, seems no.
+The issues above are critical already.
+
+
+On seeing a swap entry PTE, userfaultfd_move does a lockless swap cache
+lookup, and tries to move the found folio to the faulting vma. Currently,
+it relies on checking the PTE value to ensure that the moved folio still
+belongs to the src swap entry and that no new folio has been added to the
+swap cache, which turns out to be unreliable.
+
+While working and reviewing the swap table series with Barry, following
+existing races are observed and reproduced [1]:
+
+In the example below, move_pages_pte is moving src_pte to dst_pte, where
+src_pte is a swap entry PTE holding swap entry S1, and S1 is not in the
+swap cache:
+
+CPU1 CPU2
+userfaultfd_move
+ move_pages_pte()
+ entry = pte_to_swp_entry(orig_src_pte);
+ // Here it got entry = S1
+ ... < interrupted> ...
+ <swapin src_pte, alloc and use folio A>
+ // folio A is a new allocated folio
+ // and get installed into src_pte
+ <frees swap entry S1>
+ // src_pte now points to folio A, S1
+ // has swap count == 0, it can be freed
+ // by folio_swap_swap or swap
+ // allocator's reclaim.
+ <try to swap out another folio B>
+ // folio B is a folio in another VMA.
+ <put folio B to swap cache using S1 >
+ // S1 is freed, folio B can use it
+ // for swap out with no problem.
+ ...
+ folio = filemap_get_folio(S1)
+ // Got folio B here !!!
+ ... < interrupted again> ...
+ <swapin folio B and free S1>
+ // Now S1 is free to be used again.
+ <swapout src_pte & folio A using S1>
+ // Now src_pte is a swap entry PTE
+ // holding S1 again.
+ folio_trylock(folio)
+ move_swap_pte
+ double_pt_lock
+ is_pte_pages_stable
+ // Check passed because src_pte == S1
+ folio_move_anon_rmap(...)
+ // Moved invalid folio B here !!!
+
+The race window is very short and requires multiple collisions of multiple
+rare events, so it's very unlikely to happen, but with a deliberately
+constructed reproducer and increased time window, it can be reproduced
+easily.
+
+This can be fixed by checking if the folio returned by filemap is the
+valid swap cache folio after acquiring the folio lock.
+
+Another similar race is possible: filemap_get_folio may return NULL, but
+folio (A) could be swapped in and then swapped out again using the same
+swap entry after the lookup. In such a case, folio (A) may remain in the
+swap cache, so it must be moved too:
+
+CPU1 CPU2
+userfaultfd_move
+ move_pages_pte()
+ entry = pte_to_swp_entry(orig_src_pte);
+ // Here it got entry = S1, and S1 is not in swap cache
+ folio = filemap_get_folio(S1)
+ // Got NULL
+ ... < interrupted again> ...
+ <swapin folio A and free S1>
+ <swapout folio A re-using S1>
+ move_swap_pte
+ double_pt_lock
+ is_pte_pages_stable
+ // Check passed because src_pte == S1
+ folio_move_anon_rmap(...)
+ // folio A is ignored !!!
+
+Fix this by checking the swap cache again after acquiring the src_pte
+lock. And to avoid the filemap overhead, we check swap_map directly [2].
+
+The SWP_SYNCHRONOUS_IO path does make the problem more complex, but so far
+we don't need to worry about that, since folios can only be exposed to the
+swap cache in the swap out path, and this is covered in this patch by
+checking the swap cache again after acquiring the src_pte lock.
+
+Testing with a simple C program that allocates and moves several GB of
+memory did not show any observable performance change.
+
+Link: https://lkml.kernel.org/r/20250604151038.21968-1-ryncsn@gmail.com
+Fixes: adef440691ba ("userfaultfd: UFFDIO_MOVE uABI")
+Signed-off-by: Kairui Song <kasong@tencent.com>
+Closes: https://lore.kernel.org/linux-mm/CAMgjq7B1K=6OOrK2OUZ0-tqCzi+EJt+2_K97TPGoSt=9+JwP7Q@mail.gmail.com/ [1]
+Link: https://lore.kernel.org/all/CAGsJ_4yJhJBo16XhiC-nUzSheyX-V3-nFE+tAi=8Y560K8eT=A@mail.gmail.com/ [2]
+Reviewed-by: Lokesh Gidra <lokeshgidra@google.com>
+Acked-by: Peter Xu <peterx@redhat.com>
+Reviewed-by: Suren Baghdasaryan <surenb@google.com>
+Reviewed-by: Barry Song <baohua@kernel.org>
+Reviewed-by: Chris Li <chrisl@kernel.org>
+Cc: Andrea Arcangeli <aarcange@redhat.com>
+Cc: David Hildenbrand <david@redhat.com>
+Cc: Kairui Song <kasong@tencent.com>
+Cc: <stable@vger.kernel.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ mm/userfaultfd.c | 33 +++++++++++++++++++++++++++++++--
+ 1 file changed, 31 insertions(+), 2 deletions(-)
+
+--- a/mm/userfaultfd.c
++++ b/mm/userfaultfd.c
+@@ -1084,8 +1084,18 @@ static int move_swap_pte(struct mm_struc
+ pte_t orig_dst_pte, pte_t orig_src_pte,
+ pmd_t *dst_pmd, pmd_t dst_pmdval,
+ spinlock_t *dst_ptl, spinlock_t *src_ptl,
+- struct folio *src_folio)
++ struct folio *src_folio,
++ struct swap_info_struct *si, swp_entry_t entry)
+ {
++ /*
++ * Check if the folio still belongs to the target swap entry after
++ * acquiring the lock. Folio can be freed in the swap cache while
++ * not locked.
++ */
++ if (src_folio && unlikely(!folio_test_swapcache(src_folio) ||
++ entry.val != src_folio->swap.val))
++ return -EAGAIN;
++
+ double_pt_lock(dst_ptl, src_ptl);
+
+ if (!is_pte_pages_stable(dst_pte, src_pte, orig_dst_pte, orig_src_pte,
+@@ -1102,6 +1112,25 @@ static int move_swap_pte(struct mm_struc
+ if (src_folio) {
+ folio_move_anon_rmap(src_folio, dst_vma);
+ src_folio->index = linear_page_index(dst_vma, dst_addr);
++ } else {
++ /*
++ * Check if the swap entry is cached after acquiring the src_pte
++ * lock. Otherwise, we might miss a newly loaded swap cache folio.
++ *
++ * Check swap_map directly to minimize overhead, READ_ONCE is sufficient.
++ * We are trying to catch newly added swap cache, the only possible case is
++ * when a folio is swapped in and out again staying in swap cache, using the
++ * same entry before the PTE check above. The PTL is acquired and released
++ * twice, each time after updating the swap_map's flag. So holding
++ * the PTL here ensures we see the updated value. False positive is possible,
++ * e.g. SWP_SYNCHRONOUS_IO swapin may set the flag without touching the
++ * cache, or during the tiny synchronization window between swap cache and
++ * swap_map, but it will be gone very quickly, worst result is retry jitters.
++ */
++ if (READ_ONCE(si->swap_map[swp_offset(entry)]) & SWAP_HAS_CACHE) {
++ double_pt_unlock(dst_ptl, src_ptl);
++ return -EAGAIN;
++ }
+ }
+
+ orig_src_pte = ptep_get_and_clear(mm, src_addr, src_pte);
+@@ -1412,7 +1441,7 @@ retry:
+ }
+ err = move_swap_pte(mm, dst_vma, dst_addr, src_addr, dst_pte, src_pte,
+ orig_dst_pte, orig_src_pte, dst_pmd, dst_pmdval,
+- dst_ptl, src_ptl, src_folio);
++ dst_ptl, src_ptl, src_folio, si, entry);
+ }
+
+ out:
--- /dev/null
+From 85720e04d9af0b77f8092b12a06661a8d459d4a0 Mon Sep 17 00:00:00 2001
+From: Jiawen Wu <jiawenwu@trustnetic.com>
+Date: Wed, 25 Jun 2025 10:39:24 +0800
+Subject: net: libwx: fix the creation of page_pool
+
+From: Jiawen Wu <jiawenwu@trustnetic.com>
+
+commit 85720e04d9af0b77f8092b12a06661a8d459d4a0 upstream.
+
+'rx_ring->size' means the count of ring descriptors multiplied by the
+size of one descriptor. When increasing the count of ring descriptors,
+it may exceed the limit of pool size.
+
+[ 864.209610] page_pool_create_percpu() gave up with errno -7
+[ 864.209613] txgbe 0000:11:00.0: Page pool creation failed: -7
+
+Fix to set the pool_size to the count of ring descriptors.
+
+Fixes: 850b971110b2 ("net: libwx: Allocate Rx and Tx resources")
+Cc: stable@vger.kernel.org
+Signed-off-by: Jiawen Wu <jiawenwu@trustnetic.com>
+Reviewed-by: Simon Horman <horms@kernel.org>
+Reviewed-by: Mina Almasry <almasrymina@google.com>
+Link: https://patch.msgid.link/434C72BFB40E350A+20250625023924.21821-1-jiawenwu@trustnetic.com
+Signed-off-by: Paolo Abeni <pabeni@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/net/ethernet/wangxun/libwx/wx_lib.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/drivers/net/ethernet/wangxun/libwx/wx_lib.c
++++ b/drivers/net/ethernet/wangxun/libwx/wx_lib.c
+@@ -2496,7 +2496,7 @@ static int wx_alloc_page_pool(struct wx_
+ struct page_pool_params pp_params = {
+ .flags = PP_FLAG_DMA_MAP | PP_FLAG_DMA_SYNC_DEV,
+ .order = 0,
+- .pool_size = rx_ring->size,
++ .pool_size = rx_ring->count,
+ .nid = dev_to_node(rx_ring->dev),
+ .dev = rx_ring->dev,
+ .dma_dir = DMA_FROM_DEVICE,
--- /dev/null
+From 48fd7ebe00c1cdc782b42576548b25185902f64c Mon Sep 17 00:00:00 2001
+From: Kuan-Wei Chiu <visitorckw@gmail.com>
+Date: Sun, 15 Jun 2025 04:23:52 +0800
+Subject: Revert "bcache: remove heap-related macros and switch to generic min_heap"
+
+From: Kuan-Wei Chiu <visitorckw@gmail.com>
+
+commit 48fd7ebe00c1cdc782b42576548b25185902f64c upstream.
+
+This reverts commit 866898efbb25bb44fd42848318e46db9e785973a.
+
+The generic bottom-up min_heap implementation causes performance
+regression in invalidate_buckets_lru(), a hot path in bcache. Before the
+cache is fully populated, new_bucket_prio() often returns zero, leading to
+many equal comparisons. In such cases, bottom-up sift_down performs up to
+2 * log2(n) comparisons, while the original top-down approach completes
+with just O() comparisons, resulting in a measurable performance gap.
+
+The performance degradation is further worsened by the non-inlined
+min_heap API functions introduced in commit 92a8b224b833 ("lib/min_heap:
+introduce non-inline versions of min heap API functions"), adding function
+call overhead to this critical path.
+
+As reported by Robert, bcache now suffers from latency spikes, with P100
+(max) latency increasing from 600 ms to 2.4 seconds every 5 minutes.
+These regressions degrade bcache's effectiveness as a low-latency cache
+layer and lead to frequent timeouts and application stalls in production
+environments.
+
+This revert aims to restore bcache's original low-latency behavior.
+
+Link: https://lore.kernel.org/lkml/CAJhEC05+0S69z+3+FB2Cd0hD+pCRyWTKLEOsc8BOmH73p1m+KQ@mail.gmail.com
+Link: https://lkml.kernel.org/r/20250614202353.1632957-3-visitorckw@gmail.com
+Fixes: 866898efbb25 ("bcache: remove heap-related macros and switch to generic min_heap")
+Fixes: 92a8b224b833 ("lib/min_heap: introduce non-inline versions of min heap API functions")
+Signed-off-by: Kuan-Wei Chiu <visitorckw@gmail.com>
+Reported-by: Robert Pang <robertpang@google.com>
+Closes: https://lore.kernel.org/linux-bcache/CAJhEC06F_AtrPgw2-7CvCqZgeStgCtitbD-ryuPpXQA-JG5XXw@mail.gmail.com
+Acked-by: Coly Li <colyli@kernel.org>
+Cc: Ching-Chun (Jim) Huang <jserv@ccns.ncku.edu.tw>
+Cc: Kent Overstreet <kent.overstreet@linux.dev>
+Cc: <stable@vger.kernel.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/md/bcache/alloc.c | 64 +++++----------------
+ drivers/md/bcache/bcache.h | 2
+ drivers/md/bcache/bset.c | 124 +++++++++++++++---------------------------
+ drivers/md/bcache/bset.h | 42 ++++++++------
+ drivers/md/bcache/btree.c | 69 +++++++++--------------
+ drivers/md/bcache/extents.c | 51 ++++++-----------
+ drivers/md/bcache/movinggc.c | 41 +++----------
+ drivers/md/bcache/super.c | 3 -
+ drivers/md/bcache/sysfs.c | 4 -
+ drivers/md/bcache/util.h | 67 ++++++++++++++++++++++
+ drivers/md/bcache/writeback.c | 13 +---
+ 11 files changed, 217 insertions(+), 263 deletions(-)
+
+--- a/drivers/md/bcache/alloc.c
++++ b/drivers/md/bcache/alloc.c
+@@ -164,68 +164,40 @@ static void bch_invalidate_one_bucket(st
+ * prio is worth 1/8th of what INITIAL_PRIO is worth.
+ */
+
+-static inline unsigned int new_bucket_prio(struct cache *ca, struct bucket *b)
+-{
+- unsigned int min_prio = (INITIAL_PRIO - ca->set->min_prio) / 8;
+-
+- return (b->prio - ca->set->min_prio + min_prio) * GC_SECTORS_USED(b);
+-}
+-
+-static inline bool new_bucket_max_cmp(const void *l, const void *r, void *args)
+-{
+- struct bucket **lhs = (struct bucket **)l;
+- struct bucket **rhs = (struct bucket **)r;
+- struct cache *ca = args;
+-
+- return new_bucket_prio(ca, *lhs) > new_bucket_prio(ca, *rhs);
+-}
+-
+-static inline bool new_bucket_min_cmp(const void *l, const void *r, void *args)
+-{
+- struct bucket **lhs = (struct bucket **)l;
+- struct bucket **rhs = (struct bucket **)r;
+- struct cache *ca = args;
+-
+- return new_bucket_prio(ca, *lhs) < new_bucket_prio(ca, *rhs);
+-}
+-
+-static inline void new_bucket_swap(void *l, void *r, void __always_unused *args)
+-{
+- struct bucket **lhs = l, **rhs = r;
++#define bucket_prio(b) \
++({ \
++ unsigned int min_prio = (INITIAL_PRIO - ca->set->min_prio) / 8; \
++ \
++ (b->prio - ca->set->min_prio + min_prio) * GC_SECTORS_USED(b); \
++})
+
+- swap(*lhs, *rhs);
+-}
++#define bucket_max_cmp(l, r) (bucket_prio(l) < bucket_prio(r))
++#define bucket_min_cmp(l, r) (bucket_prio(l) > bucket_prio(r))
+
+ static void invalidate_buckets_lru(struct cache *ca)
+ {
+ struct bucket *b;
+- const struct min_heap_callbacks bucket_max_cmp_callback = {
+- .less = new_bucket_max_cmp,
+- .swp = new_bucket_swap,
+- };
+- const struct min_heap_callbacks bucket_min_cmp_callback = {
+- .less = new_bucket_min_cmp,
+- .swp = new_bucket_swap,
+- };
++ ssize_t i;
+
+- ca->heap.nr = 0;
++ ca->heap.used = 0;
+
+ for_each_bucket(b, ca) {
+ if (!bch_can_invalidate_bucket(ca, b))
+ continue;
+
+- if (!min_heap_full(&ca->heap))
+- min_heap_push(&ca->heap, &b, &bucket_max_cmp_callback, ca);
+- else if (!new_bucket_max_cmp(&b, min_heap_peek(&ca->heap), ca)) {
++ if (!heap_full(&ca->heap))
++ heap_add(&ca->heap, b, bucket_max_cmp);
++ else if (bucket_max_cmp(b, heap_peek(&ca->heap))) {
+ ca->heap.data[0] = b;
+- min_heap_sift_down(&ca->heap, 0, &bucket_max_cmp_callback, ca);
++ heap_sift(&ca->heap, 0, bucket_max_cmp);
+ }
+ }
+
+- min_heapify_all(&ca->heap, &bucket_min_cmp_callback, ca);
++ for (i = ca->heap.used / 2 - 1; i >= 0; --i)
++ heap_sift(&ca->heap, i, bucket_min_cmp);
+
+ while (!fifo_full(&ca->free_inc)) {
+- if (!ca->heap.nr) {
++ if (!heap_pop(&ca->heap, b, bucket_min_cmp)) {
+ /*
+ * We don't want to be calling invalidate_buckets()
+ * multiple times when it can't do anything
+@@ -234,8 +206,6 @@ static void invalidate_buckets_lru(struc
+ wake_up_gc(ca->set);
+ return;
+ }
+- b = min_heap_peek(&ca->heap)[0];
+- min_heap_pop(&ca->heap, &bucket_min_cmp_callback, ca);
+
+ bch_invalidate_one_bucket(ca, b);
+ }
+--- a/drivers/md/bcache/bcache.h
++++ b/drivers/md/bcache/bcache.h
+@@ -458,7 +458,7 @@ struct cache {
+ /* Allocation stuff: */
+ struct bucket *buckets;
+
+- DEFINE_MIN_HEAP(struct bucket *, cache_heap) heap;
++ DECLARE_HEAP(struct bucket *, heap);
+
+ /*
+ * If nonzero, we know we aren't going to find any buckets to invalidate
+--- a/drivers/md/bcache/bset.c
++++ b/drivers/md/bcache/bset.c
+@@ -54,11 +54,9 @@ void bch_dump_bucket(struct btree_keys *
+ int __bch_count_data(struct btree_keys *b)
+ {
+ unsigned int ret = 0;
+- struct btree_iter iter;
++ struct btree_iter_stack iter;
+ struct bkey *k;
+
+- min_heap_init(&iter.heap, NULL, MAX_BSETS);
+-
+ if (b->ops->is_extents)
+ for_each_key(b, k, &iter)
+ ret += KEY_SIZE(k);
+@@ -69,11 +67,9 @@ void __bch_check_keys(struct btree_keys
+ {
+ va_list args;
+ struct bkey *k, *p = NULL;
+- struct btree_iter iter;
++ struct btree_iter_stack iter;
+ const char *err;
+
+- min_heap_init(&iter.heap, NULL, MAX_BSETS);
+-
+ for_each_key(b, k, &iter) {
+ if (b->ops->is_extents) {
+ err = "Keys out of order";
+@@ -114,9 +110,9 @@ bug:
+
+ static void bch_btree_iter_next_check(struct btree_iter *iter)
+ {
+- struct bkey *k = iter->heap.data->k, *next = bkey_next(k);
++ struct bkey *k = iter->data->k, *next = bkey_next(k);
+
+- if (next < iter->heap.data->end &&
++ if (next < iter->data->end &&
+ bkey_cmp(k, iter->b->ops->is_extents ?
+ &START_KEY(next) : next) > 0) {
+ bch_dump_bucket(iter->b);
+@@ -883,14 +879,12 @@ unsigned int bch_btree_insert_key(struct
+ unsigned int status = BTREE_INSERT_STATUS_NO_INSERT;
+ struct bset *i = bset_tree_last(b)->data;
+ struct bkey *m, *prev = NULL;
+- struct btree_iter iter;
++ struct btree_iter_stack iter;
+ struct bkey preceding_key_on_stack = ZERO_KEY;
+ struct bkey *preceding_key_p = &preceding_key_on_stack;
+
+ BUG_ON(b->ops->is_extents && !KEY_SIZE(k));
+
+- min_heap_init(&iter.heap, NULL, MAX_BSETS);
+-
+ /*
+ * If k has preceding key, preceding_key_p will be set to address
+ * of k's preceding key; otherwise preceding_key_p will be set
+@@ -901,9 +895,9 @@ unsigned int bch_btree_insert_key(struct
+ else
+ preceding_key(k, &preceding_key_p);
+
+- m = bch_btree_iter_init(b, &iter, preceding_key_p);
++ m = bch_btree_iter_stack_init(b, &iter, preceding_key_p);
+
+- if (b->ops->insert_fixup(b, k, &iter, replace_key))
++ if (b->ops->insert_fixup(b, k, &iter.iter, replace_key))
+ return status;
+
+ status = BTREE_INSERT_STATUS_INSERT;
+@@ -1083,102 +1077,79 @@ struct bkey *__bch_bset_search(struct bt
+
+ /* Btree iterator */
+
+-typedef bool (new_btree_iter_cmp_fn)(const void *, const void *, void *);
+-
+-static inline bool new_btree_iter_cmp(const void *l, const void *r, void __always_unused *args)
+-{
+- const struct btree_iter_set *_l = l;
+- const struct btree_iter_set *_r = r;
+-
+- return bkey_cmp(_l->k, _r->k) <= 0;
+-}
++typedef bool (btree_iter_cmp_fn)(struct btree_iter_set,
++ struct btree_iter_set);
+
+-static inline void new_btree_iter_swap(void *iter1, void *iter2, void __always_unused *args)
++static inline bool btree_iter_cmp(struct btree_iter_set l,
++ struct btree_iter_set r)
+ {
+- struct btree_iter_set *_iter1 = iter1;
+- struct btree_iter_set *_iter2 = iter2;
+-
+- swap(*_iter1, *_iter2);
++ return bkey_cmp(l.k, r.k) > 0;
+ }
+
+ static inline bool btree_iter_end(struct btree_iter *iter)
+ {
+- return !iter->heap.nr;
++ return !iter->used;
+ }
+
+ void bch_btree_iter_push(struct btree_iter *iter, struct bkey *k,
+ struct bkey *end)
+ {
+- const struct min_heap_callbacks callbacks = {
+- .less = new_btree_iter_cmp,
+- .swp = new_btree_iter_swap,
+- };
+-
+ if (k != end)
+- BUG_ON(!min_heap_push(&iter->heap,
+- &((struct btree_iter_set) { k, end }),
+- &callbacks,
+- NULL));
++ BUG_ON(!heap_add(iter,
++ ((struct btree_iter_set) { k, end }),
++ btree_iter_cmp));
+ }
+
+-static struct bkey *__bch_btree_iter_init(struct btree_keys *b,
+- struct btree_iter *iter,
+- struct bkey *search,
+- struct bset_tree *start)
++static struct bkey *__bch_btree_iter_stack_init(struct btree_keys *b,
++ struct btree_iter_stack *iter,
++ struct bkey *search,
++ struct bset_tree *start)
+ {
+ struct bkey *ret = NULL;
+
+- iter->heap.size = ARRAY_SIZE(iter->heap.preallocated);
+- iter->heap.nr = 0;
++ iter->iter.size = ARRAY_SIZE(iter->stack_data);
++ iter->iter.used = 0;
+
+ #ifdef CONFIG_BCACHE_DEBUG
+- iter->b = b;
++ iter->iter.b = b;
+ #endif
+
+ for (; start <= bset_tree_last(b); start++) {
+ ret = bch_bset_search(b, start, search);
+- bch_btree_iter_push(iter, ret, bset_bkey_last(start->data));
++ bch_btree_iter_push(&iter->iter, ret, bset_bkey_last(start->data));
+ }
+
+ return ret;
+ }
+
+-struct bkey *bch_btree_iter_init(struct btree_keys *b,
+- struct btree_iter *iter,
++struct bkey *bch_btree_iter_stack_init(struct btree_keys *b,
++ struct btree_iter_stack *iter,
+ struct bkey *search)
+ {
+- return __bch_btree_iter_init(b, iter, search, b->set);
++ return __bch_btree_iter_stack_init(b, iter, search, b->set);
+ }
+
+ static inline struct bkey *__bch_btree_iter_next(struct btree_iter *iter,
+- new_btree_iter_cmp_fn *cmp)
++ btree_iter_cmp_fn *cmp)
+ {
+ struct btree_iter_set b __maybe_unused;
+ struct bkey *ret = NULL;
+- const struct min_heap_callbacks callbacks = {
+- .less = cmp,
+- .swp = new_btree_iter_swap,
+- };
+
+ if (!btree_iter_end(iter)) {
+ bch_btree_iter_next_check(iter);
+
+- ret = iter->heap.data->k;
+- iter->heap.data->k = bkey_next(iter->heap.data->k);
++ ret = iter->data->k;
++ iter->data->k = bkey_next(iter->data->k);
+
+- if (iter->heap.data->k > iter->heap.data->end) {
++ if (iter->data->k > iter->data->end) {
+ WARN_ONCE(1, "bset was corrupt!\n");
+- iter->heap.data->k = iter->heap.data->end;
++ iter->data->k = iter->data->end;
+ }
+
+- if (iter->heap.data->k == iter->heap.data->end) {
+- if (iter->heap.nr) {
+- b = min_heap_peek(&iter->heap)[0];
+- min_heap_pop(&iter->heap, &callbacks, NULL);
+- }
+- }
++ if (iter->data->k == iter->data->end)
++ heap_pop(iter, b, cmp);
+ else
+- min_heap_sift_down(&iter->heap, 0, &callbacks, NULL);
++ heap_sift(iter, 0, cmp);
+ }
+
+ return ret;
+@@ -1186,7 +1157,7 @@ static inline struct bkey *__bch_btree_i
+
+ struct bkey *bch_btree_iter_next(struct btree_iter *iter)
+ {
+- return __bch_btree_iter_next(iter, new_btree_iter_cmp);
++ return __bch_btree_iter_next(iter, btree_iter_cmp);
+
+ }
+
+@@ -1224,18 +1195,16 @@ static void btree_mergesort(struct btree
+ struct btree_iter *iter,
+ bool fixup, bool remove_stale)
+ {
++ int i;
+ struct bkey *k, *last = NULL;
+ BKEY_PADDED(k) tmp;
+ bool (*bad)(struct btree_keys *, const struct bkey *) = remove_stale
+ ? bch_ptr_bad
+ : bch_ptr_invalid;
+- const struct min_heap_callbacks callbacks = {
+- .less = b->ops->sort_cmp,
+- .swp = new_btree_iter_swap,
+- };
+
+ /* Heapify the iterator, using our comparison function */
+- min_heapify_all(&iter->heap, &callbacks, NULL);
++ for (i = iter->used / 2 - 1; i >= 0; --i)
++ heap_sift(iter, i, b->ops->sort_cmp);
+
+ while (!btree_iter_end(iter)) {
+ if (b->ops->sort_fixup && fixup)
+@@ -1324,11 +1293,10 @@ void bch_btree_sort_partial(struct btree
+ struct bset_sort_state *state)
+ {
+ size_t order = b->page_order, keys = 0;
+- struct btree_iter iter;
++ struct btree_iter_stack iter;
+ int oldsize = bch_count_data(b);
+
+- min_heap_init(&iter.heap, NULL, MAX_BSETS);
+- __bch_btree_iter_init(b, &iter, NULL, &b->set[start]);
++ __bch_btree_iter_stack_init(b, &iter, NULL, &b->set[start]);
+
+ if (start) {
+ unsigned int i;
+@@ -1339,7 +1307,7 @@ void bch_btree_sort_partial(struct btree
+ order = get_order(__set_bytes(b->set->data, keys));
+ }
+
+- __btree_sort(b, &iter, start, order, false, state);
++ __btree_sort(b, &iter.iter, start, order, false, state);
+
+ EBUG_ON(oldsize >= 0 && bch_count_data(b) != oldsize);
+ }
+@@ -1355,13 +1323,11 @@ void bch_btree_sort_into(struct btree_ke
+ struct bset_sort_state *state)
+ {
+ uint64_t start_time = local_clock();
+- struct btree_iter iter;
+-
+- min_heap_init(&iter.heap, NULL, MAX_BSETS);
++ struct btree_iter_stack iter;
+
+- bch_btree_iter_init(b, &iter, NULL);
++ bch_btree_iter_stack_init(b, &iter, NULL);
+
+- btree_mergesort(b, new->set->data, &iter, false, true);
++ btree_mergesort(b, new->set->data, &iter.iter, false, true);
+
+ bch_time_stats_update(&state->time, start_time);
+
+--- a/drivers/md/bcache/bset.h
++++ b/drivers/md/bcache/bset.h
+@@ -187,9 +187,8 @@ struct bset_tree {
+ };
+
+ struct btree_keys_ops {
+- bool (*sort_cmp)(const void *l,
+- const void *r,
+- void *args);
++ bool (*sort_cmp)(struct btree_iter_set l,
++ struct btree_iter_set r);
+ struct bkey *(*sort_fixup)(struct btree_iter *iter,
+ struct bkey *tmp);
+ bool (*insert_fixup)(struct btree_keys *b,
+@@ -313,17 +312,23 @@ enum {
+ BTREE_INSERT_STATUS_FRONT_MERGE,
+ };
+
+-struct btree_iter_set {
+- struct bkey *k, *end;
+-};
+-
+ /* Btree key iteration */
+
+ struct btree_iter {
++ size_t size, used;
+ #ifdef CONFIG_BCACHE_DEBUG
+ struct btree_keys *b;
+ #endif
+- MIN_HEAP_PREALLOCATED(struct btree_iter_set, btree_iter_heap, MAX_BSETS) heap;
++ struct btree_iter_set {
++ struct bkey *k, *end;
++ } data[];
++};
++
++/* Fixed-size btree_iter that can be allocated on the stack */
++
++struct btree_iter_stack {
++ struct btree_iter iter;
++ struct btree_iter_set stack_data[MAX_BSETS];
+ };
+
+ typedef bool (*ptr_filter_fn)(struct btree_keys *b, const struct bkey *k);
+@@ -335,9 +340,9 @@ struct bkey *bch_btree_iter_next_filter(
+
+ void bch_btree_iter_push(struct btree_iter *iter, struct bkey *k,
+ struct bkey *end);
+-struct bkey *bch_btree_iter_init(struct btree_keys *b,
+- struct btree_iter *iter,
+- struct bkey *search);
++struct bkey *bch_btree_iter_stack_init(struct btree_keys *b,
++ struct btree_iter_stack *iter,
++ struct bkey *search);
+
+ struct bkey *__bch_bset_search(struct btree_keys *b, struct bset_tree *t,
+ const struct bkey *search);
+@@ -352,13 +357,14 @@ static inline struct bkey *bch_bset_sear
+ return search ? __bch_bset_search(b, t, search) : t->data->start;
+ }
+
+-#define for_each_key_filter(b, k, iter, filter) \
+- for (bch_btree_iter_init((b), (iter), NULL); \
+- ((k) = bch_btree_iter_next_filter((iter), (b), filter));)
+-
+-#define for_each_key(b, k, iter) \
+- for (bch_btree_iter_init((b), (iter), NULL); \
+- ((k) = bch_btree_iter_next(iter));)
++#define for_each_key_filter(b, k, stack_iter, filter) \
++ for (bch_btree_iter_stack_init((b), (stack_iter), NULL); \
++ ((k) = bch_btree_iter_next_filter(&((stack_iter)->iter), (b), \
++ filter));)
++
++#define for_each_key(b, k, stack_iter) \
++ for (bch_btree_iter_stack_init((b), (stack_iter), NULL); \
++ ((k) = bch_btree_iter_next(&((stack_iter)->iter)));)
+
+ /* Sorting */
+
+--- a/drivers/md/bcache/btree.c
++++ b/drivers/md/bcache/btree.c
+@@ -149,19 +149,19 @@ void bch_btree_node_read_done(struct btr
+ {
+ const char *err = "bad btree header";
+ struct bset *i = btree_bset_first(b);
+- struct btree_iter iter;
++ struct btree_iter *iter;
+
+ /*
+ * c->fill_iter can allocate an iterator with more memory space
+ * than static MAX_BSETS.
+ * See the comment arount cache_set->fill_iter.
+ */
+- iter.heap.data = mempool_alloc(&b->c->fill_iter, GFP_NOIO);
+- iter.heap.size = b->c->cache->sb.bucket_size / b->c->cache->sb.block_size;
+- iter.heap.nr = 0;
++ iter = mempool_alloc(&b->c->fill_iter, GFP_NOIO);
++ iter->size = b->c->cache->sb.bucket_size / b->c->cache->sb.block_size;
++ iter->used = 0;
+
+ #ifdef CONFIG_BCACHE_DEBUG
+- iter.b = &b->keys;
++ iter->b = &b->keys;
+ #endif
+
+ if (!i->seq)
+@@ -199,7 +199,7 @@ void bch_btree_node_read_done(struct btr
+ if (i != b->keys.set[0].data && !i->keys)
+ goto err;
+
+- bch_btree_iter_push(&iter, i->start, bset_bkey_last(i));
++ bch_btree_iter_push(iter, i->start, bset_bkey_last(i));
+
+ b->written += set_blocks(i, block_bytes(b->c->cache));
+ }
+@@ -211,7 +211,7 @@ void bch_btree_node_read_done(struct btr
+ if (i->seq == b->keys.set[0].data->seq)
+ goto err;
+
+- bch_btree_sort_and_fix_extents(&b->keys, &iter, &b->c->sort);
++ bch_btree_sort_and_fix_extents(&b->keys, iter, &b->c->sort);
+
+ i = b->keys.set[0].data;
+ err = "short btree key";
+@@ -223,7 +223,7 @@ void bch_btree_node_read_done(struct btr
+ bch_bset_init_next(&b->keys, write_block(b),
+ bset_magic(&b->c->cache->sb));
+ out:
+- mempool_free(iter.heap.data, &b->c->fill_iter);
++ mempool_free(iter, &b->c->fill_iter);
+ return;
+ err:
+ set_btree_node_io_error(b);
+@@ -1309,11 +1309,9 @@ static bool btree_gc_mark_node(struct bt
+ uint8_t stale = 0;
+ unsigned int keys = 0, good_keys = 0;
+ struct bkey *k;
+- struct btree_iter iter;
++ struct btree_iter_stack iter;
+ struct bset_tree *t;
+
+- min_heap_init(&iter.heap, NULL, MAX_BSETS);
+-
+ gc->nodes++;
+
+ for_each_key_filter(&b->keys, k, &iter, bch_ptr_invalid) {
+@@ -1572,11 +1570,9 @@ static int btree_gc_rewrite_node(struct
+ static unsigned int btree_gc_count_keys(struct btree *b)
+ {
+ struct bkey *k;
+- struct btree_iter iter;
++ struct btree_iter_stack iter;
+ unsigned int ret = 0;
+
+- min_heap_init(&iter.heap, NULL, MAX_BSETS);
+-
+ for_each_key_filter(&b->keys, k, &iter, bch_ptr_bad)
+ ret += bkey_u64s(k);
+
+@@ -1615,18 +1611,18 @@ static int btree_gc_recurse(struct btree
+ int ret = 0;
+ bool should_rewrite;
+ struct bkey *k;
+- struct btree_iter iter;
++ struct btree_iter_stack iter;
+ struct gc_merge_info r[GC_MERGE_NODES];
+ struct gc_merge_info *i, *last = r + ARRAY_SIZE(r) - 1;
+
+- min_heap_init(&iter.heap, NULL, MAX_BSETS);
+- bch_btree_iter_init(&b->keys, &iter, &b->c->gc_done);
++ bch_btree_iter_stack_init(&b->keys, &iter, &b->c->gc_done);
+
+ for (i = r; i < r + ARRAY_SIZE(r); i++)
+ i->b = ERR_PTR(-EINTR);
+
+ while (1) {
+- k = bch_btree_iter_next_filter(&iter, &b->keys, bch_ptr_bad);
++ k = bch_btree_iter_next_filter(&iter.iter, &b->keys,
++ bch_ptr_bad);
+ if (k) {
+ r->b = bch_btree_node_get(b->c, op, k, b->level - 1,
+ true, b);
+@@ -1921,9 +1917,7 @@ static int bch_btree_check_recurse(struc
+ {
+ int ret = 0;
+ struct bkey *k, *p = NULL;
+- struct btree_iter iter;
+-
+- min_heap_init(&iter.heap, NULL, MAX_BSETS);
++ struct btree_iter_stack iter;
+
+ for_each_key_filter(&b->keys, k, &iter, bch_ptr_invalid)
+ bch_initial_mark_key(b->c, b->level, k);
+@@ -1931,10 +1925,10 @@ static int bch_btree_check_recurse(struc
+ bch_initial_mark_key(b->c, b->level + 1, &b->key);
+
+ if (b->level) {
+- bch_btree_iter_init(&b->keys, &iter, NULL);
++ bch_btree_iter_stack_init(&b->keys, &iter, NULL);
+
+ do {
+- k = bch_btree_iter_next_filter(&iter, &b->keys,
++ k = bch_btree_iter_next_filter(&iter.iter, &b->keys,
+ bch_ptr_bad);
+ if (k) {
+ btree_node_prefetch(b, k);
+@@ -1962,7 +1956,7 @@ static int bch_btree_check_thread(void *
+ struct btree_check_info *info = arg;
+ struct btree_check_state *check_state = info->state;
+ struct cache_set *c = check_state->c;
+- struct btree_iter iter;
++ struct btree_iter_stack iter;
+ struct bkey *k, *p;
+ int cur_idx, prev_idx, skip_nr;
+
+@@ -1970,11 +1964,9 @@ static int bch_btree_check_thread(void *
+ cur_idx = prev_idx = 0;
+ ret = 0;
+
+- min_heap_init(&iter.heap, NULL, MAX_BSETS);
+-
+ /* root node keys are checked before thread created */
+- bch_btree_iter_init(&c->root->keys, &iter, NULL);
+- k = bch_btree_iter_next_filter(&iter, &c->root->keys, bch_ptr_bad);
++ bch_btree_iter_stack_init(&c->root->keys, &iter, NULL);
++ k = bch_btree_iter_next_filter(&iter.iter, &c->root->keys, bch_ptr_bad);
+ BUG_ON(!k);
+
+ p = k;
+@@ -1992,7 +1984,7 @@ static int bch_btree_check_thread(void *
+ skip_nr = cur_idx - prev_idx;
+
+ while (skip_nr) {
+- k = bch_btree_iter_next_filter(&iter,
++ k = bch_btree_iter_next_filter(&iter.iter,
+ &c->root->keys,
+ bch_ptr_bad);
+ if (k)
+@@ -2065,11 +2057,9 @@ int bch_btree_check(struct cache_set *c)
+ int ret = 0;
+ int i;
+ struct bkey *k = NULL;
+- struct btree_iter iter;
++ struct btree_iter_stack iter;
+ struct btree_check_state check_state;
+
+- min_heap_init(&iter.heap, NULL, MAX_BSETS);
+-
+ /* check and mark root node keys */
+ for_each_key_filter(&c->root->keys, k, &iter, bch_ptr_invalid)
+ bch_initial_mark_key(c, c->root->level, k);
+@@ -2563,12 +2553,11 @@ static int bch_btree_map_nodes_recurse(s
+
+ if (b->level) {
+ struct bkey *k;
+- struct btree_iter iter;
++ struct btree_iter_stack iter;
+
+- min_heap_init(&iter.heap, NULL, MAX_BSETS);
+- bch_btree_iter_init(&b->keys, &iter, from);
++ bch_btree_iter_stack_init(&b->keys, &iter, from);
+
+- while ((k = bch_btree_iter_next_filter(&iter, &b->keys,
++ while ((k = bch_btree_iter_next_filter(&iter.iter, &b->keys,
+ bch_ptr_bad))) {
+ ret = bcache_btree(map_nodes_recurse, k, b,
+ op, from, fn, flags);
+@@ -2597,12 +2586,12 @@ int bch_btree_map_keys_recurse(struct bt
+ {
+ int ret = MAP_CONTINUE;
+ struct bkey *k;
+- struct btree_iter iter;
++ struct btree_iter_stack iter;
+
+- min_heap_init(&iter.heap, NULL, MAX_BSETS);
+- bch_btree_iter_init(&b->keys, &iter, from);
++ bch_btree_iter_stack_init(&b->keys, &iter, from);
+
+- while ((k = bch_btree_iter_next_filter(&iter, &b->keys, bch_ptr_bad))) {
++ while ((k = bch_btree_iter_next_filter(&iter.iter, &b->keys,
++ bch_ptr_bad))) {
+ ret = !b->level
+ ? fn(op, b, k)
+ : bcache_btree(map_keys_recurse, k,
+--- a/drivers/md/bcache/extents.c
++++ b/drivers/md/bcache/extents.c
+@@ -33,16 +33,15 @@ static void sort_key_next(struct btree_i
+ i->k = bkey_next(i->k);
+
+ if (i->k == i->end)
+- *i = iter->heap.data[--iter->heap.nr];
++ *i = iter->data[--iter->used];
+ }
+
+-static bool new_bch_key_sort_cmp(const void *l, const void *r, void *args)
++static bool bch_key_sort_cmp(struct btree_iter_set l,
++ struct btree_iter_set r)
+ {
+- struct btree_iter_set *_l = (struct btree_iter_set *)l;
+- struct btree_iter_set *_r = (struct btree_iter_set *)r;
+- int64_t c = bkey_cmp(_l->k, _r->k);
++ int64_t c = bkey_cmp(l.k, r.k);
+
+- return !(c ? c > 0 : _l->k < _r->k);
++ return c ? c > 0 : l.k < r.k;
+ }
+
+ static bool __ptr_invalid(struct cache_set *c, const struct bkey *k)
+@@ -239,7 +238,7 @@ static bool bch_btree_ptr_insert_fixup(s
+ }
+
+ const struct btree_keys_ops bch_btree_keys_ops = {
+- .sort_cmp = new_bch_key_sort_cmp,
++ .sort_cmp = bch_key_sort_cmp,
+ .insert_fixup = bch_btree_ptr_insert_fixup,
+ .key_invalid = bch_btree_ptr_invalid,
+ .key_bad = bch_btree_ptr_bad,
+@@ -256,36 +255,22 @@ const struct btree_keys_ops bch_btree_ke
+ * Necessary for btree_sort_fixup() - if there are multiple keys that compare
+ * equal in different sets, we have to process them newest to oldest.
+ */
+-
+-static bool new_bch_extent_sort_cmp(const void *l, const void *r, void __always_unused *args)
+-{
+- struct btree_iter_set *_l = (struct btree_iter_set *)l;
+- struct btree_iter_set *_r = (struct btree_iter_set *)r;
+- int64_t c = bkey_cmp(&START_KEY(_l->k), &START_KEY(_r->k));
+-
+- return !(c ? c > 0 : _l->k < _r->k);
+-}
+-
+-static inline void new_btree_iter_swap(void *iter1, void *iter2, void __always_unused *args)
++static bool bch_extent_sort_cmp(struct btree_iter_set l,
++ struct btree_iter_set r)
+ {
+- struct btree_iter_set *_iter1 = iter1;
+- struct btree_iter_set *_iter2 = iter2;
++ int64_t c = bkey_cmp(&START_KEY(l.k), &START_KEY(r.k));
+
+- swap(*_iter1, *_iter2);
++ return c ? c > 0 : l.k < r.k;
+ }
+
+ static struct bkey *bch_extent_sort_fixup(struct btree_iter *iter,
+ struct bkey *tmp)
+ {
+- const struct min_heap_callbacks callbacks = {
+- .less = new_bch_extent_sort_cmp,
+- .swp = new_btree_iter_swap,
+- };
+- while (iter->heap.nr > 1) {
+- struct btree_iter_set *top = iter->heap.data, *i = top + 1;
++ while (iter->used > 1) {
++ struct btree_iter_set *top = iter->data, *i = top + 1;
+
+- if (iter->heap.nr > 2 &&
+- !new_bch_extent_sort_cmp(&i[0], &i[1], NULL))
++ if (iter->used > 2 &&
++ bch_extent_sort_cmp(i[0], i[1]))
+ i++;
+
+ if (bkey_cmp(top->k, &START_KEY(i->k)) <= 0)
+@@ -293,7 +278,7 @@ static struct bkey *bch_extent_sort_fixu
+
+ if (!KEY_SIZE(i->k)) {
+ sort_key_next(iter, i);
+- min_heap_sift_down(&iter->heap, i - top, &callbacks, NULL);
++ heap_sift(iter, i - top, bch_extent_sort_cmp);
+ continue;
+ }
+
+@@ -303,7 +288,7 @@ static struct bkey *bch_extent_sort_fixu
+ else
+ bch_cut_front(top->k, i->k);
+
+- min_heap_sift_down(&iter->heap, i - top, &callbacks, NULL);
++ heap_sift(iter, i - top, bch_extent_sort_cmp);
+ } else {
+ /* can't happen because of comparison func */
+ BUG_ON(!bkey_cmp(&START_KEY(top->k), &START_KEY(i->k)));
+@@ -313,7 +298,7 @@ static struct bkey *bch_extent_sort_fixu
+
+ bch_cut_back(&START_KEY(i->k), tmp);
+ bch_cut_front(i->k, top->k);
+- min_heap_sift_down(&iter->heap, 0, &callbacks, NULL);
++ heap_sift(iter, 0, bch_extent_sort_cmp);
+
+ return tmp;
+ } else {
+@@ -633,7 +618,7 @@ static bool bch_extent_merge(struct btre
+ }
+
+ const struct btree_keys_ops bch_extent_keys_ops = {
+- .sort_cmp = new_bch_extent_sort_cmp,
++ .sort_cmp = bch_extent_sort_cmp,
+ .sort_fixup = bch_extent_sort_fixup,
+ .insert_fixup = bch_extent_insert_fixup,
+ .key_invalid = bch_extent_invalid,
+--- a/drivers/md/bcache/movinggc.c
++++ b/drivers/md/bcache/movinggc.c
+@@ -182,27 +182,16 @@ err: if (!IS_ERR_OR_NULL(w->private))
+ closure_sync(&cl);
+ }
+
+-static bool new_bucket_cmp(const void *l, const void *r, void __always_unused *args)
++static bool bucket_cmp(struct bucket *l, struct bucket *r)
+ {
+- struct bucket **_l = (struct bucket **)l;
+- struct bucket **_r = (struct bucket **)r;
+-
+- return GC_SECTORS_USED(*_l) >= GC_SECTORS_USED(*_r);
+-}
+-
+-static void new_bucket_swap(void *l, void *r, void __always_unused *args)
+-{
+- struct bucket **_l = l;
+- struct bucket **_r = r;
+-
+- swap(*_l, *_r);
++ return GC_SECTORS_USED(l) < GC_SECTORS_USED(r);
+ }
+
+ static unsigned int bucket_heap_top(struct cache *ca)
+ {
+ struct bucket *b;
+
+- return (b = min_heap_peek(&ca->heap)[0]) ? GC_SECTORS_USED(b) : 0;
++ return (b = heap_peek(&ca->heap)) ? GC_SECTORS_USED(b) : 0;
+ }
+
+ void bch_moving_gc(struct cache_set *c)
+@@ -210,10 +199,6 @@ void bch_moving_gc(struct cache_set *c)
+ struct cache *ca = c->cache;
+ struct bucket *b;
+ unsigned long sectors_to_move, reserve_sectors;
+- const struct min_heap_callbacks callbacks = {
+- .less = new_bucket_cmp,
+- .swp = new_bucket_swap,
+- };
+
+ if (!c->copy_gc_enabled)
+ return;
+@@ -224,7 +209,7 @@ void bch_moving_gc(struct cache_set *c)
+ reserve_sectors = ca->sb.bucket_size *
+ fifo_used(&ca->free[RESERVE_MOVINGGC]);
+
+- ca->heap.nr = 0;
++ ca->heap.used = 0;
+
+ for_each_bucket(b, ca) {
+ if (GC_MARK(b) == GC_MARK_METADATA ||
+@@ -233,31 +218,25 @@ void bch_moving_gc(struct cache_set *c)
+ atomic_read(&b->pin))
+ continue;
+
+- if (!min_heap_full(&ca->heap)) {
++ if (!heap_full(&ca->heap)) {
+ sectors_to_move += GC_SECTORS_USED(b);
+- min_heap_push(&ca->heap, &b, &callbacks, NULL);
+- } else if (!new_bucket_cmp(&b, min_heap_peek(&ca->heap), ca)) {
++ heap_add(&ca->heap, b, bucket_cmp);
++ } else if (bucket_cmp(b, heap_peek(&ca->heap))) {
+ sectors_to_move -= bucket_heap_top(ca);
+ sectors_to_move += GC_SECTORS_USED(b);
+
+ ca->heap.data[0] = b;
+- min_heap_sift_down(&ca->heap, 0, &callbacks, NULL);
++ heap_sift(&ca->heap, 0, bucket_cmp);
+ }
+ }
+
+ while (sectors_to_move > reserve_sectors) {
+- if (ca->heap.nr) {
+- b = min_heap_peek(&ca->heap)[0];
+- min_heap_pop(&ca->heap, &callbacks, NULL);
+- }
++ heap_pop(&ca->heap, b, bucket_cmp);
+ sectors_to_move -= GC_SECTORS_USED(b);
+ }
+
+- while (ca->heap.nr) {
+- b = min_heap_peek(&ca->heap)[0];
+- min_heap_pop(&ca->heap, &callbacks, NULL);
++ while (heap_pop(&ca->heap, b, bucket_cmp))
+ SET_GC_MOVE(b, 1);
+- }
+
+ mutex_unlock(&c->bucket_lock);
+
+--- a/drivers/md/bcache/super.c
++++ b/drivers/md/bcache/super.c
+@@ -1912,7 +1912,8 @@ struct cache_set *bch_cache_set_alloc(st
+ INIT_LIST_HEAD(&c->btree_cache_freed);
+ INIT_LIST_HEAD(&c->data_buckets);
+
+- iter_size = ((meta_bucket_pages(sb) * PAGE_SECTORS) / sb->block_size) *
++ iter_size = sizeof(struct btree_iter) +
++ ((meta_bucket_pages(sb) * PAGE_SECTORS) / sb->block_size) *
+ sizeof(struct btree_iter_set);
+
+ c->devices = kcalloc(c->nr_uuids, sizeof(void *), GFP_KERNEL);
+--- a/drivers/md/bcache/sysfs.c
++++ b/drivers/md/bcache/sysfs.c
+@@ -660,9 +660,7 @@ static unsigned int bch_root_usage(struc
+ unsigned int bytes = 0;
+ struct bkey *k;
+ struct btree *b;
+- struct btree_iter iter;
+-
+- min_heap_init(&iter.heap, NULL, MAX_BSETS);
++ struct btree_iter_stack iter;
+
+ goto lock_root;
+
+--- a/drivers/md/bcache/util.h
++++ b/drivers/md/bcache/util.h
+@@ -9,7 +9,6 @@
+ #include <linux/kernel.h>
+ #include <linux/sched/clock.h>
+ #include <linux/llist.h>
+-#include <linux/min_heap.h>
+ #include <linux/ratelimit.h>
+ #include <linux/vmalloc.h>
+ #include <linux/workqueue.h>
+@@ -31,10 +30,16 @@ struct closure;
+
+ #endif
+
++#define DECLARE_HEAP(type, name) \
++ struct { \
++ size_t size, used; \
++ type *data; \
++ } name
++
+ #define init_heap(heap, _size, gfp) \
+ ({ \
+ size_t _bytes; \
+- (heap)->nr = 0; \
++ (heap)->used = 0; \
+ (heap)->size = (_size); \
+ _bytes = (heap)->size * sizeof(*(heap)->data); \
+ (heap)->data = kvmalloc(_bytes, (gfp) & GFP_KERNEL); \
+@@ -47,6 +52,64 @@ do { \
+ (heap)->data = NULL; \
+ } while (0)
+
++#define heap_swap(h, i, j) swap((h)->data[i], (h)->data[j])
++
++#define heap_sift(h, i, cmp) \
++do { \
++ size_t _r, _j = i; \
++ \
++ for (; _j * 2 + 1 < (h)->used; _j = _r) { \
++ _r = _j * 2 + 1; \
++ if (_r + 1 < (h)->used && \
++ cmp((h)->data[_r], (h)->data[_r + 1])) \
++ _r++; \
++ \
++ if (cmp((h)->data[_r], (h)->data[_j])) \
++ break; \
++ heap_swap(h, _r, _j); \
++ } \
++} while (0)
++
++#define heap_sift_down(h, i, cmp) \
++do { \
++ while (i) { \
++ size_t p = (i - 1) / 2; \
++ if (cmp((h)->data[i], (h)->data[p])) \
++ break; \
++ heap_swap(h, i, p); \
++ i = p; \
++ } \
++} while (0)
++
++#define heap_add(h, d, cmp) \
++({ \
++ bool _r = !heap_full(h); \
++ if (_r) { \
++ size_t _i = (h)->used++; \
++ (h)->data[_i] = d; \
++ \
++ heap_sift_down(h, _i, cmp); \
++ heap_sift(h, _i, cmp); \
++ } \
++ _r; \
++})
++
++#define heap_pop(h, d, cmp) \
++({ \
++ bool _r = (h)->used; \
++ if (_r) { \
++ (d) = (h)->data[0]; \
++ (h)->used--; \
++ heap_swap(h, 0, (h)->used); \
++ heap_sift(h, 0, cmp); \
++ } \
++ _r; \
++})
++
++#define heap_peek(h) ((h)->used ? (h)->data[0] : NULL)
++
++#define heap_full(h) ((h)->used == (h)->size)
++
+ #define DECLARE_FIFO(type, name) \
+ struct { \
+ size_t front, back, size, mask; \
+--- a/drivers/md/bcache/writeback.c
++++ b/drivers/md/bcache/writeback.c
+@@ -908,16 +908,15 @@ static int bch_dirty_init_thread(void *a
+ struct dirty_init_thrd_info *info = arg;
+ struct bch_dirty_init_state *state = info->state;
+ struct cache_set *c = state->c;
+- struct btree_iter iter;
++ struct btree_iter_stack iter;
+ struct bkey *k, *p;
+ int cur_idx, prev_idx, skip_nr;
+
+ k = p = NULL;
+ prev_idx = 0;
+
+- min_heap_init(&iter.heap, NULL, MAX_BSETS);
+- bch_btree_iter_init(&c->root->keys, &iter, NULL);
+- k = bch_btree_iter_next_filter(&iter, &c->root->keys, bch_ptr_bad);
++ bch_btree_iter_stack_init(&c->root->keys, &iter, NULL);
++ k = bch_btree_iter_next_filter(&iter.iter, &c->root->keys, bch_ptr_bad);
+ BUG_ON(!k);
+
+ p = k;
+@@ -931,7 +930,7 @@ static int bch_dirty_init_thread(void *a
+ skip_nr = cur_idx - prev_idx;
+
+ while (skip_nr) {
+- k = bch_btree_iter_next_filter(&iter,
++ k = bch_btree_iter_next_filter(&iter.iter,
+ &c->root->keys,
+ bch_ptr_bad);
+ if (k)
+@@ -980,13 +979,11 @@ void bch_sectors_dirty_init(struct bcach
+ int i;
+ struct btree *b = NULL;
+ struct bkey *k = NULL;
+- struct btree_iter iter;
++ struct btree_iter_stack iter;
+ struct sectors_dirty_init op;
+ struct cache_set *c = d->c;
+ struct bch_dirty_init_state state;
+
+- min_heap_init(&iter.heap, NULL, MAX_BSETS);
+-
+ retry_lock:
+ b = c->root;
+ rw_lock(0, b, b->level);
--- /dev/null
+From 845f1f2d69f3f49b3d8c142265952c8257e3368c Mon Sep 17 00:00:00 2001
+From: Kuan-Wei Chiu <visitorckw@gmail.com>
+Date: Sun, 15 Jun 2025 04:23:51 +0800
+Subject: Revert "bcache: update min_heap_callbacks to use default builtin swap"
+
+From: Kuan-Wei Chiu <visitorckw@gmail.com>
+
+commit 845f1f2d69f3f49b3d8c142265952c8257e3368c upstream.
+
+Patch series "bcache: Revert min_heap migration due to performance
+regression".
+
+This patch series reverts the migration of bcache from its original heap
+implementation to the generic min_heap library. While the original change
+aimed to simplify the code and improve maintainability, it introduced a
+severe performance regression in real-world scenarios.
+
+As reported by Robert, systems using bcache now suffer from periodic
+latency spikes, with P100 (max) latency increasing from 600 ms to 2.4
+seconds every 5 minutes. This degrades bcache's value as a low-latency
+caching layer, and leads to frequent timeouts and application stalls in
+production environments.
+
+The primary cause of this regression is the behavior of the generic
+min_heap implementation's bottom-up sift_down, which performs up to 2 *
+log2(n) comparisons when many elements are equal. The original top-down
+variant used by bcache only required O(1) comparisons in such cases. The
+issue was further exacerbated by commit 92a8b224b833 ("lib/min_heap:
+introduce non-inline versions of min heap API functions"), which
+introduced non-inlined versions of the min_heap API, adding function call
+overhead to a performance-critical hot path.
+
+
+This patch (of 3):
+
+This reverts commit 3d8a9a1c35227c3f1b0bd132c9f0a80dbda07b65.
+
+Although removing the custom swap function simplified the code, this
+change is part of a broader migration to the generic min_heap API that
+introduced significant performance regressions in bcache.
+
+As reported by Robert, bcache now suffers from latency spikes, with P100
+(max) latency increasing from 600 ms to 2.4 seconds every 5 minutes.
+These regressions degrade bcache's effectiveness as a low-latency cache
+layer and lead to frequent timeouts and application stalls in production
+environments.
+
+This revert is part of a series of changes to restore previous performance
+by undoing the min_heap transition.
+
+Link: https://lkml.kernel.org/r/20250614202353.1632957-1-visitorckw@gmail.com
+Link: https://lore.kernel.org/lkml/CAJhEC05+0S69z+3+FB2Cd0hD+pCRyWTKLEOsc8BOmH73p1m+KQ@mail.gmail.com
+Link: https://lkml.kernel.org/r/20250614202353.1632957-2-visitorckw@gmail.com
+Fixes: 866898efbb25 ("bcache: remove heap-related macros and switch to generic min_heap")
+Fixes: 92a8b224b833 ("lib/min_heap: introduce non-inline versions of min heap API functions")
+Signed-off-by: Kuan-Wei Chiu <visitorckw@gmail.com>
+Reported-by: Robert Pang <robertpang@google.com>
+Closes: https://lore.kernel.org/linux-bcache/CAJhEC06F_AtrPgw2-7CvCqZgeStgCtitbD-ryuPpXQA-JG5XXw@mail.gmail.com
+Acked-by: Coly Li <colyli@kernel.org>
+Cc: Ching-Chun (Jim) Huang <jserv@ccns.ncku.edu.tw>
+Cc: Kent Overstreet <kent.overstreet@linux.dev>
+Cc: <stable@vger.kernel.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/md/bcache/alloc.c | 11 +++++++++--
+ drivers/md/bcache/bset.c | 14 +++++++++++---
+ drivers/md/bcache/extents.c | 10 +++++++++-
+ drivers/md/bcache/movinggc.c | 10 +++++++++-
+ 4 files changed, 38 insertions(+), 7 deletions(-)
+
+diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c
+index 8998e61efa40..da50f6661bae 100644
+--- a/drivers/md/bcache/alloc.c
++++ b/drivers/md/bcache/alloc.c
+@@ -189,16 +189,23 @@ static inline bool new_bucket_min_cmp(const void *l, const void *r, void *args)
+ return new_bucket_prio(ca, *lhs) < new_bucket_prio(ca, *rhs);
+ }
+
++static inline void new_bucket_swap(void *l, void *r, void __always_unused *args)
++{
++ struct bucket **lhs = l, **rhs = r;
++
++ swap(*lhs, *rhs);
++}
++
+ static void invalidate_buckets_lru(struct cache *ca)
+ {
+ struct bucket *b;
+ const struct min_heap_callbacks bucket_max_cmp_callback = {
+ .less = new_bucket_max_cmp,
+- .swp = NULL,
++ .swp = new_bucket_swap,
+ };
+ const struct min_heap_callbacks bucket_min_cmp_callback = {
+ .less = new_bucket_min_cmp,
+- .swp = NULL,
++ .swp = new_bucket_swap,
+ };
+
+ ca->heap.nr = 0;
+diff --git a/drivers/md/bcache/bset.c b/drivers/md/bcache/bset.c
+index 68258a16e125..bd97d8626887 100644
+--- a/drivers/md/bcache/bset.c
++++ b/drivers/md/bcache/bset.c
+@@ -1093,6 +1093,14 @@ static inline bool new_btree_iter_cmp(const void *l, const void *r, void __alway
+ return bkey_cmp(_l->k, _r->k) <= 0;
+ }
+
++static inline void new_btree_iter_swap(void *iter1, void *iter2, void __always_unused *args)
++{
++ struct btree_iter_set *_iter1 = iter1;
++ struct btree_iter_set *_iter2 = iter2;
++
++ swap(*_iter1, *_iter2);
++}
++
+ static inline bool btree_iter_end(struct btree_iter *iter)
+ {
+ return !iter->heap.nr;
+@@ -1103,7 +1111,7 @@ void bch_btree_iter_push(struct btree_iter *iter, struct bkey *k,
+ {
+ const struct min_heap_callbacks callbacks = {
+ .less = new_btree_iter_cmp,
+- .swp = NULL,
++ .swp = new_btree_iter_swap,
+ };
+
+ if (k != end)
+@@ -1149,7 +1157,7 @@ static inline struct bkey *__bch_btree_iter_next(struct btree_iter *iter,
+ struct bkey *ret = NULL;
+ const struct min_heap_callbacks callbacks = {
+ .less = cmp,
+- .swp = NULL,
++ .swp = new_btree_iter_swap,
+ };
+
+ if (!btree_iter_end(iter)) {
+@@ -1223,7 +1231,7 @@ static void btree_mergesort(struct btree_keys *b, struct bset *out,
+ : bch_ptr_invalid;
+ const struct min_heap_callbacks callbacks = {
+ .less = b->ops->sort_cmp,
+- .swp = NULL,
++ .swp = new_btree_iter_swap,
+ };
+
+ /* Heapify the iterator, using our comparison function */
+diff --git a/drivers/md/bcache/extents.c b/drivers/md/bcache/extents.c
+index 4b84fda1530a..a7221e5dbe81 100644
+--- a/drivers/md/bcache/extents.c
++++ b/drivers/md/bcache/extents.c
+@@ -266,12 +266,20 @@ static bool new_bch_extent_sort_cmp(const void *l, const void *r, void __always_
+ return !(c ? c > 0 : _l->k < _r->k);
+ }
+
++static inline void new_btree_iter_swap(void *iter1, void *iter2, void __always_unused *args)
++{
++ struct btree_iter_set *_iter1 = iter1;
++ struct btree_iter_set *_iter2 = iter2;
++
++ swap(*_iter1, *_iter2);
++}
++
+ static struct bkey *bch_extent_sort_fixup(struct btree_iter *iter,
+ struct bkey *tmp)
+ {
+ const struct min_heap_callbacks callbacks = {
+ .less = new_bch_extent_sort_cmp,
+- .swp = NULL,
++ .swp = new_btree_iter_swap,
+ };
+ while (iter->heap.nr > 1) {
+ struct btree_iter_set *top = iter->heap.data, *i = top + 1;
+diff --git a/drivers/md/bcache/movinggc.c b/drivers/md/bcache/movinggc.c
+index 45ca134cbf02..d6c73dd8eb2b 100644
+--- a/drivers/md/bcache/movinggc.c
++++ b/drivers/md/bcache/movinggc.c
+@@ -190,6 +190,14 @@ static bool new_bucket_cmp(const void *l, const void *r, void __always_unused *a
+ return GC_SECTORS_USED(*_l) >= GC_SECTORS_USED(*_r);
+ }
+
++static void new_bucket_swap(void *l, void *r, void __always_unused *args)
++{
++ struct bucket **_l = l;
++ struct bucket **_r = r;
++
++ swap(*_l, *_r);
++}
++
+ static unsigned int bucket_heap_top(struct cache *ca)
+ {
+ struct bucket *b;
+@@ -204,7 +212,7 @@ void bch_moving_gc(struct cache_set *c)
+ unsigned long sectors_to_move, reserve_sectors;
+ const struct min_heap_callbacks callbacks = {
+ .less = new_bucket_cmp,
+- .swp = NULL,
++ .swp = new_bucket_swap,
+ };
+
+ if (!c->copy_gc_enabled)
+--
+2.50.0
+
--- /dev/null
+From a35b29bdedb4d2ae3160d4d6684a6f1ecd9ca7c2 Mon Sep 17 00:00:00 2001
+From: Karan Tilak Kumar <kartilak@cisco.com>
+Date: Tue, 17 Jun 2025 17:34:28 -0700
+Subject: scsi: fnic: Fix crash in fnic_wq_cmpl_handler when FDMI times out
+
+From: Karan Tilak Kumar <kartilak@cisco.com>
+
+commit a35b29bdedb4d2ae3160d4d6684a6f1ecd9ca7c2 upstream.
+
+When both the RHBA and RPA FDMI requests time out, fnic reuses a frame to
+send ABTS for each of them. On send completion, this causes an attempt to
+free the same frame twice that leads to a crash.
+
+Fix crash by allocating separate frames for RHBA and RPA, and modify ABTS
+logic accordingly.
+
+Tested by checking MDS for FDMI information.
+
+Tested by using instrumented driver to:
+
+ - Drop PLOGI response
+ - Drop RHBA response
+ - Drop RPA response
+ - Drop RHBA and RPA response
+ - Drop PLOGI response + ABTS response
+ - Drop RHBA response + ABTS response
+ - Drop RPA response + ABTS response
+ - Drop RHBA and RPA response + ABTS response for both of them
+
+Fixes: 09c1e6ab4ab2 ("scsi: fnic: Add and integrate support for FDMI")
+Reviewed-by: Sesidhar Baddela <sebaddel@cisco.com>
+Reviewed-by: Arulprabhu Ponnusamy <arulponn@cisco.com>
+Reviewed-by: Gian Carlo Boffa <gcboffa@cisco.com>
+Tested-by: Arun Easi <aeasi@cisco.com>
+Co-developed-by: Arun Easi <aeasi@cisco.com>
+Signed-off-by: Arun Easi <aeasi@cisco.com>
+Tested-by: Karan Tilak Kumar <kartilak@cisco.com>
+Cc: stable@vger.kernel.org
+Signed-off-by: Karan Tilak Kumar <kartilak@cisco.com>
+Link: https://lore.kernel.org/r/20250618003431.6314-1-kartilak@cisco.com
+Reviewed-by: John Meneghini <jmeneghi@redhat.com>
+Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/scsi/fnic/fdls_disc.c | 113 +++++++++++++++++++++++++++++++-----------
+ drivers/scsi/fnic/fnic.h | 2
+ drivers/scsi/fnic/fnic_fdls.h | 1
+ 3 files changed, 87 insertions(+), 29 deletions(-)
+
+--- a/drivers/scsi/fnic/fdls_disc.c
++++ b/drivers/scsi/fnic/fdls_disc.c
+@@ -763,47 +763,69 @@ static void fdls_send_fabric_abts(struct
+ iport->fabric.timer_pending = 1;
+ }
+
+-static void fdls_send_fdmi_abts(struct fnic_iport_s *iport)
++static uint8_t *fdls_alloc_init_fdmi_abts_frame(struct fnic_iport_s *iport,
++ uint16_t oxid)
+ {
+- uint8_t *frame;
++ struct fc_frame_header *pfdmi_abts;
+ uint8_t d_id[3];
++ uint8_t *frame;
+ struct fnic *fnic = iport->fnic;
+- struct fc_frame_header *pfabric_abts;
+- unsigned long fdmi_tov;
+- uint16_t oxid;
+- uint16_t frame_size = FNIC_ETH_FCOE_HDRS_OFFSET +
+- sizeof(struct fc_frame_header);
+
+ frame = fdls_alloc_frame(iport);
+ if (frame == NULL) {
+ FNIC_FCS_DBG(KERN_ERR, fnic->host, fnic->fnic_num,
+ "Failed to allocate frame to send FDMI ABTS");
+- return;
++ return NULL;
+ }
+
+- pfabric_abts = (struct fc_frame_header *) (frame + FNIC_ETH_FCOE_HDRS_OFFSET);
++ pfdmi_abts = (struct fc_frame_header *) (frame + FNIC_ETH_FCOE_HDRS_OFFSET);
+ fdls_init_fabric_abts_frame(frame, iport);
+
+ hton24(d_id, FC_FID_MGMT_SERV);
+- FNIC_STD_SET_D_ID(*pfabric_abts, d_id);
++ FNIC_STD_SET_D_ID(*pfdmi_abts, d_id);
++ FNIC_STD_SET_OX_ID(*pfdmi_abts, oxid);
++
++ return frame;
++}
++
++static void fdls_send_fdmi_abts(struct fnic_iport_s *iport)
++{
++ uint8_t *frame;
++ unsigned long fdmi_tov;
++ uint16_t frame_size = FNIC_ETH_FCOE_HDRS_OFFSET +
++ sizeof(struct fc_frame_header);
+
+ if (iport->fabric.fdmi_pending & FDLS_FDMI_PLOGI_PENDING) {
+- oxid = iport->active_oxid_fdmi_plogi;
+- FNIC_STD_SET_OX_ID(*pfabric_abts, oxid);
++ frame = fdls_alloc_init_fdmi_abts_frame(iport,
++ iport->active_oxid_fdmi_plogi);
++ if (frame == NULL)
++ return;
++
+ fnic_send_fcoe_frame(iport, frame, frame_size);
+ } else {
+ if (iport->fabric.fdmi_pending & FDLS_FDMI_REG_HBA_PENDING) {
+- oxid = iport->active_oxid_fdmi_rhba;
+- FNIC_STD_SET_OX_ID(*pfabric_abts, oxid);
++ frame = fdls_alloc_init_fdmi_abts_frame(iport,
++ iport->active_oxid_fdmi_rhba);
++ if (frame == NULL)
++ return;
++
+ fnic_send_fcoe_frame(iport, frame, frame_size);
+ }
+ if (iport->fabric.fdmi_pending & FDLS_FDMI_RPA_PENDING) {
+- oxid = iport->active_oxid_fdmi_rpa;
+- FNIC_STD_SET_OX_ID(*pfabric_abts, oxid);
++ frame = fdls_alloc_init_fdmi_abts_frame(iport,
++ iport->active_oxid_fdmi_rpa);
++ if (frame == NULL) {
++ if (iport->fabric.fdmi_pending & FDLS_FDMI_REG_HBA_PENDING)
++ goto arm_timer;
++ else
++ return;
++ }
++
+ fnic_send_fcoe_frame(iport, frame, frame_size);
+ }
+ }
+
++arm_timer:
+ fdmi_tov = jiffies + msecs_to_jiffies(2 * iport->e_d_tov);
+ mod_timer(&iport->fabric.fdmi_timer, round_jiffies(fdmi_tov));
+ iport->fabric.fdmi_pending |= FDLS_FDMI_ABORT_PENDING;
+@@ -2244,6 +2266,21 @@ void fdls_fabric_timer_callback(struct t
+ spin_unlock_irqrestore(&fnic->fnic_lock, flags);
+ }
+
++void fdls_fdmi_retry_plogi(struct fnic_iport_s *iport)
++{
++ struct fnic *fnic = iport->fnic;
++
++ iport->fabric.fdmi_pending = 0;
++ /* If max retries not exhausted, start over from fdmi plogi */
++ if (iport->fabric.fdmi_retry < FDLS_FDMI_MAX_RETRY) {
++ iport->fabric.fdmi_retry++;
++ FNIC_FCS_DBG(KERN_INFO, fnic->host, fnic->fnic_num,
++ "Retry FDMI PLOGI. FDMI retry: %d",
++ iport->fabric.fdmi_retry);
++ fdls_send_fdmi_plogi(iport);
++ }
++}
++
+ void fdls_fdmi_timer_callback(struct timer_list *t)
+ {
+ struct fnic_fdls_fabric_s *fabric = from_timer(fabric, t, fdmi_timer);
+@@ -2289,14 +2326,7 @@ void fdls_fdmi_timer_callback(struct tim
+ FNIC_FCS_DBG(KERN_INFO, fnic->host, fnic->fnic_num,
+ "fdmi timer callback : 0x%x\n", iport->fabric.fdmi_pending);
+
+- iport->fabric.fdmi_pending = 0;
+- /* If max retries not exhaused, start over from fdmi plogi */
+- if (iport->fabric.fdmi_retry < FDLS_FDMI_MAX_RETRY) {
+- iport->fabric.fdmi_retry++;
+- FNIC_FCS_DBG(KERN_INFO, fnic->host, fnic->fnic_num,
+- "retry fdmi timer %d", iport->fabric.fdmi_retry);
+- fdls_send_fdmi_plogi(iport);
+- }
++ fdls_fdmi_retry_plogi(iport);
+ FNIC_FCS_DBG(KERN_INFO, fnic->host, fnic->fnic_num,
+ "fdmi timer callback : 0x%x\n", iport->fabric.fdmi_pending);
+ spin_unlock_irqrestore(&fnic->fnic_lock, flags);
+@@ -3714,11 +3744,32 @@ static void fdls_process_fdmi_abts_rsp(s
+ switch (FNIC_FRAME_TYPE(oxid)) {
+ case FNIC_FRAME_TYPE_FDMI_PLOGI:
+ fdls_free_oxid(iport, oxid, &iport->active_oxid_fdmi_plogi);
++
++ iport->fabric.fdmi_pending &= ~FDLS_FDMI_PLOGI_PENDING;
++ iport->fabric.fdmi_pending &= ~FDLS_FDMI_ABORT_PENDING;
+ break;
+ case FNIC_FRAME_TYPE_FDMI_RHBA:
++ iport->fabric.fdmi_pending &= ~FDLS_FDMI_REG_HBA_PENDING;
++
++ /* If RPA is still pending, don't turn off ABORT PENDING.
++ * We count on the timer to detect the ABTS timeout and take
++ * corrective action.
++ */
++ if (!(iport->fabric.fdmi_pending & FDLS_FDMI_RPA_PENDING))
++ iport->fabric.fdmi_pending &= ~FDLS_FDMI_ABORT_PENDING;
++
+ fdls_free_oxid(iport, oxid, &iport->active_oxid_fdmi_rhba);
+ break;
+ case FNIC_FRAME_TYPE_FDMI_RPA:
++ iport->fabric.fdmi_pending &= ~FDLS_FDMI_RPA_PENDING;
++
++ /* If RHBA is still pending, don't turn off ABORT PENDING.
++ * We count on the timer to detect the ABTS timeout and take
++ * corrective action.
++ */
++ if (!(iport->fabric.fdmi_pending & FDLS_FDMI_REG_HBA_PENDING))
++ iport->fabric.fdmi_pending &= ~FDLS_FDMI_ABORT_PENDING;
++
+ fdls_free_oxid(iport, oxid, &iport->active_oxid_fdmi_rpa);
+ break;
+ default:
+@@ -3728,10 +3779,16 @@ static void fdls_process_fdmi_abts_rsp(s
+ break;
+ }
+
+- timer_delete_sync(&iport->fabric.fdmi_timer);
+- iport->fabric.fdmi_pending &= ~FDLS_FDMI_ABORT_PENDING;
+-
+- fdls_send_fdmi_plogi(iport);
++ /*
++ * Only if ABORT PENDING is off, delete the timer, and if no other
++ * operations are pending, retry FDMI.
++ * Otherwise, let the timer pop and take the appropriate action.
++ */
++ if (!(iport->fabric.fdmi_pending & FDLS_FDMI_ABORT_PENDING)) {
++ timer_delete_sync(&iport->fabric.fdmi_timer);
++ if (!iport->fabric.fdmi_pending)
++ fdls_fdmi_retry_plogi(iport);
++ }
+ }
+
+ static void
+--- a/drivers/scsi/fnic/fnic.h
++++ b/drivers/scsi/fnic/fnic.h
+@@ -30,7 +30,7 @@
+
+ #define DRV_NAME "fnic"
+ #define DRV_DESCRIPTION "Cisco FCoE HBA Driver"
+-#define DRV_VERSION "1.8.0.0"
++#define DRV_VERSION "1.8.0.1"
+ #define PFX DRV_NAME ": "
+ #define DFX DRV_NAME "%d: "
+
+--- a/drivers/scsi/fnic/fnic_fdls.h
++++ b/drivers/scsi/fnic/fnic_fdls.h
+@@ -394,6 +394,7 @@ void fdls_send_tport_abts(struct fnic_ip
+ bool fdls_delete_tport(struct fnic_iport_s *iport,
+ struct fnic_tport_s *tport);
+ void fdls_fdmi_timer_callback(struct timer_list *t);
++void fdls_fdmi_retry_plogi(struct fnic_iport_s *iport);
+
+ /* fnic_fcs.c */
+ void fnic_fdls_init(struct fnic *fnic, int usefip);
--- /dev/null
+From 74f46a0524f8d2f01dc7ca95bb5fc463a8603e72 Mon Sep 17 00:00:00 2001
+From: Karan Tilak Kumar <kartilak@cisco.com>
+Date: Tue, 17 Jun 2025 17:34:29 -0700
+Subject: scsi: fnic: Turn off FDMI ACTIVE flags on link down
+
+From: Karan Tilak Kumar <kartilak@cisco.com>
+
+commit 74f46a0524f8d2f01dc7ca95bb5fc463a8603e72 upstream.
+
+When the link goes down and comes up, FDMI requests are not sent out
+anymore.
+
+Fix bug by turning off FNIC_FDMI_ACTIVE when the link goes down.
+
+Fixes: 09c1e6ab4ab2 ("scsi: fnic: Add and integrate support for FDMI")
+Reviewed-by: Sesidhar Baddela <sebaddel@cisco.com>
+Reviewed-by: Arulprabhu Ponnusamy <arulponn@cisco.com>
+Reviewed-by: Gian Carlo Boffa <gcboffa@cisco.com>
+Reviewed-by: Arun Easi <aeasi@cisco.com>
+Tested-by: Karan Tilak Kumar <kartilak@cisco.com>
+Cc: stable@vger.kernel.org
+Signed-off-by: Karan Tilak Kumar <kartilak@cisco.com>
+Link: https://lore.kernel.org/r/20250618003431.6314-2-kartilak@cisco.com
+Reviewed-by: John Meneghini <jmeneghi@redhat.com>
+Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/scsi/fnic/fdls_disc.c | 9 ++++++---
+ drivers/scsi/fnic/fnic.h | 2 +-
+ 2 files changed, 7 insertions(+), 4 deletions(-)
+
+--- a/drivers/scsi/fnic/fdls_disc.c
++++ b/drivers/scsi/fnic/fdls_disc.c
+@@ -5027,9 +5027,12 @@ void fnic_fdls_link_down(struct fnic_ipo
+ fdls_delete_tport(iport, tport);
+ }
+
+- if ((fnic_fdmi_support == 1) && (iport->fabric.fdmi_pending > 0)) {
+- timer_delete_sync(&iport->fabric.fdmi_timer);
+- iport->fabric.fdmi_pending = 0;
++ if (fnic_fdmi_support == 1) {
++ if (iport->fabric.fdmi_pending > 0) {
++ timer_delete_sync(&iport->fabric.fdmi_timer);
++ iport->fabric.fdmi_pending = 0;
++ }
++ iport->flags &= ~FNIC_FDMI_ACTIVE;
+ }
+
+ FNIC_FCS_DBG(KERN_INFO, fnic->host, fnic->fnic_num,
+--- a/drivers/scsi/fnic/fnic.h
++++ b/drivers/scsi/fnic/fnic.h
+@@ -30,7 +30,7 @@
+
+ #define DRV_NAME "fnic"
+ #define DRV_DESCRIPTION "Cisco FCoE HBA Driver"
+-#define DRV_VERSION "1.8.0.1"
++#define DRV_VERSION "1.8.0.2"
+ #define PFX DRV_NAME ": "
+ #define DFX DRV_NAME "%d: "
+
--- /dev/null
+From 752eb816b55adb0673727ba0ed96609a17895654 Mon Sep 17 00:00:00 2001
+From: Chen Yu <yu.c.chen@intel.com>
+Date: Wed, 4 Jun 2025 12:25:56 +0800
+Subject: scsi: megaraid_sas: Fix invalid node index
+
+From: Chen Yu <yu.c.chen@intel.com>
+
+commit 752eb816b55adb0673727ba0ed96609a17895654 upstream.
+
+On a system with DRAM interleave enabled, out-of-bound access is
+detected:
+
+megaraid_sas 0000:3f:00.0: requested/available msix 128/128 poll_queue 0
+------------[ cut here ]------------
+UBSAN: array-index-out-of-bounds in ./arch/x86/include/asm/topology.h:72:28
+index -1 is out of range for type 'cpumask *[1024]'
+dump_stack_lvl+0x5d/0x80
+ubsan_epilogue+0x5/0x2b
+__ubsan_handle_out_of_bounds.cold+0x46/0x4b
+megasas_alloc_irq_vectors+0x149/0x190 [megaraid_sas]
+megasas_probe_one.cold+0xa4d/0x189c [megaraid_sas]
+local_pci_probe+0x42/0x90
+pci_device_probe+0xdc/0x290
+really_probe+0xdb/0x340
+__driver_probe_device+0x78/0x110
+driver_probe_device+0x1f/0xa0
+__driver_attach+0xba/0x1c0
+bus_for_each_dev+0x8b/0xe0
+bus_add_driver+0x142/0x220
+driver_register+0x72/0xd0
+megasas_init+0xdf/0xff0 [megaraid_sas]
+do_one_initcall+0x57/0x310
+do_init_module+0x90/0x250
+init_module_from_file+0x85/0xc0
+idempotent_init_module+0x114/0x310
+__x64_sys_finit_module+0x65/0xc0
+do_syscall_64+0x82/0x170
+entry_SYSCALL_64_after_hwframe+0x76/0x7e
+
+Fix it accordingly.
+
+Signed-off-by: Chen Yu <yu.c.chen@intel.com>
+Link: https://lore.kernel.org/r/20250604042556.3731059-1-yu.c.chen@intel.com
+Fixes: 8049da6f3943 ("scsi: megaraid_sas: Use irq_set_affinity_and_hint()")
+Cc: stable@vger.kernel.org
+Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/scsi/megaraid/megaraid_sas_base.c | 6 +++++-
+ 1 file changed, 5 insertions(+), 1 deletion(-)
+
+--- a/drivers/scsi/megaraid/megaraid_sas_base.c
++++ b/drivers/scsi/megaraid/megaraid_sas_base.c
+@@ -5910,7 +5910,11 @@ megasas_set_high_iops_queue_affinity_and
+ const struct cpumask *mask;
+
+ if (instance->perf_mode == MR_BALANCED_PERF_MODE) {
+- mask = cpumask_of_node(dev_to_node(&instance->pdev->dev));
++ int nid = dev_to_node(&instance->pdev->dev);
++
++ if (nid == NUMA_NO_NODE)
++ nid = 0;
++ mask = cpumask_of_node(nid);
+
+ for (i = 0; i < instance->low_latency_index_start; i++) {
+ irq = pci_irq_vector(instance->pdev, i);
--- /dev/null
+From 2e083cd802294693a5414e4557a183dd7e442e71 Mon Sep 17 00:00:00 2001
+From: anvithdosapati <anvithdosapati@google.com>
+Date: Mon, 16 Jun 2025 08:57:34 +0000
+Subject: scsi: ufs: core: Fix clk scaling to be conditional in reset and restore
+
+From: anvithdosapati <anvithdosapati@google.com>
+
+commit 2e083cd802294693a5414e4557a183dd7e442e71 upstream.
+
+In ufshcd_host_reset_and_restore(), scale up clocks only when clock
+scaling is supported. Without this change CPU latency is voted for 0
+(ufshcd_pm_qos_update) during resume unconditionally.
+
+Signed-off-by: anvithdosapati <anvithdosapati@google.com>
+Link: https://lore.kernel.org/r/20250616085734.2133581-1-anvithdosapati@google.com
+Fixes: a3cd5ec55f6c ("scsi: ufs: add load based scaling of UFS gear")
+Cc: stable@vger.kernel.org
+Reviewed-by: Bart Van Assche <bvanassche@acm.org>
+Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/ufs/core/ufshcd.c | 3 ++-
+ 1 file changed, 2 insertions(+), 1 deletion(-)
+
+--- a/drivers/ufs/core/ufshcd.c
++++ b/drivers/ufs/core/ufshcd.c
+@@ -7753,7 +7753,8 @@ static int ufshcd_host_reset_and_restore
+ hba->silence_err_logs = false;
+
+ /* scale up clocks to max frequency before full reinitialization */
+- ufshcd_scale_clks(hba, ULONG_MAX, true);
++ if (ufshcd_is_clkscaling_supported(hba))
++ ufshcd_scale_clks(hba, ULONG_MAX, true);
+
+ err = ufshcd_hba_enable(hba);
+
--- /dev/null
+From fde46f60f6c5138ee422087addbc5bf5b4968bf1 Mon Sep 17 00:00:00 2001
+From: Stephen Smalley <stephen.smalley.work@gmail.com>
+Date: Tue, 10 Jun 2025 15:48:27 -0400
+Subject: selinux: change security_compute_sid to return the ssid or tsid on match
+
+From: Stephen Smalley <stephen.smalley.work@gmail.com>
+
+commit fde46f60f6c5138ee422087addbc5bf5b4968bf1 upstream.
+
+If the end result of a security_compute_sid() computation matches the
+ssid or tsid, return that SID rather than looking it up again. This
+avoids the problem of multiple initial SIDs that map to the same
+context.
+
+Cc: stable@vger.kernel.org
+Reported-by: Guido Trentalancia <guido@trentalancia.com>
+Fixes: ae254858ce07 ("selinux: introduce an initial SID for early boot processes")
+Signed-off-by: Stephen Smalley <stephen.smalley.work@gmail.com>
+Tested-by: Guido Trentalancia <guido@trentalancia.com>
+Signed-off-by: Paul Moore <paul@paul-moore.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ security/selinux/ss/services.c | 16 +++++++++++-----
+ 1 file changed, 11 insertions(+), 5 deletions(-)
+
+--- a/security/selinux/ss/services.c
++++ b/security/selinux/ss/services.c
+@@ -1909,11 +1909,17 @@ retry:
+ goto out_unlock;
+ }
+ /* Obtain the sid for the context. */
+- rc = sidtab_context_to_sid(sidtab, &newcontext, out_sid);
+- if (rc == -ESTALE) {
+- rcu_read_unlock();
+- context_destroy(&newcontext);
+- goto retry;
++ if (context_equal(scontext, &newcontext))
++ *out_sid = ssid;
++ else if (context_equal(tcontext, &newcontext))
++ *out_sid = tsid;
++ else {
++ rc = sidtab_context_to_sid(sidtab, &newcontext, out_sid);
++ if (rc == -ESTALE) {
++ rcu_read_unlock();
++ context_destroy(&newcontext);
++ goto retry;
++ }
+ }
+ out_unlock:
+ rcu_read_unlock();
--- /dev/null
+From d36f0e9a0002f04f4d6dd9be908d58fe5bd3a279 Mon Sep 17 00:00:00 2001
+From: Aidan Stewart <astewart@tektelic.com>
+Date: Tue, 17 Jun 2025 10:48:19 -0600
+Subject: serial: core: restore of_node information in sysfs
+
+From: Aidan Stewart <astewart@tektelic.com>
+
+commit d36f0e9a0002f04f4d6dd9be908d58fe5bd3a279 upstream.
+
+Since in v6.8-rc1, the of_node symlink under tty devices is
+missing. This breaks any udev rules relying on this information.
+
+Link the of_node information in the serial controller device with the
+parent defined in the device tree. This will also apply to the serial
+device which takes the serial controller as a parent device.
+
+Fixes: b286f4e87e32 ("serial: core: Move tty and serdev to be children of serial core port device")
+Cc: stable@vger.kernel.org
+Signed-off-by: Aidan Stewart <astewart@tektelic.com>
+Link: https://lore.kernel.org/r/20250617164819.13912-1-astewart@tektelic.com
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/tty/serial/serial_base_bus.c | 1 +
+ 1 file changed, 1 insertion(+)
+
+--- a/drivers/tty/serial/serial_base_bus.c
++++ b/drivers/tty/serial/serial_base_bus.c
+@@ -72,6 +72,7 @@ static int serial_base_device_init(struc
+ dev->parent = parent_dev;
+ dev->bus = &serial_base_bus_type;
+ dev->release = release;
++ device_set_of_node_from_dev(dev, parent_dev);
+
+ if (!serial_base_initialized) {
+ dev_dbg(port->dev, "uart_add_one_port() called before arch_initcall()?\n");
--- /dev/null
+From f23c52aafb1675ab1d1f46914556d8e29cbbf7b3 Mon Sep 17 00:00:00 2001
+From: Fabio Estevam <festevam@gmail.com>
+Date: Thu, 19 Jun 2025 08:46:17 -0300
+Subject: serial: imx: Restore original RXTL for console to fix data loss
+
+From: Fabio Estevam <festevam@gmail.com>
+
+commit f23c52aafb1675ab1d1f46914556d8e29cbbf7b3 upstream.
+
+Commit 7a637784d517 ("serial: imx: reduce RX interrupt frequency")
+introduced a regression on the i.MX6UL EVK board. The issue can be
+reproduced with the following steps:
+
+- Open vi on the board.
+- Paste a text file (~150 characters).
+- Save the file, then repeat the process.
+- Compare the sha256sum of the saved files.
+
+The checksums do not match due to missing characters or entire lines.
+
+Fix this by restoring the RXTL value to 1 when the UART is used as a
+console.
+
+This ensures timely RX interrupts and reliable data reception in console
+mode.
+
+With this change, pasted content is saved correctly, and checksums are
+always consistent.
+
+Cc: stable <stable@kernel.org>
+Fixes: 7a637784d517 ("serial: imx: reduce RX interrupt frequency")
+Signed-off-by: Fabio Estevam <festevam@gmail.com>
+Reviewed-by: Stefan Wahren <wahrenst@gmx.net>
+Link: https://lore.kernel.org/r/20250619114617.2791939-1-festevam@gmail.com
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/tty/serial/imx.c | 17 ++++++++++++-----
+ 1 file changed, 12 insertions(+), 5 deletions(-)
+
+--- a/drivers/tty/serial/imx.c
++++ b/drivers/tty/serial/imx.c
+@@ -235,6 +235,7 @@ struct imx_port {
+ enum imx_tx_state tx_state;
+ struct hrtimer trigger_start_tx;
+ struct hrtimer trigger_stop_tx;
++ unsigned int rxtl;
+ };
+
+ struct imx_port_ucrs {
+@@ -1339,6 +1340,7 @@ static void imx_uart_clear_rx_errors(str
+
+ #define TXTL_DEFAULT 8
+ #define RXTL_DEFAULT 8 /* 8 characters or aging timer */
++#define RXTL_CONSOLE_DEFAULT 1
+ #define TXTL_DMA 8 /* DMA burst setting */
+ #define RXTL_DMA 9 /* DMA burst setting */
+
+@@ -1457,7 +1459,7 @@ static void imx_uart_disable_dma(struct
+ ucr1 &= ~(UCR1_RXDMAEN | UCR1_TXDMAEN | UCR1_ATDMAEN);
+ imx_uart_writel(sport, ucr1, UCR1);
+
+- imx_uart_setup_ufcr(sport, TXTL_DEFAULT, RXTL_DEFAULT);
++ imx_uart_setup_ufcr(sport, TXTL_DEFAULT, sport->rxtl);
+
+ sport->dma_is_enabled = 0;
+ }
+@@ -1482,7 +1484,12 @@ static int imx_uart_startup(struct uart_
+ return retval;
+ }
+
+- imx_uart_setup_ufcr(sport, TXTL_DEFAULT, RXTL_DEFAULT);
++ if (uart_console(&sport->port))
++ sport->rxtl = RXTL_CONSOLE_DEFAULT;
++ else
++ sport->rxtl = RXTL_DEFAULT;
++
++ imx_uart_setup_ufcr(sport, TXTL_DEFAULT, sport->rxtl);
+
+ /* disable the DREN bit (Data Ready interrupt enable) before
+ * requesting IRQs
+@@ -1948,7 +1955,7 @@ static int imx_uart_poll_init(struct uar
+ if (retval)
+ clk_disable_unprepare(sport->clk_ipg);
+
+- imx_uart_setup_ufcr(sport, TXTL_DEFAULT, RXTL_DEFAULT);
++ imx_uart_setup_ufcr(sport, TXTL_DEFAULT, sport->rxtl);
+
+ uart_port_lock_irqsave(&sport->port, &flags);
+
+@@ -2040,7 +2047,7 @@ static int imx_uart_rs485_config(struct
+ /* If the receiver trigger is 0, set it to a default value */
+ ufcr = imx_uart_readl(sport, UFCR);
+ if ((ufcr & UFCR_RXTL_MASK) == 0)
+- imx_uart_setup_ufcr(sport, TXTL_DEFAULT, RXTL_DEFAULT);
++ imx_uart_setup_ufcr(sport, TXTL_DEFAULT, sport->rxtl);
+ imx_uart_start_rx(port);
+ }
+
+@@ -2302,7 +2309,7 @@ imx_uart_console_setup(struct console *c
+ else
+ imx_uart_console_get_options(sport, &baud, &parity, &bits);
+
+- imx_uart_setup_ufcr(sport, TXTL_DEFAULT, RXTL_DEFAULT);
++ imx_uart_setup_ufcr(sport, TXTL_DEFAULT, sport->rxtl);
+
+ retval = uart_set_options(&sport->port, co, baud, parity, bits, flow);
+
drm-xe-guc-explicitly-exit-ct-safe-mode-on-unwind.patch
drm-xe-process-deferred-ggtt-node-removals-on-device.patch
smb-client-fix-potential-deadlock-when-reconnecting-.patch
+edac-amd64-fix-size-calculation-for-non-power-of-two-dimms.patch
+x86-traps-initialize-dr6-by-writing-its-architectural-reset-value.patch
+staging-rtl8723bs-avoid-memset-in-aes_cipher-and-aes_decipher.patch
+dt-bindings-serial-8250-make-clocks-and-clock-frequency-exclusive.patch
+serial-core-restore-of_node-information-in-sysfs.patch
+serial-imx-restore-original-rxtl-for-console-to-fix-data-loss.patch
+bluetooth-l2cap-fix-l2cap-mtu-negotiation.patch
+dm-raid-fix-variable-in-journal-device-check.patch
+bcache-remove-unnecessary-select-min_heap.patch
+btrfs-fix-a-race-between-renames-and-directory-logging.patch
+btrfs-update-superblock-s-device-bytes_used-when-dropping-chunk.patch
+btrfs-fix-invalid-inode-pointer-dereferences-during-log-replay.patch
+revert-bcache-update-min_heap_callbacks-to-use-default-builtin-swap.patch
+revert-bcache-remove-heap-related-macros-and-switch-to-generic-min_heap.patch
+selinux-change-security_compute_sid-to-return-the-ssid-or-tsid-on-match.patch
+spi-spi-cadence-quadspi-fix-pm-runtime-unbalance.patch
+net-libwx-fix-the-creation-of-page_pool.patch
+maple_tree-fix-ma_state_prealloc-flag-in-mas_preallocate.patch
+mm-userfaultfd-fix-race-of-userfaultfd_move-and-swap-cache.patch
+mm-shmem-swap-fix-softlockup-with-mthp-swapin.patch
+mm-gup-revert-mm-gup-fix-infinite-loop-within-__get_longterm_locked.patch
+f2fs-fix-to-zero-post-eof-page.patch
+hid-appletb-kbd-fix-appletb_backlight-backlight-device-reference-counting.patch
+hid-lenovo-restrict-f7-9-11-mode-to-compact-keyboards-only.patch
+hid-wacom-fix-memory-leak-on-kobject-creation-failure.patch
+hid-wacom-fix-memory-leak-on-sysfs-attribute-creation-failure.patch
+hid-wacom-fix-kobject-reference-count-leak.patch
+scsi-megaraid_sas-fix-invalid-node-index.patch
+scsi-ufs-core-fix-clk-scaling-to-be-conditional-in-reset-and-restore.patch
+scsi-fnic-fix-crash-in-fnic_wq_cmpl_handler-when-fdmi-times-out.patch
+scsi-fnic-turn-off-fdmi-active-flags-on-link-down.patch
+drm-ast-fix-comment-on-modeset-lock.patch
+drm-cirrus-qemu-fix-pitch-programming.patch
+drm-etnaviv-protect-the-scheduler-s-pending-list-with-its-lock.patch
+drm-panel-simple-tianma-tm070jdhg34-00-add-delays.patch
+drm-simpledrm-do-not-upcast-in-release-helpers.patch
+drm-tegra-assign-plane-type-before-registration.patch
+drm-tegra-fix-a-possible-null-pointer-dereference.patch
+drm-udl-unregister-device-before-cleaning-up-on-disconnect.patch
+drm-msm-gpu-fix-crash-when-throttling-gpu-immediately-during-boot.patch
+drm-amdkfd-fix-race-in-gws-queue-scheduling.patch
--- /dev/null
+From b07f349d1864abe29436f45e3047da2bdd476462 Mon Sep 17 00:00:00 2001
+From: Khairul Anuar Romli <khairul.anuar.romli@altera.com>
+Date: Mon, 16 Jun 2025 09:13:53 +0800
+Subject: spi: spi-cadence-quadspi: Fix pm runtime unbalance
+
+From: Khairul Anuar Romli <khairul.anuar.romli@altera.com>
+
+commit b07f349d1864abe29436f45e3047da2bdd476462 upstream.
+
+Having PM put sync in remove function is causing PM underflow during
+remove operation. This is caused by the function, runtime_pm_get_sync,
+not being called anywhere during the op. Ensure that calls to
+pm_runtime_enable()/pm_runtime_disable() and
+pm_runtime_get_sync()/pm_runtime_put_sync() match.
+
+echo 108d2000.spi > /sys/bus/platform/drivers/cadence-qspi/unbind
+[ 49.644256] Deleting MTD partitions on "108d2000.spi.0":
+[ 49.649575] Deleting u-boot MTD partition
+[ 49.684087] Deleting root MTD partition
+[ 49.724188] cadence-qspi 108d2000.spi: Runtime PM usage count underflow!
+
+Continuous bind/unbind will result in an "Unbalanced pm_runtime_enable" error.
+Subsequent unbind attempts will return a "No such device" error, while bind
+attempts will return a "Resource temporarily unavailable" error.
+
+[ 47.592434] cadence-qspi 108d2000.spi: Runtime PM usage count underflow!
+[ 49.592233] cadence-qspi 108d2000.spi: detected FIFO depth (1024) different from config (128)
+[ 53.232309] cadence-qspi 108d2000.spi: Runtime PM usage count underflow!
+[ 55.828550] cadence-qspi 108d2000.spi: detected FIFO depth (1024) different from config (128)
+[ 57.940627] cadence-qspi 108d2000.spi: Runtime PM usage count underflow!
+[ 59.912490] cadence-qspi 108d2000.spi: detected FIFO depth (1024) different from config (128)
+[ 61.876243] cadence-qspi 108d2000.spi: Runtime PM usage count underflow!
+[ 61.883000] platform 108d2000.spi: Unbalanced pm_runtime_enable!
+[ 532.012270] cadence-qspi 108d2000.spi: probe with driver cadence-qspi failed1
+
+Also, change clk_disable_unprepare() to clk_disable() since continuous
+bind and unbind operations will trigger a warning indicating that the clock is
+already unprepared.
+
+Fixes: 4892b374c9b7 ("mtd: spi-nor: cadence-quadspi: Add runtime PM support")
+cc: stable@vger.kernel.org # 6.6+
+Signed-off-by: Khairul Anuar Romli <khairul.anuar.romli@altera.com>
+Reviewed-by: Matthew Gerlach <matthew.gerlach@altera.com>
+Link: https://patch.msgid.link/4e7a4b8aba300e629b45a04f90bddf665fbdb335.1749601877.git.khairul.anuar.romli@altera.com
+Signed-off-by: Mark Brown <broonie@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/spi/spi-cadence-quadspi.c | 12 +++++++-----
+ 1 file changed, 7 insertions(+), 5 deletions(-)
+
+--- a/drivers/spi/spi-cadence-quadspi.c
++++ b/drivers/spi/spi-cadence-quadspi.c
+@@ -1958,10 +1958,10 @@ static int cqspi_probe(struct platform_d
+ goto probe_setup_failed;
+ }
+
+- ret = devm_pm_runtime_enable(dev);
+- if (ret) {
+- if (cqspi->rx_chan)
+- dma_release_channel(cqspi->rx_chan);
++ pm_runtime_enable(dev);
++
++ if (cqspi->rx_chan) {
++ dma_release_channel(cqspi->rx_chan);
+ goto probe_setup_failed;
+ }
+
+@@ -1981,6 +1981,7 @@ static int cqspi_probe(struct platform_d
+ return 0;
+ probe_setup_failed:
+ cqspi_controller_enable(cqspi, 0);
++ pm_runtime_disable(dev);
+ probe_reset_failed:
+ if (cqspi->is_jh7110)
+ cqspi_jh7110_disable_clk(pdev, cqspi);
+@@ -1999,7 +2000,8 @@ static void cqspi_remove(struct platform
+ if (cqspi->rx_chan)
+ dma_release_channel(cqspi->rx_chan);
+
+- clk_disable_unprepare(cqspi->clk);
++ if (pm_runtime_get_sync(&pdev->dev) >= 0)
++ clk_disable(cqspi->clk);
+
+ if (cqspi->is_jh7110)
+ cqspi_jh7110_disable_clk(pdev, cqspi);
--- /dev/null
+From a55bc4ffc06d8c965a7d6f0a01ed0ed41380df28 Mon Sep 17 00:00:00 2001
+From: Nathan Chancellor <nathan@kernel.org>
+Date: Mon, 9 Jun 2025 14:13:14 -0700
+Subject: staging: rtl8723bs: Avoid memset() in aes_cipher() and aes_decipher()
+
+From: Nathan Chancellor <nathan@kernel.org>
+
+commit a55bc4ffc06d8c965a7d6f0a01ed0ed41380df28 upstream.
+
+After commit 6f110a5e4f99 ("Disable SLUB_TINY for build testing"), which
+causes CONFIG_KASAN to be enabled in allmodconfig again, arm64
+allmodconfig builds with older versions of clang (15 through 17) show an
+instance of -Wframe-larger-than (which breaks the build with
+CONFIG_WERROR=y):
+
+ drivers/staging/rtl8723bs/core/rtw_security.c:1287:5: error: stack frame size (2208) exceeds limit (2048) in 'rtw_aes_decrypt' [-Werror,-Wframe-larger-than]
+ 1287 | u32 rtw_aes_decrypt(struct adapter *padapter, u8 *precvframe)
+ | ^
+
+This comes from aes_decipher() being inlined in rtw_aes_decrypt().
+Running the same build with CONFIG_FRAME_WARN=128 shows aes_cipher()
+also uses a decent amount of stack, just under the limit of 2048:
+
+ drivers/staging/rtl8723bs/core/rtw_security.c:864:19: warning: stack frame size (1952) exceeds limit (128) in 'aes_cipher' [-Wframe-larger-than]
+ 864 | static signed int aes_cipher(u8 *key, uint hdrlen,
+ | ^
+
+-Rpass-analysis=stack-frame-layout only shows one large structure on the
+stack, which is the ctx variable inlined from aes128k128d(). A good
+number of the other variables come from the additional checks of
+fortified string routines, which are present in memset(), which both
+aes_cipher() and aes_decipher() use to initialize some temporary
+buffers. In this case, since the size is known at compile time, these
+additional checks should not result in any code generation changes but
+allmodconfig has several sanitizers enabled, which may make it harder
+for the compiler to eliminate the compile time checks and the variables
+that come about from them.
+
+The memset() calls are just initializing these buffers to zero, so use
+'= {}' instead, which is used all over the kernel and does the exact
+same thing as memset() without the fortify checks, which drops the stack
+usage of these functions by a few hundred kilobytes.
+
+ drivers/staging/rtl8723bs/core/rtw_security.c:864:19: warning: stack frame size (1584) exceeds limit (128) in 'aes_cipher' [-Wframe-larger-than]
+ 864 | static signed int aes_cipher(u8 *key, uint hdrlen,
+ | ^
+ drivers/staging/rtl8723bs/core/rtw_security.c:1271:5: warning: stack frame size (1456) exceeds limit (128) in 'rtw_aes_decrypt' [-Wframe-larger-than]
+ 1271 | u32 rtw_aes_decrypt(struct adapter *padapter, u8 *precvframe)
+ | ^
+
+Cc: stable@vger.kernel.org
+Fixes: 554c0a3abf21 ("staging: Add rtl8723bs sdio wifi driver")
+Signed-off-by: Nathan Chancellor <nathan@kernel.org>
+Reviewed-by: Dan Carpenter <dan.carpenter@linaro.org>
+Link: https://lore.kernel.org/r/20250609-rtl8723bs-fix-clang-arm64-wflt-v1-1-e2accba43def@kernel.org
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/staging/rtl8723bs/core/rtw_security.c | 44 ++++++++------------------
+ 1 file changed, 14 insertions(+), 30 deletions(-)
+
+--- a/drivers/staging/rtl8723bs/core/rtw_security.c
++++ b/drivers/staging/rtl8723bs/core/rtw_security.c
+@@ -868,29 +868,21 @@ static signed int aes_cipher(u8 *key, ui
+ num_blocks, payload_index;
+
+ u8 pn_vector[6];
+- u8 mic_iv[16];
+- u8 mic_header1[16];
+- u8 mic_header2[16];
+- u8 ctr_preload[16];
++ u8 mic_iv[16] = {};
++ u8 mic_header1[16] = {};
++ u8 mic_header2[16] = {};
++ u8 ctr_preload[16] = {};
+
+ /* Intermediate Buffers */
+- u8 chain_buffer[16];
+- u8 aes_out[16];
+- u8 padded_buffer[16];
++ u8 chain_buffer[16] = {};
++ u8 aes_out[16] = {};
++ u8 padded_buffer[16] = {};
+ u8 mic[8];
+ uint frtype = GetFrameType(pframe);
+ uint frsubtype = GetFrameSubType(pframe);
+
+ frsubtype = frsubtype>>4;
+
+- memset((void *)mic_iv, 0, 16);
+- memset((void *)mic_header1, 0, 16);
+- memset((void *)mic_header2, 0, 16);
+- memset((void *)ctr_preload, 0, 16);
+- memset((void *)chain_buffer, 0, 16);
+- memset((void *)aes_out, 0, 16);
+- memset((void *)padded_buffer, 0, 16);
+-
+ if ((hdrlen == WLAN_HDR_A3_LEN) || (hdrlen == WLAN_HDR_A3_QOS_LEN))
+ a4_exists = 0;
+ else
+@@ -1080,15 +1072,15 @@ static signed int aes_decipher(u8 *key,
+ num_blocks, payload_index;
+ signed int res = _SUCCESS;
+ u8 pn_vector[6];
+- u8 mic_iv[16];
+- u8 mic_header1[16];
+- u8 mic_header2[16];
+- u8 ctr_preload[16];
++ u8 mic_iv[16] = {};
++ u8 mic_header1[16] = {};
++ u8 mic_header2[16] = {};
++ u8 ctr_preload[16] = {};
+
+ /* Intermediate Buffers */
+- u8 chain_buffer[16];
+- u8 aes_out[16];
+- u8 padded_buffer[16];
++ u8 chain_buffer[16] = {};
++ u8 aes_out[16] = {};
++ u8 padded_buffer[16] = {};
+ u8 mic[8];
+
+ uint frtype = GetFrameType(pframe);
+@@ -1096,14 +1088,6 @@ static signed int aes_decipher(u8 *key,
+
+ frsubtype = frsubtype>>4;
+
+- memset((void *)mic_iv, 0, 16);
+- memset((void *)mic_header1, 0, 16);
+- memset((void *)mic_header2, 0, 16);
+- memset((void *)ctr_preload, 0, 16);
+- memset((void *)chain_buffer, 0, 16);
+- memset((void *)aes_out, 0, 16);
+- memset((void *)padded_buffer, 0, 16);
+-
+ /* start to decrypt the payload */
+
+ num_blocks = (plen-8) / 16; /* plen including LLC, payload_length and mic) */
--- /dev/null
+From 5f465c148c61e876b6d6eacd8e8e365f2d47758f Mon Sep 17 00:00:00 2001
+From: "Xin Li (Intel)" <xin@zytor.com>
+Date: Fri, 20 Jun 2025 16:15:03 -0700
+Subject: x86/traps: Initialize DR6 by writing its architectural reset value
+
+From: Xin Li (Intel) <xin@zytor.com>
+
+commit 5f465c148c61e876b6d6eacd8e8e365f2d47758f upstream.
+
+Initialize DR6 by writing its architectural reset value to avoid
+incorrectly zeroing DR6 to clear DR6.BLD at boot time, which leads
+to a false bus lock detected warning.
+
+The Intel SDM says:
+
+ 1) Certain debug exceptions may clear bits 0-3 of DR6.
+
+ 2) BLD induced #DB clears DR6.BLD and any other debug exception
+ doesn't modify DR6.BLD.
+
+ 3) RTM induced #DB clears DR6.RTM and any other debug exception
+ sets DR6.RTM.
+
+ To avoid confusion in identifying debug exceptions, debug handlers
+ should set DR6.BLD and DR6.RTM, and clear other DR6 bits before
+ returning.
+
+The DR6 architectural reset value 0xFFFF0FF0, already defined as
+macro DR6_RESERVED, satisfies these requirements, so just use it to
+reinitialize DR6 whenever needed.
+
+Since clear_all_debug_regs() no longer zeros all debug registers,
+rename it to initialize_debug_regs() to better reflect its current
+behavior.
+
+Since debug_read_clear_dr6() no longer clears DR6, rename it to
+debug_read_reset_dr6() to better reflect its current behavior.
+
+Fixes: ebb1064e7c2e9 ("x86/traps: Handle #DB for bus lock")
+Reported-by: Sohil Mehta <sohil.mehta@intel.com>
+Suggested-by: H. Peter Anvin (Intel) <hpa@zytor.com>
+Signed-off-by: Xin Li (Intel) <xin@zytor.com>
+Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
+Reviewed-by: H. Peter Anvin (Intel) <hpa@zytor.com>
+Reviewed-by: Sohil Mehta <sohil.mehta@intel.com>
+Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
+Tested-by: Sohil Mehta <sohil.mehta@intel.com>
+Link: https://lore.kernel.org/lkml/06e68373-a92b-472e-8fd9-ba548119770c@intel.com/
+Cc:stable@vger.kernel.org
+Link: https://lore.kernel.org/all/20250620231504.2676902-2-xin%40zytor.com
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/x86/include/uapi/asm/debugreg.h | 21 ++++++++++++++++++++-
+ arch/x86/kernel/cpu/common.c | 24 ++++++++++--------------
+ arch/x86/kernel/traps.c | 34 +++++++++++++++++++++-------------
+ 3 files changed, 51 insertions(+), 28 deletions(-)
+
+--- a/arch/x86/include/uapi/asm/debugreg.h
++++ b/arch/x86/include/uapi/asm/debugreg.h
+@@ -15,7 +15,26 @@
+ which debugging register was responsible for the trap. The other bits
+ are either reserved or not of interest to us. */
+
+-/* Define reserved bits in DR6 which are always set to 1 */
++/*
++ * Define bits in DR6 which are set to 1 by default.
++ *
++ * This is also the DR6 architectural value following Power-up, Reset or INIT.
++ *
++ * Note, with the introduction of Bus Lock Detection (BLD) and Restricted
++ * Transactional Memory (RTM), the DR6 register has been modified:
++ *
++ * 1) BLD flag (bit 11) is no longer reserved to 1 if the CPU supports
++ * Bus Lock Detection. The assertion of a bus lock could clear it.
++ *
++ * 2) RTM flag (bit 16) is no longer reserved to 1 if the CPU supports
++ * restricted transactional memory. #DB occurred inside an RTM region
++ * could clear it.
++ *
++ * Apparently, DR6.BLD and DR6.RTM are active low bits.
++ *
++ * As a result, DR6_RESERVED is an incorrect name now, but it is kept for
++ * compatibility.
++ */
+ #define DR6_RESERVED (0xFFFF0FF0)
+
+ #define DR_TRAP0 (0x1) /* db0 */
+--- a/arch/x86/kernel/cpu/common.c
++++ b/arch/x86/kernel/cpu/common.c
+@@ -2205,20 +2205,16 @@ EXPORT_PER_CPU_SYMBOL(__stack_chk_guard)
+ #endif
+ #endif
+
+-/*
+- * Clear all 6 debug registers:
+- */
+-static void clear_all_debug_regs(void)
++static void initialize_debug_regs(void)
+ {
+- int i;
+-
+- for (i = 0; i < 8; i++) {
+- /* Ignore db4, db5 */
+- if ((i == 4) || (i == 5))
+- continue;
+-
+- set_debugreg(0, i);
+- }
++ /* Control register first -- to make sure everything is disabled. */
++ set_debugreg(0, 7);
++ set_debugreg(DR6_RESERVED, 6);
++ /* dr5 and dr4 don't exist */
++ set_debugreg(0, 3);
++ set_debugreg(0, 2);
++ set_debugreg(0, 1);
++ set_debugreg(0, 0);
+ }
+
+ #ifdef CONFIG_KGDB
+@@ -2379,7 +2375,7 @@ void cpu_init(void)
+
+ load_mm_ldt(&init_mm);
+
+- clear_all_debug_regs();
++ initialize_debug_regs();
+ dbg_restore_debug_regs();
+
+ doublefault_init_cpu_tss();
+--- a/arch/x86/kernel/traps.c
++++ b/arch/x86/kernel/traps.c
+@@ -1021,24 +1021,32 @@ static bool is_sysenter_singlestep(struc
+ #endif
+ }
+
+-static __always_inline unsigned long debug_read_clear_dr6(void)
++static __always_inline unsigned long debug_read_reset_dr6(void)
+ {
+ unsigned long dr6;
+
++ get_debugreg(dr6, 6);
++ dr6 ^= DR6_RESERVED; /* Flip to positive polarity */
++
+ /*
+ * The Intel SDM says:
+ *
+- * Certain debug exceptions may clear bits 0-3. The remaining
+- * contents of the DR6 register are never cleared by the
+- * processor. To avoid confusion in identifying debug
+- * exceptions, debug handlers should clear the register before
+- * returning to the interrupted task.
++ * Certain debug exceptions may clear bits 0-3 of DR6.
++ *
++ * BLD induced #DB clears DR6.BLD and any other debug
++ * exception doesn't modify DR6.BLD.
+ *
+- * Keep it simple: clear DR6 immediately.
++ * RTM induced #DB clears DR6.RTM and any other debug
++ * exception sets DR6.RTM.
++ *
++ * To avoid confusion in identifying debug exceptions,
++ * debug handlers should set DR6.BLD and DR6.RTM, and
++ * clear other DR6 bits before returning.
++ *
++ * Keep it simple: write DR6 with its architectural reset
++ * value 0xFFFF0FF0, defined as DR6_RESERVED, immediately.
+ */
+- get_debugreg(dr6, 6);
+ set_debugreg(DR6_RESERVED, 6);
+- dr6 ^= DR6_RESERVED; /* Flip to positive polarity */
+
+ return dr6;
+ }
+@@ -1238,13 +1246,13 @@ out:
+ /* IST stack entry */
+ DEFINE_IDTENTRY_DEBUG(exc_debug)
+ {
+- exc_debug_kernel(regs, debug_read_clear_dr6());
++ exc_debug_kernel(regs, debug_read_reset_dr6());
+ }
+
+ /* User entry, runs on regular task stack */
+ DEFINE_IDTENTRY_DEBUG_USER(exc_debug)
+ {
+- exc_debug_user(regs, debug_read_clear_dr6());
++ exc_debug_user(regs, debug_read_reset_dr6());
+ }
+
+ #ifdef CONFIG_X86_FRED
+@@ -1263,7 +1271,7 @@ DEFINE_FREDENTRY_DEBUG(exc_debug)
+ {
+ /*
+ * FRED #DB stores DR6 on the stack in the format which
+- * debug_read_clear_dr6() returns for the IDT entry points.
++ * debug_read_reset_dr6() returns for the IDT entry points.
+ */
+ unsigned long dr6 = fred_event_data(regs);
+
+@@ -1278,7 +1286,7 @@ DEFINE_FREDENTRY_DEBUG(exc_debug)
+ /* 32 bit does not have separate entry points. */
+ DEFINE_IDTENTRY_RAW(exc_debug)
+ {
+- unsigned long dr6 = debug_read_clear_dr6();
++ unsigned long dr6 = debug_read_reset_dr6();
+
+ if (user_mode(regs))
+ exc_debug_user(regs, dr6);