--- /dev/null
+From 5471834a96fb697874be2ca0b052e74bcf3c23d1 Mon Sep 17 00:00:00 2001
+From: Cen Zhang <zzzccc427@gmail.com>
+Date: Wed, 18 Mar 2026 15:32:53 +0800
+Subject: f2fs: add READ_ONCE() for i_blocks in f2fs_update_inode()
+
+From: Cen Zhang <zzzccc427@gmail.com>
+
+commit 5471834a96fb697874be2ca0b052e74bcf3c23d1 upstream.
+
+f2fs_update_inode() reads inode->i_blocks without holding i_lock to
+serialize it to the on-disk inode, while concurrent truncate or
+allocation paths may modify i_blocks under i_lock. Since blkcnt_t is
+u64, this risks torn reads on 32-bit architectures.
+
+Following the approach in ext4_inode_blocks_set(), add READ_ONCE() to prevent
+potential compiler-induced tearing.
+
+Fixes: 19f99cee206c ("f2fs: add core inode operations")
+Cc: stable@vger.kernel.org
+Signed-off-by: Cen Zhang <zzzccc427@gmail.com>
+Reviewed-by: Chao Yu <chao@kernel.org>
+Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/f2fs/inode.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/fs/f2fs/inode.c
++++ b/fs/f2fs/inode.c
+@@ -677,7 +677,7 @@ void f2fs_update_inode(struct inode *ino
+ ri->i_uid = cpu_to_le32(i_uid_read(inode));
+ ri->i_gid = cpu_to_le32(i_gid_read(inode));
+ ri->i_links = cpu_to_le32(inode->i_nlink);
+- ri->i_blocks = cpu_to_le64(SECTOR_TO_BLOCK(inode->i_blocks) + 1);
++ ri->i_blocks = cpu_to_le64(SECTOR_TO_BLOCK(READ_ONCE(inode->i_blocks)) + 1);
+
+ if (!f2fs_is_atomic_file(inode) ||
+ is_inode_flag_set(inode, FI_ATOMIC_COMMITTED))
--- /dev/null
+From 95e159ad3e52f7478cfd22e44ec37c9f334f8993 Mon Sep 17 00:00:00 2001
+From: Yongpeng Yang <yangyongpeng@xiaomi.com>
+Date: Mon, 23 Mar 2026 20:06:24 +0800
+Subject: f2fs: fix fiemap boundary handling when read extent cache is incomplete
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+From: Yongpeng Yang <yangyongpeng@xiaomi.com>
+
+commit 95e159ad3e52f7478cfd22e44ec37c9f334f8993 upstream.
+
+f2fs_fiemap() calls f2fs_map_blocks() to obtain the block mapping a
+file, and then merges contiguous mappings into extents. If the mapping
+is found in the read extent cache, node blocks do not need to be read.
+However, in the following scenario, a contiguous extent can be split
+into two extents:
+
+$ dd if=/dev/zero of=data.128M bs=1M count=128
+$ losetup -f data.128M
+$ mkfs.f2fs /dev/loop0 -f
+$ mount -o mode=lfs /dev/loop0 /mnt/f2fs/
+$ cd /mnt/f2fs/
+$ dd if=/dev/zero of=data.72M bs=1M count=72 && sync
+$ dd if=/dev/zero of=data.4M bs=1M count=4 && sync
+$ dd if=/dev/zero of=data.4M bs=1M count=2 seek=2 conv=notrunc && sync
+$ echo 3 > /proc/sys/vm/drop_caches
+$ dd if=/dev/zero of=data.4M bs=1M count=2 seek=0 conv=notrunc && sync
+$ dd if=/dev/zero of=data.4M bs=1M count=2 seek=0 conv=notrunc && sync
+$ f2fs_io fiemap 0 1024 data.4M
+Fiemap: offset = 0 len = 1024
+logical addr. physical addr. length flags
+0 0000000000000000 0000000006400000 0000000000200000 00001000
+1 0000000000200000 0000000006600000 0000000000200000 00001001
+
+Although the physical addresses of the ranges 0~2MB and 2M~4MB are
+contiguous, the mapping for the 2M~4MB range is not present in memory.
+When the physical addresses for the 0~2MB range are updated, no merge
+happens because the adjacent mapping is missing from the in-memory
+cache. As a result, fiemap reports two separate extents instead of a
+single contiguous one.
+
+The root cause is that the read extent cache does not guarantee that all
+blocks of an extent are present in memory. Therefore, when the extent
+length returned by f2fs_map_blocks_cached() is smaller than maxblocks,
+the remaining mappings are retrieved via f2fs_get_dnode_of_data() to
+ensure correct fiemap extent boundary handling.
+
+Cc: stable@kernel.org
+Fixes: cd8fc5226bef ("f2fs: remove the create argument to f2fs_map_blocks")
+Signed-off-by: Yongpeng Yang <yangyongpeng@xiaomi.com>
+Reviewed-by: Chao Yu <chao@kernel.org>
+Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/f2fs/data.c | 25 ++++++++++++++++++++++---
+ 1 file changed, 22 insertions(+), 3 deletions(-)
+
+--- a/fs/f2fs/data.c
++++ b/fs/f2fs/data.c
+@@ -1567,8 +1567,26 @@ int f2fs_map_blocks(struct inode *inode,
+ lfs_dio_write = (flag == F2FS_GET_BLOCK_DIO && f2fs_lfs_mode(sbi) &&
+ map->m_may_create);
+
+- if (!map->m_may_create && f2fs_map_blocks_cached(inode, map, flag))
+- goto out;
++ if (!map->m_may_create && f2fs_map_blocks_cached(inode, map, flag)) {
++ struct extent_info ei;
++
++ /*
++ * 1. If map->m_multidev_dio is true, map->m_pblk cannot be
++ * waitted by f2fs_wait_on_block_writeback_range() and are not
++ * mergeable.
++ * 2. If pgofs hits the read extent cache, it means the mapping
++ * is already cached in the extent cache, but it is not
++ * mergeable, and there is no need to query the mapping again
++ * via f2fs_get_dnode_of_data().
++ */
++ pgofs = (pgoff_t)map->m_lblk + map->m_len;
++ if (map->m_len == maxblocks ||
++ map->m_multidev_dio ||
++ f2fs_lookup_read_extent_cache(inode, pgofs, &ei))
++ goto out;
++ ofs = map->m_len;
++ goto map_more;
++ }
+
+ map->m_bdev = inode->i_sb->s_bdev;
+ map->m_multidev_dio =
+@@ -1579,7 +1597,8 @@ int f2fs_map_blocks(struct inode *inode,
+
+ /* it only supports block size == page size */
+ pgofs = (pgoff_t)map->m_lblk;
+- end = pgofs + maxblocks;
++map_more:
++ end = (pgoff_t)map->m_lblk + maxblocks;
+
+ if (flag == F2FS_GET_BLOCK_PRECACHE)
+ mode = LOOKUP_NODE_RA;
--- /dev/null
+From c3e238bd1f56993f205ef83889d406dfeaf717a8 Mon Sep 17 00:00:00 2001
+From: Yongpeng Yang <yangyongpeng@xiaomi.com>
+Date: Wed, 18 Mar 2026 16:45:34 +0800
+Subject: f2fs: fix fsck inconsistency caused by FGGC of node block
+
+From: Yongpeng Yang <yangyongpeng@xiaomi.com>
+
+commit c3e238bd1f56993f205ef83889d406dfeaf717a8 upstream.
+
+During FGGC node block migration, fsck may incorrectly treat the
+migrated node block as fsync-written data.
+
+The reproduction scenario:
+root@vm:/mnt/f2fs# seq 1 2048 | xargs -n 1 ./test_sync // write inline inode and sync
+root@vm:/mnt/f2fs# rm -f 1
+root@vm:/mnt/f2fs# sync
+root@vm:/mnt/f2fs# f2fs_io gc_range // move data block in sync mode and not write CP
+ SPO, "fsck --dry-run" find inode has already checkpointed but still
+ with DENT_BIT_SHIFT set
+
+The root cause is that GC does not clear the dentry mark and fsync mark
+during node block migration, leading fsck to misinterpret them as
+user-issued fsync writes.
+
+In BGGC mode, node block migration is handled by f2fs_sync_node_pages(),
+which guarantees the dentry and fsync marks are cleared before writing.
+
+This patch move the set/clear of the fsync|dentry marks into
+__write_node_folio to make the logic clearer, and ensures the
+fsync|dentry mark is cleared in FGGC.
+
+Cc: stable@kernel.org
+Fixes: da011cc0da8c ("f2fs: move node pages only in victim section during GC")
+Signed-off-by: Yongpeng Yang <yangyongpeng@xiaomi.com>
+Reviewed-by: Chao Yu <chao@kernel.org>
+Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/f2fs/node.c | 27 +++++++++++++--------------
+ 1 file changed, 13 insertions(+), 14 deletions(-)
+
+--- a/fs/f2fs/node.c
++++ b/fs/f2fs/node.c
+@@ -1709,9 +1709,10 @@ continue_unlock:
+ return last_folio;
+ }
+
+-static bool __write_node_folio(struct folio *folio, bool atomic, bool *submitted,
+- struct writeback_control *wbc, bool do_balance,
+- enum iostat_type io_type, unsigned int *seq_id)
++static bool __write_node_folio(struct folio *folio, bool atomic, bool do_fsync,
++ bool *submitted, struct writeback_control *wbc,
++ bool do_balance, enum iostat_type io_type,
++ unsigned int *seq_id)
+ {
+ struct f2fs_sb_info *sbi = F2FS_F_SB(folio);
+ nid_t nid;
+@@ -1783,6 +1784,8 @@ static bool __write_node_folio(struct fo
+ if (atomic && !test_opt(sbi, NOBARRIER))
+ fio.op_flags |= REQ_PREFLUSH | REQ_FUA;
+
++ set_dentry_mark(folio, false);
++ set_fsync_mark(folio, do_fsync);
+ if (IS_INODE(folio) && (atomic || is_fsync_dnode(folio)))
+ set_dentry_mark(folio,
+ f2fs_need_dentry_mark(sbi, ino_of_node(folio)));
+@@ -1849,7 +1852,7 @@ int f2fs_write_single_node_folio(struct
+ goto out_folio;
+ }
+
+- if (!__write_node_folio(node_folio, false, NULL,
++ if (!__write_node_folio(node_folio, false, false, NULL,
+ &wbc, false, FS_GC_NODE_IO, NULL))
+ err = -EAGAIN;
+ goto release_folio;
+@@ -1896,6 +1899,7 @@ retry:
+ for (i = 0; i < nr_folios; i++) {
+ struct folio *folio = fbatch.folios[i];
+ bool submitted = false;
++ bool do_fsync = false;
+
+ if (unlikely(f2fs_cp_error(sbi))) {
+ f2fs_folio_put(last_folio, false);
+@@ -1926,11 +1930,8 @@ continue_unlock:
+
+ f2fs_folio_wait_writeback(folio, NODE, true, true);
+
+- set_fsync_mark(folio, 0);
+- set_dentry_mark(folio, 0);
+-
+ if (!atomic || folio == last_folio) {
+- set_fsync_mark(folio, 1);
++ do_fsync = true;
+ percpu_counter_inc(&sbi->rf_node_block_count);
+ if (IS_INODE(folio)) {
+ if (is_inode_flag_set(inode,
+@@ -1947,8 +1948,9 @@ continue_unlock:
+
+ if (!__write_node_folio(folio, atomic &&
+ folio == last_folio,
+- &submitted, wbc, true,
+- FS_NODE_IO, seq_id)) {
++ do_fsync, &submitted,
++ wbc, true, FS_NODE_IO,
++ seq_id)) {
+ f2fs_folio_put(last_folio, false);
+ folio_batch_release(&fbatch);
+ ret = -EIO;
+@@ -2148,10 +2150,7 @@ write_node:
+ if (!folio_clear_dirty_for_io(folio))
+ goto continue_unlock;
+
+- set_fsync_mark(folio, 0);
+- set_dentry_mark(folio, 0);
+-
+- if (!__write_node_folio(folio, false, &submitted,
++ if (!__write_node_folio(folio, false, false, &submitted,
+ wbc, do_balance, io_type, NULL)) {
+ folio_batch_release(&fbatch);
+ ret = -EIO;
--- /dev/null
+From 019f9dda7f66e55eb94cd32e1d3fff5835f73fbc Mon Sep 17 00:00:00 2001
+From: Yongpeng Yang <yangyongpeng@xiaomi.com>
+Date: Tue, 10 Mar 2026 17:36:12 +0800
+Subject: f2fs: fix fsck inconsistency caused by incorrect nat_entry flag usage
+
+From: Yongpeng Yang <yangyongpeng@xiaomi.com>
+
+commit 019f9dda7f66e55eb94cd32e1d3fff5835f73fbc upstream.
+
+f2fs_need_dentry_mark() reads nat_entry flags without mutual exclusion
+with the checkpoint path, which can result in an incorrect inode block
+marking state. The scenario is as follows:
+
+create & write & fsync 'file A' write checkpoint
+- f2fs_do_sync_file // inline inode
+ - f2fs_write_inode // inode folio is dirty
+ - f2fs_write_checkpoint
+ - f2fs_flush_merged_writes
+ - f2fs_sync_node_pages
+ - f2fs_fsync_node_pages // no dirty node
+ - f2fs_need_inode_block_update // return true
+ - f2fs_fsync_node_pages // inode dirtied
+ - f2fs_need_dentry_mark //return true
+ - f2fs_flush_nat_entries
+ - f2fs_write_checkpoint end
+ - __write_node_folio // inode with DENT_BIT_SHIFT set
+ SPO, "fsck --dry-run" find inode has already checkpointed but still
+ with DENT_BIT_SHIFT set
+
+The state observed by f2fs_need_dentry_mark() can differ from the state
+observed in __write_node_folio() after acquiring sbi->node_write. The
+root cause is that the semantics of IS_CHECKPOINTED and
+HAS_FSYNCED_INODE are only guaranteed after the checkpoint write has
+fully completed.
+
+This patch moves set_dentry_mark() into __write_node_folio() and
+protects it with the sbi->node_write lock.
+
+Cc: stable@kernel.org
+Fixes: 88bd02c9472a ("f2fs: fix conditions to remain recovery information in f2fs_sync_file")
+Signed-off-by: Yongpeng Yang <yangyongpeng@xiaomi.com>
+Reviewed-by: Chao Yu <chao@kernel.org>
+Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/f2fs/node.c | 14 +++++---------
+ 1 file changed, 5 insertions(+), 9 deletions(-)
+
+--- a/fs/f2fs/node.c
++++ b/fs/f2fs/node.c
+@@ -1780,13 +1780,12 @@ static bool __write_node_folio(struct fo
+ goto redirty_out;
+ }
+
+- if (atomic) {
+- if (!test_opt(sbi, NOBARRIER))
+- fio.op_flags |= REQ_PREFLUSH | REQ_FUA;
+- if (IS_INODE(folio))
+- set_dentry_mark(folio,
++ if (atomic && !test_opt(sbi, NOBARRIER))
++ fio.op_flags |= REQ_PREFLUSH | REQ_FUA;
++
++ if (IS_INODE(folio) && (atomic || is_fsync_dnode(folio)))
++ set_dentry_mark(folio,
+ f2fs_need_dentry_mark(sbi, ino_of_node(folio)));
+- }
+
+ /* should add to global list before clearing PAGECACHE status */
+ if (f2fs_in_warm_node_list(sbi, folio)) {
+@@ -1927,9 +1926,6 @@ continue_unlock:
+ if (is_inode_flag_set(inode,
+ FI_DIRTY_INODE))
+ f2fs_update_inode(inode, folio);
+- if (!atomic)
+- set_dentry_mark(folio,
+- f2fs_need_dentry_mark(sbi, ino));
+ }
+ /* may be written by other thread */
+ if (!folio_test_dirty(folio))
--- /dev/null
+From 68a0178981a0f493295afa29f8880246e561494c Mon Sep 17 00:00:00 2001
+From: Yongpeng Yang <yangyongpeng@xiaomi.com>
+Date: Tue, 3 Feb 2026 21:36:35 +0800
+Subject: f2fs: fix incorrect file address mapping when inline inode is unwritten
+
+From: Yongpeng Yang <yangyongpeng@xiaomi.com>
+
+commit 68a0178981a0f493295afa29f8880246e561494c upstream.
+
+When `fileinfo->fi_flags` does not have the `FIEMAP_FLAG_SYNC` bit set
+and inline data has not been persisted yet, the physical address of the
+extent is calculated incorrectly for unwritten inline inodes.
+
+root@vm:/mnt/f2fs# dd if=/dev/zero of=data.3k bs=3k count=1
+root@vm:/mnt/f2fs# f2fs_io fiemap 0 100 data.3k
+Fiemap: offset = 0 len = 100
+ logical addr. physical addr. length flags
+0 0000000000000000 00000ffffffff16c 0000000000000c00 00000301
+
+This patch fixes the issue by checking if the inode's address is valid.
+If the inline inode is unwritten, set the physical address to 0 and
+mark the extent with `FIEMAP_EXTENT_UNKNOWN | FIEMAP_EXTENT_DELALLOC`
+flags.
+
+Cc: stable@kernel.org
+Fixes: 67f8cf3cee6f ("f2fs: support fiemap for inline_data")
+Signed-off-by: Yongpeng Yang <yangyongpeng@xiaomi.com>
+Reviewed-by: Chao Yu <chao@kernel.org>
+Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/f2fs/inline.c | 13 +++++++++----
+ 1 file changed, 9 insertions(+), 4 deletions(-)
+
+--- a/fs/f2fs/inline.c
++++ b/fs/f2fs/inline.c
+@@ -790,7 +790,7 @@ int f2fs_read_inline_dir(struct file *fi
+ int f2fs_inline_data_fiemap(struct inode *inode,
+ struct fiemap_extent_info *fieinfo, __u64 start, __u64 len)
+ {
+- __u64 byteaddr, ilen;
++ __u64 byteaddr = 0, ilen;
+ __u32 flags = FIEMAP_EXTENT_DATA_INLINE | FIEMAP_EXTENT_NOT_ALIGNED |
+ FIEMAP_EXTENT_LAST;
+ struct node_info ni;
+@@ -823,9 +823,14 @@ int f2fs_inline_data_fiemap(struct inode
+ if (err)
+ goto out;
+
+- byteaddr = (__u64)ni.blk_addr << inode->i_sb->s_blocksize_bits;
+- byteaddr += (char *)inline_data_addr(inode, ifolio) -
+- (char *)F2FS_INODE(ifolio);
++ if (__is_valid_data_blkaddr(ni.blk_addr)) {
++ byteaddr = (__u64)ni.blk_addr << inode->i_sb->s_blocksize_bits;
++ byteaddr += (char *)inline_data_addr(inode, ifolio) -
++ (char *)F2FS_INODE(ifolio);
++ } else {
++ f2fs_bug_on(F2FS_I_SB(inode), ni.blk_addr != NEW_ADDR);
++ flags |= FIEMAP_EXTENT_DELALLOC | FIEMAP_EXTENT_UNKNOWN;
++ }
+ err = fiemap_fill_next_extent(fieinfo, start, byteaddr, ilen, flags);
+ trace_f2fs_fiemap(inode, start, byteaddr, ilen, flags, err);
+ out:
--- /dev/null
+From eb2ca3ca983551a80e16a4a25df5a4ce59df8484 Mon Sep 17 00:00:00 2001
+From: Yongpeng Yang <yangyongpeng@xiaomi.com>
+Date: Mon, 23 Mar 2026 20:06:22 +0800
+Subject: f2fs: fix incorrect multidevice info in trace_f2fs_map_blocks()
+
+From: Yongpeng Yang <yangyongpeng@xiaomi.com>
+
+commit eb2ca3ca983551a80e16a4a25df5a4ce59df8484 upstream.
+
+When f2fs_map_blocks()->f2fs_map_blocks_cached() hits the read extent
+cache, map->m_multidev_dio is not updated, which leads to incorrect
+multidevice information being reported by trace_f2fs_map_blocks().
+
+This patch updates map->m_multidev_dio in f2fs_map_blocks_cached() when
+the read extent cache is hit.
+
+Cc: stable@kernel.org
+Fixes: 0094e98bd147 ("f2fs: factor a f2fs_map_blocks_cached helper")
+Signed-off-by: Yongpeng Yang <yangyongpeng@xiaomi.com>
+Reviewed-by: Chao Yu <chao@kernel.org>
+Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/f2fs/data.c | 3 ++-
+ 1 file changed, 2 insertions(+), 1 deletion(-)
+
+--- a/fs/f2fs/data.c
++++ b/fs/f2fs/data.c
+@@ -1508,7 +1508,8 @@ static bool f2fs_map_blocks_cached(struc
+ f2fs_wait_on_block_writeback_range(inode,
+ map->m_pblk, map->m_len);
+
+- if (f2fs_allow_multi_device_dio(sbi, flag)) {
++ map->m_multidev_dio = f2fs_allow_multi_device_dio(sbi, flag);
++ if (map->m_multidev_dio) {
+ int bidx = f2fs_target_device_index(sbi, map->m_pblk);
+ struct f2fs_dev_info *dev = &sbi->devs[bidx];
+
--- /dev/null
+From fe9b8b30b97102859a9102be7bd2a09803bd90bd Mon Sep 17 00:00:00 2001
+From: Yongpeng Yang <yangyongpeng@xiaomi.com>
+Date: Wed, 18 Mar 2026 16:46:35 +0800
+Subject: f2fs: fix inline data not being written to disk in writeback path
+
+From: Yongpeng Yang <yangyongpeng@xiaomi.com>
+
+commit fe9b8b30b97102859a9102be7bd2a09803bd90bd upstream.
+
+When f2fs_fiemap() is called with `fileinfo->fi_flags` containing the
+FIEMAP_FLAG_SYNC flag, it attempts to write data to disk before
+retrieving file mappings via filemap_write_and_wait(). However, there is
+an issue where the file does not get mapped as expected. The following
+scenario can occur:
+
+root@vm:/mnt/f2fs# dd if=/dev/zero of=data.3k bs=3k count=1
+root@vm:/mnt/f2fs# xfs_io data.3k -c "fiemap -v 0 4096"
+data.3k:
+ EXT: FILE-OFFSET BLOCK-RANGE TOTAL FLAGS
+ 0: [0..5]: 0..5 6 0x307
+
+The root cause of this issue is that f2fs_write_single_data_page() only
+calls f2fs_write_inline_data() to copy data from the data folio to the
+inode folio, and it clears the dirty flag on the data folio. However, it
+does not mark the data folio as writeback. When
+__filemap_fdatawait_range() checks for folios with the writeback flag,
+it returns early, causing f2fs_fiemap() to report that the file has no
+mapping.
+
+To fix this issue, the solution is to call
+f2fs_write_single_node_folio() in f2fs_inline_data_fiemap() when
+getting fiemap with FIEMAP_FLAG_SYNC flags. This patch ensures that the
+inode folio is written back and the writeback process completes before
+proceeding.
+
+Cc: stable@kernel.org
+Fixes: 9ffe0fb5f3bb ("f2fs: handle inline data operations")
+Signed-off-by: Yongpeng Yang <yangyongpeng@xiaomi.com>
+Reviewed-by: Chao Yu <chao@kernel.org>
+Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/f2fs/f2fs.h | 2 ++
+ fs/f2fs/inline.c | 9 +++++++++
+ fs/f2fs/node.c | 2 +-
+ 3 files changed, 12 insertions(+), 1 deletion(-)
+
+--- a/fs/f2fs/f2fs.h
++++ b/fs/f2fs/f2fs.h
+@@ -3888,6 +3888,8 @@ int f2fs_sanity_check_node_footer(struct
+ enum node_type ntype, bool in_irq);
+ struct folio *f2fs_get_inode_folio(struct f2fs_sb_info *sbi, pgoff_t ino);
+ struct folio *f2fs_get_xnode_folio(struct f2fs_sb_info *sbi, pgoff_t xnid);
++int f2fs_write_single_node_folio(struct folio *node_folio, int sync_mode,
++ bool mark_dirty, enum iostat_type io_type);
+ int f2fs_move_node_folio(struct folio *node_folio, int gc_type);
+ void f2fs_flush_inline_data(struct f2fs_sb_info *sbi);
+ int f2fs_fsync_node_pages(struct f2fs_sb_info *sbi, struct inode *inode,
+--- a/fs/f2fs/inline.c
++++ b/fs/f2fs/inline.c
+@@ -812,6 +812,15 @@ int f2fs_inline_data_fiemap(struct inode
+ goto out;
+ }
+
++ if (fieinfo->fi_flags & FIEMAP_FLAG_SYNC) {
++ err = f2fs_write_single_node_folio(ifolio, true, false, FS_NODE_IO);
++ if (err)
++ return err;
++ ifolio = f2fs_get_inode_folio(F2FS_I_SB(inode), inode->i_ino);
++ if (IS_ERR(ifolio))
++ return PTR_ERR(ifolio);
++ f2fs_folio_wait_writeback(ifolio, NODE, true, true);
++ }
+ ilen = min_t(size_t, MAX_INLINE_DATA(inode), i_size_read(inode));
+ if (start >= ilen)
+ goto out;
+--- a/fs/f2fs/node.c
++++ b/fs/f2fs/node.c
+@@ -1821,7 +1821,7 @@ redirty_out:
+ return false;
+ }
+
+-static int f2fs_write_single_node_folio(struct folio *node_folio, int sync_mode,
++int f2fs_write_single_node_folio(struct folio *node_folio, int sync_mode,
+ bool mark_dirty, enum iostat_type io_type)
+ {
+ int err = 0;
--- /dev/null
+From ed78aeebef05212ef7dca93bd931e4eff67c113f Mon Sep 17 00:00:00 2001
+From: Yongpeng Yang <yangyongpeng@xiaomi.com>
+Date: Fri, 3 Apr 2026 22:40:17 +0800
+Subject: f2fs: fix node_cnt race between extent node destroy and writeback
+
+From: Yongpeng Yang <yangyongpeng@xiaomi.com>
+
+commit ed78aeebef05212ef7dca93bd931e4eff67c113f upstream.
+
+f2fs_destroy_extent_node() does not set FI_NO_EXTENT before clearing
+extent nodes. When called from f2fs_drop_inode() with I_SYNC set,
+concurrent kworker writeback can insert new extent nodes into the same
+extent tree, racing with the destroy and triggering f2fs_bug_on() in
+__destroy_extent_node(). The scenario is as follows:
+
+drop inode writeback
+ - iput
+ - f2fs_drop_inode // I_SYNC set
+ - f2fs_destroy_extent_node
+ - __destroy_extent_node
+ - while (node_cnt) {
+ write_lock(&et->lock)
+ __free_extent_tree
+ write_unlock(&et->lock)
+ - __writeback_single_inode
+ - f2fs_outplace_write_data
+ - f2fs_update_read_extent_cache
+ - __update_extent_tree_range
+ // FI_NO_EXTENT not set,
+ // insert new extent node
+ } // node_cnt == 0, exit while
+ - f2fs_bug_on(node_cnt) // node_cnt > 0
+
+Additionally, __update_extent_tree_range() only checks FI_NO_EXTENT for
+EX_READ type, leaving EX_BLOCK_AGE updates completely unprotected.
+
+This patch set FI_NO_EXTENT under et->lock in __destroy_extent_node(),
+consistent with other callers (__update_extent_tree_range and
+__drop_extent_tree) and check FI_NO_EXTENT for both EX_READ and
+EX_BLOCK_AGE tree.
+
+Fixes: 3fc5d5a182f6 ("f2fs: fix to shrink read extent node in batches")
+Cc: stable@vger.kernel.org
+Signed-off-by: Yongpeng Yang <yangyongpeng@xiaomi.com>
+Reviewed-by: Chao Yu <chao@kernel.org>
+Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/f2fs/extent_cache.c | 17 ++++++++++-------
+ 1 file changed, 10 insertions(+), 7 deletions(-)
+
+--- a/fs/f2fs/extent_cache.c
++++ b/fs/f2fs/extent_cache.c
+@@ -119,9 +119,10 @@ static bool __may_extent_tree(struct ino
+ if (!__init_may_extent_tree(inode, type))
+ return false;
+
++ if (is_inode_flag_set(inode, FI_NO_EXTENT))
++ return false;
++
+ if (type == EX_READ) {
+- if (is_inode_flag_set(inode, FI_NO_EXTENT))
+- return false;
+ if (is_inode_flag_set(inode, FI_COMPRESSED_FILE) &&
+ !f2fs_sb_has_readonly(F2FS_I_SB(inode)))
+ return false;
+@@ -644,6 +645,8 @@ static unsigned int __destroy_extent_nod
+
+ while (atomic_read(&et->node_cnt)) {
+ write_lock(&et->lock);
++ if (!is_inode_flag_set(inode, FI_NO_EXTENT))
++ set_inode_flag(inode, FI_NO_EXTENT);
+ node_cnt += __free_extent_tree(sbi, et, nr_shrink);
+ write_unlock(&et->lock);
+ }
+@@ -688,12 +691,12 @@ static void __update_extent_tree_range(s
+
+ write_lock(&et->lock);
+
+- if (type == EX_READ) {
+- if (is_inode_flag_set(inode, FI_NO_EXTENT)) {
+- write_unlock(&et->lock);
+- return;
+- }
++ if (is_inode_flag_set(inode, FI_NO_EXTENT)) {
++ write_unlock(&et->lock);
++ return;
++ }
+
++ if (type == EX_READ) {
+ prev = et->largest;
+ dei.len = 0;
+
--- /dev/null
+From b635f2ecdb5ad34f9c967cabb704d6bed9382fd0 Mon Sep 17 00:00:00 2001
+From: Guangshuo Li <lgs201920130244@gmail.com>
+Date: Fri, 10 Apr 2026 20:47:26 +0800
+Subject: f2fs: fix uninitialized kobject put in f2fs_init_sysfs()
+
+From: Guangshuo Li <lgs201920130244@gmail.com>
+
+commit b635f2ecdb5ad34f9c967cabb704d6bed9382fd0 upstream.
+
+In f2fs_init_sysfs(), all failure paths after kset_register() jump to
+put_kobject, which unconditionally releases both f2fs_tune and
+f2fs_feat.
+
+If kobject_init_and_add(&f2fs_feat, ...) fails, f2fs_tune has not been
+initialized yet, so calling kobject_put(&f2fs_tune) is invalid.
+
+Fix this by splitting the unwind path so each error path only releases
+objects that were successfully initialized.
+
+Fixes: a907f3a68ee26ba4 ("f2fs: add a sysfs entry to reclaim POSIX_FADV_NOREUSE pages")
+Cc: stable@vger.kernel.org
+Signed-off-by: Guangshuo Li <lgs201920130244@gmail.com>
+Reviewed-by: Chao Yu <chao@kernel.org>
+Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/f2fs/sysfs.c | 10 ++++++----
+ 1 file changed, 6 insertions(+), 4 deletions(-)
+
+--- a/fs/f2fs/sysfs.c
++++ b/fs/f2fs/sysfs.c
+@@ -1935,24 +1935,26 @@ int __init f2fs_init_sysfs(void)
+ ret = kobject_init_and_add(&f2fs_feat, &f2fs_feat_ktype,
+ NULL, "features");
+ if (ret)
+- goto put_kobject;
++ goto unregister_kset;
+
+ ret = kobject_init_and_add(&f2fs_tune, &f2fs_tune_ktype,
+ NULL, "tuning");
+ if (ret)
+- goto put_kobject;
++ goto put_feat;
+
+ f2fs_proc_root = proc_mkdir("fs/f2fs", NULL);
+ if (!f2fs_proc_root) {
+ ret = -ENOMEM;
+- goto put_kobject;
++ goto put_tune;
+ }
+
+ return 0;
+
+-put_kobject:
++put_tune:
+ kobject_put(&f2fs_tune);
++put_feat:
+ kobject_put(&f2fs_feat);
++unregister_kset:
+ kset_unregister(&f2fs_kset);
+ return ret;
+ }
--- /dev/null
+From 92c20989366e023b74fa0c1028af9436c1917dbf Mon Sep 17 00:00:00 2001
+From: Yongpeng Yang <yangyongpeng@xiaomi.com>
+Date: Wed, 18 Mar 2026 16:45:32 +0800
+Subject: f2fs: refactor f2fs_move_node_folio function
+
+From: Yongpeng Yang <yangyongpeng@xiaomi.com>
+
+commit 92c20989366e023b74fa0c1028af9436c1917dbf upstream.
+
+This patch refactor the f2fs_move_node_folio() function. No logical
+changes.
+
+Cc: stable@kernel.org
+Signed-off-by: Yongpeng Yang <yangyongpeng@xiaomi.com>
+Reviewed-by: Chao Yu <chao@kernel.org>
+Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/f2fs/node.c | 54 ++++++++++++++++++++++++++++++++----------------------
+ 1 file changed, 32 insertions(+), 22 deletions(-)
+
+--- a/fs/f2fs/node.c
++++ b/fs/f2fs/node.c
+@@ -1821,41 +1821,51 @@ redirty_out:
+ return false;
+ }
+
+-int f2fs_move_node_folio(struct folio *node_folio, int gc_type)
++static int f2fs_write_single_node_folio(struct folio *node_folio, int sync_mode,
++ bool mark_dirty, enum iostat_type io_type)
+ {
+ int err = 0;
++ struct writeback_control wbc = {
++ .sync_mode = WB_SYNC_ALL,
++ .nr_to_write = 1,
++ };
+
+- if (gc_type == FG_GC) {
+- struct writeback_control wbc = {
+- .sync_mode = WB_SYNC_ALL,
+- .nr_to_write = 1,
+- };
++ if (!sync_mode) {
++ /* set page dirty and write it */
++ if (!folio_test_writeback(node_folio))
++ folio_mark_dirty(node_folio);
++ goto out_folio;
++ }
+
+- f2fs_folio_wait_writeback(node_folio, NODE, true, true);
++ f2fs_folio_wait_writeback(node_folio, NODE, true, true);
+
++ if (mark_dirty)
+ folio_mark_dirty(node_folio);
++ else if (!folio_test_dirty(node_folio))
++ goto out_folio;
+
+- if (!folio_clear_dirty_for_io(node_folio)) {
+- err = -EAGAIN;
+- goto out_page;
+- }
+-
+- if (!__write_node_folio(node_folio, false, NULL,
+- &wbc, false, FS_GC_NODE_IO, NULL))
+- err = -EAGAIN;
+- goto release_page;
+- } else {
+- /* set page dirty and write it */
+- if (!folio_test_writeback(node_folio))
+- folio_mark_dirty(node_folio);
++ if (!folio_clear_dirty_for_io(node_folio)) {
++ err = -EAGAIN;
++ goto out_folio;
+ }
+-out_page:
++
++ if (!__write_node_folio(node_folio, false, NULL,
++ &wbc, false, FS_GC_NODE_IO, NULL))
++ err = -EAGAIN;
++ goto release_folio;
++out_folio:
+ folio_unlock(node_folio);
+-release_page:
++release_folio:
+ f2fs_folio_put(node_folio, false);
+ return err;
+ }
+
++int f2fs_move_node_folio(struct folio *node_folio, int gc_type)
++{
++ return f2fs_write_single_node_folio(node_folio, gc_type == FG_GC,
++ true, FS_GC_NODE_IO);
++}
++
+ int f2fs_fsync_node_pages(struct f2fs_sb_info *sbi, struct inode *inode,
+ struct writeback_control *wbc, bool atomic,
+ unsigned int *seq_id)
--- /dev/null
+From 7fe2cd4e1a3ad230d8fcc00cc99c4bcce4412a75 Mon Sep 17 00:00:00 2001
+From: Fuad Tabba <tabba@google.com>
+Date: Fri, 24 Apr 2026 09:49:03 +0100
+Subject: KVM: arm64: Fix FEAT_Debugv8p9 to check DebugVer, not PMUVer
+
+From: Fuad Tabba <tabba@google.com>
+
+commit 7fe2cd4e1a3ad230d8fcc00cc99c4bcce4412a75 upstream.
+
+FEAT_Debugv8p9 is incorrectly defined against ID_AA64DFR0_EL1.PMUVer
+instead of ID_AA64DFR0_EL1.DebugVer. All three consumers of the macro
+gate features that are architecturally tied to FEAT_Debugv8p9
+(DebugVer = 0b1011, DDI0487 M.b A2.2.10):
+
+ - HDFGRTR2_EL2.nMDSELR_EL1, HDFGWTR2_EL2.nMDSELR_EL1: MDSELR_EL1
+ is present only when FEAT_Debugv8p9 is implemented (D24.3.21).
+
+ - MDCR_EL2.EBWE: the Extended Breakpoint and Watchpoint Enable bit
+ is RES0 unless FEAT_Debugv8p9 is implemented (D24.3.17).
+
+Neither register has any dependency on PMUVer.
+
+FEAT_Debugv8p9 and FEAT_PMUv3p9 are independent. Per DDI0487 M.b
+A2.2.10, FEAT_Debugv8p9 is unconditionally mandatory from Armv8.9,
+whereas FEAT_PMUv3p9 is mandatory only when FEAT_PMUv3 is implemented.
+An Armv8.9 CPU without a PMU has DebugVer = 0b1011 but PMUVer = 0b0000,
+so the wrong field check would cause KVM to incorrectly treat EBWE and
+MDSELR_EL1 as RES0 on such hardware.
+
+Fixes: 4bc0fe089840 ("KVM: arm64: Add sanitisation for FEAT_FGT2 registers")
+Signed-off-by: Fuad Tabba <tabba@google.com>
+Link: https://patch.msgid.link/20260424084908.370776-2-tabba@google.com
+Signed-off-by: Marc Zyngier <maz@kernel.org>
+Cc: stable@vger.kernel.org
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/arm64/kvm/config.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/arch/arm64/kvm/config.c
++++ b/arch/arm64/kvm/config.c
+@@ -187,7 +187,7 @@ struct reg_feat_map_desc {
+ #define FEAT_SRMASK ID_AA64MMFR4_EL1, SRMASK, IMP
+ #define FEAT_PoPS ID_AA64MMFR4_EL1, PoPS, IMP
+ #define FEAT_PFAR ID_AA64PFR1_EL1, PFAR, IMP
+-#define FEAT_Debugv8p9 ID_AA64DFR0_EL1, PMUVer, V3P9
++#define FEAT_Debugv8p9 ID_AA64DFR0_EL1, DebugVer, V8P9
+ #define FEAT_PMUv3_SS ID_AA64DFR0_EL1, PMSS, IMP
+ #define FEAT_SEBEP ID_AA64DFR0_EL1, SEBEP, IMP
+ #define FEAT_EBEP ID_AA64DFR1_EL1, EBEP, IMP
--- /dev/null
+From 08d715338287a1affb4c7ad5733decef4558a5c8 Mon Sep 17 00:00:00 2001
+From: Fuad Tabba <tabba@google.com>
+Date: Fri, 24 Apr 2026 09:49:05 +0100
+Subject: KVM: arm64: Fix FEAT_SPE_FnE to use PMSIDR_EL1.FnE, not PMSVer
+
+From: Fuad Tabba <tabba@google.com>
+
+commit 08d715338287a1affb4c7ad5733decef4558a5c8 upstream.
+
+FEAT_SPE_FnE is architecturally detected via PMSIDR_EL1.FnE [6], not
+ID_AA64DFR0_EL1.PMSVer. The FEAT_X macro form (register, field, value)
+cannot encode a PMSIDR_EL1-based feature, so FEAT_SPE_FnE was defined
+identically to FEAT_SPEv1p2 (ID_AA64DFR0_EL1, PMSVer, V1P2), producing
+a duplicate that used PMSVer >= V1P2 as a proxy.
+
+Replace the macro with feat_spe_fne(), following the same pattern as
+the sibling feat_spe_fds(): guard on FEAT_SPEv1p2 and read
+PMSIDR_EL1.FnE [6] directly. Wire the two NEEDS_FEAT consumers to use
+the new function.
+
+Remove the now-unused FEAT_SPE_FnE macro.
+
+Fixes: 63d423a7635b ("KVM: arm64: Switch to table-driven FGU configuration")
+Signed-off-by: Fuad Tabba <tabba@google.com>
+Link: https://patch.msgid.link/20260424084908.370776-4-tabba@google.com
+Signed-off-by: Marc Zyngier <maz@kernel.org>
+Cc: stable@vger.kernel.org
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/arm64/kvm/config.c | 15 ++++++++++++---
+ 1 file changed, 12 insertions(+), 3 deletions(-)
+
+--- a/arch/arm64/kvm/config.c
++++ b/arch/arm64/kvm/config.c
+@@ -127,7 +127,6 @@ struct reg_feat_map_desc {
+ }
+
+ #define FEAT_SPE ID_AA64DFR0_EL1, PMSVer, IMP
+-#define FEAT_SPE_FnE ID_AA64DFR0_EL1, PMSVer, V1P2
+ #define FEAT_BRBE ID_AA64DFR0_EL1, BRBE, IMP
+ #define FEAT_TRC_SR ID_AA64DFR0_EL1, TraceVer, IMP
+ #define FEAT_PMUv3 ID_AA64DFR0_EL1, PMUVer, IMP
+@@ -294,6 +293,16 @@ static bool feat_spe_fds(struct kvm *kvm
+ (read_sysreg_s(SYS_PMSIDR_EL1) & PMSIDR_EL1_FDS));
+ }
+
++static bool feat_spe_fne(struct kvm *kvm)
++{
++ /*
++ * Revisit this if KVM ever supports SPE -- this really should
++ * look at the guest's view of PMSIDR_EL1.
++ */
++ return (kvm_has_feat(kvm, FEAT_SPEv1p2) &&
++ (read_sysreg_s(SYS_PMSIDR_EL1) & PMSIDR_EL1_FnE));
++}
++
+ static bool feat_trbe_mpam(struct kvm *kvm)
+ {
+ /*
+@@ -547,7 +556,7 @@ static const struct reg_bits_to_feat_map
+ HDFGRTR_EL2_PMBPTR_EL1 |
+ HDFGRTR_EL2_PMBLIMITR_EL1,
+ FEAT_SPE),
+- NEEDS_FEAT(HDFGRTR_EL2_nPMSNEVFR_EL1, FEAT_SPE_FnE),
++ NEEDS_FEAT(HDFGRTR_EL2_nPMSNEVFR_EL1, feat_spe_fne),
+ NEEDS_FEAT(HDFGRTR_EL2_nBRBDATA |
+ HDFGRTR_EL2_nBRBCTL |
+ HDFGRTR_EL2_nBRBIDR,
+@@ -615,7 +624,7 @@ static const struct reg_bits_to_feat_map
+ HDFGWTR_EL2_PMBPTR_EL1 |
+ HDFGWTR_EL2_PMBLIMITR_EL1,
+ FEAT_SPE),
+- NEEDS_FEAT(HDFGWTR_EL2_nPMSNEVFR_EL1, FEAT_SPE_FnE),
++ NEEDS_FEAT(HDFGWTR_EL2_nPMSNEVFR_EL1, feat_spe_fne),
+ NEEDS_FEAT(HDFGWTR_EL2_nBRBDATA |
+ HDFGWTR_EL2_nBRBCTL,
+ FEAT_BRBE),
--- /dev/null
+From 5bb0aed57ba944f8c201e4e82ec066e0187e0f85 Mon Sep 17 00:00:00 2001
+From: Quentin Perret <qperret@google.com>
+Date: Fri, 24 Apr 2026 09:49:08 +0100
+Subject: KVM: arm64: Fix initialisation order in __pkvm_init_finalise()
+
+From: Quentin Perret <qperret@google.com>
+
+commit 5bb0aed57ba944f8c201e4e82ec066e0187e0f85 upstream.
+
+fix_host_ownership() walks the hypervisor's stage-1 page-table to
+adjust the host's stage-2 accordingly. Any such adjustment that
+requires cache maintenance operations depends on the per-CPU hyp
+fixmap being present. However, fix_host_ownership() is currently
+called before fix_hyp_pgtable_refcnt() and hyp_create_fixmap(), so
+the fixmap does not yet exist when it runs.
+
+This is benign today because the host stage-2 starts empty and no
+CMOs are needed, but it becomes a latent crash as soon as
+fix_host_ownership() is extended to operate on a non-empty
+page-table.
+
+Reorder the calls so that fix_hyp_pgtable_refcnt() and
+hyp_create_fixmap() complete before fix_host_ownership() is invoked.
+
+Fixes: 0d16d12eb26e ("KVM: arm64: Fix-up hyp stage-1 refcounts for all pages mapped at EL2")
+Signed-off-by: Quentin Perret <qperret@google.com>
+Signed-off-by: Fuad Tabba <tabba@google.com>
+Link: https://patch.msgid.link/20260424084908.370776-7-tabba@google.com
+Signed-off-by: Marc Zyngier <maz@kernel.org>
+Cc: stable@vger.kernel.org
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/arm64/kvm/hyp/nvhe/setup.c | 6 +++---
+ 1 file changed, 3 insertions(+), 3 deletions(-)
+
+--- a/arch/arm64/kvm/hyp/nvhe/setup.c
++++ b/arch/arm64/kvm/hyp/nvhe/setup.c
+@@ -312,15 +312,15 @@ void __noreturn __pkvm_init_finalise(voi
+ };
+ pkvm_pgtable.mm_ops = &pkvm_pgtable_mm_ops;
+
+- ret = fix_host_ownership();
++ ret = fix_hyp_pgtable_refcnt();
+ if (ret)
+ goto out;
+
+- ret = fix_hyp_pgtable_refcnt();
++ ret = hyp_create_fixmap();
+ if (ret)
+ goto out;
+
+- ret = hyp_create_fixmap();
++ ret = fix_host_ownership();
+ if (ret)
+ goto out;
+
--- /dev/null
+From 73b9c1e5da84cd69b1a86e374e450817cd051371 Mon Sep 17 00:00:00 2001
+From: Fuad Tabba <tabba@google.com>
+Date: Fri, 24 Apr 2026 09:49:07 +0100
+Subject: KVM: arm64: Fix pin leak and publication ordering in __pkvm_init_vcpu()
+
+From: Fuad Tabba <tabba@google.com>
+
+commit 73b9c1e5da84cd69b1a86e374e450817cd051371 upstream.
+
+Two bugs exist in the vCPU initialisation path:
+
+1. If a check fails after hyp_pin_shared_mem() succeeds, the cleanup
+ path jumps to 'unlock' without calling unpin_host_vcpu() or
+ unpin_host_sve_state(), permanently leaking pin references on the
+ host vCPU and SVE state pages.
+
+ Extract a register_hyp_vcpu() helper that performs the checks and
+ the store. When register_hyp_vcpu() returns an error, call
+ unpin_host_vcpu() and unpin_host_sve_state() inline before falling
+ through to the existing 'unlock' label.
+
+2. register_hyp_vcpu() publishes the new vCPU pointer into
+ 'hyp_vm->vcpus[]' with a bare store, allowing a concurrent caller
+ of pkvm_load_hyp_vcpu() to observe a partially initialised vCPU
+ object.
+
+ Ensure the store uses smp_store_release() and the load uses
+ smp_load_acquire(). While 'vm_table_lock' currently serialises the
+ store and the load, these barriers ensure the reader sees the fully
+ initialised 'hyp_vcpu' object even if there were a lockless path or
+ if the lock's own ordering guarantees were insufficient for nested
+ object initialization.
+
+Fixes: 49af6ddb8e5c ("KVM: arm64: Add infrastructure to create and track pKVM instances at EL2")
+Reported-by: Ben Simner <ben.simner@cl.cam.ac.uk>
+Co-developed-by: Will Deacon <willdeacon@google.com>
+Signed-off-by: Will Deacon <willdeacon@google.com>
+Signed-off-by: Fuad Tabba <tabba@google.com>
+Link: https://patch.msgid.link/20260424084908.370776-6-tabba@google.com
+Signed-off-by: Marc Zyngier <maz@kernel.org>
+Cc: stable@vger.kernel.org
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/arm64/kvm/hyp/nvhe/pkvm.c | 38 +++++++++++++++++++++++++-------------
+ 1 file changed, 25 insertions(+), 13 deletions(-)
+
+--- a/arch/arm64/kvm/hyp/nvhe/pkvm.c
++++ b/arch/arm64/kvm/hyp/nvhe/pkvm.c
+@@ -259,7 +259,8 @@ struct pkvm_hyp_vcpu *pkvm_load_hyp_vcpu
+ if (!hyp_vm || hyp_vm->kvm.created_vcpus <= vcpu_idx)
+ goto unlock;
+
+- hyp_vcpu = hyp_vm->vcpus[vcpu_idx];
++ /* Pairs with smp_store_release() in register_hyp_vcpu(). */
++ hyp_vcpu = smp_load_acquire(&hyp_vm->vcpus[vcpu_idx]);
+ if (!hyp_vcpu)
+ goto unlock;
+
+@@ -801,12 +802,30 @@ err_unpin_kvm:
+ * the page-aligned size of 'struct pkvm_hyp_vcpu'.
+ * Return 0 on success, negative error code on failure.
+ */
++static int register_hyp_vcpu(struct pkvm_hyp_vm *hyp_vm,
++ struct pkvm_hyp_vcpu *hyp_vcpu)
++{
++ unsigned int idx = hyp_vcpu->vcpu.vcpu_idx;
++
++ if (idx >= hyp_vm->kvm.created_vcpus)
++ return -EINVAL;
++
++ if (hyp_vm->vcpus[idx])
++ return -EINVAL;
++
++ /*
++ * Ensure the hyp_vcpu is initialised before publishing it to
++ * the vCPU-load path via 'hyp_vm->vcpus[]'.
++ */
++ smp_store_release(&hyp_vm->vcpus[idx], hyp_vcpu);
++ return 0;
++}
++
+ int __pkvm_init_vcpu(pkvm_handle_t handle, struct kvm_vcpu *host_vcpu,
+ unsigned long vcpu_hva)
+ {
+ struct pkvm_hyp_vcpu *hyp_vcpu;
+ struct pkvm_hyp_vm *hyp_vm;
+- unsigned int idx;
+ int ret;
+
+ hyp_vcpu = map_donated_memory(vcpu_hva, sizeof(*hyp_vcpu));
+@@ -825,18 +844,11 @@ int __pkvm_init_vcpu(pkvm_handle_t handl
+ if (ret)
+ goto unlock;
+
+- idx = hyp_vcpu->vcpu.vcpu_idx;
+- if (idx >= hyp_vm->kvm.created_vcpus) {
+- ret = -EINVAL;
+- goto unlock;
+- }
+-
+- if (hyp_vm->vcpus[idx]) {
+- ret = -EINVAL;
+- goto unlock;
++ ret = register_hyp_vcpu(hyp_vm, hyp_vcpu);
++ if (ret) {
++ unpin_host_vcpu(host_vcpu);
++ unpin_host_sve_state(hyp_vcpu);
+ }
+-
+- hyp_vm->vcpus[idx] = hyp_vcpu;
+ unlock:
+ hyp_spin_unlock(&vm_table_lock);
+
--- /dev/null
+From a0e6ae45af17e8b27958830595799c702ffbab8d Mon Sep 17 00:00:00 2001
+From: David Woodhouse <dwmw@amazon.co.uk>
+Date: Tue, 7 Apr 2026 21:27:02 +0100
+Subject: KVM: arm64: vgic: Fix IIDR revision field extracted from wrong value
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+From: David Woodhouse <dwmw@amazon.co.uk>
+
+commit a0e6ae45af17e8b27958830595799c702ffbab8d upstream.
+
+The uaccess write handlers for GICD_IIDR in both GICv2 and GICv3
+extract the revision field from 'reg' (the current IIDR value read back
+from the emulated distributor) instead of 'val' (the value userspace is
+trying to write). This means userspace can never actually change the
+implementation revision — the extracted value is always the current one.
+
+Fix the FIELD_GET to use 'val' so that userspace can select a different
+revision for migration compatibility.
+
+Fixes: 49a1a2c70a7f ("KVM: arm64: vgic-v3: Advertise GICR_CTLR.{IR, CES} as a new GICD_IIDR revision")
+Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
+Link: https://patch.msgid.link/20260407210949.2076251-2-dwmw2@infradead.org
+Signed-off-by: Marc Zyngier <maz@kernel.org>
+Cc: stable@vger.kernel.org
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/arm64/kvm/vgic/vgic-mmio-v2.c | 2 +-
+ arch/arm64/kvm/vgic/vgic-mmio-v3.c | 2 +-
+ 2 files changed, 2 insertions(+), 2 deletions(-)
+
+--- a/arch/arm64/kvm/vgic/vgic-mmio-v2.c
++++ b/arch/arm64/kvm/vgic/vgic-mmio-v2.c
+@@ -91,7 +91,7 @@ static int vgic_mmio_uaccess_write_v2_mi
+ * migration from old kernels to new kernels with legacy
+ * userspace.
+ */
+- reg = FIELD_GET(GICD_IIDR_REVISION_MASK, reg);
++ reg = FIELD_GET(GICD_IIDR_REVISION_MASK, val);
+ switch (reg) {
+ case KVM_VGIC_IMP_REV_2:
+ case KVM_VGIC_IMP_REV_3:
+--- a/arch/arm64/kvm/vgic/vgic-mmio-v3.c
++++ b/arch/arm64/kvm/vgic/vgic-mmio-v3.c
+@@ -194,7 +194,7 @@ static int vgic_mmio_uaccess_write_v3_mi
+ if ((reg ^ val) & ~GICD_IIDR_REVISION_MASK)
+ return -EINVAL;
+
+- reg = FIELD_GET(GICD_IIDR_REVISION_MASK, reg);
++ reg = FIELD_GET(GICD_IIDR_REVISION_MASK, val);
+ switch (reg) {
+ case KVM_VGIC_IMP_REV_2:
+ case KVM_VGIC_IMP_REV_3:
--- /dev/null
+From 4ce98bf0865c349e7026ad9c14f48da264920953 Mon Sep 17 00:00:00 2001
+From: Marc Zyngier <maz@kernel.org>
+Date: Thu, 23 Apr 2026 17:36:07 +0100
+Subject: KVM: arm64: Wake-up from WFI when iqrchip is in userspace
+
+From: Marc Zyngier <maz@kernel.org>
+
+commit 4ce98bf0865c349e7026ad9c14f48da264920953 upstream.
+
+It appears that there is nothing in the wake-up path that
+evaluates whether the in-kernel interrupts are pending unless
+we have a vgic.
+
+This means that the userspace irqchip support has been broken for
+about four years, and nobody noticed. It was also broken before
+as we wouldn't wake-up on a PMU interrupt, but hey, who cares...
+
+It is probably time to remove the feature altogether, because it
+was a terrible idea 10 years ago, and it still is.
+
+Fixes: b57de4ffd7c6d ("KVM: arm64: Simplify kvm_cpu_has_pending_timer()")
+Link: https://patch.msgid.link/20260423163607.486345-1-maz@kernel.org
+Signed-off-by: Marc Zyngier <maz@kernel.org>
+Cc: stable@vger.kernel.org
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/arm64/kvm/arm.c | 4 ++++
+ 1 file changed, 4 insertions(+)
+
+--- a/arch/arm64/kvm/arm.c
++++ b/arch/arm64/kvm/arm.c
+@@ -755,6 +755,10 @@ int kvm_arch_vcpu_runnable(struct kvm_vc
+ {
+ bool irq_lines = *vcpu_hcr(v) & (HCR_VI | HCR_VF | HCR_VSE);
+
++ irq_lines |= (!irqchip_in_kernel(v->kvm) &&
++ (kvm_timer_should_notify_user(v) ||
++ kvm_pmu_should_notify_user(v)));
++
+ return ((irq_lines || kvm_vgic_vcpu_pending_irq(v))
+ && !kvm_arm_vcpu_stopped(v) && !v->arch.pause);
+ }
--- /dev/null
+From 8dfa2f8780e486d05b9a0ffce70b8f5fbd62053e Mon Sep 17 00:00:00 2001
+From: Wentao Guan <guanwentao@uniontech.com>
+Date: Mon, 4 May 2026 09:00:20 +0800
+Subject: LoongArch: Fix potential ADE in loongson_gpu_fixup_dma_hang()
+
+From: Wentao Guan <guanwentao@uniontech.com>
+
+commit 8dfa2f8780e486d05b9a0ffce70b8f5fbd62053e upstream.
+
+The switch case in loongson_gpu_fixup_dma_hang() may not DC2 or DC3, and
+readl(crtc_reg) will access with random address, because the "device" is
+from "base+PCI_DEVICE_ID", "base" is from "pdev->devfn+1". This is wrong
+when my platform inserts a discrete GPU:
+
+lspci -tv
+-[0000:00]-+-00.0 Loongson Technology LLC Hyper Transport Bridge Controller
+...
+ +-06.0 Loongson Technology LLC LG100 GPU
+ +-06.2 Loongson Technology LLC Device 7a37
+...
+
+Add a default switch case to fix the panic as below:
+
+ Kernel ade access[#1]:
+ CPU: 0 PID: 1 Comm: swapper/0 Not tainted 6.6.136-loong64-desktop-hwe+ #4
+ pc 90000000017e5534 ra 90000000017e54c0 tp 90000001002f8000 sp 90000001002fb6c0
+ a0 80000efe00003100 a1 0000000000003100 a2 0000000000000000 a3 0000000000000002
+ a4 90000001002fb6b4 a5 900000087cdb58fd a6 90000000027af000 a7 0000000000000001
+ t0 00000000000085b9 t1 000000000000ffff t2 0000000000000000 t3 0000000000000000
+ t4 fffffffffffffffd t5 00000000fffb6d9c t6 0000000000083b00 t7 00000000000070c0
+ t8 900000087cdb4d94 u0 900000087cdb58fd s9 90000001002fb826 s0 90000000031c12c8
+ s1 7fffffffffffff00 s2 90000000031c12d0 s3 0000000000002710 s4 0000000000000000
+ s5 0000000000000000 s6 9000000100053000 s7 7fffffffffffff00 s8 90000000030d4000
+ ra: 90000000017e54c0 loongson_gpu_fixup_dma_hang+0x40/0x210
+ ERA: 90000000017e5534 loongson_gpu_fixup_dma_hang+0xb4/0x210
+ CRMD: 000000b0 (PLV0 -IE -DA +PG DACF=CC DACM=CC -WE)
+ PRMD: 00000004 (PPLV0 +PIE -PWE)
+ EUEN: 00000000 (-FPE -SXE -ASXE -BTE)
+ ECFG: 00071c1d (LIE=0,2-4,10-12 VS=7)
+ ESTAT: 00480000 [ADEM] (IS= ECode=8 EsubCode=1)
+ BADV: 7fffffffffffff00
+ PRID: 0014d000 (Loongson-64bit, Loongson-3A6000-HV)
+ Modules linked in:
+ Process swapper/0 (pid: 1, threadinfo=(____ptrval____), task=(____ptrval____))
+ Stack : 0000000000000006 90000001002fb778 90000001002fb704 0000000000000007
+ 0000000016a65700 90000000017e5690 000000000000ffff ffffffffffffffff
+ 900000000209f7c0 9000000100053000 900000000209f7a8 9000000000eebc08
+ 0000000000000000 0000000000000000 0000000000000006 90000001002fb778
+ 90000001000530b8 90000000027af000 0000000000000000 9000000100054000
+ 9000000100053000 9000000000ebb70c 9000000100004c00 9000000004000001
+ 90000001002fb7e4 bae765461f31cb12 0000000000000000 0000000000000000
+ 0000000000000006 90000000027af000 0000000000000030 90000000027af000
+ 900000087cd6f800 9000000100053000 0000000000000000 9000000000ebc560
+ 7a2500147cdaf720 bae765461f31cb12 0000000000000001 0000000000000030
+ ...
+ Call Trace:
+ [<90000000017e5534>] loongson_gpu_fixup_dma_hang+0xb4/0x210
+ [<9000000000eebc08>] pci_fixup_device+0x108/0x280
+ [<9000000000ebb70c>] pci_setup_device+0x24c/0x690
+ [<9000000000ebc560>] pci_scan_single_device+0xe0/0x140
+ [<9000000000ebc684>] pci_scan_slot+0xc4/0x280
+ [<9000000000ebdd00>] pci_scan_child_bus_extend+0x60/0x3f0
+ [<9000000000f5bc94>] acpi_pci_root_create+0x2b4/0x420
+ [<90000000017e5e74>] pci_acpi_scan_root+0x2d4/0x440
+ [<9000000000f5b02c>] acpi_pci_root_add+0x21c/0x3a0
+ [<9000000000f4ee54>] acpi_bus_attach+0x1a4/0x3c0
+ [<90000000010e200c>] device_for_each_child+0x6c/0xe0
+ [<9000000000f4bbf4>] acpi_dev_for_each_child+0x44/0x70
+ [<9000000000f4ef40>] acpi_bus_attach+0x290/0x3c0
+ [<90000000010e200c>] device_for_each_child+0x6c/0xe0
+ [<9000000000f4bbf4>] acpi_dev_for_each_child+0x44/0x70
+ [<9000000000f4ef40>] acpi_bus_attach+0x290/0x3c0
+ [<9000000000f5211c>] acpi_bus_scan+0x6c/0x280
+ [<900000000189c028>] acpi_scan_init+0x194/0x310
+ [<900000000189bc6c>] acpi_init+0xcc/0x140
+ [<9000000000220cdc>] do_one_initcall+0x4c/0x310
+ [<90000000018618fc>] kernel_init_freeable+0x258/0x2d4
+ [<900000000184326c>] kernel_init+0x28/0x13c
+ [<9000000000222008>] ret_from_kernel_thread+0xc/0xa4
+
+Cc: stable@vger.kernel.org
+Fixes: 95db0c9f526d ("LoongArch: Workaround LS2K/LS7A GPU DMA hang bug")
+Link: https://gist.github.com/opsiff/ebf2dac51b4013d22462f2124c55f807
+Link: https://gist.github.com/opsiff/a62f2a73db0492b3c49bf223a339b133
+Signed-off-by: Wentao Guan <guanwentao@uniontech.com>
+Signed-off-by: Huacai Chen <chenhuacai@loongson.cn>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/loongarch/pci/pci.c | 3 +++
+ 1 file changed, 3 insertions(+)
+
+--- a/arch/loongarch/pci/pci.c
++++ b/arch/loongarch/pci/pci.c
+@@ -132,6 +132,9 @@ static void loongson_gpu_fixup_dma_hang(
+ crtc_reg = regbase;
+ crtc_offset = 0x400;
+ break;
++ default:
++ iounmap(regbase);
++ return;
+ }
+
+ for (i = 0; i < CRTC_NUM_MAX; i++, crtc_reg += crtc_offset) {
--- /dev/null
+From b3e31a6650d4cab63f0814c37c0b360372c6ee9e Mon Sep 17 00:00:00 2001
+From: Qiang Ma <maqianga@uniontech.com>
+Date: Mon, 4 May 2026 09:00:37 +0800
+Subject: LoongArch: KVM: Cap KVM_CAP_NR_VCPUS by KVM_CAP_MAX_VCPUS
+
+From: Qiang Ma <maqianga@uniontech.com>
+
+commit b3e31a6650d4cab63f0814c37c0b360372c6ee9e upstream.
+
+It doesn't make sense to return the recommended maximum number of vCPUs
+which exceeds the maximum possible number of vCPUs.
+
+Other architectures have already done this, such as commit 57a2e13ebdda
+("KVM: MIPS: Cap KVM_CAP_NR_VCPUS by KVM_CAP_MAX_VCPUS")
+
+Cc: stable@vger.kernel.org
+Reviewed-by: Bibo Mao <maobibo@loongson.cn>
+Signed-off-by: Qiang Ma <maqianga@uniontech.com>
+Signed-off-by: Huacai Chen <chenhuacai@loongson.cn>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/loongarch/kvm/vm.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/arch/loongarch/kvm/vm.c
++++ b/arch/loongarch/kvm/vm.c
+@@ -94,7 +94,7 @@ int kvm_vm_ioctl_check_extension(struct
+ r = 1;
+ break;
+ case KVM_CAP_NR_VCPUS:
+- r = num_online_cpus();
++ r = min_t(unsigned int, num_online_cpus(), KVM_MAX_VCPUS);
+ break;
+ case KVM_CAP_MAX_VCPUS:
+ r = KVM_MAX_VCPUS;
--- /dev/null
+From 2433f3f5724b3af569d9fb411ba728629524738b Mon Sep 17 00:00:00 2001
+From: Bibo Mao <maobibo@loongson.cn>
+Date: Mon, 4 May 2026 09:00:48 +0800
+Subject: LoongArch: KVM: Fix HW timer interrupt lost when inject interrupt by software
+
+From: Bibo Mao <maobibo@loongson.cn>
+
+commit 2433f3f5724b3af569d9fb411ba728629524738b upstream.
+
+With passthrough HW timer, timer interrupt is injected by HW. When
+inject emulated CPU interrupt by software such SIP0/SIP1/IPI, HW timer
+interrupt may be lost.
+
+Here check whether there is timer tick value inversion before and after
+injecting emulated CPU interrupt by software, timer enabling by reading
+timer cfg register is skipped. If the timer tick value is detected with
+changing, then timer should be enabled. And inject a timer interrupt by
+software if there is.
+
+Cc: <stable@vger.kernel.org>
+Fixes: f45ad5b8aa93 ("LoongArch: KVM: Implement vcpu interrupt operations").
+Signed-off-by: Bibo Mao <maobibo@loongson.cn>
+Signed-off-by: Huacai Chen <chenhuacai@loongson.cn>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/loongarch/kvm/interrupt.c | 14 ++++++++++++++
+ 1 file changed, 14 insertions(+)
+
+--- a/arch/loongarch/kvm/interrupt.c
++++ b/arch/loongarch/kvm/interrupt.c
+@@ -26,6 +26,7 @@ static unsigned int priority_to_irq[EXCC
+ static int kvm_irq_deliver(struct kvm_vcpu *vcpu, unsigned int priority)
+ {
+ unsigned int irq = 0;
++ unsigned long old, new;
+
+ clear_bit(priority, &vcpu->arch.irq_pending);
+ if (priority < EXCCODE_INT_NUM)
+@@ -36,7 +37,13 @@ static int kvm_irq_deliver(struct kvm_vc
+ case INT_IPI:
+ case INT_SWI0:
+ case INT_SWI1:
++ old = kvm_read_hw_gcsr(LOONGARCH_CSR_TVAL);
+ set_gcsr_estat(irq);
++ new = kvm_read_hw_gcsr(LOONGARCH_CSR_TVAL);
++
++ /* Inject TI if TVAL inverted */
++ if (new > old)
++ set_gcsr_estat(CPU_TIMER);
+ break;
+
+ case INT_HWI0 ... INT_HWI7:
+@@ -53,6 +60,7 @@ static int kvm_irq_deliver(struct kvm_vc
+ static int kvm_irq_clear(struct kvm_vcpu *vcpu, unsigned int priority)
+ {
+ unsigned int irq = 0;
++ unsigned long old, new;
+
+ clear_bit(priority, &vcpu->arch.irq_clear);
+ if (priority < EXCCODE_INT_NUM)
+@@ -63,7 +71,13 @@ static int kvm_irq_clear(struct kvm_vcpu
+ case INT_IPI:
+ case INT_SWI0:
+ case INT_SWI1:
++ old = kvm_read_hw_gcsr(LOONGARCH_CSR_TVAL);
+ clear_gcsr_estat(irq);
++ new = kvm_read_hw_gcsr(LOONGARCH_CSR_TVAL);
++
++ /* Inject TI if TVAL inverted */
++ if (new > old)
++ set_gcsr_estat(CPU_TIMER);
+ break;
+
+ case INT_HWI0 ... INT_HWI7:
--- /dev/null
+From b323a441da602dfdfc24f30d3190cac786ffebf2 Mon Sep 17 00:00:00 2001
+From: Xianglai Li <lixianglai@loongson.cn>
+Date: Mon, 4 May 2026 09:00:37 +0800
+Subject: LoongArch: KVM: Fix "unreliable stack" for kvm_exc_entry
+
+From: Xianglai Li <lixianglai@loongson.cn>
+
+commit b323a441da602dfdfc24f30d3190cac786ffebf2 upstream.
+
+Insert the appropriate UNWIND hint into the kvm_exc_entry assembly
+function to guide the generation of correct ORC table entries, thereby
+solving the timeout problem ("unreliable stack") while loading the
+livepatch-sample module on a physical machine running virtual machines
+with multiple vcpus.
+
+Cc: stable@vger.kernel.org
+Signed-off-by: Xianglai Li <lixianglai@loongson.cn>
+Signed-off-by: Huacai Chen <chenhuacai@loongson.cn>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/loongarch/kvm/switch.S | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/arch/loongarch/kvm/switch.S
++++ b/arch/loongarch/kvm/switch.S
+@@ -111,7 +111,7 @@
+ .p2align PAGE_SHIFT
+ .cfi_sections .debug_frame
+ SYM_CODE_START(kvm_exc_entry)
+- UNWIND_HINT_UNDEFINED
++ UNWIND_HINT_END_OF_STACK
+ csrwr a2, KVM_TEMP_KS
+ csrrd a2, KVM_VCPU_KS
+ addi.d a2, a2, KVM_VCPU_ARCH
--- /dev/null
+From 5a873d77ba792410a796595a917be6a440f9b7d2 Mon Sep 17 00:00:00 2001
+From: Bibo Mao <maobibo@loongson.cn>
+Date: Mon, 4 May 2026 09:00:48 +0800
+Subject: LoongArch: KVM: Move unconditional delay into timer clear scenery
+
+From: Bibo Mao <maobibo@loongson.cn>
+
+commit 5a873d77ba792410a796595a917be6a440f9b7d2 upstream.
+
+When timer interrupt arrives in guest kernel, guest kernel clears the
+timer interrupt and program timer with the next incoming event.
+
+During this stage, timer tick is -1 and timer interrupt status is
+disabled in ESTAT register. KVM hypervisor need write zero with timer
+tick register and wait timer interrupt injection from HW side, and
+then clear timer interrupt.
+
+So there is 2 cycle delay in KVM hypervisor to emulate such scenery,
+and the delay is unnecessary if there is no need to clear the timer
+interrupt.
+
+Here move 2 cycle delay into timer clear scenery and add timer ESTAT
+checking after delay, and set max timer expire value if timer interrupt
+does not arrive still.
+
+Cc: stable@vger.kernel.org
+Signed-off-by: Bibo Mao <maobibo@loongson.cn>
+Signed-off-by: Huacai Chen <chenhuacai@loongson.cn>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/loongarch/kvm/timer.c | 10 ++++++++--
+ 1 file changed, 8 insertions(+), 2 deletions(-)
+
+--- a/arch/loongarch/kvm/timer.c
++++ b/arch/loongarch/kvm/timer.c
+@@ -96,15 +96,21 @@ void kvm_restore_timer(struct kvm_vcpu *
+ * and set CSR TVAL with -1
+ */
+ write_gcsr_timertick(0);
+- __delay(2); /* Wait cycles until timer interrupt injected */
+
+ /*
+ * Writing CSR_TINTCLR_TI to LOONGARCH_CSR_TINTCLR will clear
+ * timer interrupt, and CSR TVAL keeps unchanged with -1, it
+ * avoids spurious timer interrupt
+ */
+- if (!(estat & CPU_TIMER))
++ if (!(estat & CPU_TIMER)) {
++ __delay(2); /* Wait cycles until timer interrupt injected */
++
++ /* Write TVAL with max value if no TI shot */
++ estat = kvm_read_hw_gcsr(LOONGARCH_CSR_ESTAT);
++ if (!(estat & CPU_TIMER))
++ write_gcsr_timertick(CSR_TCFG_VAL);
+ gcsr_write(CSR_TINTCLR_TI, LOONGARCH_CSR_TINTCLR);
++ }
+ return;
+ }
+
--- /dev/null
+From 81e18777d61440511451866c7c80b34a8bdd6b33 Mon Sep 17 00:00:00 2001
+From: Tao Cui <cuitao@kylinos.cn>
+Date: Mon, 4 May 2026 09:00:38 +0800
+Subject: LoongArch: KVM: Use kvm_set_pte() in kvm_flush_pte()
+
+From: Tao Cui <cuitao@kylinos.cn>
+
+commit 81e18777d61440511451866c7c80b34a8bdd6b33 upstream.
+
+kvm_flush_pte() is the only caller that directly assigns *pte instead
+of using the kvm_set_pte() wrapper. Use the wrapper for consistency with
+the rest of the file.
+
+No functional change intended.
+
+Cc: stable@vger.kernel.org
+Reviewed-by: Bibo Mao <maobibo@loongson.cn>
+Signed-off-by: Tao Cui <cuitao@kylinos.cn>
+Signed-off-by: Huacai Chen <chenhuacai@loongson.cn>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/loongarch/kvm/mmu.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/arch/loongarch/kvm/mmu.c
++++ b/arch/loongarch/kvm/mmu.c
+@@ -95,7 +95,7 @@ static int kvm_flush_pte(kvm_pte_t *pte,
+ else
+ kvm->stat.pages--;
+
+- *pte = ctx->invalid_entry;
++ kvm_set_pte(pte, ctx->invalid_entry);
+
+ return 1;
+ }
--- /dev/null
+From 49f33840dcc907d21313d369e34872880846b61c Mon Sep 17 00:00:00 2001
+From: Huacai Chen <chenhuacai@loongson.cn>
+Date: Mon, 4 May 2026 09:00:20 +0800
+Subject: LoongArch: Use per-root-bridge PCIH flag to skip mem resource fixup
+
+From: Huacai Chen <chenhuacai@loongson.cn>
+
+commit 49f33840dcc907d21313d369e34872880846b61c upstream.
+
+When firmware enables 64-bit PCI host bridge support, some root bridges
+already provide valid 64-bit mem resource windows through ACPI.
+
+In this case, the LoongArch-specific mem resource high-bits fixup in
+acpi_prepare_root_resources() should not be applied unconditionally.
+Otherwise, the kernel may override the native resource layout derived
+from firmware, and later BAR assignment can fail to place device BARs
+into the intended 64-bit address space correctly.
+
+Add a per-root-bridge ACPI flag, PCIH, and evaluate it from the current
+root bridge device scope. When PCIH is set, skip the mem resource high-
+bits fixup path and let the kernel use the firmware-provided resource
+description directly. When PCIH is absent or cleared, keep the existing
+behavior and continue filling the high address bits from the host bridge
+address.
+
+This makes the behavior per-root-bridge configurable and avoids breaking
+valid 64-bit BAR space allocation on bridges whose 64-bit windows have
+already been fully described by firmware.
+
+Cc: stable@vger.kernel.org
+Suggested-by: Chao Li <lichao@loongson.cn>
+Tested-by: Dongyan Qian <qiandongyan@loongson.cn>
+Signed-off-by: Dongyan Qian <qiandongyan@loongson.cn>
+Signed-off-by: Huacai Chen <chenhuacai@loongson.cn>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/loongarch/pci/acpi.c | 5 +++++
+ 1 file changed, 5 insertions(+)
+
+--- a/arch/loongarch/pci/acpi.c
++++ b/arch/loongarch/pci/acpi.c
+@@ -61,11 +61,16 @@ static void acpi_release_root_info(struc
+ static int acpi_prepare_root_resources(struct acpi_pci_root_info *ci)
+ {
+ int status;
++ unsigned long long pci_h = 0;
+ struct resource_entry *entry, *tmp;
+ struct acpi_device *device = ci->bridge;
+
+ status = acpi_pci_probe_root_resources(ci);
+ if (status > 0) {
++ acpi_evaluate_integer(device->handle, "PCIH", NULL, &pci_h);
++ if (pci_h)
++ return status;
++
+ resource_list_for_each_entry_safe(entry, tmp, &ci->resources) {
+ if (entry->res->flags & IORESOURCE_MEM) {
+ entry->offset = ci->root->mcfg_addr & GENMASK_ULL(63, 40);
--- /dev/null
+From f14d6e9c3678a067f304abba561e0c5446c7e845 Mon Sep 17 00:00:00 2001
+From: "Matthieu Baerts (NGI0)" <matttbe@kernel.org>
+Date: Mon, 27 Apr 2026 21:54:35 +0200
+Subject: mptcp: fastclose msk when linger time is 0
+
+From: Matthieu Baerts (NGI0) <matttbe@kernel.org>
+
+commit f14d6e9c3678a067f304abba561e0c5446c7e845 upstream.
+
+The SO_LINGER socket option has been supported for a while with MPTCP
+sockets [1], but it didn't cause the equivalent of a TCP reset as
+expected when enabled and its time was set to 0. This was causing some
+behavioural differences with TCP where some connections were not
+promptly stopped as expected.
+
+To fix that, an extra condition is checked at close() time before
+sending an MP_FASTCLOSE, the MPTCP equivalent of a TCP reset.
+
+Note that backporting up to [1] will be difficult as more changes are
+needed to be able to send MP_FASTCLOSE. It seems better to stop at [2],
+which was supposed to already imitate TCP.
+
+Validated with MPTCP packetdrill tests [3].
+
+Fixes: 268b12387460 ("mptcp: setsockopt: support SO_LINGER") [1]
+Fixes: d21f83485518 ("mptcp: use fastclose on more edge scenarios") [2]
+Cc: stable@vger.kernel.org
+Reported-by: Lance Tuller <lance@lance0.com>
+Closes: https://github.com/lance0/xfr/pull/67
+Link: https://github.com/multipath-tcp/packetdrill/pull/196 [3]
+Reviewed-by: Mat Martineau <martineau@kernel.org>
+Signed-off-by: Matthieu Baerts (NGI0) <matttbe@kernel.org>
+Link: https://patch.msgid.link/20260427-net-mptcp-misc-fixes-7-1-rc2-v1-3-7432b7f279fa@kernel.org
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/mptcp/protocol.c | 3 ++-
+ 1 file changed, 2 insertions(+), 1 deletion(-)
+
+--- a/net/mptcp/protocol.c
++++ b/net/mptcp/protocol.c
+@@ -3162,7 +3162,8 @@ bool __mptcp_close(struct sock *sk, long
+ goto cleanup;
+ }
+
+- if (mptcp_data_avail(msk) || timeout < 0) {
++ if (mptcp_data_avail(msk) || timeout < 0 ||
++ (sock_flag(sk, SOCK_LINGER) && !sk->sk_lingertime)) {
+ /* If the msk has read data, or the caller explicitly ask it,
+ * do the MPTCP equivalent of TCP reset, aka MPTCP fastclose
+ */
--- /dev/null
+From 6254a16d6f0c672e3809ca5d7c9a28a55d71f764 Mon Sep 17 00:00:00 2001
+From: Paolo Abeni <pabeni@redhat.com>
+Date: Fri, 1 May 2026 21:35:36 +0200
+Subject: mptcp: fix rx timestamp corruption on fastopen
+
+From: Paolo Abeni <pabeni@redhat.com>
+
+commit 6254a16d6f0c672e3809ca5d7c9a28a55d71f764 upstream.
+
+The skb cb offset containing the timestamp presence flag is cleared
+before loading such information. Cache such value before MPTCP CB
+initialization.
+
+Fixes: 36b122baf6a8 ("mptcp: add subflow_v(4,6)_send_synack()")
+Cc: stable@vger.kernel.org
+Signed-off-by: Paolo Abeni <pabeni@redhat.com>
+Reviewed-by: Matthieu Baerts (NGI0) <matttbe@kernel.org>
+Signed-off-by: Matthieu Baerts (NGI0) <matttbe@kernel.org>
+Link: https://patch.msgid.link/20260501-net-mptcp-misc-fixes-7-1-rc3-v1-3-b70118df778e@kernel.org
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/mptcp/fastopen.c | 4 +++-
+ 1 file changed, 3 insertions(+), 1 deletion(-)
+
+--- a/net/mptcp/fastopen.c
++++ b/net/mptcp/fastopen.c
+@@ -12,6 +12,7 @@ void mptcp_fastopen_subflow_synack_set_p
+ struct sock *sk, *ssk;
+ struct sk_buff *skb;
+ struct tcp_sock *tp;
++ bool has_rxtstamp;
+
+ /* on early fallback the subflow context is deleted by
+ * subflow_syn_recv_sock()
+@@ -39,12 +40,13 @@ void mptcp_fastopen_subflow_synack_set_p
+ */
+ tp->copied_seq += skb->len;
+ subflow->ssn_offset += skb->len;
++ has_rxtstamp = TCP_SKB_CB(skb)->has_rxtstamp;
+
+ /* Only the sequence delta is relevant */
+ MPTCP_SKB_CB(skb)->map_seq = -skb->len;
+ MPTCP_SKB_CB(skb)->end_seq = 0;
+ MPTCP_SKB_CB(skb)->offset = 0;
+- MPTCP_SKB_CB(skb)->has_rxtstamp = TCP_SKB_CB(skb)->has_rxtstamp;
++ MPTCP_SKB_CB(skb)->has_rxtstamp = has_rxtstamp;
+ MPTCP_SKB_CB(skb)->cant_coalesce = 1;
+
+ mptcp_data_lock(sk);
--- /dev/null
+From b5c52908d52c6c8eb8933264aa6087a0600fd892 Mon Sep 17 00:00:00 2001
+From: Gang Yan <yangang@kylinos.cn>
+Date: Mon, 27 Apr 2026 21:54:34 +0200
+Subject: mptcp: fix scheduling with atomic in timestamp sockopt
+
+From: Gang Yan <yangang@kylinos.cn>
+
+commit b5c52908d52c6c8eb8933264aa6087a0600fd892 upstream.
+
+Using lock_sock_fast() (atomic context) around sock_set_timestamp()
+and sock_set_timestamping() is unsafe, as both helpers can sleep.
+
+Replace lock_sock_fast() with sleepable lock_sock()/release_sock()
+to avoid scheduling while atomic panic.
+
+Fixes: 9061f24bf82e ("mptcp: sockopt: propagate timestamp request to subflows")
+Cc: stable@vger.kernel.org
+Reported-by: Sashiko <sashiko-bot@kernel.org>
+Closes: https://sashiko.dev/#/patchset/20260420093343.16443-1-gang.yan@linux.dev
+Signed-off-by: Gang Yan <yangang@kylinos.cn>
+Reviewed-by: Matthieu Baerts (NGI0) <matttbe@kernel.org>
+Signed-off-by: Matthieu Baerts (NGI0) <matttbe@kernel.org>
+Link: https://patch.msgid.link/20260427-net-mptcp-misc-fixes-7-1-rc2-v1-2-7432b7f279fa@kernel.org
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/mptcp/sockopt.c | 8 ++++----
+ 1 file changed, 4 insertions(+), 4 deletions(-)
+
+--- a/net/mptcp/sockopt.c
++++ b/net/mptcp/sockopt.c
+@@ -159,10 +159,10 @@ static int mptcp_setsockopt_sol_socket_t
+ lock_sock(sk);
+ mptcp_for_each_subflow(msk, subflow) {
+ struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
+- bool slow = lock_sock_fast(ssk);
+
++ lock_sock(ssk);
+ sock_set_timestamp(ssk, optname, !!val);
+- unlock_sock_fast(ssk, slow);
++ release_sock(ssk);
+ }
+
+ release_sock(sk);
+@@ -235,10 +235,10 @@ static int mptcp_setsockopt_sol_socket_t
+
+ mptcp_for_each_subflow(msk, subflow) {
+ struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
+- bool slow = lock_sock_fast(ssk);
+
++ lock_sock(ssk);
+ sock_set_timestamping(ssk, optname, timestamping);
+- unlock_sock_fast(ssk, slow);
++ release_sock(ssk);
+ }
+
+ release_sock(sk);
--- /dev/null
+From 03f324f3f1f7619a47b9c91282cb12775ab0a2f1 Mon Sep 17 00:00:00 2001
+From: "Matthieu Baerts (NGI0)" <matttbe@kernel.org>
+Date: Tue, 5 May 2026 17:00:50 +0200
+Subject: mptcp: pm: ADD_ADDR rtx: allow ID 0
+
+From: Matthieu Baerts (NGI0) <matttbe@kernel.org>
+
+commit 03f324f3f1f7619a47b9c91282cb12775ab0a2f1 upstream.
+
+ADD_ADDR can be sent for the ID 0, which corresponds to the local
+address and port linked to the initial subflow.
+
+Indeed, this address could be removed, and re-added later on, e.g. what
+is done in the "delete re-add signal" MPTCP Join selftests. So no reason
+to ignore it.
+
+Fixes: 00cfd77b9063 ("mptcp: retransmit ADD_ADDR when timeout")
+Cc: stable@vger.kernel.org
+Reviewed-by: Mat Martineau <martineau@kernel.org>
+Signed-off-by: Matthieu Baerts (NGI0) <matttbe@kernel.org>
+Link: https://patch.msgid.link/20260505-net-mptcp-pm-fixes-7-1-rc3-v1-2-fca8091060a4@kernel.org
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/mptcp/pm.c | 3 ---
+ 1 file changed, 3 deletions(-)
+
+--- a/net/mptcp/pm.c
++++ b/net/mptcp/pm.c
+@@ -350,9 +350,6 @@ static void mptcp_pm_add_timer(struct ti
+ if (inet_sk_state_load(sk) == TCP_CLOSE)
+ return;
+
+- if (!entry->addr.id)
+- return;
+-
+ if (mptcp_pm_should_add_signal_addr(msk)) {
+ sk_reset_timer(sk, timer, jiffies + TCP_RTO_MAX / 8);
+ goto out;
--- /dev/null
+From 9634cb35af17019baec21ca648516ce376fa10e6 Mon Sep 17 00:00:00 2001
+From: "Matthieu Baerts (NGI0)" <matttbe@kernel.org>
+Date: Tue, 5 May 2026 17:00:52 +0200
+Subject: mptcp: pm: ADD_ADDR rtx: always decrease sk refcount
+
+From: Matthieu Baerts (NGI0) <matttbe@kernel.org>
+
+commit 9634cb35af17019baec21ca648516ce376fa10e6 upstream.
+
+When an ADD_ADDR is retransmitted, the sk is held in sk_reset_timer().
+It should then be released in all cases at the end.
+
+Some (unlikely) checks were returning directly instead of calling
+sock_put() to decrease the refcount. Jump to a new 'exit' label to call
+__sock_put() (which will become sock_put() in the next commit) to fix
+this potential leak.
+
+While at it, drop the '!msk' check which cannot happen because it is
+never reset, and explicitly mark the remaining one as "unlikely".
+
+Fixes: 00cfd77b9063 ("mptcp: retransmit ADD_ADDR when timeout")
+Cc: stable@vger.kernel.org
+Reviewed-by: Mat Martineau <martineau@kernel.org>
+Signed-off-by: Matthieu Baerts (NGI0) <matttbe@kernel.org>
+Link: https://patch.msgid.link/20260505-net-mptcp-pm-fixes-7-1-rc3-v1-4-fca8091060a4@kernel.org
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/mptcp/pm.c | 8 +++-----
+ 1 file changed, 3 insertions(+), 5 deletions(-)
+
+--- a/net/mptcp/pm.c
++++ b/net/mptcp/pm.c
+@@ -344,11 +344,8 @@ static void mptcp_pm_add_timer(struct ti
+
+ pr_debug("msk=%p\n", msk);
+
+- if (!msk)
+- return;
+-
+- if (inet_sk_state_load(sk) == TCP_CLOSE)
+- return;
++ if (unlikely(inet_sk_state_load(sk) == TCP_CLOSE))
++ goto exit;
+
+ bh_lock_sock(sk);
+ if (sock_owned_by_user(sk)) {
+@@ -386,6 +383,7 @@ static void mptcp_pm_add_timer(struct ti
+
+ out:
+ bh_unlock_sock(sk);
++exit:
+ __sock_put(sk);
+ }
+
--- /dev/null
+From 5cd6e0ad79d2615264f63929f8b457ad97ae550d Mon Sep 17 00:00:00 2001
+From: "Matthieu Baerts (NGI0)" <matttbe@kernel.org>
+Date: Tue, 5 May 2026 17:00:51 +0200
+Subject: mptcp: pm: ADD_ADDR rtx: fix potential data-race
+
+From: Matthieu Baerts (NGI0) <matttbe@kernel.org>
+
+commit 5cd6e0ad79d2615264f63929f8b457ad97ae550d upstream.
+
+This mptcp_pm_add_timer() helper is executed as a timer callback in
+softirq context. To avoid any data races, the socket lock needs to be
+held with bh_lock_sock().
+
+If the socket is in use, retry again soon after, similar to what is done
+with the keepalive timer.
+
+Fixes: 00cfd77b9063 ("mptcp: retransmit ADD_ADDR when timeout")
+Cc: stable@vger.kernel.org
+Reviewed-by: Mat Martineau <martineau@kernel.org>
+Signed-off-by: Matthieu Baerts (NGI0) <matttbe@kernel.org>
+Link: https://patch.msgid.link/20260505-net-mptcp-pm-fixes-7-1-rc3-v1-3-fca8091060a4@kernel.org
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/mptcp/pm.c | 8 ++++++++
+ 1 file changed, 8 insertions(+)
+
+--- a/net/mptcp/pm.c
++++ b/net/mptcp/pm.c
+@@ -350,6 +350,13 @@ static void mptcp_pm_add_timer(struct ti
+ if (inet_sk_state_load(sk) == TCP_CLOSE)
+ return;
+
++ bh_lock_sock(sk);
++ if (sock_owned_by_user(sk)) {
++ /* Try again later. */
++ sk_reset_timer(sk, timer, jiffies + HZ / 20);
++ goto out;
++ }
++
+ if (mptcp_pm_should_add_signal_addr(msk)) {
+ sk_reset_timer(sk, timer, jiffies + TCP_RTO_MAX / 8);
+ goto out;
+@@ -378,6 +385,7 @@ static void mptcp_pm_add_timer(struct ti
+ mptcp_pm_subflow_established(msk);
+
+ out:
++ bh_unlock_sock(sk);
+ __sock_put(sk);
+ }
+
--- /dev/null
+From b7b9a461569734d33d3259d58d2507adfac107ed Mon Sep 17 00:00:00 2001
+From: "Matthieu Baerts (NGI0)" <matttbe@kernel.org>
+Date: Tue, 5 May 2026 17:00:53 +0200
+Subject: mptcp: pm: ADD_ADDR rtx: free sk if last
+
+From: Matthieu Baerts (NGI0) <matttbe@kernel.org>
+
+commit b7b9a461569734d33d3259d58d2507adfac107ed upstream.
+
+When an ADD_ADDR is retransmitted, the sk is held in sk_reset_timer(),
+and released at the end.
+
+If at that moment, it was the last reference being held, the sk would
+not be freed. sock_put() should then be called instead of __sock_put().
+
+But that's not enough: if it is the last reference, sock_put() will call
+sk_free(), which will end up calling sk_stop_timer_sync() on the same
+timer, and waiting indefinitely to finish. So it is needed to mark that
+the timer is done at the end of the timer handler when it has not been
+rescheduled, not to call sk_stop_timer_sync() on "itself".
+
+Fixes: 00cfd77b9063 ("mptcp: retransmit ADD_ADDR when timeout")
+Cc: stable@vger.kernel.org
+Reviewed-by: Mat Martineau <martineau@kernel.org>
+Signed-off-by: Matthieu Baerts (NGI0) <matttbe@kernel.org>
+Link: https://patch.msgid.link/20260505-net-mptcp-pm-fixes-7-1-rc3-v1-5-fca8091060a4@kernel.org
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/mptcp/pm.c | 28 ++++++++++++++++++----------
+ 1 file changed, 18 insertions(+), 10 deletions(-)
+
+--- a/net/mptcp/pm.c
++++ b/net/mptcp/pm.c
+@@ -16,6 +16,7 @@ struct mptcp_pm_add_entry {
+ struct list_head list;
+ struct mptcp_addr_info addr;
+ u8 retrans_times;
++ bool timer_done;
+ struct timer_list add_timer;
+ struct mptcp_sock *sock;
+ struct rcu_head rcu;
+@@ -340,22 +341,22 @@ static void mptcp_pm_add_timer(struct ti
+ add_timer);
+ struct mptcp_sock *msk = entry->sock;
+ struct sock *sk = (struct sock *)msk;
+- unsigned int timeout;
++ unsigned int timeout = 0;
+
+ pr_debug("msk=%p\n", msk);
+
++ bh_lock_sock(sk);
+ if (unlikely(inet_sk_state_load(sk) == TCP_CLOSE))
+- goto exit;
++ goto out;
+
+- bh_lock_sock(sk);
+ if (sock_owned_by_user(sk)) {
+ /* Try again later. */
+- sk_reset_timer(sk, timer, jiffies + HZ / 20);
++ timeout = HZ / 20;
+ goto out;
+ }
+
+ if (mptcp_pm_should_add_signal_addr(msk)) {
+- sk_reset_timer(sk, timer, jiffies + TCP_RTO_MAX / 8);
++ timeout = TCP_RTO_MAX / 8;
+ goto out;
+ }
+
+@@ -373,8 +374,9 @@ static void mptcp_pm_add_timer(struct ti
+ }
+
+ if (entry->retrans_times < ADD_ADDR_RETRANS_MAX)
+- sk_reset_timer(sk, timer,
+- jiffies + (timeout << entry->retrans_times));
++ timeout <<= entry->retrans_times;
++ else
++ timeout = 0;
+
+ spin_unlock_bh(&msk->pm.lock);
+
+@@ -382,9 +384,13 @@ static void mptcp_pm_add_timer(struct ti
+ mptcp_pm_subflow_established(msk);
+
+ out:
++ if (timeout)
++ sk_reset_timer(sk, timer, jiffies + timeout);
++ else
++ /* if sock_put calls sk_free: avoid waiting for this timer */
++ entry->timer_done = true;
+ bh_unlock_sock(sk);
+-exit:
+- __sock_put(sk);
++ sock_put(sk);
+ }
+
+ struct mptcp_pm_add_entry *
+@@ -447,6 +453,7 @@ bool mptcp_pm_alloc_anno_list(struct mpt
+
+ timer_setup(&add_entry->add_timer, mptcp_pm_add_timer, 0);
+ reset_timer:
++ add_entry->timer_done = false;
+ timeout = mptcp_adjust_add_addr_timeout(msk);
+ if (timeout)
+ sk_reset_timer(sk, &add_entry->add_timer, jiffies + timeout);
+@@ -467,7 +474,8 @@ static void mptcp_pm_free_anno_list(stru
+ spin_unlock_bh(&msk->pm.lock);
+
+ list_for_each_entry_safe(entry, tmp, &free_list, list) {
+- sk_stop_timer_sync(sk, &entry->add_timer);
++ if (!entry->timer_done)
++ sk_stop_timer_sync(sk, &entry->add_timer);
+ kfree_rcu(entry, rcu);
+ }
+ }
--- /dev/null
+From 3cf12492891c4b5ff54dda404a2de4ec54c9e1b5 Mon Sep 17 00:00:00 2001
+From: "Matthieu Baerts (NGI0)" <matttbe@kernel.org>
+Date: Tue, 5 May 2026 17:00:54 +0200
+Subject: mptcp: pm: ADD_ADDR rtx: resched blocked ADD_ADDR quicker
+
+From: Matthieu Baerts (NGI0) <matttbe@kernel.org>
+
+commit 3cf12492891c4b5ff54dda404a2de4ec54c9e1b5 upstream.
+
+When an ADD_ADDR needs to be retransmitted and another one has already
+been prepared -- e.g. multiple ADD_ADDRs have been sent in a row and
+need to be retransmitted later -- this additional retransmission will
+need to wait.
+
+In this case, the timer was reset to TCP_RTO_MAX / 8, which is ~15
+seconds. This delay is unnecessary long: it should just be rescheduled
+at the next opportunity, e.g. after the retransmission timeout.
+
+Without this modification, some issues can be seen from time to time in
+the selftests when multiple ADD_ADDRs are sent, and the host takes time
+to process them, e.g. the "signal addresses, ADD_ADDR timeout" MPTCP
+Join selftest, especially with a debug kernel config.
+
+Note that on older kernels, 'timeout' is not available. It should be
+enough to replace it by one second (HZ).
+
+Fixes: 00cfd77b9063 ("mptcp: retransmit ADD_ADDR when timeout")
+Cc: stable@vger.kernel.org
+Reviewed-by: Mat Martineau <martineau@kernel.org>
+Signed-off-by: Matthieu Baerts (NGI0) <matttbe@kernel.org>
+Link: https://patch.msgid.link/20260505-net-mptcp-pm-fixes-7-1-rc3-v1-6-fca8091060a4@kernel.org
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/mptcp/pm.c | 7 +------
+ 1 file changed, 1 insertion(+), 6 deletions(-)
+
+--- a/net/mptcp/pm.c
++++ b/net/mptcp/pm.c
+@@ -355,13 +355,8 @@ static void mptcp_pm_add_timer(struct ti
+ goto out;
+ }
+
+- if (mptcp_pm_should_add_signal_addr(msk)) {
+- timeout = TCP_RTO_MAX / 8;
+- goto out;
+- }
+-
+ timeout = mptcp_adjust_add_addr_timeout(msk);
+- if (!timeout)
++ if (!timeout || mptcp_pm_should_add_signal_addr(msk))
+ goto out;
+
+ spin_lock_bh(&msk->pm.lock);
--- /dev/null
+From 62a9b19dce77e72426f049fb99b9d1d032b9a8ea Mon Sep 17 00:00:00 2001
+From: "Matthieu Baerts (NGI0)" <matttbe@kernel.org>
+Date: Tue, 5 May 2026 17:00:56 +0200
+Subject: mptcp: pm: ADD_ADDR rtx: return early if no retrans
+
+From: Matthieu Baerts (NGI0) <matttbe@kernel.org>
+
+commit 62a9b19dce77e72426f049fb99b9d1d032b9a8ea upstream.
+
+No need to iterate over all subflows if there is no retransmission
+needed.
+
+Exit early in this case then.
+
+Fixes: 30549eebc4d8 ("mptcp: make ADD_ADDR retransmission timeout adaptive")
+Cc: stable@vger.kernel.org
+Reviewed-by: Mat Martineau <martineau@kernel.org>
+Signed-off-by: Matthieu Baerts (NGI0) <matttbe@kernel.org>
+Link: https://patch.msgid.link/20260505-net-mptcp-pm-fixes-7-1-rc3-v1-8-fca8091060a4@kernel.org
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/mptcp/pm.c | 3 +++
+ 1 file changed, 3 insertions(+)
+
+--- a/net/mptcp/pm.c
++++ b/net/mptcp/pm.c
+@@ -311,6 +311,9 @@ static unsigned int mptcp_adjust_add_add
+ struct mptcp_subflow_context *subflow;
+ unsigned int max = 0, max_stale = 0;
+
++ if (!rto)
++ return 0;
++
+ mptcp_for_each_subflow(msk, subflow) {
+ struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
+ struct inet_connection_sock *icsk = inet_csk(ssk);
--- /dev/null
+From b12014d2d36eaed4e4bec5f1ac7e91110eeb100d Mon Sep 17 00:00:00 2001
+From: "Matthieu Baerts (NGI0)" <matttbe@kernel.org>
+Date: Tue, 5 May 2026 17:00:49 +0200
+Subject: mptcp: pm: kernel: correctly retransmit ADD_ADDR ID 0
+
+From: Matthieu Baerts (NGI0) <matttbe@kernel.org>
+
+commit b12014d2d36eaed4e4bec5f1ac7e91110eeb100d upstream.
+
+When adding the ADD_ADDR to the list, the address including the IP, port
+and ID are copied. On the other hand, when the endpoint corresponds to
+the one from the initial subflow, the ID is set to 0, as specified by
+the MPTCP protocol.
+
+The issue is that the ID was reset after having copied the ID in the
+ADD_ADDR entry. So the retransmission was done, but using a different ID
+than the initial one.
+
+Fixes: 8b8ed1b429f8 ("mptcp: pm: reuse ID 0 after delete and re-add")
+Cc: stable@vger.kernel.org
+Reviewed-by: Mat Martineau <martineau@kernel.org>
+Signed-off-by: Matthieu Baerts (NGI0) <matttbe@kernel.org>
+Link: https://patch.msgid.link/20260505-net-mptcp-pm-fixes-7-1-rc3-v1-1-fca8091060a4@kernel.org
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/mptcp/pm_kernel.c | 13 ++++++++-----
+ 1 file changed, 8 insertions(+), 5 deletions(-)
+
+--- a/net/mptcp/pm_kernel.c
++++ b/net/mptcp/pm_kernel.c
+@@ -336,6 +336,8 @@ static void mptcp_pm_create_subflow_or_s
+
+ /* check first for announce */
+ if (msk->pm.add_addr_signaled < endp_signal_max) {
++ u8 endp_id;
++
+ /* due to racing events on both ends we can reach here while
+ * previous add address is still running: if we invoke now
+ * mptcp_pm_announce_addr(), that will fail and the
+@@ -349,19 +351,20 @@ static void mptcp_pm_create_subflow_or_s
+ if (!select_signal_address(pernet, msk, &local))
+ goto subflow;
+
++ /* Special case for ID0: set the correct ID */
++ endp_id = local.addr.id;
++ if (endp_id == msk->mpc_endpoint_id)
++ local.addr.id = 0;
++
+ /* If the alloc fails, we are on memory pressure, not worth
+ * continuing, and trying to create subflows.
+ */
+ if (!mptcp_pm_alloc_anno_list(msk, &local.addr))
+ return;
+
+- __clear_bit(local.addr.id, msk->pm.id_avail_bitmap);
++ __clear_bit(endp_id, msk->pm.id_avail_bitmap);
+ msk->pm.add_addr_signaled++;
+
+- /* Special case for ID0: set the correct ID */
+- if (local.addr.id == msk->mpc_endpoint_id)
+- local.addr.id = 0;
+-
+ mptcp_pm_announce_addr(msk, &local.addr, false);
+ mptcp_pm_addr_send_ack(msk);
+
--- /dev/null
+From 166b78344031bf7ac9f55cb5282776cfd85f220e Mon Sep 17 00:00:00 2001
+From: "Matthieu Baerts (NGI0)" <matttbe@kernel.org>
+Date: Tue, 5 May 2026 17:00:57 +0200
+Subject: mptcp: pm: prio: skip closed subflows
+
+From: Matthieu Baerts (NGI0) <matttbe@kernel.org>
+
+commit 166b78344031bf7ac9f55cb5282776cfd85f220e upstream.
+
+When sending an MP_PRIO, closed subflows need to be skipped.
+
+This fixes the case where the initial subflow got closed, re-opened
+later, then an MP_PRIO is needed for the same local address.
+
+Note that explicit MP_PRIO cannot be sent during the 3WHS, so it is fine
+to use __mptcp_subflow_active().
+
+Fixes: 067065422fcd ("mptcp: add the outgoing MP_PRIO support")
+Cc: stable@vger.kernel.org
+Fixes: b29fcfb54cd7 ("mptcp: full disconnect implementation")
+Reviewed-by: Mat Martineau <martineau@kernel.org>
+Signed-off-by: Matthieu Baerts (NGI0) <matttbe@kernel.org>
+Link: https://patch.msgid.link/20260505-net-mptcp-pm-fixes-7-1-rc3-v1-9-fca8091060a4@kernel.org
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/mptcp/pm.c | 3 +++
+ 1 file changed, 3 insertions(+)
+
+--- a/net/mptcp/pm.c
++++ b/net/mptcp/pm.c
+@@ -283,6 +283,9 @@ int mptcp_pm_mp_prio_send_ack(struct mpt
+ struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
+ struct mptcp_addr_info local, remote;
+
++ if (!__mptcp_subflow_active(subflow))
++ continue;
++
+ mptcp_local_address((struct sock_common *)ssk, &local);
+ if (!mptcp_addresses_equal(&local, addr, addr->port))
+ continue;
--- /dev/null
+From 70ece9d7021c54cf40c72b31b066e9088f5f75f5 Mon Sep 17 00:00:00 2001
+From: "Matthieu Baerts (NGI0)" <matttbe@kernel.org>
+Date: Fri, 1 May 2026 21:35:37 +0200
+Subject: mptcp: sockopt: increase seq in mptcp_setsockopt_all_sf
+
+From: Matthieu Baerts (NGI0) <matttbe@kernel.org>
+
+commit 70ece9d7021c54cf40c72b31b066e9088f5f75f5 upstream.
+
+mptcp_setsockopt_all_sf() was missing a call to sockopt_seq_inc(). This
+is required not to cause missing synchronization for newer subflows
+created later on.
+
+This helper is called each time a socket option is set on subflows, and
+future ones will need to inherit this option after their creation.
+
+Fixes: 51c5fd09e1b4 ("mptcp: add TCP_MAXSEG sockopt support")
+Cc: stable@vger.kernel.org
+Suggested-by: Paolo Abeni <pabeni@redhat.com>
+Reviewed-by: Mat Martineau <martineau@kernel.org>
+Signed-off-by: Matthieu Baerts (NGI0) <matttbe@kernel.org>
+Link: https://patch.msgid.link/20260501-net-mptcp-misc-fixes-7-1-rc3-v1-4-b70118df778e@kernel.org
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/mptcp/sockopt.c | 4 ++++
+ 1 file changed, 4 insertions(+)
+
+--- a/net/mptcp/sockopt.c
++++ b/net/mptcp/sockopt.c
+@@ -812,6 +812,10 @@ static int mptcp_setsockopt_all_sf(struc
+ if (ret)
+ break;
+ }
++
++ if (!ret)
++ sockopt_seq_inc(msk);
++
+ return ret;
+ }
+
--- /dev/null
+From 5f95c21fc23a7ef22b4d27d1ed9bb55557ffb926 Mon Sep 17 00:00:00 2001
+From: Gang Yan <yangang@kylinos.cn>
+Date: Mon, 27 Apr 2026 21:54:33 +0200
+Subject: mptcp: sockopt: set timestamp flags on subflow socket, not msk
+
+From: Gang Yan <yangang@kylinos.cn>
+
+commit 5f95c21fc23a7ef22b4d27d1ed9bb55557ffb926 upstream.
+
+Both mptcp_setsockopt_sol_socket_tstamp() and
+mptcp_setsockopt_sol_socket_timestamping() iterate over subflows,
+acquire the subflow socket lock, but then erroneously pass the MPTCP
+msk socket to sock_set_timestamp() / sock_set_timestamping() instead
+of the subflow ssk. As a result, the timestamp flags are set on the
+wrong socket and have no effect on the actual subflows.
+
+Pass ssk instead of sk to both helpers.
+
+Fixes: 9061f24bf82e ("mptcp: sockopt: propagate timestamp request to subflows")
+Cc: stable@vger.kernel.org
+Signed-off-by: Gang Yan <yangang@kylinos.cn>
+Reviewed-by: Matthieu Baerts (NGI0) <matttbe@kernel.org>
+Signed-off-by: Matthieu Baerts (NGI0) <matttbe@kernel.org>
+Link: https://patch.msgid.link/20260427-net-mptcp-misc-fixes-7-1-rc2-v1-1-7432b7f279fa@kernel.org
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/mptcp/sockopt.c | 4 ++--
+ 1 file changed, 2 insertions(+), 2 deletions(-)
+
+--- a/net/mptcp/sockopt.c
++++ b/net/mptcp/sockopt.c
+@@ -161,7 +161,7 @@ static int mptcp_setsockopt_sol_socket_t
+ struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
+ bool slow = lock_sock_fast(ssk);
+
+- sock_set_timestamp(sk, optname, !!val);
++ sock_set_timestamp(ssk, optname, !!val);
+ unlock_sock_fast(ssk, slow);
+ }
+
+@@ -237,7 +237,7 @@ static int mptcp_setsockopt_sol_socket_t
+ struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
+ bool slow = lock_sock_fast(ssk);
+
+- sock_set_timestamping(sk, optname, timestamping);
++ sock_set_timestamping(ssk, optname, timestamping);
+ unlock_sock_fast(ssk, slow);
+ }
+
--- /dev/null
+From c4a99a921949cddc590b22bb14eeb23dffcc3ba6 Mon Sep 17 00:00:00 2001
+From: Shardul Bankar <shardul.b@mpiricsoftware.com>
+Date: Fri, 1 May 2026 21:35:34 +0200
+Subject: mptcp: use MPJoinSynAckHMacFailure for SynAck HMAC failure
+
+From: Shardul Bankar <shardul.b@mpiricsoftware.com>
+
+commit c4a99a921949cddc590b22bb14eeb23dffcc3ba6 upstream.
+
+In subflow_finish_connect(), HMAC validation of the server's HMAC
+in SYN/ACK + MP_JOIN increments MPTCP_MIB_JOINACKMAC ("HMAC was
+wrong on ACK + MP_JOIN") on failure. The function processes the
+SYN/ACK, not the ACK; the matching MPTCP_MIB_JOINSYNACKMAC counter
+("HMAC was wrong on SYN/ACK + MP_JOIN") exists but is not
+incremented anywhere in the tree.
+
+The mirror site on the server, subflow_syn_recv_sock(), already
+uses JOINACKMAC correctly for ACK HMAC failure. Use JOINSYNACKMAC
+at the SYN/ACK validation site so each counter reflects the packet
+whose HMAC actually failed.
+
+Suggested-by: Matthieu Baerts (NGI0) <matttbe@kernel.org>
+Fixes: fc518953bc9c ("mptcp: add and use MIB counter infrastructure")
+Cc: stable@vger.kernel.org
+Signed-off-by: Shardul Bankar <shardul.b@mpiricsoftware.com>
+Reviewed-by: Matthieu Baerts (NGI0) <matttbe@kernel.org>
+Signed-off-by: Matthieu Baerts (NGI0) <matttbe@kernel.org>
+Link: https://patch.msgid.link/20260501-net-mptcp-misc-fixes-7-1-rc3-v1-1-b70118df778e@kernel.org
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/mptcp/subflow.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/net/mptcp/subflow.c
++++ b/net/mptcp/subflow.c
+@@ -580,7 +580,7 @@ static void subflow_finish_connect(struc
+ subflow->backup);
+
+ if (!subflow_thmac_valid(subflow)) {
+- MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_JOINACKMAC);
++ MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_JOINSYNACKMAC);
+ subflow->reset_reason = MPTCP_RST_EMPTCP;
+ goto do_reset;
+ }
--- /dev/null
+From a6da02d4c00fdda2417e42ad2b762a9209e6cc49 Mon Sep 17 00:00:00 2001
+From: Shardul Bankar <shardul.b@mpiricsoftware.com>
+Date: Fri, 1 May 2026 21:35:35 +0200
+Subject: mptcp: use MPTCP_RST_EMPTCP for ACK HMAC validation failure
+
+From: Shardul Bankar <shardul.b@mpiricsoftware.com>
+
+commit a6da02d4c00fdda2417e42ad2b762a9209e6cc49 upstream.
+
+When HMAC validation fails on a received ACK + MP_JOIN in
+subflow_syn_recv_sock(), the subflow is reset with reason
+MPTCP_RST_EPROHIBIT ("Administratively prohibited"). This is
+incorrect: HMAC validation failure is an MPTCP protocol-level
+error, not an administrative policy denial.
+
+The mirror site on the client, in subflow_finish_connect(), already
+uses MPTCP_RST_EMPTCP ("MPTCP-specific error") for the same kind of
+HMAC failure on the SYN/ACK + MP_JOIN. Use the same reason on the
+server side for symmetry and accuracy.
+
+Suggested-by: Matthieu Baerts (NGI0) <matttbe@kernel.org>
+Fixes: 443041deb5ef ("mptcp: fix NULL pointer in can_accept_new_subflow")
+Cc: stable@vger.kernel.org
+Signed-off-by: Shardul Bankar <shardul.b@mpiricsoftware.com>
+Reviewed-by: Matthieu Baerts (NGI0) <matttbe@kernel.org>
+Signed-off-by: Matthieu Baerts (NGI0) <matttbe@kernel.org>
+Link: https://patch.msgid.link/20260501-net-mptcp-misc-fixes-7-1-rc3-v1-2-b70118df778e@kernel.org
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/mptcp/subflow.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/net/mptcp/subflow.c
++++ b/net/mptcp/subflow.c
+@@ -907,7 +907,7 @@ create_child:
+
+ if (!subflow_hmac_valid(subflow_req, &mp_opt)) {
+ SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINACKMAC);
+- subflow_add_reset_reason(skb, MPTCP_RST_EPROHIBIT);
++ subflow_add_reset_reason(skb, MPTCP_RST_EMPTCP);
+ goto dispose_child;
+ }
+
--- /dev/null
+From a8aeea1bf3c80cc87983689e0118770e019bd4f3 Mon Sep 17 00:00:00 2001
+From: Shuai Xue <xueshuai@linux.alibaba.com>
+Date: Wed, 11 Feb 2026 20:46:24 +0800
+Subject: PCI/AER: Clear only error bits in PCIe Device Status
+
+From: Shuai Xue <xueshuai@linux.alibaba.com>
+
+commit a8aeea1bf3c80cc87983689e0118770e019bd4f3 upstream.
+
+Currently, pcie_clear_device_status() clears the entire PCIe Device Status
+register (PCI_EXP_DEVSTA) by writing back the value read from the register,
+which affects not only the error status bits but also other writable bits.
+
+According to PCIe r7.0, sec 7.5.3.5, this register contains:
+
+ - RW1C error status bits (CED, NFED, FED, URD at bits 0-3): These are the
+ four error status bits that need to be cleared.
+
+ - Read-only bits (AUXPD at bit 4, TRPND at bit 5): Writing to these has
+ no effect.
+
+ - Emergency Power Reduction Detected (bit 6): A RW1C non-error bit
+ introduced in PCIe r5.0 (2019). This is currently the only writable
+ non-error bit in the Device Status register. Unconditionally clearing
+ this bit can interfere with other software components that rely on this
+ power management indication.
+
+ - Reserved bits (RsvdZ): These bits are required to be written as zero.
+ Writing 1s to them (as the current implementation may do) violates the
+ specification.
+
+To prevent unintended side effects, modify pcie_clear_device_status() to
+only write 1s to the four error status bits (CED, NFED, FED, URD), leaving
+the Emergency Power Reduction Detected bit and reserved bits unaffected.
+
+Fixes: ec752f5d54d7 ("PCI/AER: Clear device status bits during ERR_FATAL and ERR_NONFATAL")
+Suggested-by: Lukas Wunner <lukas@wunner.de>
+Signed-off-by: Shuai Xue <xueshuai@linux.alibaba.com>
+Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
+Reviewed-by: Kuppuswamy Sathyanarayanan <sathyanarayanan.kuppuswamy@linux.intel.com>
+Reviewed-by: Lukas Wunner <lukas@wunner.de>
+Cc: stable@vger.kernel.org
+Link: https://patch.msgid.link/20260211124624.49656-1-xueshuai@linux.alibaba.com
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/pci/pci.c | 7 +++----
+ 1 file changed, 3 insertions(+), 4 deletions(-)
+
+--- a/drivers/pci/pci.c
++++ b/drivers/pci/pci.c
+@@ -2272,10 +2272,9 @@ EXPORT_SYMBOL_GPL(pci_set_pcie_reset_sta
+ #ifdef CONFIG_PCIEAER
+ void pcie_clear_device_status(struct pci_dev *dev)
+ {
+- u16 sta;
+-
+- pcie_capability_read_word(dev, PCI_EXP_DEVSTA, &sta);
+- pcie_capability_write_word(dev, PCI_EXP_DEVSTA, sta);
++ pcie_capability_write_word(dev, PCI_EXP_DEVSTA,
++ PCI_EXP_DEVSTA_CED | PCI_EXP_DEVSTA_NFED |
++ PCI_EXP_DEVSTA_FED | PCI_EXP_DEVSTA_URD);
+ }
+ #endif
+
--- /dev/null
+From 1ab4a3c805084d752ec571efc78272295a9f2f74 Mon Sep 17 00:00:00 2001
+From: Lukas Wunner <lukas@wunner.de>
+Date: Fri, 27 Mar 2026 10:56:43 +0100
+Subject: PCI/AER: Stop ruling out unbound devices as error source
+
+From: Lukas Wunner <lukas@wunner.de>
+
+commit 1ab4a3c805084d752ec571efc78272295a9f2f74 upstream.
+
+When searching for the error source, the AER driver rules out devices whose
+enable_cnt is zero. This was introduced in 2009 by commit 28eb27cf0839
+("PCI AER: support invalid error source IDs") without providing a
+rationale.
+
+Drivers typically call pci_enable_device() on probe, hence the enable_cnt
+check essentially filters out unbound devices. At the time of the commit,
+drivers had to opt in to AER by calling pci_enable_pcie_error_reporting()
+and so any AER-enabled device could be assumed to be bound to a driver.
+The check thus made sense because it allowed skipping config space accesses
+to devices which were known not to be the error source.
+
+But since 2022, AER is universally enabled on all devices when they are
+enumerated, cf. commit f26e58bf6f54 ("PCI/AER: Enable error reporting when
+AER is native").
+
+Errors may very well be reported by unbound devices, e.g. due to link
+instability. By ruling them out as error source, errors reported by them
+are neither logged nor cleared. When they do get bound and another error
+occurs, the earlier error is reported together with the new error, which
+may confuse users. Stop doing so.
+
+Fixes: f26e58bf6f54 ("PCI/AER: Enable error reporting when AER is native")
+Signed-off-by: Lukas Wunner <lukas@wunner.de>
+Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
+Reviewed-by: Stefan Roese <stefan.roese@mailbox.org>
+Cc: stable@vger.kernel.org # v6.0+
+Link: https://patch.msgid.link/734338c2e8b669db5a5a3b45d34131b55ffebfca.1774605029.git.lukas@wunner.de
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/pci/pcie/aer.c | 2 --
+ 1 file changed, 2 deletions(-)
+
+--- a/drivers/pci/pcie/aer.c
++++ b/drivers/pci/pcie/aer.c
+@@ -1034,8 +1034,6 @@ static bool is_error_source(struct pci_d
+ * 3) There are multiple errors and prior ID comparing fails;
+ * We check AER status registers to find possible reporter.
+ */
+- if (atomic_read(&dev->enable_cnt) == 0)
+- return false;
+
+ /* Check if AER is enabled */
+ pcie_capability_read_word(dev, PCI_EXP_DEVCTL, ®16);
--- /dev/null
+From cc33985d26c92a5c908c0185239c59ec35b8637c Mon Sep 17 00:00:00 2001
+From: Lukas Wunner <lukas@wunner.de>
+Date: Mon, 16 Feb 2026 08:46:13 +0100
+Subject: PCI/ASPM: Fix pci_clear_and_set_config_dword() usage
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+From: Lukas Wunner <lukas@wunner.de>
+
+commit cc33985d26c92a5c908c0185239c59ec35b8637c upstream.
+
+When aspm_calc_l12_info() programs the L1 PM Substates Control 1 register
+fields Common_Mode_Restore_Time, LTR_L1.2_THRESHOLD_Value and _Scale, it
+invokes pci_clear_and_set_config_dword() in an incorrect way:
+
+For the bits to clear it selects those corresponding to the field. So far
+so good. But for the bits to set it passes a full register value.
+pci_clear_and_set_config_dword() performs a boolean OR operation which
+sets all bits of that value, not just the ones that were just cleared.
+
+Thus, when setting the LTR_L1.2_THRESHOLD_Value and _Scale on the child of
+an ASPM link, aspm_calc_l12_info() also sets the Common_Mode_Restore_Time.
+That's a spec violation: PCIe r7.0 sec 7.8.3.3 says this field is RsvdP
+for Upstream Ports. On Adrià 's Pixelbook Eve, Common_Mode_Restore_Time
+of the Intel 7265 "Stone Peak" wifi card is zero, yet aspm_calc_l12_info()
+does not preserve the zero bits but instead programs the value calculated
+for the Root Port into the wifi card.
+
+Likewise, when setting the Common_Mode_Restore_Time on the Root Port,
+aspm_calc_l12_info() also changes the LTR_L1.2_THRESHOLD_Value and _Scale
+from the initial 163840 nsec to 237568 nsec (due to ORing those fields),
+only to reduce it afterwards to 106496 nsec.
+
+Amend all invocations of pci_clear_and_set_config_dword() to only set bits
+which are cleared.
+
+Finally, when setting the T_POWER_ON_Value and _Scale on the Root Port and
+the wifi card, aspm_calc_l12_info() fails to preserve bits declared RsvdP
+and instead overwrites them with zeroes. Replace pci_write_config_dword()
+with pci_clear_and_set_config_dword() to avoid this.
+
+Fixes: aeda9adebab8 ("PCI/ASPM: Configure L1 substate settings")
+Link: https://bugzilla.kernel.org/show_bug.cgi?id=220705#c22
+Signed-off-by: Lukas Wunner <lukas@wunner.de>
+Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
+Tested-by: Adrià Vilanova MartÃnez <me@avm99963.com>
+Cc: stable@vger.kernel.org # v4.11+
+Link: https://patch.msgid.link/5c1752d7512eed0f4ea57b84b12d7ee08ca61fc5.1771226659.git.lukas@wunner.de
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/pci/pcie/aspm.c | 17 ++++++++++++-----
+ 1 file changed, 12 insertions(+), 5 deletions(-)
+
+--- a/drivers/pci/pcie/aspm.c
++++ b/drivers/pci/pcie/aspm.c
+@@ -706,22 +706,29 @@ static void aspm_calc_l12_info(struct pc
+ }
+
+ /* Program T_POWER_ON times in both ports */
+- pci_write_config_dword(parent, parent->l1ss + PCI_L1SS_CTL2, ctl2);
+- pci_write_config_dword(child, child->l1ss + PCI_L1SS_CTL2, ctl2);
++ pci_clear_and_set_config_dword(parent, parent->l1ss + PCI_L1SS_CTL2,
++ PCI_L1SS_CTL2_T_PWR_ON_VALUE |
++ PCI_L1SS_CTL2_T_PWR_ON_SCALE, ctl2);
++ pci_clear_and_set_config_dword(child, child->l1ss + PCI_L1SS_CTL2,
++ PCI_L1SS_CTL2_T_PWR_ON_VALUE |
++ PCI_L1SS_CTL2_T_PWR_ON_SCALE, ctl2);
+
+ /* Program Common_Mode_Restore_Time in upstream device */
+ pci_clear_and_set_config_dword(parent, parent->l1ss + PCI_L1SS_CTL1,
+- PCI_L1SS_CTL1_CM_RESTORE_TIME, ctl1);
++ PCI_L1SS_CTL1_CM_RESTORE_TIME,
++ ctl1 & PCI_L1SS_CTL1_CM_RESTORE_TIME);
+
+ /* Program LTR_L1.2_THRESHOLD time in both ports */
+ pci_clear_and_set_config_dword(parent, parent->l1ss + PCI_L1SS_CTL1,
+ PCI_L1SS_CTL1_LTR_L12_TH_VALUE |
+ PCI_L1SS_CTL1_LTR_L12_TH_SCALE,
+- ctl1);
++ ctl1 & (PCI_L1SS_CTL1_LTR_L12_TH_VALUE |
++ PCI_L1SS_CTL1_LTR_L12_TH_SCALE));
+ pci_clear_and_set_config_dword(child, child->l1ss + PCI_L1SS_CTL1,
+ PCI_L1SS_CTL1_LTR_L12_TH_VALUE |
+ PCI_L1SS_CTL1_LTR_L12_TH_SCALE,
+- ctl1);
++ ctl1 & (PCI_L1SS_CTL1_LTR_L12_TH_VALUE |
++ PCI_L1SS_CTL1_LTR_L12_TH_SCALE));
+
+ if (pl1_2_enables || cl1_2_enables) {
+ pci_clear_and_set_config_dword(parent,
--- /dev/null
+From 8ba0b706a485b1e607594cf4210786d517ad1611 Mon Sep 17 00:00:00 2001
+From: Dapeng Mi <dapeng1.mi@linux.intel.com>
+Date: Thu, 30 Apr 2026 08:25:55 +0800
+Subject: perf/x86/intel: Always reprogram ACR events to prevent stale masks
+
+From: Dapeng Mi <dapeng1.mi@linux.intel.com>
+
+commit 8ba0b706a485b1e607594cf4210786d517ad1611 upstream.
+
+Members of an ACR group are logically linked via a bitmask of their
+hardware counter indices. If some members of the group are assigned new
+hardware counters during rescheduling, even events that keep their
+original counter index must be updated with a new mask.
+
+Without this, an event will continue to use a stale acr_mask that
+references the old indices of its group peers. Ensure all ACR events are
+reprogrammed during the scheduling path to maintain consistency across
+the group.
+
+Fixes: ec980e4facef ("perf/x86/intel: Support auto counter reload")
+Signed-off-by: Dapeng Mi <dapeng1.mi@linux.intel.com>
+Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
+Cc: stable@vger.kernel.org
+Link: https://patch.msgid.link/20260430002558.712334-3-dapeng1.mi@linux.intel.com
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/x86/events/core.c | 13 ++++++++-----
+ 1 file changed, 8 insertions(+), 5 deletions(-)
+
+--- a/arch/x86/events/core.c
++++ b/arch/x86/events/core.c
+@@ -1281,13 +1281,16 @@ int x86_perf_rdpmc_index(struct perf_eve
+ return event->hw.event_base_rdpmc;
+ }
+
+-static inline int match_prev_assignment(struct hw_perf_event *hwc,
++static inline int match_prev_assignment(struct perf_event *event,
+ struct cpu_hw_events *cpuc,
+ int i)
+ {
++ struct hw_perf_event *hwc = &event->hw;
++
+ return hwc->idx == cpuc->assign[i] &&
+- hwc->last_cpu == smp_processor_id() &&
+- hwc->last_tag == cpuc->tags[i];
++ hwc->last_cpu == smp_processor_id() &&
++ hwc->last_tag == cpuc->tags[i] &&
++ !is_acr_event_group(event);
+ }
+
+ static void x86_pmu_start(struct perf_event *event, int flags);
+@@ -1333,7 +1336,7 @@ static void x86_pmu_enable(struct pmu *p
+ * - no other event has used the counter since
+ */
+ if (hwc->idx == -1 ||
+- match_prev_assignment(hwc, cpuc, i))
++ match_prev_assignment(event, cpuc, i))
+ continue;
+
+ /*
+@@ -1354,7 +1357,7 @@ static void x86_pmu_enable(struct pmu *p
+ event = cpuc->event_list[i];
+ hwc = &event->hw;
+
+- if (!match_prev_assignment(hwc, cpuc, i))
++ if (!match_prev_assignment(event, cpuc, i))
+ x86_assign_hw_event(event, cpuc, i);
+ else if (i < n_running)
+ continue;
--- /dev/null
+From 9a44949da669708f19d29141e65b3ac774d08f5a Mon Sep 17 00:00:00 2001
+From: =?UTF-8?q?Andr=C3=A9=20Draszik?= <andre.draszik@linaro.org>
+Date: Mon, 2 Mar 2026 13:32:05 +0000
+Subject: power: supply: max17042: avoid overflow when determining health
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+From: André Draszik <andre.draszik@linaro.org>
+
+commit 9a44949da669708f19d29141e65b3ac774d08f5a upstream.
+
+If vmax has the default value of INT_MAX (e.g. because not specified in
+DT), battery health is reported as over-voltage. This is because adding
+any value to vmax (the vmax tolerance in this case) causes it to wrap
+around, making it negative and smaller than the measured battery
+voltage.
+
+Avoid that by using size_add().
+
+Fixes: edd4ab055931 ("power: max17042_battery: add HEALTH and TEMP_* properties support")
+Cc: stable@vger.kernel.org
+Signed-off-by: André Draszik <andre.draszik@linaro.org>
+Link: https://patch.msgid.link/20260302-max77759-fg-v3-6-3c5f01dbda23@linaro.org
+Signed-off-by: Sebastian Reichel <sebastian.reichel@collabora.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/power/supply/max17042_battery.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/drivers/power/supply/max17042_battery.c
++++ b/drivers/power/supply/max17042_battery.c
+@@ -201,7 +201,7 @@ static int max17042_get_battery_health(s
+ goto out;
+ }
+
+- if (vbatt > chip->pdata->vmax + MAX17042_VMAX_TOLERANCE) {
++ if (vbatt > size_add(chip->pdata->vmax, MAX17042_VMAX_TOLERANCE)) {
+ *health = POWER_SUPPLY_HEALTH_OVERVOLTAGE;
+ goto out;
+ }
--- /dev/null
+From 6771c54728c278bf1e4bfdab4fddbbb186e33498 Mon Sep 17 00:00:00 2001
+From: Nilay Shroff <nilay@linux.ibm.com>
+Date: Wed, 11 Mar 2026 19:13:31 +0530
+Subject: powerpc/xive: fix kmemleak caused by incorrect chip_data lookup
+
+From: Nilay Shroff <nilay@linux.ibm.com>
+
+commit 6771c54728c278bf1e4bfdab4fddbbb186e33498 upstream.
+
+The kmemleak reports the following memory leak:
+
+Unreferenced object 0xc0000002a7fbc640 (size 64):
+ comm "kworker/8:1", pid 540, jiffies 4294937872
+ hex dump (first 32 bytes):
+ 01 00 00 00 00 00 00 00 00 00 09 04 00 04 00 00 ................
+ 00 00 a7 81 00 00 0a c0 00 00 08 04 00 04 00 00 ................
+ backtrace (crc 177d48f6):
+ __kmalloc_cache_noprof+0x520/0x730
+ xive_irq_alloc_data.constprop.0+0x40/0xe0
+ xive_irq_domain_alloc+0xd0/0x1b0
+ irq_domain_alloc_irqs_parent+0x44/0x6c
+ pseries_irq_domain_alloc+0x1cc/0x354
+ irq_domain_alloc_irqs_parent+0x44/0x6c
+ msi_domain_alloc+0xb0/0x220
+ irq_domain_alloc_irqs_locked+0x138/0x4d0
+ __irq_domain_alloc_irqs+0x8c/0xfc
+ __msi_domain_alloc_irqs+0x214/0x4d8
+ msi_domain_alloc_irqs_all_locked+0x70/0xf8
+ pci_msi_setup_msi_irqs+0x60/0x78
+ __pci_enable_msix_range+0x54c/0x98c
+ pci_alloc_irq_vectors_affinity+0x16c/0x1d4
+ nvme_pci_enable+0xac/0x9c0 [nvme]
+ nvme_probe+0x340/0x764 [nvme]
+
+This occurs when allocating MSI-X vectors for an NVMe device. During
+allocation the XIVE code creates a struct xive_irq_data and stores it
+in irq_data->chip_data.
+
+When the MSI-X irqdomain is later freed, xive_irq_free_data() is
+responsible for retrieving this structure and freeing it. However,
+after commit cc0cc23babc9 ("powerpc/xive: Untangle xive from child
+interrupt controller drivers"), xive_irq_free_data() retrieves the
+chip_data using irq_get_chip_data(), which looks up the data through
+the child domain.
+
+This is incorrect because the XIVE-specific irq data is associated with
+the XIVE (parent) domain. As a result the lookup fails and the allocated
+struct xive_irq_data is never freed, leading to the kmemleak report
+shown above.
+
+Fix this by retrieving the irq_data from the correct domain using
+irq_domain_get_irq_data() and then accessing the chip_data via
+irq_data_get_irq_chip_data().
+
+Cc: stable@vger.kernel.org
+Fixes: cc0cc23babc9 ("powerpc/xive: Untangle xive from child interrupt controller drivers")
+Signed-off-by: Nilay Shroff <nilay@linux.ibm.com>
+Tested-by: Venkat Rao Bagalkote <venkat88@linux.ibm.com>
+Reviewed-by: Nam Cao <namcao@linutronix.de>
+Signed-off-by: Madhavan Srinivasan <maddy@linux.ibm.com>
+Link: https://patch.msgid.link/20260311134336.326996-1-nilay@linux.ibm.com
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/powerpc/sysdev/xive/common.c | 16 +++++++++++-----
+ 1 file changed, 11 insertions(+), 5 deletions(-)
+
+--- a/arch/powerpc/sysdev/xive/common.c
++++ b/arch/powerpc/sysdev/xive/common.c
+@@ -1038,13 +1038,19 @@ static struct xive_irq_data *xive_irq_al
+ return xd;
+ }
+
+-static void xive_irq_free_data(unsigned int virq)
++static void xive_irq_free_data(struct irq_domain *domain, unsigned int virq)
+ {
+- struct xive_irq_data *xd = irq_get_chip_data(virq);
++ struct xive_irq_data *xd;
++ struct irq_data *data = irq_domain_get_irq_data(domain, virq);
++
++ if (!data)
++ return;
+
++ xd = irq_data_get_irq_chip_data(data);
+ if (!xd)
+ return;
+- irq_set_chip_data(virq, NULL);
++
++ irq_domain_reset_irq_data(data);
+ xive_cleanup_irq_data(xd);
+ kfree(xd);
+ }
+@@ -1304,7 +1310,7 @@ static int xive_irq_domain_map(struct ir
+
+ static void xive_irq_domain_unmap(struct irq_domain *d, unsigned int virq)
+ {
+- xive_irq_free_data(virq);
++ xive_irq_free_data(d, virq);
+ }
+
+ static int xive_irq_domain_xlate(struct irq_domain *h, struct device_node *ct,
+@@ -1442,7 +1448,7 @@ static void xive_irq_domain_free(struct
+ pr_debug("%s %d #%d\n", __func__, virq, nr_irqs);
+
+ for (i = 0; i < nr_irqs; i++)
+- xive_irq_free_data(virq + i);
++ xive_irq_free_data(domain, virq + i);
+ }
+ #endif
+
--- /dev/null
+From 654a27f25530d052eeedf086e6c3e2d585c203bd Mon Sep 17 00:00:00 2001
+From: Kai Zen <kai.aizen.dev@gmail.com>
+Date: Tue, 7 Apr 2026 12:20:22 +0300
+Subject: RDMA/ionic: bound node_desc sysfs read with %.64s
+
+From: Kai Zen <kai.aizen.dev@gmail.com>
+
+commit 654a27f25530d052eeedf086e6c3e2d585c203bd upstream.
+
+node_desc[64] in struct ib_device is not guaranteed to be NUL-
+terminated. The core IB sysfs handler uses "%.64s" for exactly this
+reason (drivers/infiniband/core/sysfs.c:1307), since node_desc_store()
+performs a raw memcpy of up to IB_DEVICE_NODE_DESC_MAX bytes with no NUL
+termination:
+
+ memcpy(desc.node_desc, buf, min_t(int, count, IB_DEVICE_NODE_DESC_MAX));
+
+If exactly 64 bytes are written via the node_desc sysfs file, the array
+contains no NUL byte. The ionic hca_type_show() handler uses unbounded
+"%s" and will read past the end of node_desc into adjacent fields of
+struct ib_device until it encounters a NUL.
+
+ionic supports IB_DEVICE_MODIFY_NODE_DESC, so this is triggerable by
+userspace.
+
+Match the core handler and bound the format specifier.
+
+Cc: stable@vger.kernel.org
+Fixes: 2075bbe8ef03 ("RDMA/ionic: Register device ops for miscellaneous functionality")
+Link: https://patch.msgid.link/r/CALynFi7NAbhDCt1tdaDbf6TnLvAqbaHa6-Wqf6OkzREbA_PAfg@mail.gmail.com
+Signed-off-by: Kai Aizen <kai.aizen.dev@gmail.com>
+Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/infiniband/hw/ionic/ionic_ibdev.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/drivers/infiniband/hw/ionic/ionic_ibdev.c b/drivers/infiniband/hw/ionic/ionic_ibdev.c
+index bd4c73e530d0..0382a64839d2 100644
+--- a/drivers/infiniband/hw/ionic/ionic_ibdev.c
++++ b/drivers/infiniband/hw/ionic/ionic_ibdev.c
+@@ -185,7 +185,7 @@ static ssize_t hca_type_show(struct device *device,
+ struct ionic_ibdev *dev =
+ rdma_device_to_drv_device(device, struct ionic_ibdev, ibdev);
+
+- return sysfs_emit(buf, "%s\n", dev->ibdev.node_desc);
++ return sysfs_emit(buf, "%s.64\n", dev->ibdev.node_desc);
+ }
+ static DEVICE_ATTR_RO(hca_type);
+
+--
+2.54.0
+
--- /dev/null
+From 70f780edcd1e86350202d8a409de026b2d2e2067 Mon Sep 17 00:00:00 2001
+From: Jason Gunthorpe <jgg@nvidia.com>
+Date: Tue, 28 Apr 2026 13:17:34 -0300
+Subject: RDMA/ionic: Fix typo in format string
+
+From: Jason Gunthorpe <jgg@nvidia.com>
+
+commit 70f780edcd1e86350202d8a409de026b2d2e2067 upstream.
+
+Applying the corrupted patch by hand mangled the format string, put the s
+in the right place.
+
+Cc: stable@vger.kernel.org
+Fixes: 654a27f25530 ("RDMA/ionic: bound node_desc sysfs read with %.64s")
+Link: https://patch.msgid.link/r/1-v1-41f3135e5565+9d2-rdma_ai_fixes1_jgg@nvidia.com
+Reported-by: Brad Spengler <brad.spengler@opensrcsec.com>
+Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/infiniband/hw/ionic/ionic_ibdev.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/drivers/infiniband/hw/ionic/ionic_ibdev.c b/drivers/infiniband/hw/ionic/ionic_ibdev.c
+index 0382a64839d2..73a616ae3502 100644
+--- a/drivers/infiniband/hw/ionic/ionic_ibdev.c
++++ b/drivers/infiniband/hw/ionic/ionic_ibdev.c
+@@ -185,7 +185,7 @@ static ssize_t hca_type_show(struct device *device,
+ struct ionic_ibdev *dev =
+ rdma_device_to_drv_device(device, struct ionic_ibdev, ibdev);
+
+- return sysfs_emit(buf, "%s.64\n", dev->ibdev.node_desc);
++ return sysfs_emit(buf, "%.64s\n", dev->ibdev.node_desc);
+ }
+ static DEVICE_ATTR_RO(hca_type);
+
+--
+2.54.0
+
--- /dev/null
+From 6aaa978c6b6218cfac15fe1dab17c76fe229ce3f Mon Sep 17 00:00:00 2001
+From: Jason Gunthorpe <jgg@nvidia.com>
+Date: Tue, 28 Apr 2026 13:17:40 -0300
+Subject: RDMA/mana: Fix error unwind in mana_ib_create_qp_rss()
+
+From: Jason Gunthorpe <jgg@nvidia.com>
+
+commit 6aaa978c6b6218cfac15fe1dab17c76fe229ce3f upstream.
+
+Sashiko points out that mana_ib_cfg_vport_steering() is leaked, the normal
+destroy path cleans it up.
+
+Cc: stable@vger.kernel.org
+Fixes: 0266a177631d ("RDMA/mana_ib: Add a driver for Microsoft Azure Network Adapter")
+Link: https://sashiko.dev/#/patchset/0-v1-e911b76a94d1%2B65d95-rdma_udata_rep_jgg%40nvidia.com?part=4
+Link: https://patch.msgid.link/r/7-v1-41f3135e5565+9d2-rdma_ai_fixes1_jgg@nvidia.com
+Reviewed-by: Long Li <longli@microsoft.com>
+Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/infiniband/hw/mana/qp.c | 4 +++-
+ 1 file changed, 3 insertions(+), 1 deletion(-)
+
+--- a/drivers/infiniband/hw/mana/qp.c
++++ b/drivers/infiniband/hw/mana/qp.c
+@@ -236,13 +236,15 @@ static int mana_ib_create_qp_rss(struct
+ ibdev_dbg(&mdev->ib_dev,
+ "Failed to copy to udata create rss-qp, %d\n",
+ ret);
+- goto fail;
++ goto err_disable_vport_rx;
+ }
+
+ kfree(mana_ind_table);
+
+ return 0;
+
++err_disable_vport_rx:
++ mana_disable_vport_rx(mpc);
+ fail:
+ while (i-- > 0) {
+ ibwq = ind_tbl->ind_tbl[i];
--- /dev/null
+From 34ecf795692ee57c393109f4a24ccc313091e137 Mon Sep 17 00:00:00 2001
+From: Jason Gunthorpe <jgg@nvidia.com>
+Date: Tue, 28 Apr 2026 13:17:39 -0300
+Subject: RDMA/mana: Fix mana_destroy_wq_obj() cleanup in mana_ib_create_qp_rss()
+
+From: Jason Gunthorpe <jgg@nvidia.com>
+
+commit 34ecf795692ee57c393109f4a24ccc313091e137 upstream.
+
+Sashiko points out there are two bugs here in the error unwind flow, both
+related to how the WQ table is unwound.
+
+First there is a double i-- on the first failure path due to the while loop
+having a i--, remove it.
+
+Second if mana_ib_install_cq_cb() fails then mana_create_wq_obj() is not
+undone due to the above i--.
+
+Cc: stable@vger.kernel.org
+Fixes: c15d7802a424 ("RDMA/mana_ib: Add CQ interrupt support for RAW QP")
+Link: https://sashiko.dev/#/patchset/0-v2-1c49eeb88c48%2B91-rdma_udata_rep_jgg%40nvidia.com?part=1
+Link: https://patch.msgid.link/r/6-v1-41f3135e5565+9d2-rdma_ai_fixes1_jgg@nvidia.com
+Reviewed-by: Long Li <longli@microsoft.com>
+Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/infiniband/hw/mana/qp.c | 9 ++++-----
+ 1 file changed, 4 insertions(+), 5 deletions(-)
+
+--- a/drivers/infiniband/hw/mana/qp.c
++++ b/drivers/infiniband/hw/mana/qp.c
+@@ -194,11 +194,8 @@ static int mana_ib_create_qp_rss(struct
+
+ ret = mana_create_wq_obj(mpc, mpc->port_handle, GDMA_RQ,
+ &wq_spec, &cq_spec, &wq->rx_object);
+- if (ret) {
+- /* Do cleanup starting with index i-1 */
+- i--;
++ if (ret)
+ goto fail;
+- }
+
+ /* The GDMA regions are now owned by the WQ object */
+ wq->queue.gdma_region = GDMA_INVALID_DMA_REGION;
+@@ -218,8 +215,10 @@ static int mana_ib_create_qp_rss(struct
+
+ /* Create CQ table entry */
+ ret = mana_ib_install_cq_cb(mdev, cq);
+- if (ret)
++ if (ret) {
++ mana_destroy_wq_obj(mpc, GDMA_RQ, wq->rx_object);
+ goto fail;
++ }
+ }
+ resp.num_entries = i;
+
--- /dev/null
+From 159f2efabc89d3f931d38f2d35876535d4abf0a3 Mon Sep 17 00:00:00 2001
+From: Jason Gunthorpe <jgg@nvidia.com>
+Date: Tue, 28 Apr 2026 13:17:38 -0300
+Subject: RDMA/mana: Remove user triggerable WARN_ON() in mana_ib_create_qp_rss()
+
+From: Jason Gunthorpe <jgg@nvidia.com>
+
+commit 159f2efabc89d3f931d38f2d35876535d4abf0a3 upstream.
+
+Sashiko points out that the user can specify WQs sharing the same CQ as a
+part of the uAPI and this will trigger the WARN_ON() then go on to corrupt
+the kernel.
+
+Just reject it outright and fail the QP creation.
+
+Cc: stable@vger.kernel.org
+Fixes: c15d7802a424 ("RDMA/mana_ib: Add CQ interrupt support for RAW QP")
+Link: https://sashiko.dev/#/patchset/0-v2-1c49eeb88c48%2B91-rdma_udata_rep_jgg%40nvidia.com?part=1
+Link: https://patch.msgid.link/r/5-v1-41f3135e5565+9d2-rdma_ai_fixes1_jgg@nvidia.com
+Reviewed-by: Long Li <longli@microsoft.com>
+Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/infiniband/hw/mana/cq.c | 5 +++--
+ 1 file changed, 3 insertions(+), 2 deletions(-)
+
+--- a/drivers/infiniband/hw/mana/cq.c
++++ b/drivers/infiniband/hw/mana/cq.c
+@@ -144,8 +144,9 @@ int mana_ib_install_cq_cb(struct mana_ib
+
+ if (cq->queue.id >= gc->max_num_cqs)
+ return -EINVAL;
+- /* Create CQ table entry */
+- WARN_ON(gc->cq_table[cq->queue.id]);
++ /* Create CQ table entry, sharing a CQ between WQs is not supported */
++ if (gc->cq_table[cq->queue.id])
++ return -EINVAL;
+ if (cq->queue.kmem)
+ gdma_cq = cq->queue.kmem;
+ else
--- /dev/null
+From 6dd2d4ad9c8429523b1c220c5132bd551c006425 Mon Sep 17 00:00:00 2001
+From: Jason Gunthorpe <jgg@nvidia.com>
+Date: Tue, 28 Apr 2026 13:17:37 -0300
+Subject: RDMA/mana: Validate rx_hash_key_len
+
+From: Jason Gunthorpe <jgg@nvidia.com>
+
+commit 6dd2d4ad9c8429523b1c220c5132bd551c006425 upstream.
+
+Sashiko points out that rx_hash_key_len comes from a uAPI structure and is
+blindly passed to memcpy, allowing the userspace to trash kernel
+memory. Bounds check it so the memcpy cannot overflow.
+
+Cc: stable@vger.kernel.org
+Fixes: 0266a177631d ("RDMA/mana_ib: Add a driver for Microsoft Azure Network Adapter")
+Link: https://sashiko.dev/#/patchset/0-v2-1c49eeb88c48%2B91-rdma_udata_rep_jgg%40nvidia.com?part=1
+Link: https://patch.msgid.link/r/4-v1-41f3135e5565+9d2-rdma_ai_fixes1_jgg@nvidia.com
+Reviewed-by: Long Li <longli@microsoft.com>
+Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/infiniband/hw/mana/qp.c | 3 +++
+ 1 file changed, 3 insertions(+)
+
+--- a/drivers/infiniband/hw/mana/qp.c
++++ b/drivers/infiniband/hw/mana/qp.c
+@@ -21,6 +21,9 @@ static int mana_ib_cfg_vport_steering(st
+
+ gc = mdev_to_gc(dev);
+
++ if (rx_hash_key_len > sizeof(req->hashkey))
++ return -EINVAL;
++
+ req_buf_size = struct_size(req, indir_tab, MANA_INDIRECT_TABLE_DEF_SIZE);
+ req = kzalloc(req_buf_size, GFP_KERNEL);
+ if (!req)
--- /dev/null
+From c9341307ea16b9395c2e4c9c94d8499d91fe31d0 Mon Sep 17 00:00:00 2001
+From: Jason Gunthorpe <jgg@nvidia.com>
+Date: Tue, 28 Apr 2026 13:17:45 -0300
+Subject: RDMA/mlx4: Fix mis-use of RCU in mlx4_srq_event()
+
+From: Jason Gunthorpe <jgg@nvidia.com>
+
+commit c9341307ea16b9395c2e4c9c94d8499d91fe31d0 upstream.
+
+Sashiko points out the radix_tree itself is RCU safe, but nothing ever
+frees the mlx4_srq struct with RCU, and it isn't even accessed within the
+RCU critical section. It also will crash if an event is delivered before
+the srq object is finished initializing.
+
+Use the spinlock since it isn't easy to make RCU work, use
+refcount_inc_not_zero() to protect against partially initialized objects,
+and order the refcount_set() to be after the srq is fully initialized.
+
+Cc: stable@vger.kernel.org
+Fixes: 30353bfc43a1 ("net/mlx4_core: Use RCU to perform radix tree lookup for SRQ")
+Link: https://sashiko.dev/#/patchset/0-v2-1c49eeb88c48%2B91-rdma_udata_rep_jgg%40nvidia.com?part=5
+Link: https://patch.msgid.link/r/12-v1-41f3135e5565+9d2-rdma_ai_fixes1_jgg@nvidia.com
+Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/net/ethernet/mellanox/mlx4/srq.c | 13 +++++++------
+ 1 file changed, 7 insertions(+), 6 deletions(-)
+
+--- a/drivers/net/ethernet/mellanox/mlx4/srq.c
++++ b/drivers/net/ethernet/mellanox/mlx4/srq.c
+@@ -44,13 +44,14 @@ void mlx4_srq_event(struct mlx4_dev *dev
+ {
+ struct mlx4_srq_table *srq_table = &mlx4_priv(dev)->srq_table;
+ struct mlx4_srq *srq;
++ unsigned long flags;
+
+- rcu_read_lock();
++ spin_lock_irqsave(&srq_table->lock, flags);
+ srq = radix_tree_lookup(&srq_table->tree, srqn & (dev->caps.num_srqs - 1));
+- rcu_read_unlock();
+- if (srq)
+- refcount_inc(&srq->refcount);
+- else {
++ if (!srq || !refcount_inc_not_zero(&srq->refcount))
++ srq = NULL;
++ spin_unlock_irqrestore(&srq_table->lock, flags);
++ if (!srq) {
+ mlx4_warn(dev, "Async event for bogus SRQ %08x\n", srqn);
+ return;
+ }
+@@ -203,8 +204,8 @@ int mlx4_srq_alloc(struct mlx4_dev *dev,
+ if (err)
+ goto err_radix;
+
+- refcount_set(&srq->refcount, 1);
+ init_completion(&srq->free);
++ refcount_set_release(&srq->refcount, 1);
+
+ return 0;
+
--- /dev/null
+From c54c7e4cb679c0aaa1cb489b9c3f2cd98e63a44c Mon Sep 17 00:00:00 2001
+From: Jason Gunthorpe <jgg@nvidia.com>
+Date: Tue, 28 Apr 2026 13:17:44 -0300
+Subject: RDMA/mlx4: Fix resource leak on error in mlx4_ib_create_srq()
+
+From: Jason Gunthorpe <jgg@nvidia.com>
+
+commit c54c7e4cb679c0aaa1cb489b9c3f2cd98e63a44c upstream.
+
+Sashiko points out that mlx4_srq_alloc() was not undone during error
+unwind, add the missing call to mlx4_srq_free().
+
+Cc: stable@vger.kernel.org
+Fixes: 225c7b1feef1 ("IB/mlx4: Add a driver Mellanox ConnectX InfiniBand adapters")
+Link: https://sashiko.dev/#/patchset/0-v1-e911b76a94d1%2B65d95-rdma_udata_rep_jgg%40nvidia.com?part=8
+Link: https://patch.msgid.link/r/11-v1-41f3135e5565+9d2-rdma_ai_fixes1_jgg@nvidia.com
+Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/infiniband/hw/mlx4/srq.c | 4 +++-
+ 1 file changed, 3 insertions(+), 1 deletion(-)
+
+--- a/drivers/infiniband/hw/mlx4/srq.c
++++ b/drivers/infiniband/hw/mlx4/srq.c
+@@ -193,13 +193,15 @@ int mlx4_ib_create_srq(struct ib_srq *ib
+ if (udata)
+ if (ib_copy_to_udata(udata, &srq->msrq.srqn, sizeof (__u32))) {
+ err = -EFAULT;
+- goto err_wrid;
++ goto err_srq;
+ }
+
+ init_attr->attr.max_wr = srq->msrq.max - 1;
+
+ return 0;
+
++err_srq:
++ mlx4_srq_free(dev->dev, &srq->msrq);
+ err_wrid:
+ if (udata)
+ mlx4_ib_db_unmap_user(ucontext, &srq->db);
--- /dev/null
+From c488df06bd552bb8b6e14fa0cfd5ad986c6e9525 Mon Sep 17 00:00:00 2001
+From: Junrui Luo <moonafterrain@outlook.com>
+Date: Fri, 24 Apr 2026 13:51:02 +0800
+Subject: RDMA/mlx5: Fix error path fall-through in mlx5_ib_dev_res_srq_init()
+
+From: Junrui Luo <moonafterrain@outlook.com>
+
+commit c488df06bd552bb8b6e14fa0cfd5ad986c6e9525 upstream.
+
+mlx5_ib_dev_res_srq_init() allocates two SRQs, s0 and s1. When
+ib_create_srq() fails for s1, the error branch destroys s0 but falls
+through and unconditionally assigns the freed s0 and the ERR_PTR s1 to
+devr->s0 and devr->s1.
+
+This leads to several problems: the lock-free fast path checks
+"if (devr->s1) return 0;" and treats the ERR_PTR as already initialised;
+users in mlx5_ib_create_qp() dereference the freed SRQ or ERR_PTR via
+to_msrq(devr->s0)->msrq.srqn; and mlx5_ib_dev_res_cleanup() dereferences
+the ERR_PTR and double-frees s0 on teardown.
+
+Fix by adding the same `goto unlock` in the s1 failure path.
+
+Cc: stable@vger.kernel.org
+Fixes: 5895e70f2e6e ("IB/mlx5: Allocate resources just before first QP/SRQ is created")
+Link: https://patch.msgid.link/r/SYBPR01MB7881E1E0970268BD69C0BA75AF2B2@SYBPR01MB7881.ausprd01.prod.outlook.com
+Reported-by: Yuhao Jiang <danisjiang@gmail.com>
+Signed-off-by: Junrui Luo <moonafterrain@outlook.com>
+Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/infiniband/hw/mlx5/main.c | 1 +
+ 1 file changed, 1 insertion(+)
+
+--- a/drivers/infiniband/hw/mlx5/main.c
++++ b/drivers/infiniband/hw/mlx5/main.c
+@@ -3181,6 +3181,7 @@ int mlx5_ib_dev_res_srq_init(struct mlx5
+ "Couldn't create SRQ 1 for res init, err=%pe\n",
+ s1);
+ ib_destroy_srq(s0);
++ goto unlock;
+ }
+
+ devr->s0 = s0;
--- /dev/null
+From 34fbf48cf3b410d2a6e8c586fa952a36331ca5ba Mon Sep 17 00:00:00 2001
+From: Jason Gunthorpe <jgg@nvidia.com>
+Date: Tue, 28 Apr 2026 13:17:42 -0300
+Subject: RDMA/ocrdma: Don't NULL deref uctx on errors in ocrdma_copy_pd_uresp()
+
+From: Jason Gunthorpe <jgg@nvidia.com>
+
+commit 34fbf48cf3b410d2a6e8c586fa952a36331ca5ba upstream.
+
+Sashiko points out that pd->uctx isn't initialized until late in the
+function so all these error flow references are NULL and will crash. Use
+the uctx that isn't NULL.
+
+Cc: stable@vger.kernel.org
+Fixes: fe2caefcdf58 ("RDMA/ocrdma: Add driver for Emulex OneConnect IBoE RDMA adapter")
+Link: https://sashiko.dev/#/patchset/0-v1-e911b76a94d1%2B65d95-rdma_udata_rep_jgg%40nvidia.com?part=4
+Link: https://patch.msgid.link/r/9-v1-41f3135e5565+9d2-rdma_ai_fixes1_jgg@nvidia.com
+Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/infiniband/hw/ocrdma/ocrdma_verbs.c | 4 ++--
+ 1 file changed, 2 insertions(+), 2 deletions(-)
+
+--- a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c
++++ b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c
+@@ -620,9 +620,9 @@ static int ocrdma_copy_pd_uresp(struct o
+
+ ucopy_err:
+ if (pd->dpp_enabled)
+- ocrdma_del_mmap(pd->uctx, dpp_page_addr, PAGE_SIZE);
++ ocrdma_del_mmap(uctx, dpp_page_addr, PAGE_SIZE);
+ dpp_map_err:
+- ocrdma_del_mmap(pd->uctx, db_page_addr, db_page_size);
++ ocrdma_del_mmap(uctx, db_page_addr, db_page_size);
+ return status;
+ }
+
--- /dev/null
+From 1114c87aa6f195cf07da55a27b2122ae26557b26 Mon Sep 17 00:00:00 2001
+From: Michael Bommarito <michael.bommarito@gmail.com>
+Date: Sat, 18 Apr 2026 12:21:41 -0400
+Subject: RDMA/rxe: Reject non-8-byte ATOMIC_WRITE payloads
+
+From: Michael Bommarito <michael.bommarito@gmail.com>
+
+commit 1114c87aa6f195cf07da55a27b2122ae26557b26 upstream.
+
+atomic_write_reply() at drivers/infiniband/sw/rxe/rxe_resp.c
+unconditionally dereferences 8 bytes at payload_addr(pkt):
+
+ value = *(u64 *)payload_addr(pkt);
+
+check_rkey() previously accepted an ATOMIC_WRITE request with pktlen ==
+resid == 0 because the length validation only compared pktlen against
+resid. A remote initiator that sets the RETH length to 0 therefore reaches
+atomic_write_reply() with a zero-byte logical payload, and the responder
+reads sizeof(u64) bytes from past the logical end of the packet into
+skb->head tailroom, then writes those 8 bytes into the attacker's MR via
+rxe_mr_do_atomic_write(). That is a remote disclosure of 4 bytes of kernel
+tailroom per probe (the other 4 bytes are the packet's own trailing ICRC).
+
+IBA oA19-28 defines ATOMIC_WRITE as exactly 8 bytes. Anything else is
+protocol-invalid. Hoist a strict length check into check_rkey() so the
+responder never reaches the unchecked dereference, and keep the existing
+WRITE-family length logic for the normal RDMA WRITE path.
+
+Reproduced on mainline with an unmodified rxe driver: a sustained
+zero-length ATOMIC_WRITE probe repeatedly leaks adjacent skb head-buffer
+bytes into the attacker's MR, including recognisable kernel strings and
+partial kernel-direct-map pointer words. With this patch applied the
+responder rejects the PDU and the MR stays all-zero.
+
+Cc: stable@vger.kernel.org
+Fixes: 034e285f8b99 ("RDMA/rxe: Make responder support atomic write on RC service")
+Link: https://patch.msgid.link/r/20260418162141.3610201-1-michael.bommarito@gmail.com
+Assisted-by: Claude:claude-opus-4-7
+Signed-off-by: Michael Bommarito <michael.bommarito@gmail.com>
+Reviewed-by: Zhu Yanjun <yanjun.zhu@linux.dev>
+Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/infiniband/sw/rxe/rxe_resp.c | 14 +++++++++++++-
+ 1 file changed, 13 insertions(+), 1 deletion(-)
+
+--- a/drivers/infiniband/sw/rxe/rxe_resp.c
++++ b/drivers/infiniband/sw/rxe/rxe_resp.c
+@@ -526,7 +526,19 @@ static enum resp_states check_rkey(struc
+ }
+
+ skip_check_range:
+- if (pkt->mask & (RXE_WRITE_MASK | RXE_ATOMIC_WRITE_MASK)) {
++ if (pkt->mask & RXE_ATOMIC_WRITE_MASK) {
++ /* IBA oA19-28: ATOMIC_WRITE payload is exactly 8 bytes.
++ * Reject any other length before the responder reads
++ * sizeof(u64) bytes from payload_addr(pkt); a shorter
++ * payload would read past the logical end of the packet
++ * into skb->head tailroom.
++ */
++ if (resid != sizeof(u64) || pktlen != sizeof(u64) ||
++ bth_pad(pkt)) {
++ state = RESPST_ERR_LENGTH;
++ goto err;
++ }
++ } else if (pkt->mask & RXE_WRITE_MASK) {
+ if (resid > mtu) {
+ if (pktlen != mtu || bth_pad(pkt)) {
+ state = RESPST_ERR_LENGTH;
--- /dev/null
+From 4c6f86d85d03cdb33addce86aa69aa795ca6c47a Mon Sep 17 00:00:00 2001
+From: Michael Bommarito <michael.bommarito@gmail.com>
+Date: Tue, 14 Apr 2026 07:15:55 -0400
+Subject: RDMA/rxe: Reject unknown opcodes before ICRC processing
+
+From: Michael Bommarito <michael.bommarito@gmail.com>
+
+commit 4c6f86d85d03cdb33addce86aa69aa795ca6c47a upstream.
+
+Even after applying commit 7244491dab34 ("RDMA/rxe: Validate pad and ICRC
+before payload_size() in rxe_rcv"), a single unauthenticated UDP packet
+can still trigger panic. That patch handled payload_size() underflow only
+for valid opcodes with short packets, not for packets carrying an unknown
+opcode. The unknown-opcode OOB read described below predates that commit
+and reaches back to the initial Soft RoCE driver.
+
+The check added there reads
+
+ pkt->paylen < header_size(pkt) + bth_pad(pkt) + RXE_ICRC_SIZE
+
+where header_size(pkt) expands to rxe_opcode[pkt->opcode].length. The
+rxe_opcode[] array has 256 entries but is only populated for defined IB
+opcodes; any other entry (for example opcode 0xff) is zero-initialized, so
+length == 0 and the check degenerates to
+
+ pkt->paylen < 0 + bth_pad(pkt) + RXE_ICRC_SIZE
+
+which does not constrain pkt->paylen enough. rxe_icrc_hdr() then computes
+
+ rxe_opcode[pkt->opcode].length - RXE_BTH_BYTES
+
+which underflows when length == 0 and passes a huge value to rxe_crc32(),
+causing an out-of-bounds read of the skb payload.
+
+Reproduced on v7.0-rc7 with that fix applied, QEMU/KVM with
+CONFIG_RDMA_RXE=y and CONFIG_KASAN=y, after
+
+ rdma link add rxe0 type rxe netdev eth0
+
+A single 48-byte UDP packet to port 4791 with BTH opcode=0xff and
+QPN=IB_MULTICAST_QPN triggers:
+
+ BUG: KASAN: slab-out-of-bounds in crc32_le+0x115/0x170
+ Read of size 1 at addr ...
+ The buggy address is located 0 bytes to the right of
+ allocated 704-byte region
+ Call Trace:
+ crc32_le+0x115/0x170
+ rxe_icrc_hdr.isra.0+0x226/0x300
+ rxe_icrc_check+0x13f/0x3a0
+ rxe_rcv+0x6e1/0x16e0
+ rxe_udp_encap_recv+0x20a/0x320
+ udp_queue_rcv_one_skb+0x7ed/0x12c0
+
+Subsequent packets with the same shape fault on unmapped memory and panic
+the kernel. The trigger requires only module load and "rdma link add"; no
+QP, no connection, and no authentication.
+
+Fix this by rejecting packets whose opcode has no rxe_opcode[] entry,
+detected via the zero mask or zero length, before any length arithmetic
+runs.
+
+Cc: stable@vger.kernel.org
+Fixes: 8700e3e7c485 ("Soft RoCE driver")
+Link: https://patch.msgid.link/r/20260414111555.3386793-1-michael.bommarito@gmail.com
+Assisted-by: Claude:claude-opus-4-6
+Signed-off-by: Michael Bommarito <michael.bommarito@gmail.com>
+Reviewed-by: Zhu Yanjun <yanjun.zhu@linux.dev>
+Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/infiniband/sw/rxe/rxe_recv.c | 11 +++++++++++
+ 1 file changed, 11 insertions(+)
+
+--- a/drivers/infiniband/sw/rxe/rxe_recv.c
++++ b/drivers/infiniband/sw/rxe/rxe_recv.c
+@@ -330,6 +330,17 @@ void rxe_rcv(struct sk_buff *skb)
+ pkt->qp = NULL;
+ pkt->mask |= rxe_opcode[pkt->opcode].mask;
+
++ /*
++ * Unknown opcodes have a zero-initialized rxe_opcode[] entry, so
++ * both mask and length are 0. Reject them before any length math:
++ * rxe_icrc_hdr() would otherwise compute length - RXE_BTH_BYTES
++ * and pass the underflowed value to rxe_crc32(), producing an
++ * out-of-bounds read.
++ */
++ if (unlikely(!rxe_opcode[pkt->opcode].mask ||
++ !rxe_opcode[pkt->opcode].length))
++ goto drop;
++
+ if (unlikely(pkt->paylen < header_size(pkt) + bth_pad(pkt) +
+ RXE_ICRC_SIZE))
+ goto drop;
--- /dev/null
+From e38e86995df27f1f854063dab1f0c6a513db3faf Mon Sep 17 00:00:00 2001
+From: Jason Gunthorpe <jgg@nvidia.com>
+Date: Tue, 28 Apr 2026 13:17:43 -0300
+Subject: RDMA/vmw_pvrdma: Fix double free on pvrdma_alloc_ucontext() error path
+
+From: Jason Gunthorpe <jgg@nvidia.com>
+
+commit e38e86995df27f1f854063dab1f0c6a513db3faf upstream.
+
+Sashiko points out that pvrdma_uar_free() is already called within
+pvrdma_dealloc_ucontext(), so calling it before triggers a double free.
+
+Cc: stable@vger.kernel.org
+Fixes: 29c8d9eba550 ("IB: Add vmw_pvrdma driver")
+Link: https://sashiko.dev/#/patchset/0-v1-e911b76a94d1%2B65d95-rdma_udata_rep_jgg%40nvidia.com?part=4
+Link: https://patch.msgid.link/r/10-v1-41f3135e5565+9d2-rdma_ai_fixes1_jgg@nvidia.com
+Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/infiniband/hw/vmw_pvrdma/pvrdma_verbs.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_verbs.c
++++ b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_verbs.c
+@@ -322,7 +322,7 @@ int pvrdma_alloc_ucontext(struct ib_ucon
+ uresp.qp_tab_size = vdev->dsr->caps.max_qp;
+ ret = ib_copy_to_udata(udata, &uresp, sizeof(uresp));
+ if (ret) {
+- pvrdma_uar_free(vdev, &context->uar);
++ /* pvrdma_dealloc_ucontext() also frees the UAR */
+ pvrdma_dealloc_ucontext(&context->ibucontext);
+ return -EFAULT;
+ }
--- /dev/null
+From b34c82777a2c0648ee053595f4b290fd5249b093 Mon Sep 17 00:00:00 2001
+From: David Carlier <devnexen@gmail.com>
+Date: Thu, 30 Apr 2026 10:27:47 +0100
+Subject: sched_ext: idle: Recheck prev_cpu after narrowing allowed mask
+
+From: David Carlier <devnexen@gmail.com>
+
+commit b34c82777a2c0648ee053595f4b290fd5249b093 upstream.
+
+scx_select_cpu_dfl() narrows @allowed to @cpus_allowed & @p->cpus_ptr
+when the BPF caller supplies a @cpus_allowed that differs from
+@p->cpus_ptr and @p doesn't have full affinity. However,
+@is_prev_allowed was computed against the original (wider)
+@cpus_allowed, so the prev_cpu fast paths could pick a @prev_cpu that
+is in @cpus_allowed but not in @p->cpus_ptr, violating the intended
+invariant that the returned CPU is always usable by @p. The kernel
+masks this via the SCX_EV_SELECT_CPU_FALLBACK fallback, but the
+behavior contradicts the documented contract.
+
+Move the @is_prev_allowed evaluation past the narrowing block so it
+tests against the final @allowed mask.
+
+Fixes: ee9a4e92799d ("sched_ext: idle: Properly handle invalid prev_cpu during idle selection")
+Cc: stable@vger.kernel.org # v6.16+
+Assisted-by: Claude <noreply@anthropic.com>
+Signed-off-by: David Carlier <devnexen@gmail.com>
+Reviewed-by: Andrea Righi <arighi@nvidia.com>
+Signed-off-by: Tejun Heo <tj@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ kernel/sched/ext_idle.c | 12 ++++++------
+ 1 file changed, 6 insertions(+), 6 deletions(-)
+
+--- a/kernel/sched/ext_idle.c
++++ b/kernel/sched/ext_idle.c
+@@ -460,12 +460,6 @@ s32 scx_select_cpu_dfl(struct task_struc
+ preempt_disable();
+
+ /*
+- * Check whether @prev_cpu is still within the allowed set. If not,
+- * we can still try selecting a nearby CPU.
+- */
+- is_prev_allowed = cpumask_test_cpu(prev_cpu, allowed);
+-
+- /*
+ * Determine the subset of CPUs usable by @p within @cpus_allowed.
+ */
+ if (allowed != p->cpus_ptr) {
+@@ -482,6 +476,12 @@ s32 scx_select_cpu_dfl(struct task_struc
+ }
+
+ /*
++ * Check whether @prev_cpu is still within the allowed set. If not,
++ * we can still try selecting a nearby CPU.
++ */
++ is_prev_allowed = cpumask_test_cpu(prev_cpu, allowed);
++
++ /*
+ * This is necessary to protect llc_cpus.
+ */
+ rcu_read_lock();
--- /dev/null
+From 65db7b27b90e2ea8d4966935aa9a50b6a60c31ac Mon Sep 17 00:00:00 2001
+From: "Matthieu Baerts (NGI0)" <matttbe@kernel.org>
+Date: Tue, 5 May 2026 17:00:58 +0200
+Subject: selftests: mptcp: check output: catch cmd errors
+
+From: Matthieu Baerts (NGI0) <matttbe@kernel.org>
+
+commit 65db7b27b90e2ea8d4966935aa9a50b6a60c31ac upstream.
+
+Using '${?}' inside the if-statement to check the returned value from
+the command that was evaluated as part of the if-statement is not
+correct: here, '${?}' will be linked to the previous instruction, not
+the one that is expected here (${cmd}).
+
+Instead, simply mark the error, except if an error is expected. If
+that's the case, 1 can be passed as the 4th argument of this helper.
+Three checks from pm_netlink.sh expect an error.
+
+While at it, improve the error message when the command unexpectedly
+fails or succeeds.
+
+Note that we could expect a specific returned value, but the checks
+currently expecting an error can be used with 'ip mptcp' or 'pm_nl_ctl',
+and these two tools don't return the same error code.
+
+Fixes: 2d0c1d27ea4e ("selftests: mptcp: add mptcp_lib_check_output helper")
+Cc: stable@vger.kernel.org
+Reviewed-by: Mat Martineau <martineau@kernel.org>
+Signed-off-by: Matthieu Baerts (NGI0) <matttbe@kernel.org>
+Link: https://patch.msgid.link/20260505-net-mptcp-pm-fixes-7-1-rc3-v1-10-fca8091060a4@kernel.org
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ tools/testing/selftests/net/mptcp/mptcp_lib.sh | 16 ++++++++++------
+ tools/testing/selftests/net/mptcp/pm_netlink.sh | 10 ++++++----
+ 2 files changed, 16 insertions(+), 10 deletions(-)
+
+--- a/tools/testing/selftests/net/mptcp/mptcp_lib.sh
++++ b/tools/testing/selftests/net/mptcp/mptcp_lib.sh
+@@ -430,20 +430,24 @@ mptcp_lib_wait_local_port_listen() {
+ wait_local_port_listen "${@}" "tcp"
+ }
+
++# $1: error file, $2: cmd, $3: expected msg, [$4: expected error]
+ mptcp_lib_check_output() {
+ local err="${1}"
+ local cmd="${2}"
+ local expected="${3}"
++ local exp_error="${4:-0}"
+ local cmd_ret=0
+ local out
+
+- if ! out=$(${cmd} 2>"${err}"); then
+- cmd_ret=${?}
+- fi
++ out=$(${cmd} 2>"${err}") || cmd_ret=1
+
+- if [ ${cmd_ret} -ne 0 ]; then
+- mptcp_lib_pr_fail "command execution '${cmd}' stderr"
+- cat "${err}"
++ if [ "${cmd_ret}" != "${exp_error}" ]; then
++ mptcp_lib_pr_fail "unexpected returned code for '${cmd}', info:"
++ if [ "${exp_error}" = 0 ]; then
++ cat "${err}"
++ else
++ echo "${out}"
++ fi
+ return 2
+ elif [ "${out}" = "${expected}" ]; then
+ return 0
+--- a/tools/testing/selftests/net/mptcp/pm_netlink.sh
++++ b/tools/testing/selftests/net/mptcp/pm_netlink.sh
+@@ -122,10 +122,12 @@ check()
+ local cmd="$1"
+ local expected="$2"
+ local msg="$3"
++ local exp_error="$4"
+ local rc=0
+
+ mptcp_lib_print_title "$msg"
+- mptcp_lib_check_output "${err}" "${cmd}" "${expected}" || rc=${?}
++ mptcp_lib_check_output "${err}" "${cmd}" "${expected}" "${exp_error}" ||
++ rc=${?}
+ if [ ${rc} -eq 2 ]; then
+ mptcp_lib_result_fail "${msg} # error ${rc}"
+ ret=${KSFT_FAIL}
+@@ -158,13 +160,13 @@ check "show_endpoints" \
+ "3,10.0.1.3,signal backup")" "dump addrs"
+
+ del_endpoint 2
+-check "get_endpoint 2" "" "simple del addr"
++check "get_endpoint 2" "" "simple del addr" 1
+ check "show_endpoints" \
+ "$(format_endpoints "1,10.0.1.1" \
+ "3,10.0.1.3,signal backup")" "dump addrs after del"
+
+ add_endpoint 10.0.1.3 2>/dev/null
+-check "get_endpoint 4" "" "duplicate addr"
++check "get_endpoint 4" "" "duplicate addr" 1
+
+ add_endpoint 10.0.1.4 flags signal
+ check "get_endpoint 4" "$(format_endpoints "4,10.0.1.4,signal")" "id addr increment"
+@@ -173,7 +175,7 @@ for i in $(seq 5 9); do
+ add_endpoint "10.0.1.${i}" flags signal >/dev/null 2>&1
+ done
+ check "get_endpoint 9" "$(format_endpoints "9,10.0.1.9,signal")" "hard addr limit"
+-check "get_endpoint 10" "" "above hard addr limit"
++check "get_endpoint 10" "" "above hard addr limit" 1
+
+ del_endpoint 9
+ for i in $(seq 10 255); do
--- /dev/null
+From 53705ddfa18408f8e1f064331b6387509fa19f7f Mon Sep 17 00:00:00 2001
+From: "Matthieu Baerts (NGI0)" <matttbe@kernel.org>
+Date: Tue, 5 May 2026 17:00:59 +0200
+Subject: selftests: mptcp: pm: restrict 'unknown' check to pm_nl_ctl
+
+From: Matthieu Baerts (NGI0) <matttbe@kernel.org>
+
+commit 53705ddfa18408f8e1f064331b6387509fa19f7f upstream.
+
+When pm_netlink.sh is executed with '-i', 'ip mptcp' is used instead of
+'pm_nl_ctl'. IPRoute2 doesn't support the 'unknown' flag, which has only
+been added to 'pm_nl_ctl' for this specific check: to ensure that the
+kernel ignores such unsupported flag.
+
+No reason to add this flag to 'ip mptcp'. Then, this check should be
+skipped when 'ip mptcp' is used.
+
+Fixes: 0cef6fcac24d ("selftests: mptcp: ip_mptcp option for more scripts")
+Cc: stable@vger.kernel.org
+Reviewed-by: Mat Martineau <martineau@kernel.org>
+Signed-off-by: Matthieu Baerts (NGI0) <matttbe@kernel.org>
+Link: https://patch.msgid.link/20260505-net-mptcp-pm-fixes-7-1-rc3-v1-11-fca8091060a4@kernel.org
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ tools/testing/selftests/net/mptcp/pm_netlink.sh | 10 +++++++---
+ 1 file changed, 7 insertions(+), 3 deletions(-)
+
+--- a/tools/testing/selftests/net/mptcp/pm_netlink.sh
++++ b/tools/testing/selftests/net/mptcp/pm_netlink.sh
+@@ -194,9 +194,13 @@ check "show_endpoints" \
+ flush_endpoint
+ check "show_endpoints" "" "flush addrs"
+
+-add_endpoint 10.0.1.1 flags unknown
+-check "show_endpoints" "$(format_endpoints "1,10.0.1.1")" "ignore unknown flags"
+-flush_endpoint
++# "unknown" flag is only supported by pm_nl_ctl
++if ! mptcp_lib_is_ip_mptcp; then
++ add_endpoint 10.0.1.1 flags unknown
++ check "show_endpoints" "$(format_endpoints "1,10.0.1.1")" \
++ "ignore unknown flags"
++ flush_endpoint
++fi
+
+ set_limits 9 1 2>/dev/null
+ check "get_limits" "${default_limits}" "rcv addrs above hard limit"
mm-damon-stat-detect-and-use-fresh-enabled-value.patch
mm-damon-sysfs-schemes-protect-memcg_path-kfree-with-damon_sysfs_lock.patch
pci-update-saved_config_space-upon-resource-assignment.patch
+pci-aer-clear-only-error-bits-in-pcie-device-status.patch
+pci-aer-stop-ruling-out-unbound-devices-as-error-source.patch
+pci-aspm-fix-pci_clear_and_set_config_dword-usage.patch
+power-supply-max17042-avoid-overflow-when-determining-health.patch
+powerpc-xive-fix-kmemleak-caused-by-incorrect-chip_data-lookup.patch
+perf-x86-intel-always-reprogram-acr-events-to-prevent-stale-masks.patch
+rdma-ionic-bound-node_desc-sysfs-read-with-.64s.patch
+rdma-ionic-fix-typo-in-format-string.patch
+rdma-mana-fix-error-unwind-in-mana_ib_create_qp_rss.patch
+rdma-mana-fix-mana_destroy_wq_obj-cleanup-in-mana_ib_create_qp_rss.patch
+rdma-mana-remove-user-triggerable-warn_on-in-mana_ib_create_qp_rss.patch
+rdma-mana-validate-rx_hash_key_len.patch
+rdma-mlx4-fix-mis-use-of-rcu-in-mlx4_srq_event.patch
+rdma-mlx4-fix-resource-leak-on-error-in-mlx4_ib_create_srq.patch
+rdma-mlx5-fix-error-path-fall-through-in-mlx5_ib_dev_res_srq_init.patch
+rdma-ocrdma-don-t-null-deref-uctx-on-errors-in-ocrdma_copy_pd_uresp.patch
+rdma-rxe-reject-non-8-byte-atomic_write-payloads.patch
+rdma-rxe-reject-unknown-opcodes-before-icrc-processing.patch
+rdma-vmw_pvrdma-fix-double-free-on-pvrdma_alloc_ucontext-error-path.patch
+sched_ext-idle-recheck-prev_cpu-after-narrowing-allowed-mask.patch
+selftests-mptcp-check-output-catch-cmd-errors.patch
+selftests-mptcp-pm-restrict-unknown-check-to-pm_nl_ctl.patch
+mptcp-fastclose-msk-when-linger-time-is-0.patch
+mptcp-use-mpjoinsynackhmacfailure-for-synack-hmac-failure.patch
+mptcp-use-mptcp_rst_emptcp-for-ack-hmac-validation-failure.patch
+mptcp-sockopt-set-timestamp-flags-on-subflow-socket-not-msk.patch
+mptcp-sockopt-increase-seq-in-mptcp_setsockopt_all_sf.patch
+mptcp-fix-rx-timestamp-corruption-on-fastopen.patch
+mptcp-fix-scheduling-with-atomic-in-timestamp-sockopt.patch
+mptcp-pm-prio-skip-closed-subflows.patch
+mptcp-pm-kernel-correctly-retransmit-add_addr-id-0.patch
+mptcp-pm-add_addr-rtx-allow-id-0.patch
+mptcp-pm-add_addr-rtx-fix-potential-data-race.patch
+mptcp-pm-add_addr-rtx-always-decrease-sk-refcount.patch
+mptcp-pm-add_addr-rtx-free-sk-if-last.patch
+mptcp-pm-add_addr-rtx-resched-blocked-add_addr-quicker.patch
+mptcp-pm-add_addr-rtx-return-early-if-no-retrans.patch
+f2fs-add-read_once-for-i_blocks-in-f2fs_update_inode.patch
+f2fs-fix-fiemap-boundary-handling-when-read-extent-cache-is-incomplete.patch
+f2fs-fix-fsck-inconsistency-caused-by-incorrect-nat_entry-flag-usage.patch
+f2fs-fix-incorrect-file-address-mapping-when-inline-inode-is-unwritten.patch
+f2fs-fix-incorrect-multidevice-info-in-trace_f2fs_map_blocks.patch
+f2fs-fix-node_cnt-race-between-extent-node-destroy-and-writeback.patch
+f2fs-fix-uninitialized-kobject-put-in-f2fs_init_sysfs.patch
+f2fs-refactor-f2fs_move_node_folio-function.patch
+f2fs-fix-inline-data-not-being-written-to-disk-in-writeback-path.patch
+f2fs-fix-fsck-inconsistency-caused-by-fggc-of-node-block.patch
+kvm-arm64-wake-up-from-wfi-when-iqrchip-is-in-userspace.patch
+kvm-arm64-vgic-fix-iidr-revision-field-extracted-from-wrong-value.patch
+kvm-arm64-fix-initialisation-order-in-__pkvm_init_finalise.patch
+kvm-arm64-fix-feat_spe_fne-to-use-pmsidr_el1.fne-not-pmsver.patch
+kvm-arm64-fix-feat_debugv8p9-to-check-debugver-not-pmuver.patch
+kvm-arm64-fix-pin-leak-and-publication-ordering-in-__pkvm_init_vcpu.patch
+loongarch-fix-potential-ade-in-loongson_gpu_fixup_dma_hang.patch
+loongarch-kvm-cap-kvm_cap_nr_vcpus-by-kvm_cap_max_vcpus.patch
+loongarch-kvm-fix-unreliable-stack-for-kvm_exc_entry.patch
+loongarch-kvm-fix-hw-timer-interrupt-lost-when-inject-interrupt-by-software.patch
+loongarch-kvm-move-unconditional-delay-into-timer-clear-scenery.patch
+loongarch-kvm-use-kvm_set_pte-in-kvm_flush_pte.patch
+loongarch-use-per-root-bridge-pcih-flag-to-skip-mem-resource-fixup.patch