From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Mon, 12 Aug 2024 14:23:25 +0000 (+0200)
Subject: 6.6-stable patches
X-Git-Tag: v6.1.105~44
X-Git-Url: http://git.ipfire.org/gitweb.cgi?a=commitdiff_plain;h=5ae7d5e7b4eed7e45772d567c1c70472ce42abfb;p=thirdparty%2Fkernel%2Fstable-queue.git

6.6-stable patches

added patches:
	btrfs-fix-corruption-after-buffer-fault-in-during-direct-io-append-write.patch
	ipv6-fix-source-address-selection-with-route-leak.patch
	mm-huge_memory-don-t-force-huge-page-alignment-on-32-bit.patch
	mm-huge_memory-use-config_64bit-to-relax-huge-page-alignment-on-32-bit-machines.patch
	netfilter-nf_tables-prefer-nft_chain_validate.patch
---

diff --git a/queue-6.6/btrfs-fix-corruption-after-buffer-fault-in-during-direct-io-append-write.patch b/queue-6.6/btrfs-fix-corruption-after-buffer-fault-in-during-direct-io-append-write.patch
new file mode 100644
index 00000000000..79c5d162360
--- /dev/null
+++ b/queue-6.6/btrfs-fix-corruption-after-buffer-fault-in-during-direct-io-append-write.patch
@@ -0,0 +1,252 @@
+From 939b656bc8ab203fdbde26ccac22bcb7f0985be5 Mon Sep 17 00:00:00 2001
+From: Filipe Manana <fdmanana@suse.com>
+Date: Fri, 26 Jul 2024 11:12:52 +0100
+Subject: btrfs: fix corruption after buffer fault in during direct IO append write
+
+From: Filipe Manana <fdmanana@suse.com>
+
+commit 939b656bc8ab203fdbde26ccac22bcb7f0985be5 upstream.
+
+During an append (O_APPEND write flag) direct IO write if the input buffer
+was not previously faulted in, we can corrupt the file in a way that the
+final size is unexpected and it includes an unexpected hole.
+
+The problem happens like this:
+
+1) We have an empty file, with size 0, for example;
+
+2) We do an O_APPEND direct IO with a length of 4096 bytes and the input
+   buffer is not currently faulted in;
+
+3) We enter btrfs_direct_write(), lock the inode and call
+   generic_write_checks(), which calls generic_write_checks_count(), and
+   that function sets the iocb position to 0 with the following code:
+
+	if (iocb->ki_flags & IOCB_APPEND)
+		iocb->ki_pos = i_size_read(inode);
+
+4) We call btrfs_dio_write() and enter into iomap, which will end up
+   calling btrfs_dio_iomap_begin() and that calls
+   btrfs_get_blocks_direct_write(), where we update the i_size of the
+   inode to 4096 bytes;
+
+5) After btrfs_dio_iomap_begin() returns, iomap will attempt to access
+   the page of the write input buffer (at iomap_dio_bio_iter(), with a
+   call to bio_iov_iter_get_pages()) and fail with -EFAULT, which gets
+   returned to btrfs at btrfs_direct_write() via btrfs_dio_write();
+
+6) At btrfs_direct_write() we get the -EFAULT error, unlock the inode,
+   fault in the write buffer and then goto to the label 'relock';
+
+7) We lock again the inode, do all the necessary checks again and call
+   again generic_write_checks(), which calls generic_write_checks_count()
+   again, and there we set the iocb's position to 4K, which is the current
+   i_size of the inode, with the following code pointed above:
+
+        if (iocb->ki_flags & IOCB_APPEND)
+                iocb->ki_pos = i_size_read(inode);
+
+8) Then we go again to btrfs_dio_write() and enter iomap and the write
+   succeeds, but it wrote to the file range [4K, 8K), leaving a hole in
+   the [0, 4K) range and an i_size of 8K, which goes against the
+   expectations of having the data written to the range [0, 4K) and get an
+   i_size of 4K.
+
+Fix this by not unlocking the inode before faulting in the input buffer,
+in case we get -EFAULT or an incomplete write, and not jumping to the
+'relock' label after faulting in the buffer - instead jump to a location
+immediately before calling iomap, skipping all the write checks and
+relocking. This solves this problem and it's fine even in case the input
+buffer is memory mapped to the same file range, since only holding the
+range locked in the inode's io tree can cause a deadlock, it's safe to
+keep the inode lock (VFS lock), as was fixed and described in commit
+51bd9563b678 ("btrfs: fix deadlock due to page faults during direct IO
+reads and writes").
+
+A sample reproducer provided by a reporter is the following:
+
+   $ cat test.c
+   #ifndef _GNU_SOURCE
+   #define _GNU_SOURCE
+   #endif
+
+   #include <fcntl.h>
+   #include <stdio.h>
+   #include <sys/mman.h>
+   #include <sys/stat.h>
+   #include <unistd.h>
+
+   int main(int argc, char *argv[])
+   {
+       if (argc < 2) {
+           fprintf(stderr, "Usage: %s <test file>\n", argv[0]);
+           return 1;
+       }
+
+       int fd = open(argv[1], O_WRONLY | O_CREAT | O_TRUNC | O_DIRECT |
+                     O_APPEND, 0644);
+       if (fd < 0) {
+           perror("creating test file");
+           return 1;
+       }
+
+       char *buf = mmap(NULL, 4096, PROT_READ,
+                        MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+       ssize_t ret = write(fd, buf, 4096);
+       if (ret < 0) {
+           perror("pwritev2");
+           return 1;
+       }
+
+       struct stat stbuf;
+       ret = fstat(fd, &stbuf);
+       if (ret < 0) {
+           perror("stat");
+           return 1;
+       }
+
+       printf("size: %llu\n", (unsigned long long)stbuf.st_size);
+       return stbuf.st_size == 4096 ? 0 : 1;
+   }
+
+A test case for fstests will be sent soon.
+
+Reported-by: Hanna Czenczek <hreitz@redhat.com>
+Link: https://lore.kernel.org/linux-btrfs/0b841d46-12fe-4e64-9abb-871d8d0de271@redhat.com/
+Fixes: 8184620ae212 ("btrfs: fix lost file sync on direct IO write with nowait and dsync iocb")
+CC: stable@vger.kernel.org # 6.1+
+Tested-by: Hanna Czenczek <hreitz@redhat.com>
+Reviewed-by: Josef Bacik <josef@toxicpanda.com>
+Signed-off-by: Filipe Manana <fdmanana@suse.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/btrfs/ctree.h |    1 +
+ fs/btrfs/file.c  |   55 ++++++++++++++++++++++++++++++++++++++++++-------------
+ 2 files changed, 43 insertions(+), 13 deletions(-)
+
+--- a/fs/btrfs/ctree.h
++++ b/fs/btrfs/ctree.h
+@@ -445,6 +445,7 @@ struct btrfs_file_private {
+ 	void *filldir_buf;
+ 	u64 last_index;
+ 	struct extent_state *llseek_cached_state;
++	bool fsync_skip_inode_lock;
+ };
+ 
+ static inline u32 BTRFS_LEAF_DATA_SIZE(const struct btrfs_fs_info *info)
+--- a/fs/btrfs/file.c
++++ b/fs/btrfs/file.c
+@@ -1535,21 +1535,37 @@ relock:
+ 	 * So here we disable page faults in the iov_iter and then retry if we
+ 	 * got -EFAULT, faulting in the pages before the retry.
+ 	 */
++again:
+ 	from->nofault = true;
+ 	dio = btrfs_dio_write(iocb, from, written);
+ 	from->nofault = false;
+ 
+-	/*
+-	 * iomap_dio_complete() will call btrfs_sync_file() if we have a dsync
+-	 * iocb, and that needs to lock the inode. So unlock it before calling
+-	 * iomap_dio_complete() to avoid a deadlock.
+-	 */
+-	btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
+-
+-	if (IS_ERR_OR_NULL(dio))
++	if (IS_ERR_OR_NULL(dio)) {
+ 		err = PTR_ERR_OR_ZERO(dio);
+-	else
++	} else {
++		struct btrfs_file_private stack_private = { 0 };
++		struct btrfs_file_private *private;
++		const bool have_private = (file->private_data != NULL);
++
++		if (!have_private)
++			file->private_data = &stack_private;
++
++		/*
++		 * If we have a synchoronous write, we must make sure the fsync
++		 * triggered by the iomap_dio_complete() call below doesn't
++		 * deadlock on the inode lock - we are already holding it and we
++		 * can't call it after unlocking because we may need to complete
++		 * partial writes due to the input buffer (or parts of it) not
++		 * being already faulted in.
++		 */
++		private = file->private_data;
++		private->fsync_skip_inode_lock = true;
+ 		err = iomap_dio_complete(dio);
++		private->fsync_skip_inode_lock = false;
++
++		if (!have_private)
++			file->private_data = NULL;
++	}
+ 
+ 	/* No increment (+=) because iomap returns a cumulative value. */
+ 	if (err > 0)
+@@ -1576,10 +1592,12 @@ relock:
+ 		} else {
+ 			fault_in_iov_iter_readable(from, left);
+ 			prev_left = left;
+-			goto relock;
++			goto again;
+ 		}
+ 	}
+ 
++	btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
++
+ 	/*
+ 	 * If 'err' is -ENOTBLK or we have not written all data, then it means
+ 	 * we must fallback to buffered IO.
+@@ -1778,6 +1796,7 @@ static inline bool skip_inode_logging(co
+  */
+ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
+ {
++	struct btrfs_file_private *private = file->private_data;
+ 	struct dentry *dentry = file_dentry(file);
+ 	struct inode *inode = d_inode(dentry);
+ 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+@@ -1787,6 +1806,7 @@ int btrfs_sync_file(struct file *file, l
+ 	int ret = 0, err;
+ 	u64 len;
+ 	bool full_sync;
++	const bool skip_ilock = (private ? private->fsync_skip_inode_lock : false);
+ 
+ 	trace_btrfs_sync_file(file, datasync);
+ 
+@@ -1814,7 +1834,10 @@ int btrfs_sync_file(struct file *file, l
+ 	if (ret)
+ 		goto out;
+ 
+-	btrfs_inode_lock(BTRFS_I(inode), BTRFS_ILOCK_MMAP);
++	if (skip_ilock)
++		down_write(&BTRFS_I(inode)->i_mmap_lock);
++	else
++		btrfs_inode_lock(BTRFS_I(inode), BTRFS_ILOCK_MMAP);
+ 
+ 	atomic_inc(&root->log_batch);
+ 
+@@ -1838,7 +1861,10 @@ int btrfs_sync_file(struct file *file, l
+ 	 */
+ 	ret = start_ordered_ops(inode, start, end);
+ 	if (ret) {
+-		btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_MMAP);
++		if (skip_ilock)
++			up_write(&BTRFS_I(inode)->i_mmap_lock);
++		else
++			btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_MMAP);
+ 		goto out;
+ 	}
+ 
+@@ -1941,7 +1967,10 @@ int btrfs_sync_file(struct file *file, l
+ 	 * file again, but that will end up using the synchronization
+ 	 * inside btrfs_sync_log to keep things safe.
+ 	 */
+-	btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_MMAP);
++	if (skip_ilock)
++		up_write(&BTRFS_I(inode)->i_mmap_lock);
++	else
++		btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_MMAP);
+ 
+ 	if (ret == BTRFS_NO_LOG_SYNC) {
+ 		ret = btrfs_end_transaction(trans);
diff --git a/queue-6.6/ipv6-fix-source-address-selection-with-route-leak.patch b/queue-6.6/ipv6-fix-source-address-selection-with-route-leak.patch
new file mode 100644
index 00000000000..5d113ba3fa9
--- /dev/null
+++ b/queue-6.6/ipv6-fix-source-address-selection-with-route-leak.patch
@@ -0,0 +1,85 @@
+From 252442f2ae317d109ef0b4b39ce0608c09563042 Mon Sep 17 00:00:00 2001
+From: Nicolas Dichtel <nicolas.dichtel@6wind.com>
+Date: Wed, 10 Jul 2024 10:14:28 +0200
+Subject: ipv6: fix source address selection with route leak
+
+From: Nicolas Dichtel <nicolas.dichtel@6wind.com>
+
+commit 252442f2ae317d109ef0b4b39ce0608c09563042 upstream.
+
+By default, an address assigned to the output interface is selected when
+the source address is not specified. This is problematic when a route,
+configured in a vrf, uses an interface from another vrf (aka route leak).
+The original vrf does not own the selected source address.
+
+Let's add a check against the output interface and call the appropriate
+function to select the source address.
+
+CC: stable@vger.kernel.org
+Fixes: 0d240e7811c4 ("net: vrf: Implement get_saddr for IPv6")
+Signed-off-by: Nicolas Dichtel <nicolas.dichtel@6wind.com>
+Link: https://patch.msgid.link/20240710081521.3809742-3-nicolas.dichtel@6wind.com
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ include/net/ip6_route.h |   20 ++++++++++++++------
+ net/ipv6/ip6_output.c   |    1 +
+ net/ipv6/route.c        |    2 +-
+ 3 files changed, 16 insertions(+), 7 deletions(-)
+
+--- a/include/net/ip6_route.h
++++ b/include/net/ip6_route.h
+@@ -128,18 +128,26 @@ void rt6_age_exceptions(struct fib6_info
+ 
+ static inline int ip6_route_get_saddr(struct net *net, struct fib6_info *f6i,
+ 				      const struct in6_addr *daddr,
+-				      unsigned int prefs,
++				      unsigned int prefs, int l3mdev_index,
+ 				      struct in6_addr *saddr)
+ {
++	struct net_device *l3mdev;
++	struct net_device *dev;
++	bool same_vrf;
+ 	int err = 0;
+ 
+-	if (f6i && f6i->fib6_prefsrc.plen) {
++	rcu_read_lock();
++
++	l3mdev = dev_get_by_index_rcu(net, l3mdev_index);
++	if (!f6i || !f6i->fib6_prefsrc.plen || l3mdev)
++		dev = f6i ? fib6_info_nh_dev(f6i) : NULL;
++	same_vrf = !l3mdev || l3mdev_master_dev_rcu(dev) == l3mdev;
++	if (f6i && f6i->fib6_prefsrc.plen && same_vrf)
+ 		*saddr = f6i->fib6_prefsrc.addr;
+-	} else {
+-		struct net_device *dev = f6i ? fib6_info_nh_dev(f6i) : NULL;
++	else
++		err = ipv6_dev_get_saddr(net, same_vrf ? dev : l3mdev, daddr, prefs, saddr);
+ 
+-		err = ipv6_dev_get_saddr(net, dev, daddr, prefs, saddr);
+-	}
++	rcu_read_unlock();
+ 
+ 	return err;
+ }
+--- a/net/ipv6/ip6_output.c
++++ b/net/ipv6/ip6_output.c
+@@ -1122,6 +1122,7 @@ static int ip6_dst_lookup_tail(struct ne
+ 		from = rt ? rcu_dereference(rt->from) : NULL;
+ 		err = ip6_route_get_saddr(net, from, &fl6->daddr,
+ 					  sk ? inet6_sk(sk)->srcprefs : 0,
++					  fl6->flowi6_l3mdev,
+ 					  &fl6->saddr);
+ 		rcu_read_unlock();
+ 
+--- a/net/ipv6/route.c
++++ b/net/ipv6/route.c
+@@ -5678,7 +5678,7 @@ static int rt6_fill_node(struct net *net
+ 				goto nla_put_failure;
+ 	} else if (dest) {
+ 		struct in6_addr saddr_buf;
+-		if (ip6_route_get_saddr(net, rt, dest, 0, &saddr_buf) == 0 &&
++		if (ip6_route_get_saddr(net, rt, dest, 0, 0, &saddr_buf) == 0 &&
+ 		    nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
+ 			goto nla_put_failure;
+ 	}
diff --git a/queue-6.6/mm-huge_memory-don-t-force-huge-page-alignment-on-32-bit.patch b/queue-6.6/mm-huge_memory-don-t-force-huge-page-alignment-on-32-bit.patch
new file mode 100644
index 00000000000..df594ecafd5
--- /dev/null
+++ b/queue-6.6/mm-huge_memory-don-t-force-huge-page-alignment-on-32-bit.patch
@@ -0,0 +1,56 @@
+From 4ef9ad19e17676b9ef071309bc62020e2373705d Mon Sep 17 00:00:00 2001
+From: Yang Shi <yang@os.amperecomputing.com>
+Date: Thu, 18 Jan 2024 10:05:05 -0800
+Subject: mm: huge_memory: don't force huge page alignment on 32 bit
+
+From: Yang Shi <yang@os.amperecomputing.com>
+
+commit 4ef9ad19e17676b9ef071309bc62020e2373705d upstream.
+
+commit efa7df3e3bb5 ("mm: align larger anonymous mappings on THP
+boundaries") caused two issues [1] [2] reported on 32 bit system or compat
+userspace.
+
+It doesn't make too much sense to force huge page alignment on 32 bit
+system due to the constrained virtual address space.
+
+[1] https://lore.kernel.org/linux-mm/d0a136a0-4a31-46bc-adf4-2db109a61672@kernel.org/
+[2] https://lore.kernel.org/linux-mm/CAJuCfpHXLdQy1a2B6xN2d7quTYwg2OoZseYPZTRpU0eHHKD-sQ@mail.gmail.com/
+
+Link: https://lkml.kernel.org/r/20240118180505.2914778-1-shy828301@gmail.com
+Fixes: efa7df3e3bb5 ("mm: align larger anonymous mappings on THP boundaries")
+Signed-off-by: Yang Shi <yang@os.amperecomputing.com>
+Reported-by: Jiri Slaby <jirislaby@kernel.org>
+Reported-by: Suren Baghdasaryan <surenb@google.com>
+Tested-by: Jiri Slaby <jirislaby@kernel.org>
+Tested-by: Suren Baghdasaryan <surenb@google.com>
+Reviewed-by: Matthew Wilcox (Oracle) <willy@infradead.org>
+Cc: Rik van Riel <riel@surriel.com>
+Cc: Christopher Lameter <cl@linux.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Cc: Ben Hutchings <ben@decadent.org.uk>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ mm/huge_memory.c |    4 ++++
+ 1 file changed, 4 insertions(+)
+
+--- a/mm/huge_memory.c
++++ b/mm/huge_memory.c
+@@ -37,6 +37,7 @@
+ #include <linux/page_owner.h>
+ #include <linux/sched/sysctl.h>
+ #include <linux/memory-tiers.h>
++#include <linux/compat.h>
+ 
+ #include <asm/tlb.h>
+ #include <asm/pgalloc.h>
+@@ -601,6 +602,9 @@ static unsigned long __thp_get_unmapped_
+ 	loff_t off_align = round_up(off, size);
+ 	unsigned long len_pad, ret;
+ 
++	if (IS_ENABLED(CONFIG_32BIT) || in_compat_syscall())
++		return 0;
++
+ 	if (off_end <= off_align || (off_end - off_align) < size)
+ 		return 0;
+ 
diff --git a/queue-6.6/mm-huge_memory-use-config_64bit-to-relax-huge-page-alignment-on-32-bit-machines.patch b/queue-6.6/mm-huge_memory-use-config_64bit-to-relax-huge-page-alignment-on-32-bit-machines.patch
new file mode 100644
index 00000000000..7bb3dcbd6b8
--- /dev/null
+++ b/queue-6.6/mm-huge_memory-use-config_64bit-to-relax-huge-page-alignment-on-32-bit-machines.patch
@@ -0,0 +1,49 @@
+From d9592025000b3cf26c742f3505da7b83aedc26d5 Mon Sep 17 00:00:00 2001
+From: Yang Shi <yang@os.amperecomputing.com>
+Date: Fri, 12 Jul 2024 08:58:55 -0700
+Subject: mm: huge_memory: use !CONFIG_64BIT to relax huge page alignment on 32 bit machines
+
+From: Yang Shi <yang@os.amperecomputing.com>
+
+commit d9592025000b3cf26c742f3505da7b83aedc26d5 upstream.
+
+Yves-Alexis Perez reported commit 4ef9ad19e176 ("mm: huge_memory: don't
+force huge page alignment on 32 bit") didn't work for x86_32 [1].  It is
+because x86_32 uses CONFIG_X86_32 instead of CONFIG_32BIT.
+
+!CONFIG_64BIT should cover all 32 bit machines.
+
+[1] https://lore.kernel.org/linux-mm/CAHbLzkr1LwH3pcTgM+aGQ31ip2bKqiqEQ8=FQB+t2c3dhNKNHA@mail.gmail.com/
+
+Link: https://lkml.kernel.org/r/20240712155855.1130330-1-yang@os.amperecomputing.com
+Fixes: 4ef9ad19e176 ("mm: huge_memory: don't force huge page alignment on 32 bit")
+Signed-off-by: Yang Shi <yang@os.amperecomputing.com>
+Reported-by: Yves-Alexis Perez <corsac@debian.org>
+Tested-by: Yves-Alexis Perez <corsac@debian.org>
+Acked-by: David Hildenbrand <david@redhat.com>
+Cc: Ben Hutchings <ben@decadent.org.uk>
+Cc: Christoph Lameter <cl@linux.com>
+Cc: Jiri Slaby <jirislaby@kernel.org>
+Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
+Cc: Rik van Riel <riel@surriel.com>
+Cc: Salvatore Bonaccorso <carnil@debian.org>
+Cc: Suren Baghdasaryan <surenb@google.com>
+Cc: <stable@vger.kernel.org>	[6.8+]
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Cc: Ben Hutchings <ben@decadent.org.uk>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ mm/huge_memory.c |    2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/mm/huge_memory.c
++++ b/mm/huge_memory.c
+@@ -602,7 +602,7 @@ static unsigned long __thp_get_unmapped_
+ 	loff_t off_align = round_up(off, size);
+ 	unsigned long len_pad, ret;
+ 
+-	if (IS_ENABLED(CONFIG_32BIT) || in_compat_syscall())
++	if (!IS_ENABLED(CONFIG_64BIT) || in_compat_syscall())
+ 		return 0;
+ 
+ 	if (off_end <= off_align || (off_end - off_align) < size)
diff --git a/queue-6.6/netfilter-nf_tables-prefer-nft_chain_validate.patch b/queue-6.6/netfilter-nf_tables-prefer-nft_chain_validate.patch
new file mode 100644
index 00000000000..bd69f59f864
--- /dev/null
+++ b/queue-6.6/netfilter-nf_tables-prefer-nft_chain_validate.patch
@@ -0,0 +1,228 @@
+From cff3bd012a9512ac5ed858d38e6ed65f6391008c Mon Sep 17 00:00:00 2001
+From: Florian Westphal <fw@strlen.de>
+Date: Thu, 11 Jul 2024 11:06:39 +0200
+Subject: netfilter: nf_tables: prefer nft_chain_validate
+
+From: Florian Westphal <fw@strlen.de>
+
+commit cff3bd012a9512ac5ed858d38e6ed65f6391008c upstream.
+
+nft_chain_validate already performs loop detection because a cycle will
+result in a call stack overflow (ctx->level >= NFT_JUMP_STACK_SIZE).
+
+It also follows maps via ->validate callback in nft_lookup, so there
+appears no reason to iterate the maps again.
+
+nf_tables_check_loops() and all its helper functions can be removed.
+This improves ruleset load time significantly, from 23s down to 12s.
+
+This also fixes a crash bug. Old loop detection code can result in
+unbounded recursion:
+
+BUG: TASK stack guard page was hit at ....
+Oops: stack guard page: 0000 [#1] PREEMPT SMP KASAN
+CPU: 4 PID: 1539 Comm: nft Not tainted 6.10.0-rc5+ #1
+[..]
+
+with a suitable ruleset during validation of register stores.
+
+I can't see any actual reason to attempt to check for this from
+nft_validate_register_store(), at this point the transaction is still in
+progress, so we don't have a full picture of the rule graph.
+
+For nf-next it might make sense to either remove it or make this depend
+on table->validate_state in case we could catch an error earlier
+(for improved error reporting to userspace).
+
+Fixes: 20a69341f2d0 ("netfilter: nf_tables: add netlink set API")
+Signed-off-by: Florian Westphal <fw@strlen.de>
+Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/netfilter/nf_tables_api.c |  154 +++---------------------------------------
+ 1 file changed, 13 insertions(+), 141 deletions(-)
+
+--- a/net/netfilter/nf_tables_api.c
++++ b/net/netfilter/nf_tables_api.c
+@@ -3743,6 +3743,15 @@ static void nf_tables_rule_release(const
+ 	nf_tables_rule_destroy(ctx, rule);
+ }
+ 
++/** nft_chain_validate - loop detection and hook validation
++ *
++ * @ctx: context containing call depth and base chain
++ * @chain: chain to validate
++ *
++ * Walk through the rules of the given chain and chase all jumps/gotos
++ * and set lookups until either the jump limit is hit or all reachable
++ * chains have been validated.
++ */
+ int nft_chain_validate(const struct nft_ctx *ctx, const struct nft_chain *chain)
+ {
+ 	struct nft_expr *expr, *last;
+@@ -3764,6 +3773,9 @@ int nft_chain_validate(const struct nft_
+ 			if (!expr->ops->validate)
+ 				continue;
+ 
++			/* This may call nft_chain_validate() recursively,
++			 * callers that do so must increment ctx->level.
++			 */
+ 			err = expr->ops->validate(ctx, expr, &data);
+ 			if (err < 0)
+ 				return err;
+@@ -10621,146 +10633,6 @@ int nft_chain_validate_hooks(const struc
+ }
+ EXPORT_SYMBOL_GPL(nft_chain_validate_hooks);
+ 
+-/*
+- * Loop detection - walk through the ruleset beginning at the destination chain
+- * of a new jump until either the source chain is reached (loop) or all
+- * reachable chains have been traversed.
+- *
+- * The loop check is performed whenever a new jump verdict is added to an
+- * expression or verdict map or a verdict map is bound to a new chain.
+- */
+-
+-static int nf_tables_check_loops(const struct nft_ctx *ctx,
+-				 const struct nft_chain *chain);
+-
+-static int nft_check_loops(const struct nft_ctx *ctx,
+-			   const struct nft_set_ext *ext)
+-{
+-	const struct nft_data *data;
+-	int ret;
+-
+-	data = nft_set_ext_data(ext);
+-	switch (data->verdict.code) {
+-	case NFT_JUMP:
+-	case NFT_GOTO:
+-		ret = nf_tables_check_loops(ctx, data->verdict.chain);
+-		break;
+-	default:
+-		ret = 0;
+-		break;
+-	}
+-
+-	return ret;
+-}
+-
+-static int nf_tables_loop_check_setelem(const struct nft_ctx *ctx,
+-					struct nft_set *set,
+-					const struct nft_set_iter *iter,
+-					struct nft_set_elem *elem)
+-{
+-	const struct nft_set_ext *ext = nft_set_elem_ext(set, elem->priv);
+-
+-	if (nft_set_ext_exists(ext, NFT_SET_EXT_FLAGS) &&
+-	    *nft_set_ext_flags(ext) & NFT_SET_ELEM_INTERVAL_END)
+-		return 0;
+-
+-	return nft_check_loops(ctx, ext);
+-}
+-
+-static int nft_set_catchall_loops(const struct nft_ctx *ctx,
+-				  struct nft_set *set)
+-{
+-	u8 genmask = nft_genmask_next(ctx->net);
+-	struct nft_set_elem_catchall *catchall;
+-	struct nft_set_ext *ext;
+-	int ret = 0;
+-
+-	list_for_each_entry_rcu(catchall, &set->catchall_list, list) {
+-		ext = nft_set_elem_ext(set, catchall->elem);
+-		if (!nft_set_elem_active(ext, genmask))
+-			continue;
+-
+-		ret = nft_check_loops(ctx, ext);
+-		if (ret < 0)
+-			return ret;
+-	}
+-
+-	return ret;
+-}
+-
+-static int nf_tables_check_loops(const struct nft_ctx *ctx,
+-				 const struct nft_chain *chain)
+-{
+-	const struct nft_rule *rule;
+-	const struct nft_expr *expr, *last;
+-	struct nft_set *set;
+-	struct nft_set_binding *binding;
+-	struct nft_set_iter iter;
+-
+-	if (ctx->chain == chain)
+-		return -ELOOP;
+-
+-	if (fatal_signal_pending(current))
+-		return -EINTR;
+-
+-	list_for_each_entry(rule, &chain->rules, list) {
+-		nft_rule_for_each_expr(expr, last, rule) {
+-			struct nft_immediate_expr *priv;
+-			const struct nft_data *data;
+-			int err;
+-
+-			if (strcmp(expr->ops->type->name, "immediate"))
+-				continue;
+-
+-			priv = nft_expr_priv(expr);
+-			if (priv->dreg != NFT_REG_VERDICT)
+-				continue;
+-
+-			data = &priv->data;
+-			switch (data->verdict.code) {
+-			case NFT_JUMP:
+-			case NFT_GOTO:
+-				err = nf_tables_check_loops(ctx,
+-							data->verdict.chain);
+-				if (err < 0)
+-					return err;
+-				break;
+-			default:
+-				break;
+-			}
+-		}
+-	}
+-
+-	list_for_each_entry(set, &ctx->table->sets, list) {
+-		if (!nft_is_active_next(ctx->net, set))
+-			continue;
+-		if (!(set->flags & NFT_SET_MAP) ||
+-		    set->dtype != NFT_DATA_VERDICT)
+-			continue;
+-
+-		list_for_each_entry(binding, &set->bindings, list) {
+-			if (!(binding->flags & NFT_SET_MAP) ||
+-			    binding->chain != chain)
+-				continue;
+-
+-			iter.genmask	= nft_genmask_next(ctx->net);
+-			iter.skip 	= 0;
+-			iter.count	= 0;
+-			iter.err	= 0;
+-			iter.fn		= nf_tables_loop_check_setelem;
+-
+-			set->ops->walk(ctx, set, &iter);
+-			if (!iter.err)
+-				iter.err = nft_set_catchall_loops(ctx, set);
+-
+-			if (iter.err < 0)
+-				return iter.err;
+-		}
+-	}
+-
+-	return 0;
+-}
+-
+ /**
+  *	nft_parse_u32_check - fetch u32 attribute and check for maximum value
+  *
+@@ -10873,7 +10745,7 @@ static int nft_validate_register_store(c
+ 		if (data != NULL &&
+ 		    (data->verdict.code == NFT_GOTO ||
+ 		     data->verdict.code == NFT_JUMP)) {
+-			err = nf_tables_check_loops(ctx, data->verdict.chain);
++			err = nft_chain_validate(ctx, data->verdict.chain);
+ 			if (err < 0)
+ 				return err;
+ 		}
diff --git a/queue-6.6/series b/queue-6.6/series
index d56ec640b69..fc2cb4932c2 100644
--- a/queue-6.6/series
+++ b/queue-6.6/series
@@ -170,3 +170,8 @@ selftests-mptcp-fix-error-path.patch
 mptcp-pm-deny-endp-with-signal-subflow-port.patch
 block-use-the-right-type-for-stub-rq_integrity_vec.patch
 revert-drm-amd-display-add-null-check-for-afb-before-dereferencing-in-amdgpu_dm_plane_handle_cursor_update.patch
+mm-huge_memory-don-t-force-huge-page-alignment-on-32-bit.patch
+mm-huge_memory-use-config_64bit-to-relax-huge-page-alignment-on-32-bit-machines.patch
+btrfs-fix-corruption-after-buffer-fault-in-during-direct-io-append-write.patch
+netfilter-nf_tables-prefer-nft_chain_validate.patch
+ipv6-fix-source-address-selection-with-route-leak.patch