--- /dev/null
+From 902fa55813207ab583e8d7e15f0069b1f2c5cfe2 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Fri, 7 Feb 2025 13:58:36 +0000
+Subject: arp: use RCU protection in arp_xmit()
+
+From: Eric Dumazet <edumazet@google.com>
+
+[ Upstream commit a42b69f692165ec39db42d595f4f65a4c8f42e44 ]
+
+arp_xmit() can be called without RTNL or RCU protection.
+
+Use RCU protection to avoid potential UAF.
+
+Fixes: 29a26a568038 ("netfilter: Pass struct net into the netfilter hooks")
+Signed-off-by: Eric Dumazet <edumazet@google.com>
+Reviewed-by: David Ahern <dsahern@kernel.org>
+Reviewed-by: Kuniyuki Iwashima <kuniyu@amazon.com>
+Link: https://patch.msgid.link/20250207135841.1948589-5-edumazet@google.com
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ net/ipv4/arp.c | 4 +++-
+ 1 file changed, 3 insertions(+), 1 deletion(-)
+
+diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c
+index 8ae9bd6f91c19..6879e0b70c769 100644
+--- a/net/ipv4/arp.c
++++ b/net/ipv4/arp.c
+@@ -637,10 +637,12 @@ static int arp_xmit_finish(struct net *net, struct sock *sk, struct sk_buff *skb
+ */
+ void arp_xmit(struct sk_buff *skb)
+ {
++ rcu_read_lock();
+ /* Send it off, maybe filter it using firewalling first. */
+ NF_HOOK(NFPROTO_ARP, NF_ARP_OUT,
+- dev_net(skb->dev), NULL, skb, NULL, skb->dev,
++ dev_net_rcu(skb->dev), NULL, skb, NULL, skb->dev,
+ arp_xmit_finish);
++ rcu_read_unlock();
+ }
+ EXPORT_SYMBOL(arp_xmit);
+
+--
+2.39.5
+
--- /dev/null
+From 2101ff45f34fa447a21bf9d0da8226b68008017c Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 10 Feb 2022 14:49:07 -0800
+Subject: clocksource: Replace cpumask_weight() with cpumask_empty()
+
+From: Yury Norov <yury.norov@gmail.com>
+
+[ Upstream commit 8afbcaf8690dac19ebf570a4e4fef9c59c75bf8e ]
+
+clocksource_verify_percpu() calls cpumask_weight() to check if any bit of a
+given cpumask is set.
+
+This can be done more efficiently with cpumask_empty() because
+cpumask_empty() stops traversing the cpumask as soon as it finds first set
+bit, while cpumask_weight() counts all bits unconditionally.
+
+Signed-off-by: Yury Norov <yury.norov@gmail.com>
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+Link: https://lore.kernel.org/r/20220210224933.379149-24-yury.norov@gmail.com
+Stable-dep-of: 6bb05a33337b ("clocksource: Use migrate_disable() to avoid calling get_random_u32() in atomic context")
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ kernel/time/clocksource.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
+index 5aa8eec89e781..ee7e8d0dc182f 100644
+--- a/kernel/time/clocksource.c
++++ b/kernel/time/clocksource.c
+@@ -344,7 +344,7 @@ void clocksource_verify_percpu(struct clocksource *cs)
+ cpus_read_lock();
+ preempt_disable();
+ clocksource_verify_choose_cpus();
+- if (cpumask_weight(&cpus_chosen) == 0) {
++ if (cpumask_empty(&cpus_chosen)) {
+ preempt_enable();
+ cpus_read_unlock();
+ pr_warn("Not enough CPUs to check clocksource '%s'.\n", cs->name);
+--
+2.39.5
+
--- /dev/null
+From db03669347973a66724bdbdb90357f70b0c66213 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Fri, 31 Jan 2025 12:33:23 -0500
+Subject: clocksource: Use migrate_disable() to avoid calling get_random_u32()
+ in atomic context
+
+From: Waiman Long <longman@redhat.com>
+
+[ Upstream commit 6bb05a33337b2c842373857b63de5c9bf1ae2a09 ]
+
+The following bug report happened with a PREEMPT_RT kernel:
+
+ BUG: sleeping function called from invalid context at kernel/locking/spinlock_rt.c:48
+ in_atomic(): 1, irqs_disabled(): 0, non_block: 0, pid: 2012, name: kwatchdog
+ preempt_count: 1, expected: 0
+ RCU nest depth: 0, expected: 0
+ get_random_u32+0x4f/0x110
+ clocksource_verify_choose_cpus+0xab/0x1a0
+ clocksource_verify_percpu.part.0+0x6b/0x330
+ clocksource_watchdog_kthread+0x193/0x1a0
+
+It is due to the fact that clocksource_verify_choose_cpus() is invoked with
+preemption disabled. This function invokes get_random_u32() to obtain
+random numbers for choosing CPUs. The batched_entropy_32 local lock and/or
+the base_crng.lock spinlock in driver/char/random.c will be acquired during
+the call. In PREEMPT_RT kernel, they are both sleeping locks and so cannot
+be acquired in atomic context.
+
+Fix this problem by using migrate_disable() to allow smp_processor_id() to
+be reliably used without introducing atomic context. preempt_disable() is
+then called after clocksource_verify_choose_cpus() but before the
+clocksource measurement is being run to avoid introducing unexpected
+latency.
+
+Fixes: 7560c02bdffb ("clocksource: Check per-CPU clock synchronization when marked unstable")
+Suggested-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
+Signed-off-by: Waiman Long <longman@redhat.com>
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+Reviewed-by: Paul E. McKenney <paulmck@kernel.org>
+Reviewed-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
+Link: https://lore.kernel.org/all/20250131173323.891943-2-longman@redhat.com
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ kernel/time/clocksource.c | 6 ++++--
+ 1 file changed, 4 insertions(+), 2 deletions(-)
+
+diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
+index 8d9b11555f7ef..32efc87c41f20 100644
+--- a/kernel/time/clocksource.c
++++ b/kernel/time/clocksource.c
+@@ -342,10 +342,10 @@ void clocksource_verify_percpu(struct clocksource *cs)
+ cpumask_clear(&cpus_ahead);
+ cpumask_clear(&cpus_behind);
+ cpus_read_lock();
+- preempt_disable();
++ migrate_disable();
+ clocksource_verify_choose_cpus();
+ if (cpumask_empty(&cpus_chosen)) {
+- preempt_enable();
++ migrate_enable();
+ cpus_read_unlock();
+ pr_warn("Not enough CPUs to check clocksource '%s'.\n", cs->name);
+ return;
+@@ -353,6 +353,7 @@ void clocksource_verify_percpu(struct clocksource *cs)
+ testcpu = smp_processor_id();
+ pr_info("Checking clocksource %s synchronization from CPU %d to CPUs %*pbl.\n",
+ cs->name, testcpu, cpumask_pr_args(&cpus_chosen));
++ preempt_disable();
+ for_each_cpu(cpu, &cpus_chosen) {
+ if (cpu == testcpu)
+ continue;
+@@ -372,6 +373,7 @@ void clocksource_verify_percpu(struct clocksource *cs)
+ cs_nsec_min = cs_nsec;
+ }
+ preempt_enable();
++ migrate_enable();
+ cpus_read_unlock();
+ if (!cpumask_empty(&cpus_ahead))
+ pr_warn(" CPUs %*pbl ahead of CPU %d for clocksource %s.\n",
+--
+2.39.5
+
--- /dev/null
+From 15afb00ecb1cca0aa44bd46d983692d14de39591 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Fri, 24 Jan 2025 20:54:41 -0500
+Subject: clocksource: Use pr_info() for "Checking clocksource synchronization"
+ message
+
+From: Waiman Long <longman@redhat.com>
+
+[ Upstream commit 1f566840a82982141f94086061927a90e79440e5 ]
+
+The "Checking clocksource synchronization" message is normally printed
+when clocksource_verify_percpu() is called for a given clocksource if
+both the CLOCK_SOURCE_UNSTABLE and CLOCK_SOURCE_VERIFY_PERCPU flags
+are set.
+
+It is an informational message and so pr_info() is the correct choice.
+
+Signed-off-by: Waiman Long <longman@redhat.com>
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+Reviewed-by: Paul E. McKenney <paulmck@kernel.org>
+Acked-by: John Stultz <jstultz@google.com>
+Link: https://lore.kernel.org/all/20250125015442.3740588-1-longman@redhat.com
+Stable-dep-of: 6bb05a33337b ("clocksource: Use migrate_disable() to avoid calling get_random_u32() in atomic context")
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ kernel/time/clocksource.c | 3 ++-
+ 1 file changed, 2 insertions(+), 1 deletion(-)
+
+diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
+index ee7e8d0dc182f..8d9b11555f7ef 100644
+--- a/kernel/time/clocksource.c
++++ b/kernel/time/clocksource.c
+@@ -351,7 +351,8 @@ void clocksource_verify_percpu(struct clocksource *cs)
+ return;
+ }
+ testcpu = smp_processor_id();
+- pr_warn("Checking clocksource %s synchronization from CPU %d to CPUs %*pbl.\n", cs->name, testcpu, cpumask_pr_args(&cpus_chosen));
++ pr_info("Checking clocksource %s synchronization from CPU %d to CPUs %*pbl.\n",
++ cs->name, testcpu, cpumask_pr_args(&cpus_chosen));
+ for_each_cpu(cpu, &cpus_chosen) {
+ if (cpu == testcpu)
+ continue;
+--
+2.39.5
+
--- /dev/null
+From d09b054acd19d10f22c938657607112195fec7a4 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Wed, 5 Feb 2025 15:51:10 +0000
+Subject: ipv4: add RCU protection to ip4_dst_hoplimit()
+
+From: Eric Dumazet <edumazet@google.com>
+
+[ Upstream commit 469308552ca4560176cfc100e7ca84add1bebd7c ]
+
+ip4_dst_hoplimit() must use RCU protection to make
+sure the net structure it reads does not disappear.
+
+Fixes: fa50d974d104 ("ipv4: Namespaceify ip_default_ttl sysctl knob")
+Signed-off-by: Eric Dumazet <edumazet@google.com>
+Reviewed-by: Kuniyuki Iwashima <kuniyu@amazon.com>
+Link: https://patch.msgid.link/20250205155120.1676781-3-edumazet@google.com
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ include/net/route.h | 9 +++++++--
+ 1 file changed, 7 insertions(+), 2 deletions(-)
+
+diff --git a/include/net/route.h b/include/net/route.h
+index 30610101ea14f..036e3ee3b856b 100644
+--- a/include/net/route.h
++++ b/include/net/route.h
+@@ -357,10 +357,15 @@ static inline int inet_iif(const struct sk_buff *skb)
+ static inline int ip4_dst_hoplimit(const struct dst_entry *dst)
+ {
+ int hoplimit = dst_metric_raw(dst, RTAX_HOPLIMIT);
+- struct net *net = dev_net(dst->dev);
+
+- if (hoplimit == 0)
++ if (hoplimit == 0) {
++ const struct net *net;
++
++ rcu_read_lock();
++ net = dev_net_rcu(dst->dev);
+ hoplimit = READ_ONCE(net->ipv4.sysctl_ip_default_ttl);
++ rcu_read_unlock();
++ }
+ return hoplimit;
+ }
+
+--
+2.39.5
+
--- /dev/null
+From 6da58ac499930e5b2c82f1ceab71fc1345efad1b Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Wed, 5 Feb 2025 15:51:15 +0000
+Subject: ipv4: use RCU protection in __ip_rt_update_pmtu()
+
+From: Eric Dumazet <edumazet@google.com>
+
+[ Upstream commit 139512191bd06f1b496117c76372b2ce372c9a41 ]
+
+__ip_rt_update_pmtu() must use RCU protection to make
+sure the net structure it reads does not disappear.
+
+Fixes: 2fbc6e89b2f1 ("ipv4: Update exception handling for multipath routes via same device")
+Fixes: 1de6b15a434c ("Namespaceify min_pmtu sysctl")
+Signed-off-by: Eric Dumazet <edumazet@google.com>
+Link: https://patch.msgid.link/20250205155120.1676781-8-edumazet@google.com
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ net/ipv4/route.c | 11 ++++++-----
+ 1 file changed, 6 insertions(+), 5 deletions(-)
+
+diff --git a/net/ipv4/route.c b/net/ipv4/route.c
+index 2ae9d2855efab..a4884d434038e 100644
+--- a/net/ipv4/route.c
++++ b/net/ipv4/route.c
+@@ -1023,9 +1023,9 @@ out: kfree_skb(skb);
+ static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
+ {
+ struct dst_entry *dst = &rt->dst;
+- struct net *net = dev_net(dst->dev);
+ struct fib_result res;
+ bool lock = false;
++ struct net *net;
+ u32 old_mtu;
+
+ if (ip_mtu_locked(dst))
+@@ -1035,6 +1035,8 @@ static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
+ if (old_mtu < mtu)
+ return;
+
++ rcu_read_lock();
++ net = dev_net_rcu(dst->dev);
+ if (mtu < net->ipv4.ip_rt_min_pmtu) {
+ lock = true;
+ mtu = min(old_mtu, net->ipv4.ip_rt_min_pmtu);
+@@ -1042,9 +1044,8 @@ static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
+
+ if (rt->rt_pmtu == mtu && !lock &&
+ time_before(jiffies, dst->expires - net->ipv4.ip_rt_mtu_expires / 2))
+- return;
++ goto out;
+
+- rcu_read_lock();
+ if (fib_lookup(net, fl4, &res, 0) == 0) {
+ struct fib_nh_common *nhc;
+
+@@ -1058,14 +1059,14 @@ static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
+ update_or_create_fnhe(nhc, fl4->daddr, 0, mtu, lock,
+ jiffies + net->ipv4.ip_rt_mtu_expires);
+ }
+- rcu_read_unlock();
+- return;
++ goto out;
+ }
+ #endif /* CONFIG_IP_ROUTE_MULTIPATH */
+ nhc = FIB_RES_NHC(res);
+ update_or_create_fnhe(nhc, fl4->daddr, 0, mtu, lock,
+ jiffies + net->ipv4.ip_rt_mtu_expires);
+ }
++out:
+ rcu_read_unlock();
+ }
+
+--
+2.39.5
+
--- /dev/null
+From cbf9ee413880e59c4dfef62141d63e9139a261b0 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Wed, 5 Feb 2025 15:51:14 +0000
+Subject: ipv4: use RCU protection in inet_select_addr()
+
+From: Eric Dumazet <edumazet@google.com>
+
+[ Upstream commit 719817cd293e4fa389e1f69c396f3f816ed5aa41 ]
+
+inet_select_addr() must use RCU protection to make
+sure the net structure it reads does not disappear.
+
+Fixes: c4544c724322 ("[NETNS]: Process inet_select_addr inside a namespace.")
+Signed-off-by: Eric Dumazet <edumazet@google.com>
+Link: https://patch.msgid.link/20250205155120.1676781-7-edumazet@google.com
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ net/ipv4/devinet.c | 3 ++-
+ 1 file changed, 2 insertions(+), 1 deletion(-)
+
+diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
+index dcbc087fff179..33e87b442b475 100644
+--- a/net/ipv4/devinet.c
++++ b/net/ipv4/devinet.c
+@@ -1316,10 +1316,11 @@ __be32 inet_select_addr(const struct net_device *dev, __be32 dst, int scope)
+ __be32 addr = 0;
+ unsigned char localnet_scope = RT_SCOPE_HOST;
+ struct in_device *in_dev;
+- struct net *net = dev_net(dev);
++ struct net *net;
+ int master_idx;
+
+ rcu_read_lock();
++ net = dev_net_rcu(dev);
+ in_dev = __in_dev_get_rcu(dev);
+ if (!in_dev)
+ goto no_in_dev;
+--
+2.39.5
+
--- /dev/null
+From 5374e2891497377d7647fdd20ca025cec0db5ea9 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Wed, 5 Feb 2025 15:51:13 +0000
+Subject: ipv4: use RCU protection in rt_is_expired()
+
+From: Eric Dumazet <edumazet@google.com>
+
+[ Upstream commit dd205fcc33d92d54eee4d7f21bb073af9bd5ce2b ]
+
+rt_is_expired() must use RCU protection to make
+sure the net structure it reads does not disappear.
+
+Fixes: e84f84f27647 ("netns: place rt_genid into struct net")
+Signed-off-by: Eric Dumazet <edumazet@google.com>
+Reviewed-by: Kuniyuki Iwashima <kuniyu@amazon.com>
+Link: https://patch.msgid.link/20250205155120.1676781-6-edumazet@google.com
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ net/ipv4/route.c | 8 +++++++-
+ 1 file changed, 7 insertions(+), 1 deletion(-)
+
+diff --git a/net/ipv4/route.c b/net/ipv4/route.c
+index 3522801885787..3ad78bbd6261b 100644
+--- a/net/ipv4/route.c
++++ b/net/ipv4/route.c
+@@ -400,7 +400,13 @@ static inline int ip_rt_proc_init(void)
+
+ static inline bool rt_is_expired(const struct rtable *rth)
+ {
+- return rth->rt_genid != rt_genid_ipv4(dev_net(rth->dst.dev));
++ bool res;
++
++ rcu_read_lock();
++ res = rth->rt_genid != rt_genid_ipv4(dev_net_rcu(rth->dst.dev));
++ rcu_read_unlock();
++
++ return res;
+ }
+
+ void rt_cache_flush(struct net *net)
+--
+2.39.5
+
--- /dev/null
+From 229bb6df37287bb7607c4c8d8a77468874f3a962 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Wed, 12 Feb 2025 14:10:21 +0000
+Subject: ipv6: mcast: add RCU protection to mld_newpack()
+
+From: Eric Dumazet <edumazet@google.com>
+
+[ Upstream commit a527750d877fd334de87eef81f1cb5f0f0ca3373 ]
+
+mld_newpack() can be called without RTNL or RCU being held.
+
+Note that we no longer can use sock_alloc_send_skb() because
+ipv6.igmp_sk uses GFP_KERNEL allocations which can sleep.
+
+Instead use alloc_skb() and charge the net->ipv6.igmp_sk
+socket under RCU protection.
+
+Fixes: b8ad0cbc58f7 ("[NETNS][IPV6] mcast - handle several network namespace")
+Signed-off-by: Eric Dumazet <edumazet@google.com>
+Reviewed-by: David Ahern <dsahern@kernel.org>
+Link: https://patch.msgid.link/20250212141021.1663666-1-edumazet@google.com
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ net/ipv6/mcast.c | 14 ++++++++++----
+ 1 file changed, 10 insertions(+), 4 deletions(-)
+
+diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c
+index 6e5d1ade48a89..1d038a0840994 100644
+--- a/net/ipv6/mcast.c
++++ b/net/ipv6/mcast.c
+@@ -1731,21 +1731,19 @@ static struct sk_buff *mld_newpack(struct inet6_dev *idev, unsigned int mtu)
+ struct net_device *dev = idev->dev;
+ int hlen = LL_RESERVED_SPACE(dev);
+ int tlen = dev->needed_tailroom;
+- struct net *net = dev_net(dev);
+ const struct in6_addr *saddr;
+ struct in6_addr addr_buf;
+ struct mld2_report *pmr;
+ struct sk_buff *skb;
+ unsigned int size;
+ struct sock *sk;
+- int err;
++ struct net *net;
+
+- sk = net->ipv6.igmp_sk;
+ /* we assume size > sizeof(ra) here
+ * Also try to not allocate high-order pages for big MTU
+ */
+ size = min_t(int, mtu, PAGE_SIZE / 2) + hlen + tlen;
+- skb = sock_alloc_send_skb(sk, size, 1, &err);
++ skb = alloc_skb(size, GFP_KERNEL);
+ if (!skb)
+ return NULL;
+
+@@ -1753,6 +1751,12 @@ static struct sk_buff *mld_newpack(struct inet6_dev *idev, unsigned int mtu)
+ skb_reserve(skb, hlen);
+ skb_tailroom_reserve(skb, mtu, tlen);
+
++ rcu_read_lock();
++
++ net = dev_net_rcu(dev);
++ sk = net->ipv6.igmp_sk;
++ skb_set_owner_w(skb, sk);
++
+ if (ipv6_get_lladdr(dev, &addr_buf, IFA_F_TENTATIVE)) {
+ /* <draft-ietf-magma-mld-source-05.txt>:
+ * use unspecified address as the source address
+@@ -1764,6 +1768,8 @@ static struct sk_buff *mld_newpack(struct inet6_dev *idev, unsigned int mtu)
+
+ ip6_mc_hdr(sk, skb, dev, saddr, &mld2_all_mcr, NEXTHDR_HOP, 0);
+
++ rcu_read_unlock();
++
+ skb_put_data(skb, ra, sizeof(ra));
+
+ skb_set_transport_header(skb, skb_tail_pointer(skb) - skb->data);
+--
+2.39.5
+
--- /dev/null
+From 15bb8fffed081e8f0bae7e5bfc10813b28c9fafc Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Wed, 5 Feb 2025 15:51:18 +0000
+Subject: ipv6: use RCU protection in ip6_default_advmss()
+
+From: Eric Dumazet <edumazet@google.com>
+
+[ Upstream commit 3c8ffcd248da34fc41e52a46e51505900115fc2a ]
+
+ip6_default_advmss() needs rcu protection to make
+sure the net structure it reads does not disappear.
+
+Fixes: 5578689a4e3c ("[NETNS][IPV6] route6 - make route6 per namespace")
+Signed-off-by: Eric Dumazet <edumazet@google.com>
+Reviewed-by: Kuniyuki Iwashima <kuniyu@amazon.com>
+Link: https://patch.msgid.link/20250205155120.1676781-11-edumazet@google.com
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ net/ipv6/route.c | 7 ++++++-
+ 1 file changed, 6 insertions(+), 1 deletion(-)
+
+diff --git a/net/ipv6/route.c b/net/ipv6/route.c
+index b7f494cca3e5c..94526436b91e8 100644
+--- a/net/ipv6/route.c
++++ b/net/ipv6/route.c
+@@ -3184,13 +3184,18 @@ static unsigned int ip6_default_advmss(const struct dst_entry *dst)
+ {
+ struct net_device *dev = dst->dev;
+ unsigned int mtu = dst_mtu(dst);
+- struct net *net = dev_net(dev);
++ struct net *net;
+
+ mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
+
++ rcu_read_lock();
++
++ net = dev_net_rcu(dev);
+ if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
+ mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
+
++ rcu_read_unlock();
++
+ /*
+ * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
+ * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
+--
+2.39.5
+
--- /dev/null
+From aad943d05c2b69b1cef568185aa1787eeec0d8d2 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Tue, 4 Jan 2022 10:59:34 +0000
+Subject: Namespaceify min_pmtu sysctl
+
+From: xu xin <xu.xin16@zte.com.cn>
+
+[ Upstream commit 1de6b15a434c0068253fea5d719f71143e7e3a79 ]
+
+This patch enables the sysctl min_pmtu to be configured per net
+namespace.
+
+Signed-off-by: xu xin <xu.xin16@zte.com.cn>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Stable-dep-of: 139512191bd0 ("ipv4: use RCU protection in __ip_rt_update_pmtu()")
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ include/net/netns/ipv4.h | 2 ++
+ net/ipv4/route.c | 53 ++++++++++++++++++++++++++++------------
+ 2 files changed, 39 insertions(+), 16 deletions(-)
+
+diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
+index d60a10cfc3823..16515c04a46a7 100644
+--- a/include/net/netns/ipv4.h
++++ b/include/net/netns/ipv4.h
+@@ -84,6 +84,8 @@ struct netns_ipv4 {
+ int sysctl_icmp_ratelimit;
+ int sysctl_icmp_ratemask;
+
++ u32 ip_rt_min_pmtu;
++
+ struct local_ports ip_local_ports;
+
+ u8 sysctl_tcp_ecn;
+diff --git a/net/ipv4/route.c b/net/ipv4/route.c
+index 3ad78bbd6261b..9a837cd2b925a 100644
+--- a/net/ipv4/route.c
++++ b/net/ipv4/route.c
+@@ -118,6 +118,8 @@
+
+ #define RT_GC_TIMEOUT (300*HZ)
+
++#define DEFAULT_MIN_PMTU (512 + 20 + 20)
++
+ static int ip_rt_max_size;
+ static int ip_rt_redirect_number __read_mostly = 9;
+ static int ip_rt_redirect_load __read_mostly = HZ / 50;
+@@ -125,7 +127,6 @@ static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
+ static int ip_rt_error_cost __read_mostly = HZ;
+ static int ip_rt_error_burst __read_mostly = 5 * HZ;
+ static int ip_rt_mtu_expires __read_mostly = 10 * 60 * HZ;
+-static u32 ip_rt_min_pmtu __read_mostly = 512 + 20 + 20;
+ static int ip_rt_min_advmss __read_mostly = 256;
+
+ static int ip_rt_gc_timeout __read_mostly = RT_GC_TIMEOUT;
+@@ -1034,9 +1035,9 @@ static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
+ if (old_mtu < mtu)
+ return;
+
+- if (mtu < ip_rt_min_pmtu) {
++ if (mtu < net->ipv4.ip_rt_min_pmtu) {
+ lock = true;
+- mtu = min(old_mtu, ip_rt_min_pmtu);
++ mtu = min(old_mtu, net->ipv4.ip_rt_min_pmtu);
+ }
+
+ if (rt->rt_pmtu == mtu && !lock &&
+@@ -3578,14 +3579,6 @@ static struct ctl_table ipv4_route_table[] = {
+ .mode = 0644,
+ .proc_handler = proc_dointvec_jiffies,
+ },
+- {
+- .procname = "min_pmtu",
+- .data = &ip_rt_min_pmtu,
+- .maxlen = sizeof(int),
+- .mode = 0644,
+- .proc_handler = proc_dointvec_minmax,
+- .extra1 = &ip_min_valid_pmtu,
+- },
+ {
+ .procname = "min_adv_mss",
+ .data = &ip_rt_min_advmss,
+@@ -3598,13 +3591,21 @@ static struct ctl_table ipv4_route_table[] = {
+
+ static const char ipv4_route_flush_procname[] = "flush";
+
+-static struct ctl_table ipv4_route_flush_table[] = {
++static struct ctl_table ipv4_route_netns_table[] = {
+ {
+ .procname = ipv4_route_flush_procname,
+ .maxlen = sizeof(int),
+ .mode = 0200,
+ .proc_handler = ipv4_sysctl_rtcache_flush,
+ },
++ {
++ .procname = "min_pmtu",
++ .data = &init_net.ipv4.ip_rt_min_pmtu,
++ .maxlen = sizeof(int),
++ .mode = 0644,
++ .proc_handler = proc_dointvec_minmax,
++ .extra1 = &ip_min_valid_pmtu,
++ },
+ { },
+ };
+
+@@ -3612,9 +3613,11 @@ static __net_init int sysctl_route_net_init(struct net *net)
+ {
+ struct ctl_table *tbl;
+
+- tbl = ipv4_route_flush_table;
++ tbl = ipv4_route_netns_table;
+ if (!net_eq(net, &init_net)) {
+- tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
++ int i;
++
++ tbl = kmemdup(tbl, sizeof(ipv4_route_netns_table), GFP_KERNEL);
+ if (!tbl)
+ goto err_dup;
+
+@@ -3623,6 +3626,12 @@ static __net_init int sysctl_route_net_init(struct net *net)
+ if (tbl[0].procname != ipv4_route_flush_procname)
+ tbl[0].procname = NULL;
+ }
++
++ /* Update the variables to point into the current struct net
++ * except for the first element flush
++ */
++ for (i = 1; i < ARRAY_SIZE(ipv4_route_netns_table) - 1; i++)
++ tbl[i].data += (void *)net - (void *)&init_net;
+ }
+ tbl[0].extra1 = net;
+
+@@ -3632,7 +3641,7 @@ static __net_init int sysctl_route_net_init(struct net *net)
+ return 0;
+
+ err_reg:
+- if (tbl != ipv4_route_flush_table)
++ if (tbl != ipv4_route_netns_table)
+ kfree(tbl);
+ err_dup:
+ return -ENOMEM;
+@@ -3644,7 +3653,7 @@ static __net_exit void sysctl_route_net_exit(struct net *net)
+
+ tbl = net->ipv4.route_hdr->ctl_table_arg;
+ unregister_net_sysctl_table(net->ipv4.route_hdr);
+- BUG_ON(tbl == ipv4_route_flush_table);
++ BUG_ON(tbl == ipv4_route_netns_table);
+ kfree(tbl);
+ }
+
+@@ -3654,6 +3663,17 @@ static __net_initdata struct pernet_operations sysctl_route_ops = {
+ };
+ #endif
+
++static __net_init int netns_ip_rt_init(struct net *net)
++{
++ /* Set default value for namespaceified sysctls */
++ net->ipv4.ip_rt_min_pmtu = DEFAULT_MIN_PMTU;
++ return 0;
++}
++
++static struct pernet_operations __net_initdata ip_rt_ops = {
++ .init = netns_ip_rt_init,
++};
++
+ static __net_init int rt_genid_init(struct net *net)
+ {
+ atomic_set(&net->ipv4.rt_genid, 0);
+@@ -3759,6 +3779,7 @@ int __init ip_rt_init(void)
+ #ifdef CONFIG_SYSCTL
+ register_pernet_subsys(&sysctl_route_ops);
+ #endif
++ register_pernet_subsys(&ip_rt_ops);
+ register_pernet_subsys(&rt_genid_ops);
+ register_pernet_subsys(&ipv4_inetpeer_ops);
+ return 0;
+--
+2.39.5
+
--- /dev/null
+From 263bf32bee3c8ef0e309b7b0ef3285ddeec1b7dd Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Tue, 4 Jan 2022 10:59:47 +0000
+Subject: Namespaceify mtu_expires sysctl
+
+From: xu xin <xu.xin16@zte.com.cn>
+
+[ Upstream commit 1135fad204805518462c1f0caaca6bcd52ba78cf ]
+
+This patch enables the sysctl mtu_expires to be configured per net
+namespace.
+
+Signed-off-by: xu xin <xu.xin16@zte.com.cn>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Stable-dep-of: 139512191bd0 ("ipv4: use RCU protection in __ip_rt_update_pmtu()")
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ include/net/netns/ipv4.h | 1 +
+ net/ipv4/route.c | 21 +++++++++++----------
+ 2 files changed, 12 insertions(+), 10 deletions(-)
+
+diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
+index 16515c04a46a7..8bc0d865338e4 100644
+--- a/include/net/netns/ipv4.h
++++ b/include/net/netns/ipv4.h
+@@ -85,6 +85,7 @@ struct netns_ipv4 {
+ int sysctl_icmp_ratemask;
+
+ u32 ip_rt_min_pmtu;
++ int ip_rt_mtu_expires;
+
+ struct local_ports ip_local_ports;
+
+diff --git a/net/ipv4/route.c b/net/ipv4/route.c
+index 9a837cd2b925a..75c379315ef37 100644
+--- a/net/ipv4/route.c
++++ b/net/ipv4/route.c
+@@ -119,6 +119,7 @@
+ #define RT_GC_TIMEOUT (300*HZ)
+
+ #define DEFAULT_MIN_PMTU (512 + 20 + 20)
++#define DEFAULT_MTU_EXPIRES (10 * 60 * HZ)
+
+ static int ip_rt_max_size;
+ static int ip_rt_redirect_number __read_mostly = 9;
+@@ -126,7 +127,6 @@ static int ip_rt_redirect_load __read_mostly = HZ / 50;
+ static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
+ static int ip_rt_error_cost __read_mostly = HZ;
+ static int ip_rt_error_burst __read_mostly = 5 * HZ;
+-static int ip_rt_mtu_expires __read_mostly = 10 * 60 * HZ;
+ static int ip_rt_min_advmss __read_mostly = 256;
+
+ static int ip_rt_gc_timeout __read_mostly = RT_GC_TIMEOUT;
+@@ -1041,7 +1041,7 @@ static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
+ }
+
+ if (rt->rt_pmtu == mtu && !lock &&
+- time_before(jiffies, dst->expires - ip_rt_mtu_expires / 2))
++ time_before(jiffies, dst->expires - net->ipv4.ip_rt_mtu_expires / 2))
+ return;
+
+ rcu_read_lock();
+@@ -1051,7 +1051,7 @@ static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
+ fib_select_path(net, &res, fl4, NULL);
+ nhc = FIB_RES_NHC(res);
+ update_or_create_fnhe(nhc, fl4->daddr, 0, mtu, lock,
+- jiffies + ip_rt_mtu_expires);
++ jiffies + net->ipv4.ip_rt_mtu_expires);
+ }
+ rcu_read_unlock();
+ }
+@@ -3572,13 +3572,6 @@ static struct ctl_table ipv4_route_table[] = {
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+- {
+- .procname = "mtu_expires",
+- .data = &ip_rt_mtu_expires,
+- .maxlen = sizeof(int),
+- .mode = 0644,
+- .proc_handler = proc_dointvec_jiffies,
+- },
+ {
+ .procname = "min_adv_mss",
+ .data = &ip_rt_min_advmss,
+@@ -3606,6 +3599,13 @@ static struct ctl_table ipv4_route_netns_table[] = {
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &ip_min_valid_pmtu,
+ },
++ {
++ .procname = "mtu_expires",
++ .data = &init_net.ipv4.ip_rt_mtu_expires,
++ .maxlen = sizeof(int),
++ .mode = 0644,
++ .proc_handler = proc_dointvec_jiffies,
++ },
+ { },
+ };
+
+@@ -3667,6 +3667,7 @@ static __net_init int netns_ip_rt_init(struct net *net)
+ {
+ /* Set default value for namespaceified sysctls */
+ net->ipv4.ip_rt_min_pmtu = DEFAULT_MIN_PMTU;
++ net->ipv4.ip_rt_mtu_expires = DEFAULT_MTU_EXPIRES;
+ return 0;
+ }
+
+--
+2.39.5
+
--- /dev/null
+From 8a44c27dc8c014a4f3a2fc0db7e7ad5a5a1128e4 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Fri, 7 Feb 2025 13:58:39 +0000
+Subject: ndisc: extend RCU protection in ndisc_send_skb()
+
+From: Eric Dumazet <edumazet@google.com>
+
+[ Upstream commit ed6ae1f325d3c43966ec1b62ac1459e2b8e45640 ]
+
+ndisc_send_skb() can be called without RTNL or RCU held.
+
+Acquire rcu_read_lock() earlier, so that we can use dev_net_rcu()
+and avoid a potential UAF.
+
+Fixes: 1762f7e88eb3 ("[NETNS][IPV6] ndisc - make socket control per namespace")
+Signed-off-by: Eric Dumazet <edumazet@google.com>
+Reviewed-by: David Ahern <dsahern@kernel.org>
+Reviewed-by: Kuniyuki Iwashima <kuniyu@amazon.com>
+Link: https://patch.msgid.link/20250207135841.1948589-8-edumazet@google.com
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ net/ipv6/ndisc.c | 12 ++++++++----
+ 1 file changed, 8 insertions(+), 4 deletions(-)
+
+diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c
+index 3972189c09b14..af584e879467e 100644
+--- a/net/ipv6/ndisc.c
++++ b/net/ipv6/ndisc.c
+@@ -471,16 +471,20 @@ static void ndisc_send_skb(struct sk_buff *skb,
+ const struct in6_addr *daddr,
+ const struct in6_addr *saddr)
+ {
++ struct icmp6hdr *icmp6h = icmp6_hdr(skb);
+ struct dst_entry *dst = skb_dst(skb);
+- struct net *net = dev_net(skb->dev);
+- struct sock *sk = net->ipv6.ndisc_sk;
+ struct inet6_dev *idev;
++ struct net *net;
++ struct sock *sk;
+ int err;
+- struct icmp6hdr *icmp6h = icmp6_hdr(skb);
+ u8 type;
+
+ type = icmp6h->icmp6_type;
+
++ rcu_read_lock();
++
++ net = dev_net_rcu(skb->dev);
++ sk = net->ipv6.ndisc_sk;
+ if (!dst) {
+ struct flowi6 fl6;
+ int oif = skb->dev->ifindex;
+@@ -488,6 +492,7 @@ static void ndisc_send_skb(struct sk_buff *skb,
+ icmpv6_flow_init(sk, &fl6, type, saddr, daddr, oif);
+ dst = icmp6_dst_alloc(skb->dev, &fl6);
+ if (IS_ERR(dst)) {
++ rcu_read_unlock();
+ kfree_skb(skb);
+ return;
+ }
+@@ -502,7 +507,6 @@ static void ndisc_send_skb(struct sk_buff *skb,
+
+ ip6_nd_hdr(skb, saddr, daddr, inet6_sk(sk)->hop_limit, skb->len);
+
+- rcu_read_lock();
+ idev = __in6_dev_get(dst->dev);
+ IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUT, skb->len);
+
+--
+2.39.5
+
--- /dev/null
+From ac4f3804a967edf91db2bca556b68c2e3c9e98c4 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Fri, 7 Feb 2025 13:58:34 +0000
+Subject: ndisc: use RCU protection in ndisc_alloc_skb()
+
+From: Eric Dumazet <edumazet@google.com>
+
+[ Upstream commit 628e6d18930bbd21f2d4562228afe27694f66da9 ]
+
+ndisc_alloc_skb() can be called without RTNL or RCU being held.
+
+Add RCU protection to avoid possible UAF.
+
+Fixes: de09334b9326 ("ndisc: Introduce ndisc_alloc_skb() helper.")
+Signed-off-by: Eric Dumazet <edumazet@google.com>
+Reviewed-by: David Ahern <dsahern@kernel.org>
+Reviewed-by: Kuniyuki Iwashima <kuniyu@amazon.com>
+Link: https://patch.msgid.link/20250207135841.1948589-3-edumazet@google.com
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ net/ipv6/ndisc.c | 10 ++++------
+ 1 file changed, 4 insertions(+), 6 deletions(-)
+
+diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c
+index 63c1420c58249..3972189c09b14 100644
+--- a/net/ipv6/ndisc.c
++++ b/net/ipv6/ndisc.c
+@@ -417,15 +417,11 @@ static struct sk_buff *ndisc_alloc_skb(struct net_device *dev,
+ {
+ int hlen = LL_RESERVED_SPACE(dev);
+ int tlen = dev->needed_tailroom;
+- struct sock *sk = dev_net(dev)->ipv6.ndisc_sk;
+ struct sk_buff *skb;
+
+ skb = alloc_skb(hlen + sizeof(struct ipv6hdr) + len + tlen, GFP_ATOMIC);
+- if (!skb) {
+- ND_PRINTK(0, err, "ndisc: %s failed to allocate an skb\n",
+- __func__);
++ if (!skb)
+ return NULL;
+- }
+
+ skb->protocol = htons(ETH_P_IPV6);
+ skb->dev = dev;
+@@ -436,7 +432,9 @@ static struct sk_buff *ndisc_alloc_skb(struct net_device *dev,
+ /* Manually assign socket ownership as we avoid calling
+ * sock_alloc_send_pskb() to bypass wmem buffer limits
+ */
+- skb_set_owner_w(skb, sk);
++ rcu_read_lock();
++ skb_set_owner_w(skb, dev_net_rcu(dev)->ipv6.ndisc_sk);
++ rcu_read_unlock();
+
+ return skb;
+ }
+--
+2.39.5
+
--- /dev/null
+From aa8d7d2415da6a2fc8f8f325a237852dbe452ab5 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 22 Aug 2024 12:32:45 +0800
+Subject: neighbour: delete redundant judgment statements
+
+From: Li Zetao <lizetao1@huawei.com>
+
+[ Upstream commit c25bdd2ac8cf7da70a226f1a66cdce7af15ff86f ]
+
+The initial value of err is -ENOBUFS, and err is guaranteed to be
+less than 0 before all goto errout. Therefore, on the error path
+of errout, there is no need to repeatedly judge that err is less than 0,
+and delete redundant judgments to make the code more concise.
+
+Signed-off-by: Li Zetao <lizetao1@huawei.com>
+Reviewed-by: Petr Machata <petrm@nvidia.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Stable-dep-of: becbd5850c03 ("neighbour: use RCU protection in __neigh_notify()")
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ net/core/neighbour.c | 3 +--
+ 1 file changed, 1 insertion(+), 2 deletions(-)
+
+diff --git a/net/core/neighbour.c b/net/core/neighbour.c
+index 6f3bd1a4ec8ca..7fffbe0424342 100644
+--- a/net/core/neighbour.c
++++ b/net/core/neighbour.c
+@@ -3387,8 +3387,7 @@ static void __neigh_notify(struct neighbour *n, int type, int flags,
+ rtnl_notify(skb, net, 0, RTNLGRP_NEIGH, NULL, GFP_ATOMIC);
+ return;
+ errout:
+- if (err < 0)
+- rtnl_set_sk_err(net, RTNLGRP_NEIGH, err);
++ rtnl_set_sk_err(net, RTNLGRP_NEIGH, err);
+ }
+
+ void neigh_app_ns(struct neighbour *n)
+--
+2.39.5
+
--- /dev/null
+From e227247e68d9cf6ebcb96be1999f2051b1127a31 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Fri, 7 Feb 2025 13:58:35 +0000
+Subject: neighbour: use RCU protection in __neigh_notify()
+
+From: Eric Dumazet <edumazet@google.com>
+
+[ Upstream commit becbd5850c03ed33b232083dd66c6e38c0c0e569 ]
+
+__neigh_notify() can be called without RTNL or RCU protection.
+
+Use RCU protection to avoid potential UAF.
+
+Fixes: 426b5303eb43 ("[NETNS]: Modify the neighbour table code so it handles multiple network namespaces")
+Signed-off-by: Eric Dumazet <edumazet@google.com>
+Reviewed-by: David Ahern <dsahern@kernel.org>
+Reviewed-by: Kuniyuki Iwashima <kuniyu@amazon.com>
+Link: https://patch.msgid.link/20250207135841.1948589-4-edumazet@google.com
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ net/core/neighbour.c | 8 ++++++--
+ 1 file changed, 6 insertions(+), 2 deletions(-)
+
+diff --git a/net/core/neighbour.c b/net/core/neighbour.c
+index 7fffbe0424342..9549738b81842 100644
+--- a/net/core/neighbour.c
++++ b/net/core/neighbour.c
+@@ -3369,10 +3369,12 @@ static const struct seq_operations neigh_stat_seq_ops = {
+ static void __neigh_notify(struct neighbour *n, int type, int flags,
+ u32 pid)
+ {
+- struct net *net = dev_net(n->dev);
+ struct sk_buff *skb;
+ int err = -ENOBUFS;
++ struct net *net;
+
++ rcu_read_lock();
++ net = dev_net_rcu(n->dev);
+ skb = nlmsg_new(neigh_nlmsg_size(), GFP_ATOMIC);
+ if (skb == NULL)
+ goto errout;
+@@ -3385,9 +3387,11 @@ static void __neigh_notify(struct neighbour *n, int type, int flags,
+ goto errout;
+ }
+ rtnl_notify(skb, net, 0, RTNLGRP_NEIGH, NULL, GFP_ATOMIC);
+- return;
++ goto out;
+ errout:
+ rtnl_set_sk_err(net, RTNLGRP_NEIGH, err);
++out:
++ rcu_read_unlock();
+ }
+
+ void neigh_app_ns(struct neighbour *n)
+--
+2.39.5
+
--- /dev/null
+From 33e40695198caa179ab1454878b6d2be38fbaa96 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Wed, 5 Feb 2025 15:51:09 +0000
+Subject: net: add dev_net_rcu() helper
+
+From: Eric Dumazet <edumazet@google.com>
+
+[ Upstream commit 482ad2a4ace2740ca0ff1cbc8f3c7f862f3ab507 ]
+
+dev->nd_net can change, readers should either
+use rcu_read_lock() or RTNL.
+
+We currently use a generic helper, dev_net() with
+no debugging support. We probably have many hidden bugs.
+
+Add dev_net_rcu() helper for callers using rcu_read_lock()
+protection.
+
+Signed-off-by: Eric Dumazet <edumazet@google.com>
+Reviewed-by: Kuniyuki Iwashima <kuniyu@amazon.com>
+Link: https://patch.msgid.link/20250205155120.1676781-2-edumazet@google.com
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Stable-dep-of: dd205fcc33d9 ("ipv4: use RCU protection in rt_is_expired()")
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ include/linux/netdevice.h | 6 ++++++
+ include/net/net_namespace.h | 2 +-
+ 2 files changed, 7 insertions(+), 1 deletion(-)
+
+diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
+index 829ebde5d50d5..79b528c128c14 100644
+--- a/include/linux/netdevice.h
++++ b/include/linux/netdevice.h
+@@ -2454,6 +2454,12 @@ struct net *dev_net(const struct net_device *dev)
+ return read_pnet(&dev->nd_net);
+ }
+
++static inline
++struct net *dev_net_rcu(const struct net_device *dev)
++{
++ return read_pnet_rcu(&dev->nd_net);
++}
++
+ static inline
+ void dev_net_set(struct net_device *dev, struct net *net)
+ {
+diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h
+index 0b6bea456fce6..ff9ecc76d622b 100644
+--- a/include/net/net_namespace.h
++++ b/include/net/net_namespace.h
+@@ -336,7 +336,7 @@ static inline struct net *read_pnet(const possible_net_t *pnet)
+ #endif
+ }
+
+-static inline struct net *read_pnet_rcu(possible_net_t *pnet)
++static inline struct net *read_pnet_rcu(const possible_net_t *pnet)
+ {
+ #ifdef CONFIG_NET_NS
+ return rcu_dereference(pnet->net);
+--
+2.39.5
+
--- /dev/null
+From d3df20325474b0793d11f7a7e98a38fdc3988a06 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Fri, 8 Nov 2024 09:34:24 +0000
+Subject: net: ipv4: Cache pmtu for all packet paths if multipath enabled
+
+From: Vladimir Vdovin <deliran@verdict.gg>
+
+[ Upstream commit 7d3f3b4367f315a61fc615e3138f3d320da8c466 ]
+
+Check number of paths by fib_info_num_path(),
+and update_or_create_fnhe() for every path.
+Problem is that pmtu is cached only for the oif
+that has received icmp message "need to frag",
+other oifs will still try to use "default" iface mtu.
+
+An example topology showing the problem:
+
+ | host1
+ +---------+
+ | dummy0 | 10.179.20.18/32 mtu9000
+ +---------+
+ +-----------+----------------+
+ +---------+ +---------+
+ | ens17f0 | 10.179.2.141/31 | ens17f1 | 10.179.2.13/31
+ +---------+ +---------+
+ | (all here have mtu 9000) |
+ +------+ +------+
+ | ro1 | 10.179.2.140/31 | ro2 | 10.179.2.12/31
+ +------+ +------+
+ | |
+---------+------------+-------------------+------
+ |
+ +-----+
+ | ro3 | 10.10.10.10 mtu1500
+ +-----+
+ |
+ ========================================
+ some networks
+ ========================================
+ |
+ +-----+
+ | eth0| 10.10.30.30 mtu9000
+ +-----+
+ | host2
+
+host1 have enabled multipath and
+sysctl net.ipv4.fib_multipath_hash_policy = 1:
+
+default proto static src 10.179.20.18
+ nexthop via 10.179.2.12 dev ens17f1 weight 1
+ nexthop via 10.179.2.140 dev ens17f0 weight 1
+
+When host1 tries to do pmtud from 10.179.20.18/32 to host2,
+host1 receives at ens17f1 iface an icmp packet from ro3 that ro3 mtu=1500.
+And host1 caches it in nexthop exceptions cache.
+
+Problem is that it is cached only for the iface that has received icmp,
+and there is no way that ro3 will send icmp msg to host1 via another path.
+
+Host1 now have this routes to host2:
+
+ip r g 10.10.30.30 sport 30000 dport 443
+10.10.30.30 via 10.179.2.12 dev ens17f1 src 10.179.20.18 uid 0
+ cache expires 521sec mtu 1500
+
+ip r g 10.10.30.30 sport 30033 dport 443
+10.10.30.30 via 10.179.2.140 dev ens17f0 src 10.179.20.18 uid 0
+ cache
+
+So when host1 tries again to reach host2 with mtu>1500,
+if packet flow is lucky enough to be hashed with oif=ens17f1 its ok,
+if oif=ens17f0 it blackholes and still gets icmp msgs from ro3 to ens17f1,
+until lucky day when ro3 will send it through another flow to ens17f0.
+
+Signed-off-by: Vladimir Vdovin <deliran@verdict.gg>
+Reviewed-by: Ido Schimmel <idosch@nvidia.com>
+Link: https://patch.msgid.link/20241108093427.317942-1-deliran@verdict.gg
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Stable-dep-of: 139512191bd0 ("ipv4: use RCU protection in __ip_rt_update_pmtu()")
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ net/ipv4/route.c | 13 ++++
+ tools/testing/selftests/net/pmtu.sh | 112 +++++++++++++++++++++++-----
+ 2 files changed, 108 insertions(+), 17 deletions(-)
+
+diff --git a/net/ipv4/route.c b/net/ipv4/route.c
+index 75c379315ef37..2ae9d2855efab 100644
+--- a/net/ipv4/route.c
++++ b/net/ipv4/route.c
+@@ -1049,6 +1049,19 @@ static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
+ struct fib_nh_common *nhc;
+
+ fib_select_path(net, &res, fl4, NULL);
++#ifdef CONFIG_IP_ROUTE_MULTIPATH
++ if (fib_info_num_path(res.fi) > 1) {
++ int nhsel;
++
++ for (nhsel = 0; nhsel < fib_info_num_path(res.fi); nhsel++) {
++ nhc = fib_info_nhc(res.fi, nhsel);
++ update_or_create_fnhe(nhc, fl4->daddr, 0, mtu, lock,
++ jiffies + net->ipv4.ip_rt_mtu_expires);
++ }
++ rcu_read_unlock();
++ return;
++ }
++#endif /* CONFIG_IP_ROUTE_MULTIPATH */
+ nhc = FIB_RES_NHC(res);
+ update_or_create_fnhe(nhc, fl4->daddr, 0, mtu, lock,
+ jiffies + net->ipv4.ip_rt_mtu_expires);
+diff --git a/tools/testing/selftests/net/pmtu.sh b/tools/testing/selftests/net/pmtu.sh
+index dbfa56173d291..33f4fb34ac9b2 100755
+--- a/tools/testing/selftests/net/pmtu.sh
++++ b/tools/testing/selftests/net/pmtu.sh
+@@ -197,6 +197,12 @@
+ #
+ # - pmtu_ipv6_route_change
+ # Same as above but with IPv6
++#
++# - pmtu_ipv4_mp_exceptions
++# Use the same topology as in pmtu_ipv4, but add routeable addresses
++# on host A and B on lo reachable via both routers. Host A and B
++# addresses have multipath routes to each other, b_r1 mtu = 1500.
++# Check that PMTU exceptions are created for both paths.
+
+ # Kselftest framework requirement - SKIP code is 4.
+ ksft_skip=4
+@@ -266,7 +272,8 @@ tests="
+ list_flush_ipv4_exception ipv4: list and flush cached exceptions 1
+ list_flush_ipv6_exception ipv6: list and flush cached exceptions 1
+ pmtu_ipv4_route_change ipv4: PMTU exception w/route replace 1
+- pmtu_ipv6_route_change ipv6: PMTU exception w/route replace 1"
++ pmtu_ipv6_route_change ipv6: PMTU exception w/route replace 1
++ pmtu_ipv4_mp_exceptions ipv4: PMTU multipath nh exceptions 1"
+
+ NS_A="ns-A"
+ NS_B="ns-B"
+@@ -353,6 +360,9 @@ tunnel6_a_addr="fd00:2::a"
+ tunnel6_b_addr="fd00:2::b"
+ tunnel6_mask="64"
+
++host4_a_addr="192.168.99.99"
++host4_b_addr="192.168.88.88"
++
+ dummy6_0_prefix="fc00:1000::"
+ dummy6_1_prefix="fc00:1001::"
+ dummy6_mask="64"
+@@ -907,6 +917,52 @@ setup_ovs_bridge() {
+ run_cmd ip route add ${prefix6}:${b_r1}::1 via ${prefix6}:${a_r1}::2
+ }
+
++setup_multipath_new() {
++ # Set up host A with multipath routes to host B host4_b_addr
++ run_cmd ${ns_a} ip addr add ${host4_a_addr} dev lo
++ run_cmd ${ns_a} ip nexthop add id 401 via ${prefix4}.${a_r1}.2 dev veth_A-R1
++ run_cmd ${ns_a} ip nexthop add id 402 via ${prefix4}.${a_r2}.2 dev veth_A-R2
++ run_cmd ${ns_a} ip nexthop add id 403 group 401/402
++ run_cmd ${ns_a} ip route add ${host4_b_addr} src ${host4_a_addr} nhid 403
++
++ # Set up host B with multipath routes to host A host4_a_addr
++ run_cmd ${ns_b} ip addr add ${host4_b_addr} dev lo
++ run_cmd ${ns_b} ip nexthop add id 401 via ${prefix4}.${b_r1}.2 dev veth_B-R1
++ run_cmd ${ns_b} ip nexthop add id 402 via ${prefix4}.${b_r2}.2 dev veth_B-R2
++ run_cmd ${ns_b} ip nexthop add id 403 group 401/402
++ run_cmd ${ns_b} ip route add ${host4_a_addr} src ${host4_b_addr} nhid 403
++}
++
++setup_multipath_old() {
++ # Set up host A with multipath routes to host B host4_b_addr
++ run_cmd ${ns_a} ip addr add ${host4_a_addr} dev lo
++ run_cmd ${ns_a} ip route add ${host4_b_addr} \
++ src ${host4_a_addr} \
++ nexthop via ${prefix4}.${a_r1}.2 weight 1 \
++ nexthop via ${prefix4}.${a_r2}.2 weight 1
++
++ # Set up host B with multipath routes to host A host4_a_addr
++ run_cmd ${ns_b} ip addr add ${host4_b_addr} dev lo
++ run_cmd ${ns_b} ip route add ${host4_a_addr} \
++ src ${host4_b_addr} \
++ nexthop via ${prefix4}.${b_r1}.2 weight 1 \
++ nexthop via ${prefix4}.${b_r2}.2 weight 1
++}
++
++setup_multipath() {
++ if [ "$USE_NH" = "yes" ]; then
++ setup_multipath_new
++ else
++ setup_multipath_old
++ fi
++
++ # Set up routers with routes to dummies
++ run_cmd ${ns_r1} ip route add ${host4_a_addr} via ${prefix4}.${a_r1}.1
++ run_cmd ${ns_r2} ip route add ${host4_a_addr} via ${prefix4}.${a_r2}.1
++ run_cmd ${ns_r1} ip route add ${host4_b_addr} via ${prefix4}.${b_r1}.1
++ run_cmd ${ns_r2} ip route add ${host4_b_addr} via ${prefix4}.${b_r2}.1
++}
++
+ setup() {
+ [ "$(id -u)" -ne 0 ] && echo " need to run as root" && return $ksft_skip
+
+@@ -988,23 +1044,15 @@ link_get_mtu() {
+ }
+
+ route_get_dst_exception() {
+- ns_cmd="${1}"
+- dst="${2}"
+- dsfield="${3}"
++ ns_cmd="${1}"; shift
+
+- if [ -z "${dsfield}" ]; then
+- dsfield=0
+- fi
+-
+- ${ns_cmd} ip route get "${dst}" dsfield "${dsfield}"
++ ${ns_cmd} ip route get "$@"
+ }
+
+ route_get_dst_pmtu_from_exception() {
+- ns_cmd="${1}"
+- dst="${2}"
+- dsfield="${3}"
++ ns_cmd="${1}"; shift
+
+- mtu_parse "$(route_get_dst_exception "${ns_cmd}" "${dst}" "${dsfield}")"
++ mtu_parse "$(route_get_dst_exception "${ns_cmd}" "$@")"
+ }
+
+ check_pmtu_value() {
+@@ -1147,10 +1195,10 @@ test_pmtu_ipv4_dscp_icmp_exception() {
+ run_cmd "${ns_a}" ping -q -M want -Q "${dsfield}" -c 1 -w 1 -s "${len}" "${dst2}"
+
+ # Check that exceptions have been created with the correct PMTU
+- pmtu_1="$(route_get_dst_pmtu_from_exception "${ns_a}" "${dst1}" "${policy_mark}")"
++ pmtu_1="$(route_get_dst_pmtu_from_exception "${ns_a}" "${dst1}" dsfield "${policy_mark}")"
+ check_pmtu_value "1400" "${pmtu_1}" "exceeding MTU" || return 1
+
+- pmtu_2="$(route_get_dst_pmtu_from_exception "${ns_a}" "${dst2}" "${policy_mark}")"
++ pmtu_2="$(route_get_dst_pmtu_from_exception "${ns_a}" "${dst2}" dsfield "${policy_mark}")"
+ check_pmtu_value "1500" "${pmtu_2}" "exceeding MTU" || return 1
+ }
+
+@@ -1197,9 +1245,9 @@ test_pmtu_ipv4_dscp_udp_exception() {
+ UDP:"${dst2}":50000,tos="${dsfield}"
+
+ # Check that exceptions have been created with the correct PMTU
+- pmtu_1="$(route_get_dst_pmtu_from_exception "${ns_a}" "${dst1}" "${policy_mark}")"
++ pmtu_1="$(route_get_dst_pmtu_from_exception "${ns_a}" "${dst1}" dsfield "${policy_mark}")"
+ check_pmtu_value "1400" "${pmtu_1}" "exceeding MTU" || return 1
+- pmtu_2="$(route_get_dst_pmtu_from_exception "${ns_a}" "${dst2}" "${policy_mark}")"
++ pmtu_2="$(route_get_dst_pmtu_from_exception "${ns_a}" "${dst2}" dsfield "${policy_mark}")"
+ check_pmtu_value "1500" "${pmtu_2}" "exceeding MTU" || return 1
+ }
+
+@@ -2205,6 +2253,36 @@ test_pmtu_ipv6_route_change() {
+ test_pmtu_ipvX_route_change 6
+ }
+
++test_pmtu_ipv4_mp_exceptions() {
++ setup namespaces routing multipath || return $ksft_skip
++
++ trace "${ns_a}" veth_A-R1 "${ns_r1}" veth_R1-A \
++ "${ns_r1}" veth_R1-B "${ns_b}" veth_B-R1 \
++ "${ns_a}" veth_A-R2 "${ns_r2}" veth_R2-A \
++ "${ns_r2}" veth_R2-B "${ns_b}" veth_B-R2
++
++ # Set up initial MTU values
++ mtu "${ns_a}" veth_A-R1 2000
++ mtu "${ns_r1}" veth_R1-A 2000
++ mtu "${ns_r1}" veth_R1-B 1500
++ mtu "${ns_b}" veth_B-R1 1500
++
++ mtu "${ns_a}" veth_A-R2 2000
++ mtu "${ns_r2}" veth_R2-A 2000
++ mtu "${ns_r2}" veth_R2-B 1500
++ mtu "${ns_b}" veth_B-R2 1500
++
++ # Ping and expect two nexthop exceptions for two routes
++ run_cmd ${ns_a} ping -q -M want -i 0.1 -c 1 -s 1800 "${host4_b_addr}"
++
++ # Check that exceptions have been created with the correct PMTU
++ pmtu_a_R1="$(route_get_dst_pmtu_from_exception "${ns_a}" "${host4_b_addr}" oif veth_A-R1)"
++ pmtu_a_R2="$(route_get_dst_pmtu_from_exception "${ns_a}" "${host4_b_addr}" oif veth_A-R2)"
++
++ check_pmtu_value "1500" "${pmtu_a_R1}" "exceeding MTU (veth_A-R1)" || return 1
++ check_pmtu_value "1500" "${pmtu_a_R2}" "exceeding MTU (veth_A-R2)" || return 1
++}
++
+ usage() {
+ echo
+ echo "$0 [OPTIONS] [TEST]..."
+--
+2.39.5
+
--- /dev/null
+From 8a95e86353f62029da9cd4e760bfc558b5979cd1 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Fri, 13 Oct 2023 14:10:23 +0200
+Subject: net: treat possible_net_t net pointer as an RCU one and add
+ read_pnet_rcu()
+
+From: Jiri Pirko <jiri@nvidia.com>
+
+[ Upstream commit 2034d90ae41ae93e30d492ebcf1f06f97a9cfba6 ]
+
+Make the net pointer stored in possible_net_t structure annotated as
+an RCU pointer. Change the access helpers to treat it as such.
+Introduce read_pnet_rcu() helper to allow caller to dereference
+the net pointer under RCU read lock.
+
+Signed-off-by: Jiri Pirko <jiri@nvidia.com>
+Reviewed-by: Simon Horman <horms@kernel.org>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Stable-dep-of: dd205fcc33d9 ("ipv4: use RCU protection in rt_is_expired()")
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ include/net/net_namespace.h | 15 ++++++++++++---
+ 1 file changed, 12 insertions(+), 3 deletions(-)
+
+diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h
+index d184b832166b6..0b6bea456fce6 100644
+--- a/include/net/net_namespace.h
++++ b/include/net/net_namespace.h
+@@ -316,21 +316,30 @@ static inline int check_net(const struct net *net)
+
+ typedef struct {
+ #ifdef CONFIG_NET_NS
+- struct net *net;
++ struct net __rcu *net;
+ #endif
+ } possible_net_t;
+
+ static inline void write_pnet(possible_net_t *pnet, struct net *net)
+ {
+ #ifdef CONFIG_NET_NS
+- pnet->net = net;
++ rcu_assign_pointer(pnet->net, net);
+ #endif
+ }
+
+ static inline struct net *read_pnet(const possible_net_t *pnet)
+ {
+ #ifdef CONFIG_NET_NS
+- return pnet->net;
++ return rcu_dereference_protected(pnet->net, true);
++#else
++ return &init_net;
++#endif
++}
++
++static inline struct net *read_pnet_rcu(possible_net_t *pnet)
++{
++#ifdef CONFIG_NET_NS
++ return rcu_dereference(pnet->net);
+ #else
+ return &init_net;
+ #endif
+--
+2.39.5
+
--- /dev/null
+From 0d74272a163a95a6bdb5c307a5bdfde008268843 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Fri, 7 Feb 2025 13:58:37 +0000
+Subject: openvswitch: use RCU protection in ovs_vport_cmd_fill_info()
+
+From: Eric Dumazet <edumazet@google.com>
+
+[ Upstream commit 90b2f49a502fa71090d9f4fe29a2f51fe5dff76d ]
+
+ovs_vport_cmd_fill_info() can be called without RTNL or RCU.
+
+Use RCU protection and dev_net_rcu() to avoid potential UAF.
+
+Fixes: 9354d4520342 ("openvswitch: reliable interface indentification in port dumps")
+Signed-off-by: Eric Dumazet <edumazet@google.com>
+Reviewed-by: Kuniyuki Iwashima <kuniyu@amazon.com>
+Link: https://patch.msgid.link/20250207135841.1948589-6-edumazet@google.com
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ net/openvswitch/datapath.c | 12 +++++++++---
+ 1 file changed, 9 insertions(+), 3 deletions(-)
+
+diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c
+index 0fc98e89a1149..c28b56c309169 100644
+--- a/net/openvswitch/datapath.c
++++ b/net/openvswitch/datapath.c
+@@ -2058,6 +2058,7 @@ static int ovs_vport_cmd_fill_info(struct vport *vport, struct sk_buff *skb,
+ {
+ struct ovs_header *ovs_header;
+ struct ovs_vport_stats vport_stats;
++ struct net *net_vport;
+ int err;
+
+ ovs_header = genlmsg_put(skb, portid, seq, &dp_vport_genl_family,
+@@ -2074,12 +2075,15 @@ static int ovs_vport_cmd_fill_info(struct vport *vport, struct sk_buff *skb,
+ nla_put_u32(skb, OVS_VPORT_ATTR_IFINDEX, vport->dev->ifindex))
+ goto nla_put_failure;
+
+- if (!net_eq(net, dev_net(vport->dev))) {
+- int id = peernet2id_alloc(net, dev_net(vport->dev), gfp);
++ rcu_read_lock();
++ net_vport = dev_net_rcu(vport->dev);
++ if (!net_eq(net, net_vport)) {
++ int id = peernet2id_alloc(net, net_vport, GFP_ATOMIC);
+
+ if (nla_put_s32(skb, OVS_VPORT_ATTR_NETNSID, id))
+- goto nla_put_failure;
++ goto nla_put_failure_unlock;
+ }
++ rcu_read_unlock();
+
+ ovs_vport_get_stats(vport, &vport_stats);
+ if (nla_put_64bit(skb, OVS_VPORT_ATTR_STATS,
+@@ -2097,6 +2101,8 @@ static int ovs_vport_cmd_fill_info(struct vport *vport, struct sk_buff *skb,
+ genlmsg_end(skb, ovs_header);
+ return 0;
+
++nla_put_failure_unlock:
++ rcu_read_unlock();
+ nla_put_failure:
+ err = -EMSGSIZE;
+ error:
+--
+2.39.5
+
--- /dev/null
+From 890ab1a64aa1bfe3679db9f4fc6234fd41248d8b Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 17 Mar 2022 13:45:11 +0100
+Subject: selftest: net: Test IPv4 PMTU exceptions with DSCP and ECN
+
+From: Guillaume Nault <gnault@redhat.com>
+
+[ Upstream commit ec730c3e1f0e3a80612a9be2beb00e2b4f93fe70 ]
+
+Add two tests to pmtu.sh, for verifying that PMTU exceptions get
+properly created for routes that don't belong to the main table.
+
+A fib-rule based on the packet's DSCP field is used to jump to the
+correct table. ECN shouldn't interfere with this process, so each test
+has two components: one that only sets DSCP and one that sets both DSCP
+and ECN.
+
+One of the test triggers PMTU exceptions using ICMP Echo Requests, the
+other using UDP packets (to test different handlers in the kernel).
+
+A few adjustments are necessary in the rest of the script to allow
+policy routing scenarios:
+
+ * Add global variable rt_table that allows setup_routing_*() to
+ add routes to a specific routing table. By default rt_table is set
+ to "main", so existing tests don't need to be modified.
+
+ * Another global variable, policy_mark, is used to define which
+ dsfield value is used for policy routing. This variable has no
+ effect on tests that don't use policy routing.
+
+ * The UDP version of the test uses socat. So cleanup() now also need
+ to kill socat PIDs.
+
+ * route_get_dst_pmtu_from_exception() and route_get_dst_exception()
+ now take an optional third argument specifying the dsfield. If
+ not specified, 0 is used, so existing users don't need to be
+ modified.
+
+Signed-off-by: Guillaume Nault <gnault@redhat.com>
+Reviewed-by: David Ahern <dsahern@kernel.org>
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Stable-dep-of: 139512191bd0 ("ipv4: use RCU protection in __ip_rt_update_pmtu()")
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ tools/testing/selftests/net/pmtu.sh | 141 +++++++++++++++++++++++++++-
+ 1 file changed, 137 insertions(+), 4 deletions(-)
+
+diff --git a/tools/testing/selftests/net/pmtu.sh b/tools/testing/selftests/net/pmtu.sh
+index 84c05e533056d..dbfa56173d291 100755
+--- a/tools/testing/selftests/net/pmtu.sh
++++ b/tools/testing/selftests/net/pmtu.sh
+@@ -26,6 +26,15 @@
+ # - pmtu_ipv6
+ # Same as pmtu_ipv4, except for locked PMTU tests, using IPv6
+ #
++# - pmtu_ipv4_dscp_icmp_exception
++# Set up the same network topology as pmtu_ipv4, but use non-default
++# routing table in A. A fib-rule is used to jump to this routing table
++# based on DSCP. Send ICMPv4 packets with the expected DSCP value and
++# verify that ECN doesn't interfere with the creation of PMTU exceptions.
++#
++# - pmtu_ipv4_dscp_udp_exception
++# Same as pmtu_ipv4_dscp_icmp_exception, but use UDP instead of ICMP.
++#
+ # - pmtu_ipv4_vxlan4_exception
+ # Set up the same network topology as pmtu_ipv4, create a VXLAN tunnel
+ # over IPv4 between A and B, routed via R1. On the link between R1 and B,
+@@ -203,6 +212,8 @@ which ping6 > /dev/null 2>&1 && ping6=$(which ping6) || ping6=$(which ping)
+ tests="
+ pmtu_ipv4_exception ipv4: PMTU exceptions 1
+ pmtu_ipv6_exception ipv6: PMTU exceptions 1
++ pmtu_ipv4_dscp_icmp_exception ICMPv4 with DSCP and ECN: PMTU exceptions 1
++ pmtu_ipv4_dscp_udp_exception UDPv4 with DSCP and ECN: PMTU exceptions 1
+ pmtu_ipv4_vxlan4_exception IPv4 over vxlan4: PMTU exceptions 1
+ pmtu_ipv6_vxlan4_exception IPv6 over vxlan4: PMTU exceptions 1
+ pmtu_ipv4_vxlan6_exception IPv4 over vxlan6: PMTU exceptions 1
+@@ -323,6 +334,9 @@ routes_nh="
+ B 6 default 61
+ "
+
++policy_mark=0x04
++rt_table=main
++
+ veth4_a_addr="192.168.1.1"
+ veth4_b_addr="192.168.1.2"
+ veth4_c_addr="192.168.2.10"
+@@ -346,6 +360,7 @@ dummy6_mask="64"
+ err_buf=
+ tcpdump_pids=
+ nettest_pids=
++socat_pids=
+
+ err() {
+ err_buf="${err_buf}${1}
+@@ -725,7 +740,7 @@ setup_routing_old() {
+
+ ns_name="$(nsname ${ns})"
+
+- ip -n ${ns_name} route add ${addr} via ${gw}
++ ip -n "${ns_name}" route add "${addr}" table "${rt_table}" via "${gw}"
+
+ ns=""; addr=""; gw=""
+ done
+@@ -755,7 +770,7 @@ setup_routing_new() {
+
+ ns_name="$(nsname ${ns})"
+
+- ip -n ${ns_name} -${fam} route add ${addr} nhid ${nhid}
++ ip -n "${ns_name}" -"${fam}" route add "${addr}" table "${rt_table}" nhid "${nhid}"
+
+ ns=""; fam=""; addr=""; nhid=""
+ done
+@@ -800,6 +815,24 @@ setup_routing() {
+ return 0
+ }
+
++setup_policy_routing() {
++ setup_routing
++
++ ip -netns "${NS_A}" -4 rule add dsfield "${policy_mark}" \
++ table "${rt_table}"
++
++ # Set the IPv4 Don't Fragment bit with tc, since socat doesn't seem to
++ # have an option do to it.
++ tc -netns "${NS_A}" qdisc replace dev veth_A-R1 root prio
++ tc -netns "${NS_A}" qdisc replace dev veth_A-R2 root prio
++ tc -netns "${NS_A}" filter add dev veth_A-R1 \
++ protocol ipv4 flower ip_proto udp \
++ action pedit ex munge ip df set 0x40 pipe csum ip and udp
++ tc -netns "${NS_A}" filter add dev veth_A-R2 \
++ protocol ipv4 flower ip_proto udp \
++ action pedit ex munge ip df set 0x40 pipe csum ip and udp
++}
++
+ setup_bridge() {
+ run_cmd ${ns_a} ip link add br0 type bridge || return $ksft_skip
+ run_cmd ${ns_a} ip link set br0 up
+@@ -905,6 +938,11 @@ cleanup() {
+ done
+ nettest_pids=
+
++ for pid in ${socat_pids}; do
++ kill "${pid}"
++ done
++ socat_pids=
++
+ for n in ${NS_A} ${NS_B} ${NS_C} ${NS_R1} ${NS_R2}; do
+ ip netns del ${n} 2> /dev/null
+ done
+@@ -952,15 +990,21 @@ link_get_mtu() {
+ route_get_dst_exception() {
+ ns_cmd="${1}"
+ dst="${2}"
++ dsfield="${3}"
+
+- ${ns_cmd} ip route get "${dst}"
++ if [ -z "${dsfield}" ]; then
++ dsfield=0
++ fi
++
++ ${ns_cmd} ip route get "${dst}" dsfield "${dsfield}"
+ }
+
+ route_get_dst_pmtu_from_exception() {
+ ns_cmd="${1}"
+ dst="${2}"
++ dsfield="${3}"
+
+- mtu_parse "$(route_get_dst_exception "${ns_cmd}" ${dst})"
++ mtu_parse "$(route_get_dst_exception "${ns_cmd}" "${dst}" "${dsfield}")"
+ }
+
+ check_pmtu_value() {
+@@ -1070,6 +1114,95 @@ test_pmtu_ipv6_exception() {
+ test_pmtu_ipvX 6
+ }
+
++test_pmtu_ipv4_dscp_icmp_exception() {
++ rt_table=100
++
++ setup namespaces policy_routing || return $ksft_skip
++ trace "${ns_a}" veth_A-R1 "${ns_r1}" veth_R1-A \
++ "${ns_r1}" veth_R1-B "${ns_b}" veth_B-R1 \
++ "${ns_a}" veth_A-R2 "${ns_r2}" veth_R2-A \
++ "${ns_r2}" veth_R2-B "${ns_b}" veth_B-R2
++
++ # Set up initial MTU values
++ mtu "${ns_a}" veth_A-R1 2000
++ mtu "${ns_r1}" veth_R1-A 2000
++ mtu "${ns_r1}" veth_R1-B 1400
++ mtu "${ns_b}" veth_B-R1 1400
++
++ mtu "${ns_a}" veth_A-R2 2000
++ mtu "${ns_r2}" veth_R2-A 2000
++ mtu "${ns_r2}" veth_R2-B 1500
++ mtu "${ns_b}" veth_B-R2 1500
++
++ len=$((2000 - 20 - 8)) # Fills MTU of veth_A-R1
++
++ dst1="${prefix4}.${b_r1}.1"
++ dst2="${prefix4}.${b_r2}.1"
++
++ # Create route exceptions
++ dsfield=${policy_mark} # No ECN bit set (Not-ECT)
++ run_cmd "${ns_a}" ping -q -M want -Q "${dsfield}" -c 1 -w 1 -s "${len}" "${dst1}"
++
++ dsfield=$(printf "%#x" $((policy_mark + 0x02))) # ECN=2 (ECT(0))
++ run_cmd "${ns_a}" ping -q -M want -Q "${dsfield}" -c 1 -w 1 -s "${len}" "${dst2}"
++
++ # Check that exceptions have been created with the correct PMTU
++ pmtu_1="$(route_get_dst_pmtu_from_exception "${ns_a}" "${dst1}" "${policy_mark}")"
++ check_pmtu_value "1400" "${pmtu_1}" "exceeding MTU" || return 1
++
++ pmtu_2="$(route_get_dst_pmtu_from_exception "${ns_a}" "${dst2}" "${policy_mark}")"
++ check_pmtu_value "1500" "${pmtu_2}" "exceeding MTU" || return 1
++}
++
++test_pmtu_ipv4_dscp_udp_exception() {
++ rt_table=100
++
++ if ! which socat > /dev/null 2>&1; then
++ echo "'socat' command not found; skipping tests"
++ return $ksft_skip
++ fi
++
++ setup namespaces policy_routing || return $ksft_skip
++ trace "${ns_a}" veth_A-R1 "${ns_r1}" veth_R1-A \
++ "${ns_r1}" veth_R1-B "${ns_b}" veth_B-R1 \
++ "${ns_a}" veth_A-R2 "${ns_r2}" veth_R2-A \
++ "${ns_r2}" veth_R2-B "${ns_b}" veth_B-R2
++
++ # Set up initial MTU values
++ mtu "${ns_a}" veth_A-R1 2000
++ mtu "${ns_r1}" veth_R1-A 2000
++ mtu "${ns_r1}" veth_R1-B 1400
++ mtu "${ns_b}" veth_B-R1 1400
++
++ mtu "${ns_a}" veth_A-R2 2000
++ mtu "${ns_r2}" veth_R2-A 2000
++ mtu "${ns_r2}" veth_R2-B 1500
++ mtu "${ns_b}" veth_B-R2 1500
++
++ len=$((2000 - 20 - 8)) # Fills MTU of veth_A-R1
++
++ dst1="${prefix4}.${b_r1}.1"
++ dst2="${prefix4}.${b_r2}.1"
++
++ # Create route exceptions
++ run_cmd_bg "${ns_b}" socat UDP-LISTEN:50000 OPEN:/dev/null,wronly=1
++ socat_pids="${socat_pids} $!"
++
++ dsfield=${policy_mark} # No ECN bit set (Not-ECT)
++ run_cmd "${ns_a}" socat OPEN:/dev/zero,rdonly=1,readbytes="${len}" \
++ UDP:"${dst1}":50000,tos="${dsfield}"
++
++ dsfield=$(printf "%#x" $((policy_mark + 0x02))) # ECN=2 (ECT(0))
++ run_cmd "${ns_a}" socat OPEN:/dev/zero,rdonly=1,readbytes="${len}" \
++ UDP:"${dst2}":50000,tos="${dsfield}"
++
++ # Check that exceptions have been created with the correct PMTU
++ pmtu_1="$(route_get_dst_pmtu_from_exception "${ns_a}" "${dst1}" "${policy_mark}")"
++ check_pmtu_value "1400" "${pmtu_1}" "exceeding MTU" || return 1
++ pmtu_2="$(route_get_dst_pmtu_from_exception "${ns_a}" "${dst2}" "${policy_mark}")"
++ check_pmtu_value "1500" "${pmtu_2}" "exceeding MTU" || return 1
++}
++
+ test_pmtu_ipvX_over_vxlanY_or_geneveY_exception() {
+ type=${1}
+ family=${2}
+--
+2.39.5
+
alpha-align-stack-for-page-fault-and-user-unaligned-trap-handlers.patch
gpio-stmpe-check-return-value-of-stmpe_reg_read-in-stmpe_gpio_irq_sync_unlock.patch
partitions-mac-fix-handling-of-bogus-partition-table.patch
+clocksource-replace-cpumask_weight-with-cpumask_empt.patch
+clocksource-use-pr_info-for-checking-clocksource-syn.patch
+clocksource-use-migrate_disable-to-avoid-calling-get.patch
+ipv4-add-rcu-protection-to-ip4_dst_hoplimit.patch
+net-treat-possible_net_t-net-pointer-as-an-rcu-one-a.patch
+net-add-dev_net_rcu-helper.patch
+ipv4-use-rcu-protection-in-rt_is_expired.patch
+ipv4-use-rcu-protection-in-inet_select_addr.patch
+namespaceify-min_pmtu-sysctl.patch
+namespaceify-mtu_expires-sysctl.patch
+selftest-net-test-ipv4-pmtu-exceptions-with-dscp-and.patch
+net-ipv4-cache-pmtu-for-all-packet-paths-if-multipat.patch
+ipv4-use-rcu-protection-in-__ip_rt_update_pmtu.patch
+ipv6-use-rcu-protection-in-ip6_default_advmss.patch
+ndisc-use-rcu-protection-in-ndisc_alloc_skb.patch
+neighbour-delete-redundant-judgment-statements.patch
+neighbour-use-rcu-protection-in-__neigh_notify.patch
+arp-use-rcu-protection-in-arp_xmit.patch
+openvswitch-use-rcu-protection-in-ovs_vport_cmd_fill.patch
+ndisc-extend-rcu-protection-in-ndisc_send_skb.patch
+ipv6-mcast-add-rcu-protection-to-mld_newpack.patch