--- /dev/null
+From SRS0=s4sW=IX=amazon.com=prvs=73518ea15=surajjs@kernel.org Sat Jan 13 01:43:47 2024
+From: Suraj Jitindar Singh <surajjs@amazon.com>
+Date: Fri, 12 Jan 2024 16:42:53 -0800
+Subject: ipv6: make ip6_rt_gc_expire an atomic_t
+To: <stable@vger.kernel.org>
+Cc: <gregkh@linuxfoundation.org>, <trawets@amazon.com>, <security@kernel.org>, Eric Dumazet <edumazet@google.com>, syzbot <syzkaller@googlegroups.com>, David Ahern <dsahern@kernel.org>, Jakub Kicinski <kuba@kernel.org>, "Suraj Jitindar Singh" <surajjs@amazon.com>
+Message-ID: <20240113004254.2416044-2-surajjs@amazon.com>
+
+From: Eric Dumazet <edumazet@google.com>
+
+commit 9cb7c013420f98fa6fd12fc6a5dc055170c108db upstream.
+
+Reads and Writes to ip6_rt_gc_expire always have been racy,
+as syzbot reported lately [1]
+
+There is a possible risk of under-flow, leading
+to unexpected high value passed to fib6_run_gc(),
+although I have not observed this in the field.
+
+Hosts hitting ip6_dst_gc() very hard are under pretty bad
+state anyway.
+
+[1]
+BUG: KCSAN: data-race in ip6_dst_gc / ip6_dst_gc
+
+read-write to 0xffff888102110744 of 4 bytes by task 13165 on cpu 1:
+ ip6_dst_gc+0x1f3/0x220 net/ipv6/route.c:3311
+ dst_alloc+0x9b/0x160 net/core/dst.c:86
+ ip6_dst_alloc net/ipv6/route.c:344 [inline]
+ icmp6_dst_alloc+0xb2/0x360 net/ipv6/route.c:3261
+ mld_sendpack+0x2b9/0x580 net/ipv6/mcast.c:1807
+ mld_send_cr net/ipv6/mcast.c:2119 [inline]
+ mld_ifc_work+0x576/0x800 net/ipv6/mcast.c:2651
+ process_one_work+0x3d3/0x720 kernel/workqueue.c:2289
+ worker_thread+0x618/0xa70 kernel/workqueue.c:2436
+ kthread+0x1a9/0x1e0 kernel/kthread.c:376
+ ret_from_fork+0x1f/0x30
+
+read-write to 0xffff888102110744 of 4 bytes by task 11607 on cpu 0:
+ ip6_dst_gc+0x1f3/0x220 net/ipv6/route.c:3311
+ dst_alloc+0x9b/0x160 net/core/dst.c:86
+ ip6_dst_alloc net/ipv6/route.c:344 [inline]
+ icmp6_dst_alloc+0xb2/0x360 net/ipv6/route.c:3261
+ mld_sendpack+0x2b9/0x580 net/ipv6/mcast.c:1807
+ mld_send_cr net/ipv6/mcast.c:2119 [inline]
+ mld_ifc_work+0x576/0x800 net/ipv6/mcast.c:2651
+ process_one_work+0x3d3/0x720 kernel/workqueue.c:2289
+ worker_thread+0x618/0xa70 kernel/workqueue.c:2436
+ kthread+0x1a9/0x1e0 kernel/kthread.c:376
+ ret_from_fork+0x1f/0x30
+
+value changed: 0x00000bb3 -> 0x00000ba9
+
+Reported by Kernel Concurrency Sanitizer on:
+CPU: 0 PID: 11607 Comm: kworker/0:21 Not tainted 5.18.0-rc1-syzkaller-00037-g42e7a03d3bad-dirty #0
+Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
+Workqueue: mld mld_ifc_work
+
+Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2")
+Signed-off-by: Eric Dumazet <edumazet@google.com>
+Reported-by: syzbot <syzkaller@googlegroups.com>
+Reviewed-by: David Ahern <dsahern@kernel.org>
+Link: https://lore.kernel.org/r/20220413181333.649424-1-eric.dumazet@gmail.com
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+[ 5.4: context adjustment in include/net/netns/ipv6.h ]
+Signed-off-by: Suraj Jitindar Singh <surajjs@amazon.com>
+Cc: <stable@vger.kernel.org> # 5.4.x
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ include/net/netns/ipv6.h | 4 ++--
+ net/ipv6/route.c | 11 ++++++-----
+ 2 files changed, 8 insertions(+), 7 deletions(-)
+
+--- a/include/net/netns/ipv6.h
++++ b/include/net/netns/ipv6.h
+@@ -78,8 +78,8 @@ struct netns_ipv6 {
+ struct dst_ops ip6_dst_ops;
+ rwlock_t fib6_walker_lock;
+ spinlock_t fib6_gc_lock;
+- unsigned int ip6_rt_gc_expire;
+- unsigned long ip6_rt_last_gc;
++ atomic_t ip6_rt_gc_expire;
++ unsigned long ip6_rt_last_gc;
+ #ifdef CONFIG_IPV6_MULTIPLE_TABLES
+ unsigned int fib6_rules_require_fldissect;
+ bool fib6_has_custom_rules;
+--- a/net/ipv6/route.c
++++ b/net/ipv6/route.c
+@@ -3215,6 +3215,7 @@ static int ip6_dst_gc(struct dst_ops *op
+ int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
+ int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
+ unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
++ unsigned int val;
+ int entries;
+
+ entries = dst_entries_get_fast(ops);
+@@ -3225,13 +3226,13 @@ static int ip6_dst_gc(struct dst_ops *op
+ entries <= rt_max_size)
+ goto out;
+
+- net->ipv6.ip6_rt_gc_expire++;
+- fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net, true);
++ fib6_run_gc(atomic_inc_return(&net->ipv6.ip6_rt_gc_expire), net, true);
+ entries = dst_entries_get_slow(ops);
+ if (entries < ops->gc_thresh)
+- net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
++ atomic_set(&net->ipv6.ip6_rt_gc_expire, rt_gc_timeout >> 1);
+ out:
+- net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
++ val = atomic_read(&net->ipv6.ip6_rt_gc_expire);
++ atomic_set(&net->ipv6.ip6_rt_gc_expire, val - (val >> rt_elasticity));
+ return entries > rt_max_size;
+ }
+
+@@ -6329,7 +6330,7 @@ static int __net_init ip6_route_net_init
+ net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
+ net->ipv6.sysctl.skip_notify_on_dev_down = 0;
+
+- net->ipv6.ip6_rt_gc_expire = 30*HZ;
++ atomic_set(&net->ipv6.ip6_rt_gc_expire, 30*HZ);
+
+ ret = 0;
+ out:
--- /dev/null
+From SRS0=s4sW=IX=amazon.com=prvs=73518ea15=surajjs@kernel.org Sat Jan 13 01:43:43 2024
+From: Suraj Jitindar Singh <surajjs@amazon.com>
+Date: Fri, 12 Jan 2024 16:42:54 -0800
+Subject: ipv6: remove max_size check inline with ipv4
+To: <stable@vger.kernel.org>
+Cc: <gregkh@linuxfoundation.org>, <trawets@amazon.com>, <security@kernel.org>, Jon Maxwell <jmaxwell37@gmail.com>, Andrea Mayer <andrea.mayer@uniroma2.it>, David Ahern <dsahern@kernel.org>, Jakub Kicinski <kuba@kernel.org>, "Suraj Jitindar Singh" <surajjs@amazon.com>
+Message-ID: <20240113004254.2416044-3-surajjs@amazon.com>
+
+From: Jon Maxwell <jmaxwell37@gmail.com>
+
+commit af6d10345ca76670c1b7c37799f0d5576ccef277 upstream.
+
+In ip6_dst_gc() replace:
+
+ if (entries > gc_thresh)
+
+With:
+
+ if (entries > ops->gc_thresh)
+
+Sending Ipv6 packets in a loop via a raw socket triggers an issue where a
+route is cloned by ip6_rt_cache_alloc() for each packet sent. This quickly
+consumes the Ipv6 max_size threshold which defaults to 4096 resulting in
+these warnings:
+
+[1] 99.187805] dst_alloc: 7728 callbacks suppressed
+[2] Route cache is full: consider increasing sysctl net.ipv6.route.max_size.
+.
+.
+[300] Route cache is full: consider increasing sysctl net.ipv6.route.max_size.
+
+When this happens the packet is dropped and sendto() gets a network is
+unreachable error:
+
+remaining pkt 200557 errno 101
+remaining pkt 196462 errno 101
+.
+.
+remaining pkt 126821 errno 101
+
+Implement David Aherns suggestion to remove max_size check seeing that Ipv6
+has a GC to manage memory usage. Ipv4 already does not check max_size.
+
+Here are some memory comparisons for Ipv4 vs Ipv6 with the patch:
+
+Test by running 5 instances of a program that sends UDP packets to a raw
+socket 5000000 times. Compare Ipv4 and Ipv6 performance with a similar
+program.
+
+Ipv4:
+
+Before test:
+
+MemFree: 29427108 kB
+Slab: 237612 kB
+
+ip6_dst_cache 1912 2528 256 32 2 : tunables 0 0 0
+xfrm_dst_cache 0 0 320 25 2 : tunables 0 0 0
+ip_dst_cache 2881 3990 192 42 2 : tunables 0 0 0
+
+During test:
+
+MemFree: 29417608 kB
+Slab: 247712 kB
+
+ip6_dst_cache 1912 2528 256 32 2 : tunables 0 0 0
+xfrm_dst_cache 0 0 320 25 2 : tunables 0 0 0
+ip_dst_cache 44394 44394 192 42 2 : tunables 0 0 0
+
+After test:
+
+MemFree: 29422308 kB
+Slab: 238104 kB
+
+ip6_dst_cache 1912 2528 256 32 2 : tunables 0 0 0
+xfrm_dst_cache 0 0 320 25 2 : tunables 0 0 0
+ip_dst_cache 3048 4116 192 42 2 : tunables 0 0 0
+
+Ipv6 with patch:
+
+Errno 101 errors are not observed anymore with the patch.
+
+Before test:
+
+MemFree: 29422308 kB
+Slab: 238104 kB
+
+ip6_dst_cache 1912 2528 256 32 2 : tunables 0 0 0
+xfrm_dst_cache 0 0 320 25 2 : tunables 0 0 0
+ip_dst_cache 3048 4116 192 42 2 : tunables 0 0 0
+
+During Test:
+
+MemFree: 29431516 kB
+Slab: 240940 kB
+
+ip6_dst_cache 11980 12064 256 32 2 : tunables 0 0 0
+xfrm_dst_cache 0 0 320 25 2 : tunables 0 0 0
+ip_dst_cache 3048 4116 192 42 2 : tunables 0 0 0
+
+After Test:
+
+MemFree: 29441816 kB
+Slab: 238132 kB
+
+ip6_dst_cache 1902 2432 256 32 2 : tunables 0 0 0
+xfrm_dst_cache 0 0 320 25 2 : tunables 0 0 0
+ip_dst_cache 3048 4116 192 42 2 : tunables 0 0 0
+
+Tested-by: Andrea Mayer <andrea.mayer@uniroma2.it>
+Signed-off-by: Jon Maxwell <jmaxwell37@gmail.com>
+Reviewed-by: David Ahern <dsahern@kernel.org>
+Link: https://lore.kernel.org/r/20230112012532.311021-1-jmaxwell37@gmail.com
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Suraj Jitindar Singh <surajjs@amazon.com>
+Cc: <stable@vger.kernel.org> # 5.4.x
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ include/net/dst_ops.h | 2 +-
+ net/core/dst.c | 8 ++------
+ net/ipv6/route.c | 13 +++++--------
+ 3 files changed, 8 insertions(+), 15 deletions(-)
+
+--- a/include/net/dst_ops.h
++++ b/include/net/dst_ops.h
+@@ -16,7 +16,7 @@ struct dst_ops {
+ unsigned short family;
+ unsigned int gc_thresh;
+
+- int (*gc)(struct dst_ops *ops);
++ void (*gc)(struct dst_ops *ops);
+ struct dst_entry * (*check)(struct dst_entry *, __u32 cookie);
+ unsigned int (*default_advmss)(const struct dst_entry *);
+ unsigned int (*mtu)(const struct dst_entry *);
+--- a/net/core/dst.c
++++ b/net/core/dst.c
+@@ -83,12 +83,8 @@ void *dst_alloc(struct dst_ops *ops, str
+
+ if (ops->gc &&
+ !(flags & DST_NOCOUNT) &&
+- dst_entries_get_fast(ops) > ops->gc_thresh) {
+- if (ops->gc(ops)) {
+- pr_notice_ratelimited("Route cache is full: consider increasing sysctl net.ipv6.route.max_size.\n");
+- return NULL;
+- }
+- }
++ dst_entries_get_fast(ops) > ops->gc_thresh)
++ ops->gc(ops);
+
+ dst = kmem_cache_alloc(ops->kmem_cachep, GFP_ATOMIC);
+ if (!dst)
+--- a/net/ipv6/route.c
++++ b/net/ipv6/route.c
+@@ -88,7 +88,7 @@ static struct dst_entry *ip6_negative_ad
+ static void ip6_dst_destroy(struct dst_entry *);
+ static void ip6_dst_ifdown(struct dst_entry *,
+ struct net_device *dev, int how);
+-static int ip6_dst_gc(struct dst_ops *ops);
++static void ip6_dst_gc(struct dst_ops *ops);
+
+ static int ip6_pkt_discard(struct sk_buff *skb);
+ static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb);
+@@ -3207,11 +3207,10 @@ out:
+ return dst;
+ }
+
+-static int ip6_dst_gc(struct dst_ops *ops)
++static void ip6_dst_gc(struct dst_ops *ops)
+ {
+ struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
+ int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
+- int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
+ int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
+ int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
+ unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
+@@ -3219,11 +3218,10 @@ static int ip6_dst_gc(struct dst_ops *op
+ int entries;
+
+ entries = dst_entries_get_fast(ops);
+- if (entries > rt_max_size)
++ if (entries > ops->gc_thresh)
+ entries = dst_entries_get_slow(ops);
+
+- if (time_after(rt_last_gc + rt_min_interval, jiffies) &&
+- entries <= rt_max_size)
++ if (time_after(rt_last_gc + rt_min_interval, jiffies))
+ goto out;
+
+ fib6_run_gc(atomic_inc_return(&net->ipv6.ip6_rt_gc_expire), net, true);
+@@ -3233,7 +3231,6 @@ static int ip6_dst_gc(struct dst_ops *op
+ out:
+ val = atomic_read(&net->ipv6.ip6_rt_gc_expire);
+ atomic_set(&net->ipv6.ip6_rt_gc_expire, val - (val >> rt_elasticity));
+- return entries > rt_max_size;
+ }
+
+ static int ip6_nh_lookup_table(struct net *net, struct fib6_config *cfg,
+@@ -6321,7 +6318,7 @@ static int __net_init ip6_route_net_init
+ #endif
+
+ net->ipv6.sysctl.flush_delay = 0;
+- net->ipv6.sysctl.ip6_rt_max_size = 4096;
++ net->ipv6.sysctl.ip6_rt_max_size = INT_MAX;
+ net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
+ net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
+ net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
--- /dev/null
+From SRS0=s4sW=IX=amazon.com=prvs=73518ea15=surajjs@kernel.org Sat Jan 13 01:43:43 2024
+From: Suraj Jitindar Singh <surajjs@amazon.com>
+Date: Fri, 12 Jan 2024 16:42:52 -0800
+Subject: net/dst: use a smaller percpu_counter batch for dst entries accounting
+To: <stable@vger.kernel.org>
+Cc: <gregkh@linuxfoundation.org>, <trawets@amazon.com>, <security@kernel.org>, Eric Dumazet <edumazet@google.com>, Jakub Kicinski <kuba@kernel.org>, "Suraj Jitindar Singh" <surajjs@amazon.com>
+Message-ID: <20240113004254.2416044-1-surajjs@amazon.com>
+
+From: Eric Dumazet <edumazet@google.com>
+
+commit cf86a086a18095e33e0637cb78cda1fcf5280852 upstream.
+
+percpu_counter_add() uses a default batch size which is quite big
+on platforms with 256 cpus. (2*256 -> 512)
+
+This means dst_entries_get_fast() can be off by +/- 2*(nr_cpus^2)
+(131072 on servers with 256 cpus)
+
+Reduce the batch size to something more reasonable, and
+add logic to ip6_dst_gc() to call dst_entries_get_slow()
+before calling the _very_ expensive fib6_run_gc() function.
+
+Signed-off-by: Eric Dumazet <edumazet@google.com>
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Suraj Jitindar Singh <surajjs@amazon.com>
+Cc: <stable@vger.kernel.org> # 5.4.x
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ include/net/dst_ops.h | 4 +++-
+ net/core/dst.c | 8 ++++----
+ net/ipv6/route.c | 3 +++
+ 3 files changed, 10 insertions(+), 5 deletions(-)
+
+--- a/include/net/dst_ops.h
++++ b/include/net/dst_ops.h
+@@ -53,9 +53,11 @@ static inline int dst_entries_get_slow(s
+ return percpu_counter_sum_positive(&dst->pcpuc_entries);
+ }
+
++#define DST_PERCPU_COUNTER_BATCH 32
+ static inline void dst_entries_add(struct dst_ops *dst, int val)
+ {
+- percpu_counter_add(&dst->pcpuc_entries, val);
++ percpu_counter_add_batch(&dst->pcpuc_entries, val,
++ DST_PERCPU_COUNTER_BATCH);
+ }
+
+ static inline int dst_entries_init(struct dst_ops *dst)
+--- a/net/core/dst.c
++++ b/net/core/dst.c
+@@ -81,11 +81,11 @@ void *dst_alloc(struct dst_ops *ops, str
+ {
+ struct dst_entry *dst;
+
+- if (ops->gc && dst_entries_get_fast(ops) > ops->gc_thresh) {
++ if (ops->gc &&
++ !(flags & DST_NOCOUNT) &&
++ dst_entries_get_fast(ops) > ops->gc_thresh) {
+ if (ops->gc(ops)) {
+- printk_ratelimited(KERN_NOTICE "Route cache is full: "
+- "consider increasing sysctl "
+- "net.ipv[4|6].route.max_size.\n");
++ pr_notice_ratelimited("Route cache is full: consider increasing sysctl net.ipv6.route.max_size.\n");
+ return NULL;
+ }
+ }
+--- a/net/ipv6/route.c
++++ b/net/ipv6/route.c
+@@ -3218,6 +3218,9 @@ static int ip6_dst_gc(struct dst_ops *op
+ int entries;
+
+ entries = dst_entries_get_fast(ops);
++ if (entries > rt_max_size)
++ entries = dst_entries_get_slow(ops);
++
+ if (time_after(rt_last_gc + rt_min_interval, jiffies) &&
+ entries <= rt_max_size)
+ goto out;
netfilter-nf_tables-reject-tables-of-unsupported-family.patch
pci-extract-ats-disabling-to-a-helper-function.patch
pci-disable-ats-for-specific-intel-ipu-e2000-devices.patch
+net-dst-use-a-smaller-percpu_counter-batch-for-dst-entries-accounting.patch
+ipv6-make-ip6_rt_gc_expire-an-atomic_t.patch
+ipv6-remove-max_size-check-inline-with-ipv4.patch