From: Greg Kroah-Hartman Date: Sat, 13 Jan 2024 09:18:20 +0000 (+0100) Subject: 5.4-stable patches X-Git-Tag: v4.19.305~13 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=18628d06edb925a8bb37c64db5c649c56abaa42f;p=thirdparty%2Fkernel%2Fstable-queue.git 5.4-stable patches added patches: ipv6-make-ip6_rt_gc_expire-an-atomic_t.patch ipv6-remove-max_size-check-inline-with-ipv4.patch net-dst-use-a-smaller-percpu_counter-batch-for-dst-entries-accounting.patch --- diff --git a/queue-5.4/ipv6-make-ip6_rt_gc_expire-an-atomic_t.patch b/queue-5.4/ipv6-make-ip6_rt_gc_expire-an-atomic_t.patch new file mode 100644 index 00000000000..2a95f4af892 --- /dev/null +++ b/queue-5.4/ipv6-make-ip6_rt_gc_expire-an-atomic_t.patch @@ -0,0 +1,123 @@ +From SRS0=s4sW=IX=amazon.com=prvs=73518ea15=surajjs@kernel.org Sat Jan 13 01:43:47 2024 +From: Suraj Jitindar Singh +Date: Fri, 12 Jan 2024 16:42:53 -0800 +Subject: ipv6: make ip6_rt_gc_expire an atomic_t +To: +Cc: , , , Eric Dumazet , syzbot , David Ahern , Jakub Kicinski , "Suraj Jitindar Singh" +Message-ID: <20240113004254.2416044-2-surajjs@amazon.com> + +From: Eric Dumazet + +commit 9cb7c013420f98fa6fd12fc6a5dc055170c108db upstream. + +Reads and Writes to ip6_rt_gc_expire always have been racy, +as syzbot reported lately [1] + +There is a possible risk of under-flow, leading +to unexpected high value passed to fib6_run_gc(), +although I have not observed this in the field. + +Hosts hitting ip6_dst_gc() very hard are under pretty bad +state anyway. + +[1] +BUG: KCSAN: data-race in ip6_dst_gc / ip6_dst_gc + +read-write to 0xffff888102110744 of 4 bytes by task 13165 on cpu 1: + ip6_dst_gc+0x1f3/0x220 net/ipv6/route.c:3311 + dst_alloc+0x9b/0x160 net/core/dst.c:86 + ip6_dst_alloc net/ipv6/route.c:344 [inline] + icmp6_dst_alloc+0xb2/0x360 net/ipv6/route.c:3261 + mld_sendpack+0x2b9/0x580 net/ipv6/mcast.c:1807 + mld_send_cr net/ipv6/mcast.c:2119 [inline] + mld_ifc_work+0x576/0x800 net/ipv6/mcast.c:2651 + process_one_work+0x3d3/0x720 kernel/workqueue.c:2289 + worker_thread+0x618/0xa70 kernel/workqueue.c:2436 + kthread+0x1a9/0x1e0 kernel/kthread.c:376 + ret_from_fork+0x1f/0x30 + +read-write to 0xffff888102110744 of 4 bytes by task 11607 on cpu 0: + ip6_dst_gc+0x1f3/0x220 net/ipv6/route.c:3311 + dst_alloc+0x9b/0x160 net/core/dst.c:86 + ip6_dst_alloc net/ipv6/route.c:344 [inline] + icmp6_dst_alloc+0xb2/0x360 net/ipv6/route.c:3261 + mld_sendpack+0x2b9/0x580 net/ipv6/mcast.c:1807 + mld_send_cr net/ipv6/mcast.c:2119 [inline] + mld_ifc_work+0x576/0x800 net/ipv6/mcast.c:2651 + process_one_work+0x3d3/0x720 kernel/workqueue.c:2289 + worker_thread+0x618/0xa70 kernel/workqueue.c:2436 + kthread+0x1a9/0x1e0 kernel/kthread.c:376 + ret_from_fork+0x1f/0x30 + +value changed: 0x00000bb3 -> 0x00000ba9 + +Reported by Kernel Concurrency Sanitizer on: +CPU: 0 PID: 11607 Comm: kworker/0:21 Not tainted 5.18.0-rc1-syzkaller-00037-g42e7a03d3bad-dirty #0 +Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 +Workqueue: mld mld_ifc_work + +Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") +Signed-off-by: Eric Dumazet +Reported-by: syzbot +Reviewed-by: David Ahern +Link: https://lore.kernel.org/r/20220413181333.649424-1-eric.dumazet@gmail.com +Signed-off-by: Jakub Kicinski +[ 5.4: context adjustment in include/net/netns/ipv6.h ] +Signed-off-by: Suraj Jitindar Singh +Cc: # 5.4.x +Signed-off-by: Greg Kroah-Hartman +--- + include/net/netns/ipv6.h | 4 ++-- + net/ipv6/route.c | 11 ++++++----- + 2 files changed, 8 insertions(+), 7 deletions(-) + +--- a/include/net/netns/ipv6.h ++++ b/include/net/netns/ipv6.h +@@ -78,8 +78,8 @@ struct netns_ipv6 { + struct dst_ops ip6_dst_ops; + rwlock_t fib6_walker_lock; + spinlock_t fib6_gc_lock; +- unsigned int ip6_rt_gc_expire; +- unsigned long ip6_rt_last_gc; ++ atomic_t ip6_rt_gc_expire; ++ unsigned long ip6_rt_last_gc; + #ifdef CONFIG_IPV6_MULTIPLE_TABLES + unsigned int fib6_rules_require_fldissect; + bool fib6_has_custom_rules; +--- a/net/ipv6/route.c ++++ b/net/ipv6/route.c +@@ -3215,6 +3215,7 @@ static int ip6_dst_gc(struct dst_ops *op + int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity; + int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout; + unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc; ++ unsigned int val; + int entries; + + entries = dst_entries_get_fast(ops); +@@ -3225,13 +3226,13 @@ static int ip6_dst_gc(struct dst_ops *op + entries <= rt_max_size) + goto out; + +- net->ipv6.ip6_rt_gc_expire++; +- fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net, true); ++ fib6_run_gc(atomic_inc_return(&net->ipv6.ip6_rt_gc_expire), net, true); + entries = dst_entries_get_slow(ops); + if (entries < ops->gc_thresh) +- net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1; ++ atomic_set(&net->ipv6.ip6_rt_gc_expire, rt_gc_timeout >> 1); + out: +- net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity; ++ val = atomic_read(&net->ipv6.ip6_rt_gc_expire); ++ atomic_set(&net->ipv6.ip6_rt_gc_expire, val - (val >> rt_elasticity)); + return entries > rt_max_size; + } + +@@ -6329,7 +6330,7 @@ static int __net_init ip6_route_net_init + net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40; + net->ipv6.sysctl.skip_notify_on_dev_down = 0; + +- net->ipv6.ip6_rt_gc_expire = 30*HZ; ++ atomic_set(&net->ipv6.ip6_rt_gc_expire, 30*HZ); + + ret = 0; + out: diff --git a/queue-5.4/ipv6-remove-max_size-check-inline-with-ipv4.patch b/queue-5.4/ipv6-remove-max_size-check-inline-with-ipv4.patch new file mode 100644 index 00000000000..433320a740c --- /dev/null +++ b/queue-5.4/ipv6-remove-max_size-check-inline-with-ipv4.patch @@ -0,0 +1,206 @@ +From SRS0=s4sW=IX=amazon.com=prvs=73518ea15=surajjs@kernel.org Sat Jan 13 01:43:43 2024 +From: Suraj Jitindar Singh +Date: Fri, 12 Jan 2024 16:42:54 -0800 +Subject: ipv6: remove max_size check inline with ipv4 +To: +Cc: , , , Jon Maxwell , Andrea Mayer , David Ahern , Jakub Kicinski , "Suraj Jitindar Singh" +Message-ID: <20240113004254.2416044-3-surajjs@amazon.com> + +From: Jon Maxwell + +commit af6d10345ca76670c1b7c37799f0d5576ccef277 upstream. + +In ip6_dst_gc() replace: + + if (entries > gc_thresh) + +With: + + if (entries > ops->gc_thresh) + +Sending Ipv6 packets in a loop via a raw socket triggers an issue where a +route is cloned by ip6_rt_cache_alloc() for each packet sent. This quickly +consumes the Ipv6 max_size threshold which defaults to 4096 resulting in +these warnings: + +[1] 99.187805] dst_alloc: 7728 callbacks suppressed +[2] Route cache is full: consider increasing sysctl net.ipv6.route.max_size. +. +. +[300] Route cache is full: consider increasing sysctl net.ipv6.route.max_size. + +When this happens the packet is dropped and sendto() gets a network is +unreachable error: + +remaining pkt 200557 errno 101 +remaining pkt 196462 errno 101 +. +. +remaining pkt 126821 errno 101 + +Implement David Aherns suggestion to remove max_size check seeing that Ipv6 +has a GC to manage memory usage. Ipv4 already does not check max_size. + +Here are some memory comparisons for Ipv4 vs Ipv6 with the patch: + +Test by running 5 instances of a program that sends UDP packets to a raw +socket 5000000 times. Compare Ipv4 and Ipv6 performance with a similar +program. + +Ipv4: + +Before test: + +MemFree: 29427108 kB +Slab: 237612 kB + +ip6_dst_cache 1912 2528 256 32 2 : tunables 0 0 0 +xfrm_dst_cache 0 0 320 25 2 : tunables 0 0 0 +ip_dst_cache 2881 3990 192 42 2 : tunables 0 0 0 + +During test: + +MemFree: 29417608 kB +Slab: 247712 kB + +ip6_dst_cache 1912 2528 256 32 2 : tunables 0 0 0 +xfrm_dst_cache 0 0 320 25 2 : tunables 0 0 0 +ip_dst_cache 44394 44394 192 42 2 : tunables 0 0 0 + +After test: + +MemFree: 29422308 kB +Slab: 238104 kB + +ip6_dst_cache 1912 2528 256 32 2 : tunables 0 0 0 +xfrm_dst_cache 0 0 320 25 2 : tunables 0 0 0 +ip_dst_cache 3048 4116 192 42 2 : tunables 0 0 0 + +Ipv6 with patch: + +Errno 101 errors are not observed anymore with the patch. + +Before test: + +MemFree: 29422308 kB +Slab: 238104 kB + +ip6_dst_cache 1912 2528 256 32 2 : tunables 0 0 0 +xfrm_dst_cache 0 0 320 25 2 : tunables 0 0 0 +ip_dst_cache 3048 4116 192 42 2 : tunables 0 0 0 + +During Test: + +MemFree: 29431516 kB +Slab: 240940 kB + +ip6_dst_cache 11980 12064 256 32 2 : tunables 0 0 0 +xfrm_dst_cache 0 0 320 25 2 : tunables 0 0 0 +ip_dst_cache 3048 4116 192 42 2 : tunables 0 0 0 + +After Test: + +MemFree: 29441816 kB +Slab: 238132 kB + +ip6_dst_cache 1902 2432 256 32 2 : tunables 0 0 0 +xfrm_dst_cache 0 0 320 25 2 : tunables 0 0 0 +ip_dst_cache 3048 4116 192 42 2 : tunables 0 0 0 + +Tested-by: Andrea Mayer +Signed-off-by: Jon Maxwell +Reviewed-by: David Ahern +Link: https://lore.kernel.org/r/20230112012532.311021-1-jmaxwell37@gmail.com +Signed-off-by: Jakub Kicinski +Signed-off-by: Suraj Jitindar Singh +Cc: # 5.4.x +Signed-off-by: Greg Kroah-Hartman +--- + include/net/dst_ops.h | 2 +- + net/core/dst.c | 8 ++------ + net/ipv6/route.c | 13 +++++-------- + 3 files changed, 8 insertions(+), 15 deletions(-) + +--- a/include/net/dst_ops.h ++++ b/include/net/dst_ops.h +@@ -16,7 +16,7 @@ struct dst_ops { + unsigned short family; + unsigned int gc_thresh; + +- int (*gc)(struct dst_ops *ops); ++ void (*gc)(struct dst_ops *ops); + struct dst_entry * (*check)(struct dst_entry *, __u32 cookie); + unsigned int (*default_advmss)(const struct dst_entry *); + unsigned int (*mtu)(const struct dst_entry *); +--- a/net/core/dst.c ++++ b/net/core/dst.c +@@ -83,12 +83,8 @@ void *dst_alloc(struct dst_ops *ops, str + + if (ops->gc && + !(flags & DST_NOCOUNT) && +- dst_entries_get_fast(ops) > ops->gc_thresh) { +- if (ops->gc(ops)) { +- pr_notice_ratelimited("Route cache is full: consider increasing sysctl net.ipv6.route.max_size.\n"); +- return NULL; +- } +- } ++ dst_entries_get_fast(ops) > ops->gc_thresh) ++ ops->gc(ops); + + dst = kmem_cache_alloc(ops->kmem_cachep, GFP_ATOMIC); + if (!dst) +--- a/net/ipv6/route.c ++++ b/net/ipv6/route.c +@@ -88,7 +88,7 @@ static struct dst_entry *ip6_negative_ad + static void ip6_dst_destroy(struct dst_entry *); + static void ip6_dst_ifdown(struct dst_entry *, + struct net_device *dev, int how); +-static int ip6_dst_gc(struct dst_ops *ops); ++static void ip6_dst_gc(struct dst_ops *ops); + + static int ip6_pkt_discard(struct sk_buff *skb); + static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb); +@@ -3207,11 +3207,10 @@ out: + return dst; + } + +-static int ip6_dst_gc(struct dst_ops *ops) ++static void ip6_dst_gc(struct dst_ops *ops) + { + struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops); + int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval; +- int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size; + int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity; + int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout; + unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc; +@@ -3219,11 +3218,10 @@ static int ip6_dst_gc(struct dst_ops *op + int entries; + + entries = dst_entries_get_fast(ops); +- if (entries > rt_max_size) ++ if (entries > ops->gc_thresh) + entries = dst_entries_get_slow(ops); + +- if (time_after(rt_last_gc + rt_min_interval, jiffies) && +- entries <= rt_max_size) ++ if (time_after(rt_last_gc + rt_min_interval, jiffies)) + goto out; + + fib6_run_gc(atomic_inc_return(&net->ipv6.ip6_rt_gc_expire), net, true); +@@ -3233,7 +3231,6 @@ static int ip6_dst_gc(struct dst_ops *op + out: + val = atomic_read(&net->ipv6.ip6_rt_gc_expire); + atomic_set(&net->ipv6.ip6_rt_gc_expire, val - (val >> rt_elasticity)); +- return entries > rt_max_size; + } + + static int ip6_nh_lookup_table(struct net *net, struct fib6_config *cfg, +@@ -6321,7 +6318,7 @@ static int __net_init ip6_route_net_init + #endif + + net->ipv6.sysctl.flush_delay = 0; +- net->ipv6.sysctl.ip6_rt_max_size = 4096; ++ net->ipv6.sysctl.ip6_rt_max_size = INT_MAX; + net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2; + net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ; + net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ; diff --git a/queue-5.4/net-dst-use-a-smaller-percpu_counter-batch-for-dst-entries-accounting.patch b/queue-5.4/net-dst-use-a-smaller-percpu_counter-batch-for-dst-entries-accounting.patch new file mode 100644 index 00000000000..2d4b19a8692 --- /dev/null +++ b/queue-5.4/net-dst-use-a-smaller-percpu_counter-batch-for-dst-entries-accounting.patch @@ -0,0 +1,78 @@ +From SRS0=s4sW=IX=amazon.com=prvs=73518ea15=surajjs@kernel.org Sat Jan 13 01:43:43 2024 +From: Suraj Jitindar Singh +Date: Fri, 12 Jan 2024 16:42:52 -0800 +Subject: net/dst: use a smaller percpu_counter batch for dst entries accounting +To: +Cc: , , , Eric Dumazet , Jakub Kicinski , "Suraj Jitindar Singh" +Message-ID: <20240113004254.2416044-1-surajjs@amazon.com> + +From: Eric Dumazet + +commit cf86a086a18095e33e0637cb78cda1fcf5280852 upstream. + +percpu_counter_add() uses a default batch size which is quite big +on platforms with 256 cpus. (2*256 -> 512) + +This means dst_entries_get_fast() can be off by +/- 2*(nr_cpus^2) +(131072 on servers with 256 cpus) + +Reduce the batch size to something more reasonable, and +add logic to ip6_dst_gc() to call dst_entries_get_slow() +before calling the _very_ expensive fib6_run_gc() function. + +Signed-off-by: Eric Dumazet +Signed-off-by: Jakub Kicinski +Signed-off-by: Suraj Jitindar Singh +Cc: # 5.4.x +Signed-off-by: Greg Kroah-Hartman +--- + include/net/dst_ops.h | 4 +++- + net/core/dst.c | 8 ++++---- + net/ipv6/route.c | 3 +++ + 3 files changed, 10 insertions(+), 5 deletions(-) + +--- a/include/net/dst_ops.h ++++ b/include/net/dst_ops.h +@@ -53,9 +53,11 @@ static inline int dst_entries_get_slow(s + return percpu_counter_sum_positive(&dst->pcpuc_entries); + } + ++#define DST_PERCPU_COUNTER_BATCH 32 + static inline void dst_entries_add(struct dst_ops *dst, int val) + { +- percpu_counter_add(&dst->pcpuc_entries, val); ++ percpu_counter_add_batch(&dst->pcpuc_entries, val, ++ DST_PERCPU_COUNTER_BATCH); + } + + static inline int dst_entries_init(struct dst_ops *dst) +--- a/net/core/dst.c ++++ b/net/core/dst.c +@@ -81,11 +81,11 @@ void *dst_alloc(struct dst_ops *ops, str + { + struct dst_entry *dst; + +- if (ops->gc && dst_entries_get_fast(ops) > ops->gc_thresh) { ++ if (ops->gc && ++ !(flags & DST_NOCOUNT) && ++ dst_entries_get_fast(ops) > ops->gc_thresh) { + if (ops->gc(ops)) { +- printk_ratelimited(KERN_NOTICE "Route cache is full: " +- "consider increasing sysctl " +- "net.ipv[4|6].route.max_size.\n"); ++ pr_notice_ratelimited("Route cache is full: consider increasing sysctl net.ipv6.route.max_size.\n"); + return NULL; + } + } +--- a/net/ipv6/route.c ++++ b/net/ipv6/route.c +@@ -3218,6 +3218,9 @@ static int ip6_dst_gc(struct dst_ops *op + int entries; + + entries = dst_entries_get_fast(ops); ++ if (entries > rt_max_size) ++ entries = dst_entries_get_slow(ops); ++ + if (time_after(rt_last_gc + rt_min_interval, jiffies) && + entries <= rt_max_size) + goto out; diff --git a/queue-5.4/series b/queue-5.4/series index c3923afe8b4..a61b673aaed 100644 --- a/queue-5.4/series +++ b/queue-5.4/series @@ -33,3 +33,6 @@ net-tls-update-curr-on-splice-as-well.patch netfilter-nf_tables-reject-tables-of-unsupported-family.patch pci-extract-ats-disabling-to-a-helper-function.patch pci-disable-ats-for-specific-intel-ipu-e2000-devices.patch +net-dst-use-a-smaller-percpu_counter-batch-for-dst-entries-accounting.patch +ipv6-make-ip6_rt_gc_expire-an-atomic_t.patch +ipv6-remove-max_size-check-inline-with-ipv4.patch