--- /dev/null
+From stable+bounces-78550-greg=kroah.com@vger.kernel.org Tue Oct 1 17:09:18 2024
+From: Eric Dumazet <edumazet@google.com>
+Date: Tue, 1 Oct 2024 15:04:03 +0000
+Subject: icmp: Add counters for rate limits
+To: stable@vger.kernel.org
+Cc: Jamie Bainbridge <jamie.bainbridge@gmail.com>, Abhishek Rawal <rawal.abhishek92@gmail.com>, Paolo Abeni <pabeni@redhat.com>, Eric Dumazet <edumazet@google.com>
+Message-ID: <20241001150404.2176005-1-edumazet@google.com>
+
+From: Jamie Bainbridge <jamie.bainbridge@gmail.com>
+
+commit d0941130c93515411c8d66fc22bdae407b509a6d upstream.
+
+There are multiple ICMP rate limiting mechanisms:
+
+* Global limits: net.ipv4.icmp_msgs_burst/icmp_msgs_per_sec
+* v4 per-host limits: net.ipv4.icmp_ratelimit/ratemask
+* v6 per-host limits: net.ipv6.icmp_ratelimit/ratemask
+
+However, when ICMP output is limited, there is no way to tell
+which limit has been hit or even if the limits are responsible
+for the lack of ICMP output.
+
+Add counters for each of the cases above. As we are within
+local_bh_disable(), use the __INC stats variant.
+
+Example output:
+
+ # nstat -sz "*RateLimit*"
+ IcmpOutRateLimitGlobal 134 0.0
+ IcmpOutRateLimitHost 770 0.0
+ Icmp6OutRateLimitHost 84 0.0
+
+Signed-off-by: Jamie Bainbridge <jamie.bainbridge@gmail.com>
+Suggested-by: Abhishek Rawal <rawal.abhishek92@gmail.com>
+Link: https://lore.kernel.org/r/273b32241e6b7fdc5c609e6f5ebc68caf3994342.1674605770.git.jamie.bainbridge@gmail.com
+Signed-off-by: Paolo Abeni <pabeni@redhat.com>
+Signed-off-by: Eric Dumazet <edumazet@google.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ include/uapi/linux/snmp.h | 3 +++
+ net/ipv4/icmp.c | 3 +++
+ net/ipv4/proc.c | 8 +++++---
+ net/ipv6/icmp.c | 4 ++++
+ net/ipv6/proc.c | 1 +
+ 5 files changed, 16 insertions(+), 3 deletions(-)
+
+--- a/include/uapi/linux/snmp.h
++++ b/include/uapi/linux/snmp.h
+@@ -95,6 +95,8 @@ enum
+ ICMP_MIB_OUTADDRMASKS, /* OutAddrMasks */
+ ICMP_MIB_OUTADDRMASKREPS, /* OutAddrMaskReps */
+ ICMP_MIB_CSUMERRORS, /* InCsumErrors */
++ ICMP_MIB_RATELIMITGLOBAL, /* OutRateLimitGlobal */
++ ICMP_MIB_RATELIMITHOST, /* OutRateLimitHost */
+ __ICMP_MIB_MAX
+ };
+
+@@ -112,6 +114,7 @@ enum
+ ICMP6_MIB_OUTMSGS, /* OutMsgs */
+ ICMP6_MIB_OUTERRORS, /* OutErrors */
+ ICMP6_MIB_CSUMERRORS, /* InCsumErrors */
++ ICMP6_MIB_RATELIMITHOST, /* OutRateLimitHost */
+ __ICMP6_MIB_MAX
+ };
+
+--- a/net/ipv4/icmp.c
++++ b/net/ipv4/icmp.c
+@@ -297,6 +297,7 @@ static bool icmpv4_global_allow(struct n
+ if (icmp_global_allow())
+ return true;
+
++ __ICMP_INC_STATS(net, ICMP_MIB_RATELIMITGLOBAL);
+ return false;
+ }
+
+@@ -326,6 +327,8 @@ static bool icmpv4_xrlim_allow(struct ne
+ if (peer)
+ inet_putpeer(peer);
+ out:
++ if (!rc)
++ __ICMP_INC_STATS(net, ICMP_MIB_RATELIMITHOST);
+ return rc;
+ }
+
+--- a/net/ipv4/proc.c
++++ b/net/ipv4/proc.c
+@@ -352,7 +352,7 @@ static void icmp_put(struct seq_file *se
+ seq_puts(seq, "\nIcmp: InMsgs InErrors InCsumErrors");
+ for (i = 0; icmpmibmap[i].name; i++)
+ seq_printf(seq, " In%s", icmpmibmap[i].name);
+- seq_puts(seq, " OutMsgs OutErrors");
++ seq_puts(seq, " OutMsgs OutErrors OutRateLimitGlobal OutRateLimitHost");
+ for (i = 0; icmpmibmap[i].name; i++)
+ seq_printf(seq, " Out%s", icmpmibmap[i].name);
+ seq_printf(seq, "\nIcmp: %lu %lu %lu",
+@@ -362,9 +362,11 @@ static void icmp_put(struct seq_file *se
+ for (i = 0; icmpmibmap[i].name; i++)
+ seq_printf(seq, " %lu",
+ atomic_long_read(ptr + icmpmibmap[i].index));
+- seq_printf(seq, " %lu %lu",
++ seq_printf(seq, " %lu %lu %lu %lu",
+ snmp_fold_field(net->mib.icmp_statistics, ICMP_MIB_OUTMSGS),
+- snmp_fold_field(net->mib.icmp_statistics, ICMP_MIB_OUTERRORS));
++ snmp_fold_field(net->mib.icmp_statistics, ICMP_MIB_OUTERRORS),
++ snmp_fold_field(net->mib.icmp_statistics, ICMP_MIB_RATELIMITGLOBAL),
++ snmp_fold_field(net->mib.icmp_statistics, ICMP_MIB_RATELIMITHOST));
+ for (i = 0; icmpmibmap[i].name; i++)
+ seq_printf(seq, " %lu",
+ atomic_long_read(ptr + (icmpmibmap[i].index | 0x100)));
+--- a/net/ipv6/icmp.c
++++ b/net/ipv6/icmp.c
+@@ -183,6 +183,7 @@ static bool icmpv6_global_allow(struct n
+ if (icmp_global_allow())
+ return true;
+
++ __ICMP_INC_STATS(net, ICMP_MIB_RATELIMITGLOBAL);
+ return false;
+ }
+
+@@ -224,6 +225,9 @@ static bool icmpv6_xrlim_allow(struct so
+ if (peer)
+ inet_putpeer(peer);
+ }
++ if (!res)
++ __ICMP6_INC_STATS(net, ip6_dst_idev(dst),
++ ICMP6_MIB_RATELIMITHOST);
+ dst_release(dst);
+ return res;
+ }
+--- a/net/ipv6/proc.c
++++ b/net/ipv6/proc.c
+@@ -94,6 +94,7 @@ static const struct snmp_mib snmp6_icmp6
+ SNMP_MIB_ITEM("Icmp6OutMsgs", ICMP6_MIB_OUTMSGS),
+ SNMP_MIB_ITEM("Icmp6OutErrors", ICMP6_MIB_OUTERRORS),
+ SNMP_MIB_ITEM("Icmp6InCsumErrors", ICMP6_MIB_CSUMERRORS),
++ SNMP_MIB_ITEM("Icmp6OutRateLimitHost", ICMP6_MIB_RATELIMITHOST),
+ SNMP_MIB_SENTINEL
+ };
+
--- /dev/null
+From stable+bounces-78551-greg=kroah.com@vger.kernel.org Tue Oct 1 17:09:19 2024
+From: Eric Dumazet <edumazet@google.com>
+Date: Tue, 1 Oct 2024 15:04:04 +0000
+Subject: icmp: change the order of rate limits
+To: stable@vger.kernel.org
+Cc: Eric Dumazet <edumazet@google.com>, Keyu Man <keyu.man@email.ucr.edu>, David Ahern <dsahern@kernel.org>, Jesper Dangaard Brouer <hawk@kernel.org>, Jakub Kicinski <kuba@kernel.org>
+Message-ID: <20241001150404.2176005-2-edumazet@google.com>
+
+From: Eric Dumazet <edumazet@google.com>
+
+commit 8c2bd38b95f75f3d2a08c93e35303e26d480d24e upstream.
+
+ICMP messages are ratelimited :
+
+After the blamed commits, the two rate limiters are applied in this order:
+
+1) host wide ratelimit (icmp_global_allow())
+
+2) Per destination ratelimit (inetpeer based)
+
+In order to avoid side-channels attacks, we need to apply
+the per destination check first.
+
+This patch makes the following change :
+
+1) icmp_global_allow() checks if the host wide limit is reached.
+ But credits are not yet consumed. This is deferred to 3)
+
+2) The per destination limit is checked/updated.
+ This might add a new node in inetpeer tree.
+
+3) icmp_global_consume() consumes tokens if prior operations succeeded.
+
+This means that host wide ratelimit is still effective
+in keeping inetpeer tree small even under DDOS.
+
+As a bonus, I removed icmp_global.lock as the fast path
+can use a lock-free operation.
+
+Fixes: c0303efeab73 ("net: reduce cycles spend on ICMP replies that gets rate limited")
+Fixes: 4cdf507d5452 ("icmp: add a global rate limitation")
+Reported-by: Keyu Man <keyu.man@email.ucr.edu>
+Signed-off-by: Eric Dumazet <edumazet@google.com>
+Reviewed-by: David Ahern <dsahern@kernel.org>
+Cc: Jesper Dangaard Brouer <hawk@kernel.org>
+Cc: stable@vger.kernel.org
+Link: https://patch.msgid.link/20240829144641.3880376-2-edumazet@google.com
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Eric Dumazet <edumazet@google.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ include/net/ip.h | 2 +
+ net/ipv4/icmp.c | 103 +++++++++++++++++++++++++++++--------------------------
+ net/ipv6/icmp.c | 28 +++++++++-----
+ 3 files changed, 76 insertions(+), 57 deletions(-)
+
+--- a/include/net/ip.h
++++ b/include/net/ip.h
+@@ -781,6 +781,8 @@ static inline void ip_cmsg_recv(struct m
+ }
+
+ bool icmp_global_allow(void);
++void icmp_global_consume(void);
++
+ extern int sysctl_icmp_msgs_per_sec;
+ extern int sysctl_icmp_msgs_burst;
+
+--- a/net/ipv4/icmp.c
++++ b/net/ipv4/icmp.c
+@@ -222,57 +222,59 @@ int sysctl_icmp_msgs_per_sec __read_most
+ int sysctl_icmp_msgs_burst __read_mostly = 50;
+
+ static struct {
+- spinlock_t lock;
+- u32 credit;
++ atomic_t credit;
+ u32 stamp;
+-} icmp_global = {
+- .lock = __SPIN_LOCK_UNLOCKED(icmp_global.lock),
+-};
++} icmp_global;
+
+ /**
+ * icmp_global_allow - Are we allowed to send one more ICMP message ?
+ *
+ * Uses a token bucket to limit our ICMP messages to ~sysctl_icmp_msgs_per_sec.
+ * Returns false if we reached the limit and can not send another packet.
+- * Note: called with BH disabled
++ * Works in tandem with icmp_global_consume().
+ */
+ bool icmp_global_allow(void)
+ {
+- u32 credit, delta, incr = 0, now = (u32)jiffies;
+- bool rc = false;
++ u32 delta, now, oldstamp;
++ int incr, new, old;
+
+- /* Check if token bucket is empty and cannot be refilled
+- * without taking the spinlock. The READ_ONCE() are paired
+- * with the following WRITE_ONCE() in this same function.
++ /* Note: many cpus could find this condition true.
++ * Then later icmp_global_consume() could consume more credits,
++ * this is an acceptable race.
+ */
+- if (!READ_ONCE(icmp_global.credit)) {
+- delta = min_t(u32, now - READ_ONCE(icmp_global.stamp), HZ);
+- if (delta < HZ / 50)
+- return false;
+- }
++ if (atomic_read(&icmp_global.credit) > 0)
++ return true;
+
+- spin_lock(&icmp_global.lock);
+- delta = min_t(u32, now - icmp_global.stamp, HZ);
+- if (delta >= HZ / 50) {
+- incr = READ_ONCE(sysctl_icmp_msgs_per_sec) * delta / HZ;
+- if (incr)
+- WRITE_ONCE(icmp_global.stamp, now);
+- }
+- credit = min_t(u32, icmp_global.credit + incr,
+- READ_ONCE(sysctl_icmp_msgs_burst));
+- if (credit) {
+- /* We want to use a credit of one in average, but need to randomize
+- * it for security reasons.
+- */
+- credit = max_t(int, credit - prandom_u32_max(3), 0);
+- rc = true;
++ now = jiffies;
++ oldstamp = READ_ONCE(icmp_global.stamp);
++ delta = min_t(u32, now - oldstamp, HZ);
++ if (delta < HZ / 50)
++ return false;
++
++ incr = READ_ONCE(sysctl_icmp_msgs_per_sec) * delta / HZ;
++ if (!incr)
++ return false;
++
++ if (cmpxchg(&icmp_global.stamp, oldstamp, now) == oldstamp) {
++ old = atomic_read(&icmp_global.credit);
++ do {
++ new = min(old + incr, READ_ONCE(sysctl_icmp_msgs_burst));
++ } while (!atomic_try_cmpxchg(&icmp_global.credit, &old, new));
+ }
+- WRITE_ONCE(icmp_global.credit, credit);
+- spin_unlock(&icmp_global.lock);
+- return rc;
++ return true;
+ }
+ EXPORT_SYMBOL(icmp_global_allow);
+
++void icmp_global_consume(void)
++{
++ int credits = get_random_u32_below(3);
++
++ /* Note: this might make icmp_global.credit negative. */
++ if (credits)
++ atomic_sub(credits, &icmp_global.credit);
++}
++EXPORT_SYMBOL(icmp_global_consume);
++
+ static bool icmpv4_mask_allow(struct net *net, int type, int code)
+ {
+ if (type > NR_ICMP_TYPES)
+@@ -289,14 +291,16 @@ static bool icmpv4_mask_allow(struct net
+ return false;
+ }
+
+-static bool icmpv4_global_allow(struct net *net, int type, int code)
++static bool icmpv4_global_allow(struct net *net, int type, int code,
++ bool *apply_ratelimit)
+ {
+ if (icmpv4_mask_allow(net, type, code))
+ return true;
+
+- if (icmp_global_allow())
++ if (icmp_global_allow()) {
++ *apply_ratelimit = true;
+ return true;
+-
++ }
+ __ICMP_INC_STATS(net, ICMP_MIB_RATELIMITGLOBAL);
+ return false;
+ }
+@@ -306,15 +310,16 @@ static bool icmpv4_global_allow(struct n
+ */
+
+ static bool icmpv4_xrlim_allow(struct net *net, struct rtable *rt,
+- struct flowi4 *fl4, int type, int code)
++ struct flowi4 *fl4, int type, int code,
++ bool apply_ratelimit)
+ {
+ struct dst_entry *dst = &rt->dst;
+ struct inet_peer *peer;
+ bool rc = true;
+ int vif;
+
+- if (icmpv4_mask_allow(net, type, code))
+- goto out;
++ if (!apply_ratelimit)
++ return true;
+
+ /* No rate limit on loopback */
+ if (dst->dev && (dst->dev->flags&IFF_LOOPBACK))
+@@ -329,6 +334,8 @@ static bool icmpv4_xrlim_allow(struct ne
+ out:
+ if (!rc)
+ __ICMP_INC_STATS(net, ICMP_MIB_RATELIMITHOST);
++ else
++ icmp_global_consume();
+ return rc;
+ }
+
+@@ -400,6 +407,7 @@ static void icmp_reply(struct icmp_bxm *
+ struct ipcm_cookie ipc;
+ struct rtable *rt = skb_rtable(skb);
+ struct net *net = dev_net(rt->dst.dev);
++ bool apply_ratelimit = false;
+ struct flowi4 fl4;
+ struct sock *sk;
+ struct inet_sock *inet;
+@@ -411,11 +419,11 @@ static void icmp_reply(struct icmp_bxm *
+ if (ip_options_echo(net, &icmp_param->replyopts.opt.opt, skb))
+ return;
+
+- /* Needed by both icmp_global_allow and icmp_xmit_lock */
++ /* Needed by both icmpv4_global_allow and icmp_xmit_lock */
+ local_bh_disable();
+
+- /* global icmp_msgs_per_sec */
+- if (!icmpv4_global_allow(net, type, code))
++ /* is global icmp_msgs_per_sec exhausted ? */
++ if (!icmpv4_global_allow(net, type, code, &apply_ratelimit))
+ goto out_bh_enable;
+
+ sk = icmp_xmit_lock(net);
+@@ -448,7 +456,7 @@ static void icmp_reply(struct icmp_bxm *
+ rt = ip_route_output_key(net, &fl4);
+ if (IS_ERR(rt))
+ goto out_unlock;
+- if (icmpv4_xrlim_allow(net, rt, &fl4, type, code))
++ if (icmpv4_xrlim_allow(net, rt, &fl4, type, code, apply_ratelimit))
+ icmp_push_reply(sk, icmp_param, &fl4, &ipc, &rt);
+ ip_rt_put(rt);
+ out_unlock:
+@@ -592,6 +600,7 @@ void __icmp_send(struct sk_buff *skb_in,
+ int room;
+ struct icmp_bxm icmp_param;
+ struct rtable *rt = skb_rtable(skb_in);
++ bool apply_ratelimit = false;
+ struct ipcm_cookie ipc;
+ struct flowi4 fl4;
+ __be32 saddr;
+@@ -673,7 +682,7 @@ void __icmp_send(struct sk_buff *skb_in,
+ }
+ }
+
+- /* Needed by both icmp_global_allow and icmp_xmit_lock */
++ /* Needed by both icmpv4_global_allow and icmp_xmit_lock */
+ local_bh_disable();
+
+ /* Check global sysctl_icmp_msgs_per_sec ratelimit, unless
+@@ -681,7 +690,7 @@ void __icmp_send(struct sk_buff *skb_in,
+ * loopback, then peer ratelimit still work (in icmpv4_xrlim_allow)
+ */
+ if (!(skb_in->dev && (skb_in->dev->flags&IFF_LOOPBACK)) &&
+- !icmpv4_global_allow(net, type, code))
++ !icmpv4_global_allow(net, type, code, &apply_ratelimit))
+ goto out_bh_enable;
+
+ sk = icmp_xmit_lock(net);
+@@ -740,7 +749,7 @@ void __icmp_send(struct sk_buff *skb_in,
+ goto out_unlock;
+
+ /* peer icmp_ratelimit */
+- if (!icmpv4_xrlim_allow(net, rt, &fl4, type, code))
++ if (!icmpv4_xrlim_allow(net, rt, &fl4, type, code, apply_ratelimit))
+ goto ende;
+
+ /* RFC says return as much as we can without exceeding 576 bytes. */
+--- a/net/ipv6/icmp.c
++++ b/net/ipv6/icmp.c
+@@ -175,14 +175,16 @@ static bool icmpv6_mask_allow(struct net
+ return false;
+ }
+
+-static bool icmpv6_global_allow(struct net *net, int type)
++static bool icmpv6_global_allow(struct net *net, int type,
++ bool *apply_ratelimit)
+ {
+ if (icmpv6_mask_allow(net, type))
+ return true;
+
+- if (icmp_global_allow())
++ if (icmp_global_allow()) {
++ *apply_ratelimit = true;
+ return true;
+-
++ }
+ __ICMP_INC_STATS(net, ICMP_MIB_RATELIMITGLOBAL);
+ return false;
+ }
+@@ -191,13 +193,13 @@ static bool icmpv6_global_allow(struct n
+ * Check the ICMP output rate limit
+ */
+ static bool icmpv6_xrlim_allow(struct sock *sk, u8 type,
+- struct flowi6 *fl6)
++ struct flowi6 *fl6, bool apply_ratelimit)
+ {
+ struct net *net = sock_net(sk);
+ struct dst_entry *dst;
+ bool res = false;
+
+- if (icmpv6_mask_allow(net, type))
++ if (!apply_ratelimit)
+ return true;
+
+ /*
+@@ -228,6 +230,8 @@ static bool icmpv6_xrlim_allow(struct so
+ if (!res)
+ __ICMP6_INC_STATS(net, ip6_dst_idev(dst),
+ ICMP6_MIB_RATELIMITHOST);
++ else
++ icmp_global_consume();
+ dst_release(dst);
+ return res;
+ }
+@@ -454,6 +458,7 @@ void icmp6_send(struct sk_buff *skb, u8
+ struct net *net;
+ struct ipv6_pinfo *np;
+ const struct in6_addr *saddr = NULL;
++ bool apply_ratelimit = false;
+ struct dst_entry *dst;
+ struct icmp6hdr tmp_hdr;
+ struct flowi6 fl6;
+@@ -535,11 +540,12 @@ void icmp6_send(struct sk_buff *skb, u8
+ return;
+ }
+
+- /* Needed by both icmp_global_allow and icmpv6_xmit_lock */
++ /* Needed by both icmpv6_global_allow and icmpv6_xmit_lock */
+ local_bh_disable();
+
+ /* Check global sysctl_icmp_msgs_per_sec ratelimit */
+- if (!(skb->dev->flags & IFF_LOOPBACK) && !icmpv6_global_allow(net, type))
++ if (!(skb->dev->flags & IFF_LOOPBACK) &&
++ !icmpv6_global_allow(net, type, &apply_ratelimit))
+ goto out_bh_enable;
+
+ mip6_addr_swap(skb, parm);
+@@ -577,7 +583,7 @@ void icmp6_send(struct sk_buff *skb, u8
+
+ np = inet6_sk(sk);
+
+- if (!icmpv6_xrlim_allow(sk, type, &fl6))
++ if (!icmpv6_xrlim_allow(sk, type, &fl6, apply_ratelimit))
+ goto out;
+
+ tmp_hdr.icmp6_type = type;
+@@ -719,6 +725,7 @@ static void icmpv6_echo_reply(struct sk_
+ struct ipv6_pinfo *np;
+ const struct in6_addr *saddr = NULL;
+ struct icmp6hdr *icmph = icmp6_hdr(skb);
++ bool apply_ratelimit = false;
+ struct icmp6hdr tmp_hdr;
+ struct flowi6 fl6;
+ struct icmpv6_msg msg;
+@@ -782,8 +789,9 @@ static void icmpv6_echo_reply(struct sk_
+ goto out;
+
+ /* Check the ratelimit */
+- if ((!(skb->dev->flags & IFF_LOOPBACK) && !icmpv6_global_allow(net, ICMPV6_ECHO_REPLY)) ||
+- !icmpv6_xrlim_allow(sk, ICMPV6_ECHO_REPLY, &fl6))
++ if ((!(skb->dev->flags & IFF_LOOPBACK) &&
++ !icmpv6_global_allow(net, ICMPV6_ECHO_REPLY, &apply_ratelimit)) ||
++ !icmpv6_xrlim_allow(sk, ICMPV6_ECHO_REPLY, &fl6, apply_ratelimit))
+ goto out_dst_release;
+
+ idev = __in6_dev_get(skb->dev);