]> git.ipfire.org Git - thirdparty/kernel/linux.git/commitdiff
inet: add ip_local_port_step_width sysctl to improve port usage distribution
authorFernando Fernandez Mancera <fmancera@suse.de>
Mon, 9 Mar 2026 02:39:45 +0000 (03:39 +0100)
committerJakub Kicinski <kuba@kernel.org>
Wed, 11 Mar 2026 01:59:39 +0000 (18:59 -0700)
With the current port selection algorithm, ports after a reserved port
range or long time used port are used more often than others [1]. This
causes an uneven port usage distribution. This combines with cloud
environments blocking connections between the application server and the
database server if there was a previous connection with the same source
port, leading to connectivity problems between applications on cloud
environments.

The real issue here is that these firewalls cannot cope with
standards-compliant port reuse. This is a workaround for such situations
and an improvement on the distribution of ports selected.

The proposed solution is to implement a variant of RFC 6056 Algorithm 5.
The step size is selected randomly on every connect() call ensuring it
is a coprime with respect to the size of the range of ports we want to
scan. This way, we can ensure that all ports within the range are
scanned before returning an error. To enable this algorithm, the user
must configure the new sysctl option "net.ipv4.ip_local_port_step_width".

In addition, on graphs generated we can observe that the distribution of
source ports is more even with the proposed approach. [2]

[1] https://0xffsoftware.com/port_graph_current_alg.html

[2] https://0xffsoftware.com/port_graph_random_step_alg.html

Reviewed-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Fernando Fernandez Mancera <fmancera@suse.de>
Link: https://patch.msgid.link/20260309023946.5473-2-fmancera@suse.de
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Documentation/networking/ip-sysctl.rst
Documentation/networking/net_cachelines/netns_ipv4_sysctl.rst
include/net/netns/ipv4.h
net/ipv4/inet_hashtables.c
net/ipv4/sysctl_net_ipv4.c

index 265158534cda53fb55f3a9f9a000221e4124b9b2..2e3a746fcc6da93799b0a4aa39952218dc4b4aa8 100644 (file)
@@ -1630,6 +1630,22 @@ ip_local_reserved_ports - list of comma separated ranges
 
        Default: Empty
 
+ip_local_port_step_width - INTEGER
+        Defines the numerical maximum increment between successive port
+        allocations within the ephemeral port range when an unavailable port is
+        reached. This can be used to mitigate accumulated nodes in port
+        distribution when reserved ports have been configured. Please note that
+        port collisions may be more frequent in a system with a very high load.
+
+        It is recommended to set this value strictly larger than the largest
+        contiguous block of ports configure in ip_local_reserved_ports. For
+        large reserved port ranges, setting this to 3x or 4x the size of the
+        largest block is advised. Using a value equal or greater than the local
+        port range size completely solves the uneven port distribution problem,
+        but it can degrade performance under port exhaustion situations.
+
+        Default: 0 (disabled)
+
 ip_unprivileged_port_start - INTEGER
        This is a per-namespace sysctl.  It defines the first
        unprivileged port in the network namespace.  Privileged ports
index beaf1880a19bf4cfa578e162c571e09f7a9dffbe..cf284263e69b7ba86cd5aff869ceff6a18a6f87b 100644 (file)
@@ -52,6 +52,7 @@ u8                              sysctl_ip_fwd_update_priority
 u8                              sysctl_ip_nonlocal_bind
 u8                              sysctl_ip_autobind_reuse
 u8                              sysctl_ip_dynaddr
+u32                             sysctl_ip_local_port_step_width
 u8                              sysctl_ip_early_demux                                            read_mostly         ip(6)_rcv_finish_core
 u8                              sysctl_raw_l3mdev_accept
 u8                              sysctl_tcp_early_demux                                           read_mostly         ip(6)_rcv_finish_core
index 38624beff9b34161327d7549fcd0f041c9c91679..80ccd4dda8e0fcdde0b45956eeedd9b2242ab863 100644 (file)
@@ -166,6 +166,7 @@ struct netns_ipv4 {
        u8 sysctl_ip_autobind_reuse;
        /* Shall we try to damage output packets if routing dev changes? */
        u8 sysctl_ip_dynaddr;
+       u32 sysctl_ip_local_port_step_width;
 #ifdef CONFIG_NET_L3_MASTER_DEV
        u8 sysctl_raw_l3mdev_accept;
 #endif
index ac7b67c603b5791a65b772478acab2dea2bee7e3..13310c72b0bf42fa87879d1584394779625d0fab 100644 (file)
@@ -16,6 +16,7 @@
 #include <linux/wait.h>
 #include <linux/vmalloc.h>
 #include <linux/memblock.h>
+#include <linux/gcd.h>
 
 #include <net/addrconf.h>
 #include <net/inet_connection_sock.h>
@@ -1057,12 +1058,12 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row,
        struct net *net = sock_net(sk);
        struct inet_bind2_bucket *tb2;
        struct inet_bind_bucket *tb;
+       int step, scan_step, l3mdev;
+       u32 index, max_rand_step;
        bool tb_created = false;
        u32 remaining, offset;
        int ret, i, low, high;
        bool local_ports;
-       int step, l3mdev;
-       u32 index;
 
        if (port) {
                local_bh_disable();
@@ -1076,6 +1077,8 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row,
 
        local_ports = inet_sk_get_local_port_range(sk, &low, &high);
        step = local_ports ? 1 : 2;
+       scan_step = step;
+       max_rand_step = READ_ONCE(net->ipv4.sysctl_ip_local_port_step_width);
 
        high++; /* [32768, 60999] -> [32768, 61000[ */
        remaining = high - low;
@@ -1094,9 +1097,28 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row,
         */
        if (!local_ports)
                offset &= ~1U;
+
+       if (max_rand_step && remaining > 1) {
+               u32 range = remaining / step;
+               u32 upper_bound;
+
+               upper_bound = min(range, max_rand_step);
+               scan_step = get_random_u32_inclusive(1, upper_bound);
+               while (gcd(scan_step, range) != 1) {
+                       scan_step++;
+                       /* if both scan_step and range are even gcd won't be 1 */
+                       if (!(scan_step & 1) && !(range & 1))
+                               scan_step++;
+                       if (unlikely(scan_step > upper_bound)) {
+                               scan_step = 1;
+                               break;
+                       }
+               }
+               scan_step *= step;
+       }
 other_parity_scan:
        port = low + offset;
-       for (i = 0; i < remaining; i += step, port += step) {
+       for (i = 0; i < remaining; i += step, port += scan_step) {
                if (unlikely(port >= high))
                        port -= remaining;
                if (inet_is_local_reserved_port(net, port))
index 5654cc9c8a0b9e0cdf0e9a15ebe9948eaa6713b7..d8bdb1bdbff17ab52fe969b93ce13673e88c05a4 100644 (file)
@@ -823,6 +823,13 @@ static struct ctl_table ipv4_net_table[] = {
                .mode           = 0644,
                .proc_handler   = ipv4_local_port_range,
        },
+       {
+               .procname       = "ip_local_port_step_width",
+               .maxlen         = sizeof(u32),
+               .data           = &init_net.ipv4.sysctl_ip_local_port_step_width,
+               .mode           = 0644,
+               .proc_handler   = proc_douintvec,
+       },
        {
                .procname       = "ip_local_reserved_ports",
                .data           = &init_net.ipv4.sysctl_local_reserved_ports,