]> git.ipfire.org Git - thirdparty/bind9.git/commitdiff
Implement IP_LOCAL_PORT_RANGE socket option for Linux
authorOndřej Surý <ondrej@isc.org>
Thu, 24 Jul 2025 09:43:14 +0000 (11:43 +0200)
committerOndřej Surý <ondrej@isc.org>
Fri, 20 Feb 2026 13:06:23 +0000 (14:06 +0100)
For Linux >= 6.8:

Since 2023, Linux has introduced a change to the IP_LOCAL_PORT_RANGE
socket option that eliminates the need for the random window
shifting (implemented as a fallback in the next commit).

By setting IP_LOCAL_PORT_RANGE option, we tell the kernel to use better
approach to the source port selection.

For Linux << 6.8:

This implement selecting port by random shifting range leveraging the
IP_LOCAL_PORT_RANGE socket option.  The network manager is initialized
with the ephemeral port range (on startup and on reconfig) and then for
every outgoing TCP connection, we define a custom port range (1000
ports) and then randomly shift the custom range within the system range.

This helps the kernel to reduce the search space to the custom window
between <random_offset, random_offset + 1000>.

Reference:
https://blog.cloudflare.com/linux-transport-protocol-port-selection-performance/#kernel

bin/named/server.c
lib/isc/include/isc/net.h
lib/isc/include/isc/netmgr.h
lib/isc/include/isc/os.h
lib/isc/managers.c
lib/isc/net.c
lib/isc/netmgr/netmgr-int.h
lib/isc/netmgr/netmgr.c
lib/isc/netmgr/socket.c
lib/isc/netmgr/tcp.c
lib/isc/os.c

index 006b90c33c1b48f8c7d6855dac01914dc0ad81c5..a881560a289069a05b130a7a17fb040c5f2b07ed 100644 (file)
@@ -7704,7 +7704,7 @@ apply_configuration(cfg_obj_t *effectiveconfig, cfg_obj_t *bindkeys,
        dns_kasplist_t tmpkasplist, kasplist;
        dns_keystorelist_t tmpkeystorelist, keystorelist;
        dns_viewlist_t viewlist;
-       in_port_t listen_port, udpport_low, udpport_high;
+       in_port_t listen_port, port_low, port_high;
        int i, backlog;
        isc_interval_t interval;
        isc_logconfig_t *logc = NULL;
@@ -8048,39 +8048,26 @@ apply_configuration(cfg_obj_t *effectiveconfig, cfg_obj_t *bindkeys,
        isc_portset_create(isc_g_mctx, &v4portset);
        isc_portset_create(isc_g_mctx, &v6portset);
 
-       result = isc_net_getudpportrange(AF_INET, &udpport_low, &udpport_high);
-       if (result != ISC_R_SUCCESS) {
-               isc_log_write(NAMED_LOGCATEGORY_GENERAL, NAMED_LOGMODULE_SERVER,
-                             ISC_LOG_ERROR,
-                             "get the default UDP/IPv4 port range: %s",
-                             isc_result_totext(result));
-               goto cleanup_portsets;
-       }
-
-       isc_portset_addrange(v4portset, udpport_low, udpport_high);
+       isc_net_getudpportrange(AF_INET, &port_low, &port_high);
+       isc_netmgr_portrange(AF_INET, port_low, port_high);
+       isc_portset_addrange(v4portset, port_low, port_high);
        if (!ns_server_getoption(server->sctx, NS_SERVER_DISABLE4)) {
                isc_log_write(NAMED_LOGCATEGORY_GENERAL, NAMED_LOGMODULE_SERVER,
                              ISC_LOG_INFO,
                              "using default UDP/IPv4 port range: "
                              "[%d, %d]",
-                             udpport_low, udpport_high);
+                             port_low, port_high);
        }
 
-       result = isc_net_getudpportrange(AF_INET6, &udpport_low, &udpport_high);
-       if (result != ISC_R_SUCCESS) {
-               isc_log_write(NAMED_LOGCATEGORY_GENERAL, NAMED_LOGMODULE_SERVER,
-                             ISC_LOG_ERROR,
-                             "get the default UDP/IPv6 port range: %s",
-                             isc_result_totext(result));
-               goto cleanup_portsets;
-       }
-       isc_portset_addrange(v6portset, udpport_low, udpport_high);
+       isc_net_getudpportrange(AF_INET6, &port_low, &port_high);
+       isc_netmgr_portrange(AF_INET6, port_low, port_high);
+       isc_portset_addrange(v6portset, port_low, port_high);
        if (!ns_server_getoption(server->sctx, NS_SERVER_DISABLE6)) {
                isc_log_write(NAMED_LOGCATEGORY_GENERAL, NAMED_LOGMODULE_SERVER,
                              ISC_LOG_INFO,
                              "using default UDP/IPv6 port range: "
                              "[%d, %d]",
-                             udpport_low, udpport_high);
+                             port_low, port_high);
        }
 
        dns_dispatchmgr_setavailports(named_g_dispatchmgr, v4portset,
index 4c83db2a1b3958f8cd70f641f7aa516816bc1f81..f4062db18e6bb9915f034d7f39826e4ee48c1a1c 100644 (file)
@@ -216,7 +216,7 @@ isc_net_enableipv4(void);
 void
 isc_net_enableipv6(void);
 
-isc_result_t
+void
 isc_net_getudpportrange(int af, in_port_t *low, in_port_t *high);
 /*%<
  * Returns system's default range of ephemeral UDP ports, if defined.
index c62231b6dd574ad89ed24699e18725119c6b29f9..99aed3dd91611551aeaf42fae541331d19991063 100644 (file)
@@ -946,3 +946,9 @@ isc_nmsocket_getaddr(isc_nmsocket_t *sock);
 /*%<
  * Return the local address of 'sock'.
  */
+
+void
+isc_netmgr_portrange(sa_family_t af, in_port_t low, in_port_t high);
+/*%<
+ * Set the ephemeral port range <low, high> for 'af' family.
+ */
index 85b5d80746af008887716265ac0fb518d2174a6b..ca17208db039710b3ddab78202f25175257678f9 100644 (file)
@@ -45,3 +45,10 @@ isc_os_umask(void);
 /*%<
  * Return umask of the current process as initialized at the program start
  */
+
+void
+isc_os_kernel(char **name, int *major, int *minor, int *patch);
+/*%<
+ * Fill the running kernel version into major, minor and patch.
+ * If any of these are not available then -1 is returned.
+ */
index a01a8bac7dc7a794674648e0122fc47e751b2e75..6107620c1389bcb9b988b01f9db925d8cad7265e 100644 (file)
 
 void
 isc_managers_create(uint32_t workers) {
+       in_port_t port_low, port_high;
+
        isc_loopmgr_create(isc_g_mctx, workers);
        isc_netmgr_create(isc_g_mctx);
        isc_rwlock_setworkers(workers);
+
+       isc_net_getudpportrange(AF_INET, &port_low, &port_high);
+       isc_netmgr_portrange(AF_INET, port_low, port_high);
+
+       isc_net_getudpportrange(AF_INET6, &port_low, &port_high);
+       isc_netmgr_portrange(AF_INET6, port_low, port_high);
 }
 
 void
index d30072b7f09ff8a0b7069b1aaa9c4c81e7900453..5ac2866fe54947f4b77ecdc517994b230638a29e 100644 (file)
@@ -175,7 +175,7 @@ getudpportrange_sysctl(int af, in_port_t *low, in_port_t *high) {
 #endif /* HAVE_SYSCTLBYNAME */
 #endif /* USE_SYSCTL_PORTRANGE */
 
-isc_result_t
+void
 isc_net_getudpportrange(int af, in_port_t *low, in_port_t *high) {
        int result = ISC_R_FAILURE;
 #if !defined(USE_SYSCTL_PORTRANGE) && defined(__linux)
@@ -214,8 +214,6 @@ isc_net_getudpportrange(int af, in_port_t *low, in_port_t *high) {
                *low = ISC_NET_PORTRANGELOW;
                *high = ISC_NET_PORTRANGEHIGH;
        }
-
-       return ISC_R_SUCCESS; /* we currently never fail in this function */
 }
 
 void
index 9ea93e089f6168316ff74169ab3845ef3ac0ef7f..cce8b690295a035e408b4ee2be928e9260448055 100644 (file)
@@ -358,6 +358,12 @@ typedef struct isc__netmgr {
        atomic_int_fast32_t send_udp_buffer_size;
        atomic_int_fast32_t recv_tcp_buffer_size;
        atomic_int_fast32_t send_tcp_buffer_size;
+
+       _Atomic(in_port_t) port_low4;
+       _Atomic(in_port_t) port_high4;
+       _Atomic(in_port_t) port_low6;
+       _Atomic(in_port_t) port_high6;
+
 } isc__netmgr_t;
 
 extern isc__netmgr_t *isc__netmgr;
@@ -1387,9 +1393,11 @@ isc__nm_socket_min_mtu(uv_os_sock_t fd, sa_family_t sa_family);
  */
 
 isc_result_t
-isc__nm_tcp_bind_no_port(uv_tcp_t *handle);
+isc__nm_socket_max_port_range(uv_os_sock_t fd ISC_ATTR_UNUSED,
+                             sa_family_t sa_family ISC_ATTR_UNUSED);
 /*%<
- * Set IP_BIND_ADDRESS_NO_PORT on the socket (Linux only).
+ * Set IP_BIND_ADDRESS_NO_PORT and IP_LOCAL_PORT_RANGE on the socket
+ * (Linux only).
  */
 
 void
index b3fcf33fb22f0abb017469f68f9be9d12405b63a..8dc4fcd4f27994b9b124f9d70d85167ff81b794e 100644 (file)
@@ -155,6 +155,7 @@ netmgr_teardown(void *arg ISC_ATTR_UNUSED) {
 void
 isc_netmgr_create(isc_mem_t *mctx) {
        isc__netmgr_t *netmgr = NULL;
+       in_port_t port_low, port_high;
 
 #ifdef MAXIMAL_UV_VERSION
        if (uv_version() > MAXIMAL_UV_VERSION) {
@@ -185,6 +186,11 @@ isc_netmgr_create(isc_mem_t *mctx) {
        atomic_init(&netmgr->send_tcp_buffer_size, 0);
        atomic_init(&netmgr->recv_udp_buffer_size, 0);
        atomic_init(&netmgr->send_udp_buffer_size, 0);
+       atomic_init(&netmgr->port_low4, 0);
+       atomic_init(&netmgr->port_high4, 65535);
+       atomic_init(&netmgr->port_low6, 0);
+       atomic_init(&netmgr->port_high6, 65535);
+
 #if HAVE_SO_REUSEPORT_LB
        netmgr->load_balance_sockets = true;
 #else
@@ -237,6 +243,15 @@ isc_netmgr_create(isc_mem_t *mctx) {
        }
 
        isc__netmgr = netmgr;
+
+       /*
+        * Set the initial port range for IP_LOCAL_PORT_RANGE.
+        */
+       isc_net_getudpportrange(AF_INET, &port_low, &port_high);
+       isc_netmgr_portrange(AF_INET, port_low, port_high);
+
+       isc_net_getudpportrange(AF_INET6, &port_low, &port_high);
+       isc_netmgr_portrange(AF_INET6, port_low, port_high);
 }
 
 /*
@@ -2898,6 +2913,23 @@ isc__networker_get(uint32_t tid) {
        return &isc__netmgr->workers[tid];
 }
 
+void
+isc_netmgr_portrange(sa_family_t af, in_port_t low, in_port_t high) {
+       REQUIRE(VALID_NM(isc__netmgr));
+       switch (af) {
+       case AF_INET:
+               atomic_store_relaxed(&isc__netmgr->port_low4, low);
+               atomic_store_relaxed(&isc__netmgr->port_high4, high);
+               break;
+       case AF_INET6:
+               atomic_store_relaxed(&isc__netmgr->port_low6, low);
+               atomic_store_relaxed(&isc__netmgr->port_high6, high);
+               break;
+       default:
+               INSIST(0);
+       }
+}
+
 #if ISC_NETMGR_TRACE
 /*
  * Dump all active sockets in netmgr. We output to stderr
index d75beb97b6a2bdffa2aaa48914b21d4a5751d5ee..5f4b86090f4e773fc469df6b3de003c2cc1d0a75 100644 (file)
  * information regarding copyright ownership.
  */
 
+#include <netinet/in.h>
+
 #include <isc/errno.h>
+#include <isc/result.h>
 #include <isc/uv.h>
 
 #include "netmgr-int.h"
@@ -370,17 +373,81 @@ isc__nm_socket_min_mtu(uv_os_sock_t fd, sa_family_t sa_family) {
        return ISC_R_SUCCESS;
 }
 
+/*
+ * See
+ * https://blog.cloudflare.com/linux-transport-protocol-port-selection-performance/#kernel
+ * for rationalle.
+ */
+#define PORT_RANGE 1000
+
 isc_result_t
-isc__nm_tcp_bind_no_port(uv_tcp_t *handle ISC_ATTR_UNUSED) {
+isc__nm_socket_max_port_range(uv_os_sock_t fd ISC_ATTR_UNUSED,
+                             sa_family_t af ISC_ATTR_UNUSED) {
 #ifdef IP_BIND_ADDRESS_NO_PORT
-       uv_os_sock_t fd = -1;
-
-       int r = uv_fileno((const uv_handle_t *)handle, (uv_os_fd_t *)&fd);
-       if (r < 0) {
+       if (setsockopt_on(fd, IPPROTO_IP, IP_BIND_ADDRESS_NO_PORT) == -1) {
                return ISC_R_FAILURE;
        }
+#endif
 
-       if (setsockopt_on(fd, IPPROTO_IP, IP_BIND_ADDRESS_NO_PORT) == -1) {
+#if defined(IP_LOCAL_PORT_RANGE) && defined(__linux__)
+       /*
+        * The option takes an uint32_t value with the high 16 bits
+        * set to the upper range bound, and the low 16 bits set to
+        * the lower range bound.  Range bounds are inclusive.  The
+        * 16-bit values should be in host byte order.
+        */
+       uint32_t port_range;
+       int major, minor;
+       isc_os_kernel(NULL, &major, &minor, NULL);
+
+       in_port_t port_low, port_high;
+       switch (af) {
+       case AF_INET:
+               port_low = isc__netmgr->port_low4;
+               port_high = isc__netmgr->port_high4;
+               break;
+       case AF_INET6:
+               port_low = isc__netmgr->port_low6;
+               port_high = isc__netmgr->port_high6;
+               break;
+       default:
+               INSIST(0);
+       }
+
+       /*
+        * Linux 6.8 implemented a following patch:
+        *
+        * If IP_LOCAL_PORT_RANGE is set on a socket before accept(),
+        * port selection no longer favors even ports.
+        *
+        * This means that connect() can find a suitable source port
+        * faster, and applications can use a different split between
+        * connect() and bind() users.
+        */
+       if (major < 6 || (major == 6 && minor < 8)) {
+               /*
+                * On Linux << 6.8, use IP_LOCAL_PORT_RANGE to
+                * partition ephemeral port range randomly to help
+                * with the port selection.
+                */
+               if (port_high - port_low <= PORT_RANGE) {
+                       return ISC_R_RANGE;
+               }
+
+               /*
+                * port_low <= N < port_high - PORT_RANGE
+                */
+               port_high -= PORT_RANGE;
+               port_low += isc_random_uniform(port_high - port_low);
+               port_high = port_low + PORT_RANGE;
+       }
+       INSIST(port_low > 0);
+       INSIST(port_low < port_high);
+
+       port_range = (uint32_t)port_low | ((uint32_t)port_high << 16);
+       if (setsockopt(fd, IPPROTO_IP, IP_LOCAL_PORT_RANGE, &port_range,
+                      sizeof(port_range)) == -1)
+       {
                return ISC_R_FAILURE;
        }
 #endif
index 7dd7184cfeaf40d0cae26bacce7299bc97b12b4b..7a3a357fe7fa02e4645de3a88b4f93615b7515f3 100644 (file)
@@ -12,6 +12,7 @@
  */
 
 #include <libgen.h>
+#include <string.h>
 #include <unistd.h>
 
 #include <isc/async.h>
@@ -141,8 +142,6 @@ tcp_connect_direct(isc_nmsocket_t *sock, isc__nm_uvreq_t *req) {
        }
        isc__nm_incstats(sock, STATID_OPEN);
 
-       isc__nm_tcp_bind_no_port(&sock->uv_handle.tcp);
-
        if (req->local.length != 0) {
                r = uv_tcp_bind(&sock->uv_handle.tcp, &req->local.type.sa, 0);
                if (r != 0) {
@@ -291,6 +290,15 @@ isc_nm_tcpconnect(isc_sockaddr_t *local, isc_sockaddr_t *peer,
 
        (void)isc__nm_socket_min_mtu(sock->fd, sa_family);
        (void)isc__nm_socket_tcp_maxseg(sock->fd, NM_MAXSEG);
+       result = isc__nm_socket_max_port_range(sock->fd, sa_family);
+       if (result != ISC_R_SUCCESS) {
+               isc__nmsocket_log(sock, ISC_LOG_DEBUG(99),
+                                 "setting up IP_BIND_ADDRESS_NO_PORT or "
+                                 "IP_LOCAL_PORT_RANGE failed: %s\n",
+                                 result == ISC_R_RANGE
+                                         ? isc_result_totext(result)
+                                         : strerror(errno));
+       }
 
        sock->active = true;
 
index 8c3ced67c4bbc398f6cb289669b1791051bb7e82..741558d753e1ee809b33b92b293a8c7f2e402acc 100644 (file)
  * information regarding copyright ownership.
  */
 
+#include <ctype.h>
 #include <inttypes.h>
 #include <sys/stat.h>
+#include <sys/utsname.h>
 
 #include <isc/os.h>
+#include <isc/string.h>
 #include <isc/types.h>
 #include <isc/util.h>
 #include <isc/uv.h>
@@ -25,6 +28,8 @@
 static unsigned int isc__os_ncpus = 0;
 static unsigned long isc__os_cacheline = ISC_OS_CACHELINE_SIZE;
 static mode_t isc__os_umask = 0;
+static int kernel_major = -1, kernel_minor = -1, kernel_patch = -1;
+static char kernel_name[64];
 
 /*
  * The affinity support for non-Linux is in the review in the upstream
@@ -177,6 +182,19 @@ umask_initialize(void) {
        (void)umask(isc__os_umask);
 }
 
+static void
+kernel_initialize(void) {
+       struct utsname buffer;
+
+       if (uname(&buffer) == -1) {
+               return;
+       }
+
+       (void)sscanf(buffer.release, "%d.%d.%d", &kernel_major, &kernel_minor,
+                    &kernel_patch);
+       (void)strlcpy(kernel_name, buffer.sysname, sizeof(kernel_name));
+}
+
 unsigned int
 isc_os_ncpus(void) {
        return isc__os_ncpus;
@@ -192,10 +210,19 @@ isc_os_umask(void) {
        return isc__os_umask;
 }
 
+void
+isc_os_kernel(char **name, int *major, int *minor, int *patch) {
+       SET_IF_NOT_NULL(name, kernel_name)
+       SET_IF_NOT_NULL(major, kernel_major);
+       SET_IF_NOT_NULL(minor, kernel_minor);
+       SET_IF_NOT_NULL(patch, kernel_patch);
+}
+
 void
 isc__os_initialize(void) {
        umask_initialize();
        ncpus_initialize();
+       kernel_initialize();
 #if defined(_SC_LEVEL1_DCACHE_LINESIZE)
        long s = sysconf(_SC_LEVEL1_DCACHE_LINESIZE);
        if (s > 0 && (unsigned long)s > isc__os_cacheline) {