]> git.ipfire.org Git - thirdparty/bind9.git/commitdiff
Implement IP_LOCAL_PORT_RANGE socket option for Linux
authorOndřej Surý <ondrej@isc.org>
Thu, 24 Jul 2025 09:43:14 +0000 (11:43 +0200)
committerOndřej Surý <ondrej@isc.org>
Fri, 20 Feb 2026 16:02:45 +0000 (17:02 +0100)
For Linux >= 6.8:

Since 2023, Linux has introduced a change to the IP_LOCAL_PORT_RANGE
socket option that eliminates the need for the random window
shifting (implemented as a fallback in the next commit).

By setting IP_LOCAL_PORT_RANGE option, we tell the kernel to use better
approach to the source port selection.

For Linux << 6.8:

This implement selecting port by random shifting range leveraging the
IP_LOCAL_PORT_RANGE socket option.  The network manager is initialized
with the ephemeral port range (on startup and on reconfig) and then for
every outgoing TCP connection, we define a custom port range (1000
ports) and then randomly shift the custom range within the system range.

This helps the kernel to reduce the search space to the custom window
between <random_offset, random_offset + 1000>.

Reference:
https://blog.cloudflare.com/linux-transport-protocol-port-selection-performance/#kernel
(cherry picked from commit 04c81b55d2debfc9a401dca2c4ac4e4cffd9cce1)

bin/named/server.c
lib/isc/include/isc/netmgr.h
lib/isc/include/isc/os.h
lib/isc/netmgr/netmgr-int.h
lib/isc/netmgr/netmgr.c
lib/isc/netmgr/socket.c
lib/isc/netmgr/tcp.c
lib/isc/os.c

index 6e919d872aae99f9e8086065bbc7d725418b606d..3ea6f5d4058666da5d59769130f55bb443b5dbda 100644 (file)
@@ -8419,7 +8419,7 @@ load_configuration(const char *filename, named_server_t *server,
        dns_view_t *view_next = NULL;
        dns_viewlist_t tmpviewlist;
        dns_viewlist_t viewlist, builtin_viewlist;
-       in_port_t listen_port, udpport_low, udpport_high;
+       in_port_t listen_port, port_low, port_high;
        int i, backlog;
        isc_interval_t interval;
        isc_logconfig_t *logc = NULL;
@@ -8849,28 +8849,18 @@ load_configuration(const char *filename, named_server_t *server,
        if (usev4ports != NULL) {
                portset_fromconf(v4portset, usev4ports, true);
        } else {
-               result = isc_net_getudpportrange(AF_INET, &udpport_low,
-                                                &udpport_high);
-               if (result != ISC_R_SUCCESS) {
-                       isc_log_write(named_g_lctx, NAMED_LOGCATEGORY_GENERAL,
-                                     NAMED_LOGMODULE_SERVER, ISC_LOG_ERROR,
-                                     "get the default UDP/IPv4 port range: %s",
-                                     isc_result_totext(result));
-                       goto cleanup_v6portset;
-               }
-
-               if (udpport_low == udpport_high) {
-                       isc_portset_add(v4portset, udpport_low);
+               isc_net_getudpportrange(AF_INET, &port_low, &port_high);
+               if (port_low == port_high) {
+                       isc_portset_add(v4portset, port_low);
                } else {
-                       isc_portset_addrange(v4portset, udpport_low,
-                                            udpport_high);
+                       isc_portset_addrange(v4portset, port_low, port_high);
                }
                if (!ns_server_getoption(server->sctx, NS_SERVER_DISABLE4)) {
                        isc_log_write(named_g_lctx, NAMED_LOGCATEGORY_GENERAL,
                                      NAMED_LOGMODULE_SERVER, ISC_LOG_INFO,
                                      "using default UDP/IPv4 port range: "
                                      "[%d, %d]",
-                                     udpport_low, udpport_high);
+                                     port_low, port_high);
                }
        }
        (void)named_config_get(maps, "avoid-v4-udp-ports", &avoidv4ports);
@@ -8882,27 +8872,18 @@ load_configuration(const char *filename, named_server_t *server,
        if (usev6ports != NULL) {
                portset_fromconf(v6portset, usev6ports, true);
        } else {
-               result = isc_net_getudpportrange(AF_INET6, &udpport_low,
-                                                &udpport_high);
-               if (result != ISC_R_SUCCESS) {
-                       isc_log_write(named_g_lctx, NAMED_LOGCATEGORY_GENERAL,
-                                     NAMED_LOGMODULE_SERVER, ISC_LOG_ERROR,
-                                     "get the default UDP/IPv6 port range: %s",
-                                     isc_result_totext(result));
-                       goto cleanup_v6portset;
-               }
-               if (udpport_low == udpport_high) {
-                       isc_portset_add(v6portset, udpport_low);
+               isc_net_getudpportrange(AF_INET6, &port_low, &port_high);
+               if (port_low == port_high) {
+                       isc_portset_add(v6portset, port_low);
                } else {
-                       isc_portset_addrange(v6portset, udpport_low,
-                                            udpport_high);
+                       isc_portset_addrange(v6portset, port_low, port_high);
                }
                if (!ns_server_getoption(server->sctx, NS_SERVER_DISABLE6)) {
                        isc_log_write(named_g_lctx, NAMED_LOGCATEGORY_GENERAL,
                                      NAMED_LOGMODULE_SERVER, ISC_LOG_INFO,
                                      "using default UDP/IPv6 port range: "
                                      "[%d, %d]",
-                                     udpport_low, udpport_high);
+                                     port_low, port_high);
                }
        }
        (void)named_config_get(maps, "avoid-v6-udp-ports", &avoidv6ports);
index f6d656fff8a5589ecd435cc13492b9e3635d349c..1f574ea4037bfdfa79e811070e33fead89d189a5 100644 (file)
@@ -897,3 +897,10 @@ isc_nmsocket_getaddr(isc_nmsocket_t *sock);
 /*%<
  * Return the local address of 'sock'.
  */
+
+void
+isc_netmgr_portrange(isc_nm_t *netmgr, sa_family_t af, in_port_t low,
+                    in_port_t high);
+/*%<
+ * Set the ephemeral port range <low, high> for 'af' family.
+ */
index 32770b992f9597917a1ebf7e0ae2d6b67320b8d4..60b32d1c06231458f119d1255f59290b2873e5e9 100644 (file)
@@ -49,4 +49,11 @@ isc_os_umask(void);
  * Return umask of the current process as initialized at the program start
  */
 
+void
+isc_os_kernel(char **name, int *major, int *minor, int *patch);
+/*%<
+ * Fill the running kernel version into major, minor and patch.
+ * If any of these are not available then -1 is returned.
+ */
+
 ISC_LANG_ENDDECLS
index 85c96a8dea7c34999da2380f79f6899e6b1f261e..8b36afa2e9b8292cf4f1852050139f2d64f6c720 100644 (file)
@@ -366,6 +366,11 @@ struct isc_nm {
        atomic_int_fast32_t send_udp_buffer_size;
        atomic_int_fast32_t recv_tcp_buffer_size;
        atomic_int_fast32_t send_tcp_buffer_size;
+
+       _Atomic(in_port_t) port_low4;
+       _Atomic(in_port_t) port_high4;
+       _Atomic(in_port_t) port_low6;
+       _Atomic(in_port_t) port_high6;
 };
 
 /*%
@@ -1374,9 +1379,12 @@ isc__nm_socket_min_mtu(uv_os_sock_t fd, sa_family_t sa_family);
  */
 
 isc_result_t
-isc__nm_tcp_bind_no_port(uv_tcp_t *handle);
+isc__nm_socket_max_port_range(uv_os_sock_t fd ISC_ATTR_UNUSED,
+                             sa_family_t sa_family ISC_ATTR_UNUSED,
+                             in_port_t port_low, in_port_t port_high);
 /*%<
- * Set IP_BIND_ADDRESS_NO_PORT on the socket (Linux only).
+ * Set IP_BIND_ADDRESS_NO_PORT and IP_LOCAL_PORT_RANGE on the socket
+ * (Linux only).
  */
 
 void
index f8c3643e79ba1f1642aa47de6471166d1ed076fc..70471465c7cd38f534d7e2cc8fcb5a79ec80020a 100644 (file)
@@ -155,6 +155,7 @@ netmgr_teardown(void *arg) {
 void
 isc_netmgr_create(isc_mem_t *mctx, isc_loopmgr_t *loopmgr, isc_nm_t **netmgrp) {
        isc_nm_t *netmgr = NULL;
+       in_port_t port_low, port_high;
 
 #ifdef MAXIMAL_UV_VERSION
        if (uv_version() > MAXIMAL_UV_VERSION) {
@@ -186,6 +187,11 @@ isc_netmgr_create(isc_mem_t *mctx, isc_loopmgr_t *loopmgr, isc_nm_t **netmgrp) {
        atomic_init(&netmgr->send_tcp_buffer_size, 0);
        atomic_init(&netmgr->recv_udp_buffer_size, 0);
        atomic_init(&netmgr->send_udp_buffer_size, 0);
+       atomic_init(&netmgr->port_low4, 0);
+       atomic_init(&netmgr->port_high4, 65535);
+       atomic_init(&netmgr->port_low6, 0);
+       atomic_init(&netmgr->port_high6, 65535);
+
 #if HAVE_SO_REUSEPORT_LB
        netmgr->load_balance_sockets = true;
 #else
@@ -237,6 +243,15 @@ isc_netmgr_create(isc_mem_t *mctx, isc_loopmgr_t *loopmgr, isc_nm_t **netmgrp) {
        }
 
        *netmgrp = netmgr;
+
+       /*
+        * Set the initial port range for IP_LOCAL_PORT_RANGE.
+        */
+       isc_net_getudpportrange(AF_INET, &port_low, &port_high);
+       isc_netmgr_portrange(netmgr, AF_INET, port_low, port_high);
+
+       isc_net_getudpportrange(AF_INET6, &port_low, &port_high);
+       isc_netmgr_portrange(netmgr, AF_INET6, port_low, port_high);
 }
 
 /*
@@ -2838,6 +2853,24 @@ isc_nm_proxyheader_info_init_complete(isc_nm_proxyheader_info_t *restrict info,
                                             .complete_header = *header_data };
 }
 
+void
+isc_netmgr_portrange(isc_nm_t *netmgr, sa_family_t af, in_port_t low,
+                    in_port_t high) {
+       REQUIRE(VALID_NM(netmgr));
+       switch (af) {
+       case AF_INET:
+               atomic_store_relaxed(&netmgr->port_low4, low);
+               atomic_store_relaxed(&netmgr->port_high4, high);
+               break;
+       case AF_INET6:
+               atomic_store_relaxed(&netmgr->port_low6, low);
+               atomic_store_relaxed(&netmgr->port_high6, high);
+               break;
+       default:
+               UNREACHABLE();
+       }
+}
+
 #if ISC_NETMGR_TRACE
 /*
  * Dump all active sockets in netmgr. We output to stderr
index d75beb97b6a2bdffa2aaa48914b21d4a5751d5ee..6f546edc9462c62ffc96524c649aa7396d5a1fb8 100644 (file)
  * information regarding copyright ownership.
  */
 
+#include <netinet/in.h>
+
 #include <isc/errno.h>
+#include <isc/result.h>
 #include <isc/uv.h>
 
 #include "netmgr-int.h"
@@ -370,17 +373,69 @@ isc__nm_socket_min_mtu(uv_os_sock_t fd, sa_family_t sa_family) {
        return ISC_R_SUCCESS;
 }
 
+/*
+ * See
+ * https://blog.cloudflare.com/linux-transport-protocol-port-selection-performance/#kernel
+ * for rationalle.
+ */
+#define PORT_RANGE 1000
+
 isc_result_t
-isc__nm_tcp_bind_no_port(uv_tcp_t *handle ISC_ATTR_UNUSED) {
+isc__nm_socket_max_port_range(uv_os_sock_t fd ISC_ATTR_UNUSED,
+                             sa_family_t sa_family ISC_ATTR_UNUSED,
+                             in_port_t port_low ISC_ATTR_UNUSED,
+                             in_port_t port_high ISC_ATTR_UNUSED) {
 #ifdef IP_BIND_ADDRESS_NO_PORT
-       uv_os_sock_t fd = -1;
-
-       int r = uv_fileno((const uv_handle_t *)handle, (uv_os_fd_t *)&fd);
-       if (r < 0) {
+       if (setsockopt_on(fd, IPPROTO_IP, IP_BIND_ADDRESS_NO_PORT) == -1) {
                return ISC_R_FAILURE;
        }
+#endif
 
-       if (setsockopt_on(fd, IPPROTO_IP, IP_BIND_ADDRESS_NO_PORT) == -1) {
+#if defined(IP_LOCAL_PORT_RANGE) && defined(__linux__)
+       /*
+        * The option takes an uint32_t value with the high 16 bits
+        * set to the upper range bound, and the low 16 bits set to
+        * the lower range bound.  Range bounds are inclusive.  The
+        * 16-bit values should be in host byte order.
+        */
+       uint32_t port_range;
+       int major, minor;
+       isc_os_kernel(NULL, &major, &minor, NULL);
+
+       /*
+        * Linux 6.8 implemented a following patch:
+        *
+        * If IP_LOCAL_PORT_RANGE is set on a socket before accept(),
+        * port selection no longer favors even ports.
+        *
+        * This means that connect() can find a suitable source port
+        * faster, and applications can use a different split between
+        * connect() and bind() users.
+        */
+       if (major < 6 || (major == 6 && minor < 8)) {
+               /*
+                * On Linux << 6.8, use IP_LOCAL_PORT_RANGE to
+                * partition ephemeral port range randomly to help
+                * with the port selection.
+                */
+               if (port_high - port_low <= PORT_RANGE) {
+                       return ISC_R_RANGE;
+               }
+
+               /*
+                * port_low <= N < port_high - PORT_RANGE
+                */
+               port_high -= PORT_RANGE;
+               port_low += isc_random_uniform(port_high - port_low);
+               port_high = port_low + PORT_RANGE;
+       }
+       INSIST(port_low > 0);
+       INSIST(port_low < port_high);
+
+       port_range = (uint32_t)port_low | ((uint32_t)port_high << 16);
+       if (setsockopt(fd, IPPROTO_IP, IP_LOCAL_PORT_RANGE, &port_range,
+                      sizeof(port_range)) == -1)
+       {
                return ISC_R_FAILURE;
        }
 #endif
index b36a1eb218027eb90d6a2bf8d3da9dea5ab43b4b..7d70d546239086ec3da7d295c1df52782cf36a12 100644 (file)
@@ -12,6 +12,7 @@
  */
 
 #include <libgen.h>
+#include <string.h>
 #include <unistd.h>
 
 #include <isc/async.h>
@@ -109,8 +110,6 @@ tcp_connect_direct(isc_nmsocket_t *sock, isc__nm_uvreq_t *req) {
        }
        isc__nm_incstats(sock, STATID_OPEN);
 
-       isc__nm_tcp_bind_no_port(&sock->uv_handle.tcp);
-
        if (req->local.length != 0) {
                r = uv_tcp_bind(&sock->uv_handle.tcp, &req->local.type.sa, 0);
                if (r != 0) {
@@ -227,6 +226,7 @@ isc_nm_tcpconnect(isc_nm_t *mgr, isc_sockaddr_t *local, isc_sockaddr_t *peer,
        sa_family_t sa_family;
        isc__networker_t *worker = NULL;
        uv_os_sock_t fd = -1;
+       in_port_t port_low, port_high;
 
        REQUIRE(VALID_NM(mgr));
        REQUIRE(local != NULL);
@@ -263,6 +263,18 @@ isc_nm_tcpconnect(isc_nm_t *mgr, isc_sockaddr_t *local, isc_sockaddr_t *peer,
 
        (void)isc__nm_socket_min_mtu(sock->fd, sa_family);
        (void)isc__nm_socket_tcp_maxseg(sock->fd, NM_MAXSEG);
+       port_low = (sa_family == AF_INET) ? mgr->port_low4 : mgr->port_low6;
+       port_high = (sa_family == AF_INET) ? mgr->port_high4 : mgr->port_high6;
+       result = isc__nm_socket_max_port_range(sock->fd, sa_family, port_low,
+                                              port_high);
+       if (result != ISC_R_SUCCESS) {
+               isc__nmsocket_log(sock, ISC_LOG_DEBUG(99),
+                                 "setting up IP_BIND_ADDRESS_NO_PORT or "
+                                 "IP_LOCAL_PORT_RANGE failed: %s\n",
+                                 result == ISC_R_RANGE
+                                         ? isc_result_totext(result)
+                                         : strerror(errno));
+       }
 
        sock->active = true;
 
index aa48c317912cd219db7fbcfd178dc46d741fb4f5..0c5f2c8037baf1d7588657478f47d23636fbcf9c 100644 (file)
  * information regarding copyright ownership.
  */
 
+#include <ctype.h>
 #include <inttypes.h>
 #include <sys/stat.h>
+#include <sys/utsname.h>
 
 #include <isc/os.h>
+#include <isc/string.h>
 #include <isc/types.h>
 #include <isc/util.h>
 
@@ -23,6 +26,8 @@
 static unsigned int isc__os_ncpus = 0;
 static unsigned long isc__os_cacheline = ISC_OS_CACHELINE_SIZE;
 static mode_t isc__os_umask = 0;
+static int kernel_major = -1, kernel_minor = -1, kernel_patch = -1;
+static char kernel_name[64];
 
 #ifdef HAVE_SYSCONF
 
@@ -159,6 +164,19 @@ umask_initialize(void) {
        (void)umask(isc__os_umask);
 }
 
+static void
+kernel_initialize(void) {
+       struct utsname buffer;
+
+       if (uname(&buffer) == -1) {
+               return;
+       }
+
+       (void)sscanf(buffer.release, "%d.%d.%d", &kernel_major, &kernel_minor,
+                    &kernel_patch);
+       (void)strlcpy(kernel_name, buffer.sysname, sizeof(kernel_name));
+}
+
 unsigned int
 isc_os_ncpus(void) {
        return isc__os_ncpus;
@@ -174,10 +192,19 @@ isc_os_umask(void) {
        return isc__os_umask;
 }
 
+void
+isc_os_kernel(char **name, int *major, int *minor, int *patch) {
+       SET_IF_NOT_NULL(name, kernel_name)
+       SET_IF_NOT_NULL(major, kernel_major);
+       SET_IF_NOT_NULL(minor, kernel_minor);
+       SET_IF_NOT_NULL(patch, kernel_patch);
+}
+
 void
 isc__os_initialize(void) {
        umask_initialize();
        ncpus_initialize();
+       kernel_initialize();
 #if defined(HAVE_SYSCONF) && defined(_SC_LEVEL1_DCACHE_LINESIZE)
        long s = sysconf(_SC_LEVEL1_DCACHE_LINESIZE);
        if (s > 0 && (unsigned long)s > isc__os_cacheline) {