From: Ondřej Surý Date: Thu, 24 Jul 2025 09:43:14 +0000 (+0200) Subject: Implement IP_LOCAL_PORT_RANGE socket option for Linux X-Git-Tag: v9.21.19~16^2~1 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=04c81b55d2debfc9a401dca2c4ac4e4cffd9cce1;p=thirdparty%2Fbind9.git Implement IP_LOCAL_PORT_RANGE socket option for Linux For Linux >= 6.8: Since 2023, Linux has introduced a change to the IP_LOCAL_PORT_RANGE socket option that eliminates the need for the random window shifting (implemented as a fallback in the next commit). By setting IP_LOCAL_PORT_RANGE option, we tell the kernel to use better approach to the source port selection. For Linux << 6.8: This implement selecting port by random shifting range leveraging the IP_LOCAL_PORT_RANGE socket option. The network manager is initialized with the ephemeral port range (on startup and on reconfig) and then for every outgoing TCP connection, we define a custom port range (1000 ports) and then randomly shift the custom range within the system range. This helps the kernel to reduce the search space to the custom window between . Reference: https://blog.cloudflare.com/linux-transport-protocol-port-selection-performance/#kernel --- diff --git a/bin/named/server.c b/bin/named/server.c index 006b90c33c1..a881560a289 100644 --- a/bin/named/server.c +++ b/bin/named/server.c @@ -7704,7 +7704,7 @@ apply_configuration(cfg_obj_t *effectiveconfig, cfg_obj_t *bindkeys, dns_kasplist_t tmpkasplist, kasplist; dns_keystorelist_t tmpkeystorelist, keystorelist; dns_viewlist_t viewlist; - in_port_t listen_port, udpport_low, udpport_high; + in_port_t listen_port, port_low, port_high; int i, backlog; isc_interval_t interval; isc_logconfig_t *logc = NULL; @@ -8048,39 +8048,26 @@ apply_configuration(cfg_obj_t *effectiveconfig, cfg_obj_t *bindkeys, isc_portset_create(isc_g_mctx, &v4portset); isc_portset_create(isc_g_mctx, &v6portset); - result = isc_net_getudpportrange(AF_INET, &udpport_low, &udpport_high); - if (result != ISC_R_SUCCESS) { - isc_log_write(NAMED_LOGCATEGORY_GENERAL, NAMED_LOGMODULE_SERVER, - ISC_LOG_ERROR, - "get the default UDP/IPv4 port range: %s", - isc_result_totext(result)); - goto cleanup_portsets; - } - - isc_portset_addrange(v4portset, udpport_low, udpport_high); + isc_net_getudpportrange(AF_INET, &port_low, &port_high); + isc_netmgr_portrange(AF_INET, port_low, port_high); + isc_portset_addrange(v4portset, port_low, port_high); if (!ns_server_getoption(server->sctx, NS_SERVER_DISABLE4)) { isc_log_write(NAMED_LOGCATEGORY_GENERAL, NAMED_LOGMODULE_SERVER, ISC_LOG_INFO, "using default UDP/IPv4 port range: " "[%d, %d]", - udpport_low, udpport_high); + port_low, port_high); } - result = isc_net_getudpportrange(AF_INET6, &udpport_low, &udpport_high); - if (result != ISC_R_SUCCESS) { - isc_log_write(NAMED_LOGCATEGORY_GENERAL, NAMED_LOGMODULE_SERVER, - ISC_LOG_ERROR, - "get the default UDP/IPv6 port range: %s", - isc_result_totext(result)); - goto cleanup_portsets; - } - isc_portset_addrange(v6portset, udpport_low, udpport_high); + isc_net_getudpportrange(AF_INET6, &port_low, &port_high); + isc_netmgr_portrange(AF_INET6, port_low, port_high); + isc_portset_addrange(v6portset, port_low, port_high); if (!ns_server_getoption(server->sctx, NS_SERVER_DISABLE6)) { isc_log_write(NAMED_LOGCATEGORY_GENERAL, NAMED_LOGMODULE_SERVER, ISC_LOG_INFO, "using default UDP/IPv6 port range: " "[%d, %d]", - udpport_low, udpport_high); + port_low, port_high); } dns_dispatchmgr_setavailports(named_g_dispatchmgr, v4portset, diff --git a/lib/isc/include/isc/net.h b/lib/isc/include/isc/net.h index 4c83db2a1b3..f4062db18e6 100644 --- a/lib/isc/include/isc/net.h +++ b/lib/isc/include/isc/net.h @@ -216,7 +216,7 @@ isc_net_enableipv4(void); void isc_net_enableipv6(void); -isc_result_t +void isc_net_getudpportrange(int af, in_port_t *low, in_port_t *high); /*%< * Returns system's default range of ephemeral UDP ports, if defined. diff --git a/lib/isc/include/isc/netmgr.h b/lib/isc/include/isc/netmgr.h index c62231b6dd5..99aed3dd916 100644 --- a/lib/isc/include/isc/netmgr.h +++ b/lib/isc/include/isc/netmgr.h @@ -946,3 +946,9 @@ isc_nmsocket_getaddr(isc_nmsocket_t *sock); /*%< * Return the local address of 'sock'. */ + +void +isc_netmgr_portrange(sa_family_t af, in_port_t low, in_port_t high); +/*%< + * Set the ephemeral port range for 'af' family. + */ diff --git a/lib/isc/include/isc/os.h b/lib/isc/include/isc/os.h index 85b5d80746a..ca17208db03 100644 --- a/lib/isc/include/isc/os.h +++ b/lib/isc/include/isc/os.h @@ -45,3 +45,10 @@ isc_os_umask(void); /*%< * Return umask of the current process as initialized at the program start */ + +void +isc_os_kernel(char **name, int *major, int *minor, int *patch); +/*%< + * Fill the running kernel version into major, minor and patch. + * If any of these are not available then -1 is returned. + */ diff --git a/lib/isc/managers.c b/lib/isc/managers.c index a01a8bac7dc..6107620c138 100644 --- a/lib/isc/managers.c +++ b/lib/isc/managers.c @@ -19,9 +19,17 @@ void isc_managers_create(uint32_t workers) { + in_port_t port_low, port_high; + isc_loopmgr_create(isc_g_mctx, workers); isc_netmgr_create(isc_g_mctx); isc_rwlock_setworkers(workers); + + isc_net_getudpportrange(AF_INET, &port_low, &port_high); + isc_netmgr_portrange(AF_INET, port_low, port_high); + + isc_net_getudpportrange(AF_INET6, &port_low, &port_high); + isc_netmgr_portrange(AF_INET6, port_low, port_high); } void diff --git a/lib/isc/net.c b/lib/isc/net.c index d30072b7f09..5ac2866fe54 100644 --- a/lib/isc/net.c +++ b/lib/isc/net.c @@ -175,7 +175,7 @@ getudpportrange_sysctl(int af, in_port_t *low, in_port_t *high) { #endif /* HAVE_SYSCTLBYNAME */ #endif /* USE_SYSCTL_PORTRANGE */ -isc_result_t +void isc_net_getudpportrange(int af, in_port_t *low, in_port_t *high) { int result = ISC_R_FAILURE; #if !defined(USE_SYSCTL_PORTRANGE) && defined(__linux) @@ -214,8 +214,6 @@ isc_net_getudpportrange(int af, in_port_t *low, in_port_t *high) { *low = ISC_NET_PORTRANGELOW; *high = ISC_NET_PORTRANGEHIGH; } - - return ISC_R_SUCCESS; /* we currently never fail in this function */ } void diff --git a/lib/isc/netmgr/netmgr-int.h b/lib/isc/netmgr/netmgr-int.h index 9ea93e089f6..cce8b690295 100644 --- a/lib/isc/netmgr/netmgr-int.h +++ b/lib/isc/netmgr/netmgr-int.h @@ -358,6 +358,12 @@ typedef struct isc__netmgr { atomic_int_fast32_t send_udp_buffer_size; atomic_int_fast32_t recv_tcp_buffer_size; atomic_int_fast32_t send_tcp_buffer_size; + + _Atomic(in_port_t) port_low4; + _Atomic(in_port_t) port_high4; + _Atomic(in_port_t) port_low6; + _Atomic(in_port_t) port_high6; + } isc__netmgr_t; extern isc__netmgr_t *isc__netmgr; @@ -1387,9 +1393,11 @@ isc__nm_socket_min_mtu(uv_os_sock_t fd, sa_family_t sa_family); */ isc_result_t -isc__nm_tcp_bind_no_port(uv_tcp_t *handle); +isc__nm_socket_max_port_range(uv_os_sock_t fd ISC_ATTR_UNUSED, + sa_family_t sa_family ISC_ATTR_UNUSED); /*%< - * Set IP_BIND_ADDRESS_NO_PORT on the socket (Linux only). + * Set IP_BIND_ADDRESS_NO_PORT and IP_LOCAL_PORT_RANGE on the socket + * (Linux only). */ void diff --git a/lib/isc/netmgr/netmgr.c b/lib/isc/netmgr/netmgr.c index b3fcf33fb22..8dc4fcd4f27 100644 --- a/lib/isc/netmgr/netmgr.c +++ b/lib/isc/netmgr/netmgr.c @@ -155,6 +155,7 @@ netmgr_teardown(void *arg ISC_ATTR_UNUSED) { void isc_netmgr_create(isc_mem_t *mctx) { isc__netmgr_t *netmgr = NULL; + in_port_t port_low, port_high; #ifdef MAXIMAL_UV_VERSION if (uv_version() > MAXIMAL_UV_VERSION) { @@ -185,6 +186,11 @@ isc_netmgr_create(isc_mem_t *mctx) { atomic_init(&netmgr->send_tcp_buffer_size, 0); atomic_init(&netmgr->recv_udp_buffer_size, 0); atomic_init(&netmgr->send_udp_buffer_size, 0); + atomic_init(&netmgr->port_low4, 0); + atomic_init(&netmgr->port_high4, 65535); + atomic_init(&netmgr->port_low6, 0); + atomic_init(&netmgr->port_high6, 65535); + #if HAVE_SO_REUSEPORT_LB netmgr->load_balance_sockets = true; #else @@ -237,6 +243,15 @@ isc_netmgr_create(isc_mem_t *mctx) { } isc__netmgr = netmgr; + + /* + * Set the initial port range for IP_LOCAL_PORT_RANGE. + */ + isc_net_getudpportrange(AF_INET, &port_low, &port_high); + isc_netmgr_portrange(AF_INET, port_low, port_high); + + isc_net_getudpportrange(AF_INET6, &port_low, &port_high); + isc_netmgr_portrange(AF_INET6, port_low, port_high); } /* @@ -2898,6 +2913,23 @@ isc__networker_get(uint32_t tid) { return &isc__netmgr->workers[tid]; } +void +isc_netmgr_portrange(sa_family_t af, in_port_t low, in_port_t high) { + REQUIRE(VALID_NM(isc__netmgr)); + switch (af) { + case AF_INET: + atomic_store_relaxed(&isc__netmgr->port_low4, low); + atomic_store_relaxed(&isc__netmgr->port_high4, high); + break; + case AF_INET6: + atomic_store_relaxed(&isc__netmgr->port_low6, low); + atomic_store_relaxed(&isc__netmgr->port_high6, high); + break; + default: + INSIST(0); + } +} + #if ISC_NETMGR_TRACE /* * Dump all active sockets in netmgr. We output to stderr diff --git a/lib/isc/netmgr/socket.c b/lib/isc/netmgr/socket.c index d75beb97b6a..5f4b86090f4 100644 --- a/lib/isc/netmgr/socket.c +++ b/lib/isc/netmgr/socket.c @@ -11,7 +11,10 @@ * information regarding copyright ownership. */ +#include + #include +#include #include #include "netmgr-int.h" @@ -370,17 +373,81 @@ isc__nm_socket_min_mtu(uv_os_sock_t fd, sa_family_t sa_family) { return ISC_R_SUCCESS; } +/* + * See + * https://blog.cloudflare.com/linux-transport-protocol-port-selection-performance/#kernel + * for rationalle. + */ +#define PORT_RANGE 1000 + isc_result_t -isc__nm_tcp_bind_no_port(uv_tcp_t *handle ISC_ATTR_UNUSED) { +isc__nm_socket_max_port_range(uv_os_sock_t fd ISC_ATTR_UNUSED, + sa_family_t af ISC_ATTR_UNUSED) { #ifdef IP_BIND_ADDRESS_NO_PORT - uv_os_sock_t fd = -1; - - int r = uv_fileno((const uv_handle_t *)handle, (uv_os_fd_t *)&fd); - if (r < 0) { + if (setsockopt_on(fd, IPPROTO_IP, IP_BIND_ADDRESS_NO_PORT) == -1) { return ISC_R_FAILURE; } +#endif - if (setsockopt_on(fd, IPPROTO_IP, IP_BIND_ADDRESS_NO_PORT) == -1) { +#if defined(IP_LOCAL_PORT_RANGE) && defined(__linux__) + /* + * The option takes an uint32_t value with the high 16 bits + * set to the upper range bound, and the low 16 bits set to + * the lower range bound. Range bounds are inclusive. The + * 16-bit values should be in host byte order. + */ + uint32_t port_range; + int major, minor; + isc_os_kernel(NULL, &major, &minor, NULL); + + in_port_t port_low, port_high; + switch (af) { + case AF_INET: + port_low = isc__netmgr->port_low4; + port_high = isc__netmgr->port_high4; + break; + case AF_INET6: + port_low = isc__netmgr->port_low6; + port_high = isc__netmgr->port_high6; + break; + default: + INSIST(0); + } + + /* + * Linux 6.8 implemented a following patch: + * + * If IP_LOCAL_PORT_RANGE is set on a socket before accept(), + * port selection no longer favors even ports. + * + * This means that connect() can find a suitable source port + * faster, and applications can use a different split between + * connect() and bind() users. + */ + if (major < 6 || (major == 6 && minor < 8)) { + /* + * On Linux << 6.8, use IP_LOCAL_PORT_RANGE to + * partition ephemeral port range randomly to help + * with the port selection. + */ + if (port_high - port_low <= PORT_RANGE) { + return ISC_R_RANGE; + } + + /* + * port_low <= N < port_high - PORT_RANGE + */ + port_high -= PORT_RANGE; + port_low += isc_random_uniform(port_high - port_low); + port_high = port_low + PORT_RANGE; + } + INSIST(port_low > 0); + INSIST(port_low < port_high); + + port_range = (uint32_t)port_low | ((uint32_t)port_high << 16); + if (setsockopt(fd, IPPROTO_IP, IP_LOCAL_PORT_RANGE, &port_range, + sizeof(port_range)) == -1) + { return ISC_R_FAILURE; } #endif diff --git a/lib/isc/netmgr/tcp.c b/lib/isc/netmgr/tcp.c index 7dd7184cfea..7a3a357fe7f 100644 --- a/lib/isc/netmgr/tcp.c +++ b/lib/isc/netmgr/tcp.c @@ -12,6 +12,7 @@ */ #include +#include #include #include @@ -141,8 +142,6 @@ tcp_connect_direct(isc_nmsocket_t *sock, isc__nm_uvreq_t *req) { } isc__nm_incstats(sock, STATID_OPEN); - isc__nm_tcp_bind_no_port(&sock->uv_handle.tcp); - if (req->local.length != 0) { r = uv_tcp_bind(&sock->uv_handle.tcp, &req->local.type.sa, 0); if (r != 0) { @@ -291,6 +290,15 @@ isc_nm_tcpconnect(isc_sockaddr_t *local, isc_sockaddr_t *peer, (void)isc__nm_socket_min_mtu(sock->fd, sa_family); (void)isc__nm_socket_tcp_maxseg(sock->fd, NM_MAXSEG); + result = isc__nm_socket_max_port_range(sock->fd, sa_family); + if (result != ISC_R_SUCCESS) { + isc__nmsocket_log(sock, ISC_LOG_DEBUG(99), + "setting up IP_BIND_ADDRESS_NO_PORT or " + "IP_LOCAL_PORT_RANGE failed: %s\n", + result == ISC_R_RANGE + ? isc_result_totext(result) + : strerror(errno)); + } sock->active = true; diff --git a/lib/isc/os.c b/lib/isc/os.c index 8c3ced67c4b..741558d753e 100644 --- a/lib/isc/os.c +++ b/lib/isc/os.c @@ -11,10 +11,13 @@ * information regarding copyright ownership. */ +#include #include #include +#include #include +#include #include #include #include @@ -25,6 +28,8 @@ static unsigned int isc__os_ncpus = 0; static unsigned long isc__os_cacheline = ISC_OS_CACHELINE_SIZE; static mode_t isc__os_umask = 0; +static int kernel_major = -1, kernel_minor = -1, kernel_patch = -1; +static char kernel_name[64]; /* * The affinity support for non-Linux is in the review in the upstream @@ -177,6 +182,19 @@ umask_initialize(void) { (void)umask(isc__os_umask); } +static void +kernel_initialize(void) { + struct utsname buffer; + + if (uname(&buffer) == -1) { + return; + } + + (void)sscanf(buffer.release, "%d.%d.%d", &kernel_major, &kernel_minor, + &kernel_patch); + (void)strlcpy(kernel_name, buffer.sysname, sizeof(kernel_name)); +} + unsigned int isc_os_ncpus(void) { return isc__os_ncpus; @@ -192,10 +210,19 @@ isc_os_umask(void) { return isc__os_umask; } +void +isc_os_kernel(char **name, int *major, int *minor, int *patch) { + SET_IF_NOT_NULL(name, kernel_name) + SET_IF_NOT_NULL(major, kernel_major); + SET_IF_NOT_NULL(minor, kernel_minor); + SET_IF_NOT_NULL(patch, kernel_patch); +} + void isc__os_initialize(void) { umask_initialize(); ncpus_initialize(); + kernel_initialize(); #if defined(_SC_LEVEL1_DCACHE_LINESIZE) long s = sysconf(_SC_LEVEL1_DCACHE_LINESIZE); if (s > 0 && (unsigned long)s > isc__os_cacheline) {