]> git.ipfire.org Git - thirdparty/bind9.git/commitdiff
Add configuration option to set send/recv buffers on the nm sockets
authorOndřej Surý <ondrej@sury.org>
Wed, 2 Dec 2020 19:51:38 +0000 (20:51 +0100)
committerOndřej Surý <ondrej@sury.org>
Mon, 17 May 2021 06:47:09 +0000 (08:47 +0200)
This commit adds a new configuration option to set the receive and send
buffer sizes on the TCP and UDP netmgr sockets.  The default is `0`
which doesn't set any value and just uses the value set by the operating
system.

There's no magic value here - set it too small and the performance will
drop, set it too large, the buffers can fill-up with queries that have
already timeouted on the client side and nobody is interested for the
answer and this would just make the server clog up even more by making
it produce useless work.

The `netstat -su` can be used on POSIX systems to monitor the receive
and send buffer errors.

17 files changed:
bin/named/config.c
bin/named/named.conf.rst
bin/named/server.c
doc/arm/reference.rst
doc/man/named.conf.5in
doc/misc/options
doc/misc/options.active
doc/misc/options.grammar.rst
lib/isc/include/isc/netmgr.h
lib/isc/netmgr/netmgr-int.h
lib/isc/netmgr/netmgr.c
lib/isc/netmgr/tcp.c
lib/isc/netmgr/tcpdns.c
lib/isc/netmgr/tlsdns.c
lib/isc/netmgr/udp.c
lib/isc/win32/libisc.def.in
lib/isccfg/namedconf.c

index 3e23ed0a7b9420afc1ee5f8a13ab69e53f9e83dc..6f56fbadbc83cc163300041c15fff4ee18f49f87 100644 (file)
@@ -123,6 +123,8 @@ options {\n\
        tcp-initial-timeout 300;\n\
        tcp-keepalive-timeout 300;\n\
        tcp-listen-queue 10;\n\
+       tcp-receive-buffer 0;\n\
+       tcp-send-buffer 0;\n\
 #      tkey-dhkey <none>\n\
 #      tkey-domain <none>\n\
 #      tkey-gssapi-credential <none>\n\
@@ -133,6 +135,8 @@ options {\n\
 #      treat-cr-as-space <obsolete>;\n\
        trust-anchor-telemetry yes;\n\
 #      use-id-pool <obsolete>;\n\
+       udp-receive-buffer 0;\n\
+       udp-send-buffer 0;\n\
 \n\
        /* view */\n\
        allow-new-zones no;\n\
index bb4003ac1ca36f7c4558b05405624099a4e9a0d0..39407490f60ff88ab2d9afc080b70b2fed42994e 100644 (file)
@@ -432,6 +432,8 @@ OPTIONS
        tcp-initial-timeout integer;
        tcp-keepalive-timeout integer;
        tcp-listen-queue integer;
+       tcp-receive-buffer integer;
+       tcp-send-buffer integer;
        tkey-dhkey quoted_string integer;
        tkey-domain quoted_string;
        tkey-gssapi-credential quoted_string;
@@ -448,6 +450,8 @@ OPTIONS
        transfers-per-ns integer;
        trust-anchor-telemetry boolean; // experimental
        try-tcp-refresh boolean;
+       udp-receive-buffer integer;
+       udp-send-buffer integer;
        update-check-ksk boolean;
        use-alt-transfer-source boolean;
        use-v4-udp-ports { portrange; ... };
index f8d65e3b327298475cce4e400b48e9863355c47e..1bd3bb505a8e5cf3873e7f6c2dbd8485ccccf147 100644 (file)
@@ -8503,6 +8503,10 @@ load_configuration(const char *filename, named_server_t *server,
        uint32_t reserved;
        uint32_t udpsize;
        uint32_t transfer_message_size;
+       uint32_t recv_tcp_buffer_size;
+       uint32_t send_tcp_buffer_size;
+       uint32_t recv_udp_buffer_size;
+       uint32_t send_udp_buffer_size;
        named_cache_t *nsc;
        named_cachelist_t cachelist, tmpcachelist;
        ns_altsecret_t *altsecret;
@@ -8774,6 +8778,9 @@ load_configuration(const char *filename, named_server_t *server,
                                             named_g_aclconfctx),
               "configuring statistics server(s)");
 
+       /*
+        * Configure the network manager
+        */
        obj = NULL;
        result = named_config_get(maps, "tcp-initial-timeout", &obj);
        INSIST(result == ISC_R_SUCCESS);
@@ -8843,6 +8850,44 @@ load_configuration(const char *filename, named_server_t *server,
        isc_nm_settimeouts(named_g_netmgr, initial, idle, keepalive,
                           advertised);
 
+#define CAP_IF_NOT_ZERO(v, min, max)        \
+       if (v > 0 && v < min) {             \
+               recv_tcp_buffer_size = min; \
+       } else if (v > max) {               \
+               recv_tcp_buffer_size = max; \
+       }
+
+       /* Set the kernel send and receive buffer sizes */
+       obj = NULL;
+       result = named_config_get(maps, "tcp-receive-buffer", &obj);
+       INSIST(result == ISC_R_SUCCESS);
+       recv_tcp_buffer_size = cfg_obj_asuint32(obj);
+       CAP_IF_NOT_ZERO(recv_tcp_buffer_size, 4096, INT32_MAX);
+
+       obj = NULL;
+       result = named_config_get(maps, "tcp-send-buffer", &obj);
+       INSIST(result == ISC_R_SUCCESS);
+       send_tcp_buffer_size = cfg_obj_asuint32(obj);
+       CAP_IF_NOT_ZERO(send_tcp_buffer_size, 4096, INT32_MAX);
+
+       obj = NULL;
+       result = named_config_get(maps, "udp-receive-buffer", &obj);
+       INSIST(result == ISC_R_SUCCESS);
+       recv_udp_buffer_size = cfg_obj_asuint32(obj);
+       CAP_IF_NOT_ZERO(recv_udp_buffer_size, 4096, INT32_MAX);
+
+       obj = NULL;
+       result = named_config_get(maps, "udp-send-buffer", &obj);
+       INSIST(result == ISC_R_SUCCESS);
+       send_udp_buffer_size = cfg_obj_asuint32(obj);
+       CAP_IF_NOT_ZERO(send_udp_buffer_size, 4096, INT32_MAX);
+
+       isc_nm_setnetbuffers(named_g_netmgr, recv_tcp_buffer_size,
+                            send_tcp_buffer_size, recv_udp_buffer_size,
+                            send_udp_buffer_size);
+
+#undef CAP_IF_NOT_ZERO
+
        /*
         * Configure sets of UDP query source ports.
         */
index 68748252e2790448f355acf4daf2d9aadac0f5c6..073ec17cd737e6e3d83e0645f4c59e0113c51fa8 100644 (file)
@@ -3585,6 +3585,24 @@ Tuning
    milliseconds to prefer IPv6 name servers. The default is ``50``
    milliseconds.
 
+``tcp-recv-buffer``;  ``udp-recv-buffer``
+   These options control the operating system receiving network buffer sizes for
+   TCP and UDP respectively.  Buffering on the operating system level can
+   prevent packet drops during short spikes, but if the value is set too large
+   it could clog up a running server with outstanding queries that have already
+   timeouted. The default is ``0`` which means to use the operating system
+   default value.  The operating system caps the maximum value that the user can
+   set here.
+
+``tcp-send-buffer``; ``udp-send-buffer``
+   These options control the operating system sending network buffer sizes for
+   TCP and UDP respectively.  Buffering on the operating system level can
+   prevent packet drops during short spikes, but if the value is set too large
+   it could clog up a running server with outstanding queries that have already
+   timeouted. The default is ``0`` which means to use the operating system
+   default value.  The operating system caps the maximum value that the user can
+   set here.
+
 .. _builtin:
 
 Built-in Server Information Zones
index 35872a5ad123b71bd218b6d2b47b3dfcb289e3c0..5a311c30db48cbf4a52ed8374a253115ddb180a6 100644 (file)
@@ -499,6 +499,8 @@ options {
       tcp\-initial\-timeout integer;
       tcp\-keepalive\-timeout integer;
       tcp\-listen\-queue integer;
+      tcp\-receive\-buffer integer;
+      tcp\-send\-buffer integer;
       tkey\-dhkey quoted_string integer;
       tkey\-domain quoted_string;
       tkey\-gssapi\-credential quoted_string;
@@ -515,6 +517,8 @@ options {
       transfers\-per\-ns integer;
       trust\-anchor\-telemetry boolean; // experimental
       try\-tcp\-refresh boolean;
+      udp\-receive\-buffer integer;
+      udp\-send\-buffer integer;
       update\-check\-ksk boolean;
       use\-alt\-transfer\-source boolean;
       use\-v4\-udp\-ports { portrange; ... };
index 6a035d686e771d681366fa310d00b9a2e6595e77..e42b004c9e85a61ad76da4f7dd5cbf8c50b96357 100644 (file)
@@ -358,6 +358,8 @@ options {
         tcp-initial-timeout <integer>;
         tcp-keepalive-timeout <integer>;
         tcp-listen-queue <integer>;
+        tcp-receive-buffer <integer>;
+        tcp-send-buffer <integer>;
         tkey-dhkey <quoted_string> <integer>;
         tkey-domain <quoted_string>;
         tkey-gssapi-credential <quoted_string>;
@@ -374,6 +376,8 @@ options {
         transfers-per-ns <integer>;
         trust-anchor-telemetry <boolean>; // experimental
         try-tcp-refresh <boolean>;
+        udp-receive-buffer <integer>;
+        udp-send-buffer <integer>;
         update-check-ksk <boolean>;
         use-alt-transfer-source <boolean>;
         use-v4-udp-ports { <portrange>; ... };
index c8c56ea42735d21e308fd8ececf669dde97aa428..d5adf85a98b05eba21404ff564e7dd5d1a8656c9 100644 (file)
@@ -355,6 +355,8 @@ options {
         tcp-initial-timeout <integer>;
         tcp-keepalive-timeout <integer>;
         tcp-listen-queue <integer>;
+        tcp-receive-buffer <integer>;
+        tcp-send-buffer <integer>;
         tkey-dhkey <quoted_string> <integer>;
         tkey-domain <quoted_string>;
         tkey-gssapi-credential <quoted_string>;
@@ -371,6 +373,8 @@ options {
         transfers-per-ns <integer>;
         trust-anchor-telemetry <boolean>; // experimental
         try-tcp-refresh <boolean>;
+        udp-receive-buffer <integer>;
+        udp-send-buffer <integer>;
         update-check-ksk <boolean>;
         use-alt-transfer-source <boolean>;
         use-v4-udp-ports { <portrange>; ... };
index 5a4c4290f648cc6574916b5366a767ff54508cbb..9ee853c6a912973e5aa447a5e0a83df3041f7eb8 100644 (file)
        tcp-initial-timeout <integer>;
        tcp-keepalive-timeout <integer>;
        tcp-listen-queue <integer>;
+       tcp-receive-buffer <integer>;
+       tcp-send-buffer <integer>;
        tkey-dhkey <quoted_string> <integer>;
        tkey-domain <quoted_string>;
        tkey-gssapi-credential <quoted_string>;
        transfers-per-ns <integer>;
        trust-anchor-telemetry <boolean>; // experimental
        try-tcp-refresh <boolean>;
+       udp-receive-buffer <integer>;
+       udp-send-buffer <integer>;
        update-check-ksk <boolean>;
        use-alt-transfer-source <boolean>;
        use-v4-udp-ports { <portrange>; ... };
index 7ce897786b1e18a6e41f6eec11c24654fc52927e..89ef001a71660b34a146e69a6defdbb8e4bb4d6a 100644 (file)
@@ -424,6 +424,17 @@ isc_nm_settimeouts(isc_nm_t *mgr, uint32_t init, uint32_t idle,
  * \li 'mgr' is a valid netmgr.
  */
 
+void
+isc_nm_setnetbuffers(isc_nm_t *mgr, int32_t recv_tcp, int32_t send_tcp,
+                    int32_t recv_udp, int32_t send_udp);
+/*%<
+ * If not 0, sets the SO_RCVBUF and SO_SNDBUF socket options for TCP and UDP
+ * respectively.
+ *
+ * Requires:
+ * \li 'mgr' is a valid netmgr.
+ */
+
 void
 isc_nm_gettimeouts(isc_nm_t *mgr, uint32_t *initial, uint32_t *idle,
                   uint32_t *keepalive, uint32_t *advertised);
index f45116ae417d5266b00d5b2fd4e019979524d23d..f131400152add9069725e16b41c4afb1a6e9ebb7 100644 (file)
@@ -709,6 +709,14 @@ struct isc_nm {
        isc_barrier_t pausing;
        isc_barrier_t resuming;
 
+       /*
+        * Socket SO_RCVBUF and SO_SNDBUF values
+        */
+       atomic_int_fast32_t recv_udp_buffer_size;
+       atomic_int_fast32_t send_udp_buffer_size;
+       atomic_int_fast32_t recv_tcp_buffer_size;
+       atomic_int_fast32_t send_tcp_buffer_size;
+
 #ifdef NETMGR_TRACE
        ISC_LIST(isc_nmsocket_t) active_sockets;
 #endif
@@ -1769,6 +1777,12 @@ isc__nm_socket_tcp_nodelay(uv_os_sock_t fd);
  * Disables Nagle's algorithm on a TCP socket (sets TCP_NODELAY).
  */
 
+void
+isc__nm_set_network_buffers(isc_nm_t *nm, uv_handle_t *handle);
+/*%>
+ * Sets the pre-configured network buffers size on the handle.
+ */
+
 /*
  * typedef all the netievent types
  */
index d61c8c87f7fc5774f1d3b0fde6913984d6eed08d..ac2f38992a641113ac995343e8bb581fccffaa0f 100644 (file)
@@ -615,6 +615,17 @@ isc_nm_settimeouts(isc_nm_t *mgr, uint32_t init, uint32_t idle,
        atomic_store(&mgr->advertised, advertised);
 }
 
+void
+isc_nm_setnetbuffers(isc_nm_t *mgr, int32_t recv_tcp, int32_t send_tcp,
+                    int32_t recv_udp, int32_t send_udp) {
+       REQUIRE(VALID_NM(mgr));
+
+       atomic_store(&mgr->recv_tcp_buffer_size, recv_tcp);
+       atomic_store(&mgr->send_tcp_buffer_size, send_tcp);
+       atomic_store(&mgr->recv_udp_buffer_size, recv_udp);
+       atomic_store(&mgr->send_udp_buffer_size, send_udp);
+}
+
 void
 isc_nm_gettimeouts(isc_nm_t *mgr, uint32_t *initial, uint32_t *idle,
                   uint32_t *keepalive, uint32_t *advertised) {
@@ -3141,6 +3152,40 @@ isc__nm_socket_tcp_nodelay(uv_os_sock_t fd) {
 #endif
 }
 
+void
+isc__nm_set_network_buffers(isc_nm_t *nm, uv_handle_t *handle) {
+       int32_t recv_buffer_size = 0;
+       int32_t send_buffer_size = 0;
+
+       switch (handle->type) {
+       case UV_TCP:
+               recv_buffer_size =
+                       atomic_load_relaxed(&nm->recv_tcp_buffer_size);
+               send_buffer_size =
+                       atomic_load_relaxed(&nm->send_tcp_buffer_size);
+               break;
+       case UV_UDP:
+               recv_buffer_size =
+                       atomic_load_relaxed(&nm->recv_udp_buffer_size);
+               send_buffer_size =
+                       atomic_load_relaxed(&nm->send_udp_buffer_size);
+               break;
+       default:
+               INSIST(0);
+               ISC_UNREACHABLE();
+       }
+
+       if (recv_buffer_size > 0) {
+               int r = uv_recv_buffer_size(handle, &recv_buffer_size);
+               INSIST(r == 0);
+       }
+
+       if (send_buffer_size > 0) {
+               int r = uv_send_buffer_size(handle, &send_buffer_size);
+               INSIST(r == 0);
+       }
+}
+
 #ifdef NETMGR_TRACE
 /*
  * Dump all active sockets in netmgr. We output to stderr
index c7030eccccfc67894911a9e2f875acb0efffc8e8..7ea14d29f3a554e992885674211c7c9c99c53d1f 100644 (file)
@@ -158,6 +158,8 @@ tcp_connect_direct(isc_nmsocket_t *sock, isc__nm_uvreq_t *req) {
                }
        }
 
+       isc__nm_set_network_buffers(sock->mgr, &sock->uv_handle.handle);
+
        uv_handle_set_data(&req->uv_req.handle, req);
        r = uv_tcp_connect(&req->uv_req.connect, &sock->uv_handle.tcp,
                           &req->peer.type.sa, tcp_connect_cb);
@@ -571,6 +573,8 @@ isc__nm_async_tcplisten(isc__networker_t *worker, isc__netievent_t *ev0) {
        }
 #endif
 
+       isc__nm_set_network_buffers(sock->mgr, &sock->uv_handle.handle);
+
        /*
         * The callback will run in the same thread uv_listen() was called
         * from, so a race with tcp_connection_cb() isn't possible.
index 0d9bd85f99cb755403b886b4ea582755c2f3c1ff..fe0cfccf04acc5bdd29cfc7ecf3470f0db966208 100644 (file)
@@ -132,6 +132,8 @@ tcpdns_connect_direct(isc_nmsocket_t *sock, isc__nm_uvreq_t *req) {
                }
        }
 
+       isc__nm_set_network_buffers(sock->mgr, &sock->uv_handle.handle);
+
        uv_handle_set_data(&req->uv_req.handle, req);
        r = uv_tcp_connect(&req->uv_req.connect, &sock->uv_handle.tcp,
                           &req->peer.type.sa, tcpdns_connect_cb);
@@ -540,6 +542,8 @@ isc__nm_async_tcpdnslisten(isc__networker_t *worker, isc__netievent_t *ev0) {
        }
 #endif
 
+       isc__nm_set_network_buffers(sock->mgr, &sock->uv_handle.handle);
+
        /*
         * The callback will run in the same thread uv_listen() was called
         * from, so a race with tcpdns_connection_cb() isn't possible.
index 212d81539c1f8a5f14a347e88d38d4ff070f411f..afe6a3104361236b5c4e6ac93d97d00b58d55546 100644 (file)
@@ -149,6 +149,8 @@ tlsdns_connect_direct(isc_nmsocket_t *sock, isc__nm_uvreq_t *req) {
                }
        }
 
+       isc__nm_set_network_buffers(sock->mgr, &sock->uv_handle.handle);
+
        uv_handle_set_data(&req->uv_req.handle, req);
        r = uv_tcp_connect(&req->uv_req.connect, &sock->uv_handle.tcp,
                           &req->peer.type.sa, tlsdns_connect_cb);
@@ -610,6 +612,8 @@ isc__nm_async_tlsdnslisten(isc__networker_t *worker, isc__netievent_t *ev0) {
        }
 #endif
 
+       isc__nm_set_network_buffers(sock->mgr, &sock->uv_handle.handle);
+
        /*
         * The callback will run in the same thread uv_listen() was
         * called from, so a race with tlsdns_connection_cb() isn't
index b8a40d99f565f06af97e8b87760286537aebff9c..2c944b7d82724fd40eb9ea43dc1c17e678e169bc 100644 (file)
@@ -274,14 +274,8 @@ isc__nm_async_udplisten(isc__networker_t *worker, isc__netievent_t *ev0) {
        }
 #endif
 
-#ifdef ISC_RECV_BUFFER_SIZE
-       uv_recv_buffer_size(&sock->uv_handle.handle,
-                           &(int){ ISC_RECV_BUFFER_SIZE });
-#endif
-#ifdef ISC_SEND_BUFFER_SIZE
-       uv_send_buffer_size(&sock->uv_handle.handle,
-                           &(int){ ISC_SEND_BUFFER_SIZE });
-#endif
+       isc__nm_set_network_buffers(sock->mgr, &sock->uv_handle.handle);
+
        r = uv_udp_recv_start(&sock->uv_handle.udp, isc__nm_alloc_cb,
                              udp_recv_cb);
        if (r != 0) {
@@ -647,14 +641,7 @@ udp_connect_direct(isc_nmsocket_t *sock, isc__nm_uvreq_t *req) {
                goto done;
        }
 
-#ifdef ISC_RECV_BUFFER_SIZE
-       uv_recv_buffer_size(&sock->uv_handle.handle,
-                           &(int){ ISC_RECV_BUFFER_SIZE });
-#endif
-#ifdef ISC_SEND_BUFFER_SIZE
-       uv_send_buffer_size(&sock->uv_handle.handle,
-                           &(int){ ISC_SEND_BUFFER_SIZE });
-#endif
+       isc__nm_set_network_buffers(sock->mgr, &sock->uv_handle.handle);
 
        /*
         * On FreeBSD the UDP connect() call sometimes results in a
index db582009ad02cb1f3eacf3e755f4a4fb21929907..d384592646f3e3e8b18cf0eff4227744c4d47c7c 100644 (file)
@@ -471,6 +471,7 @@ isc_nm_stoplistening
 isc_nm_tcpconnect
 isc_nm_tcpdnsconnect
 isc_nm_gettimeouts
+isc_nm_setnetbuffers
 isc_nm_settimeouts
 isc_nm_tcpdns_keepalive
 isc_nm_tcpdns_sequential
index 59c2e57e9f087cfdb86961b83d5e80decb3dee9f..cf2b67deb7f38e224e04b2db55ccc31dc8ca2dd9 100644 (file)
@@ -1268,6 +1268,8 @@ static cfg_clausedef_t options_clauses[] = {
        { "tcp-initial-timeout", &cfg_type_uint32, 0 },
        { "tcp-keepalive-timeout", &cfg_type_uint32, 0 },
        { "tcp-listen-queue", &cfg_type_uint32, 0 },
+       { "tcp-receive-buffer", &cfg_type_uint32, 0 },
+       { "tcp-send-buffer", &cfg_type_uint32, 0 },
        { "tkey-dhkey", &cfg_type_tkey_dhkey, 0 },
        { "tkey-domain", &cfg_type_qstring, 0 },
        { "tkey-gssapi-credential", &cfg_type_qstring, 0 },
@@ -1277,6 +1279,8 @@ static cfg_clausedef_t options_clauses[] = {
        { "transfers-out", &cfg_type_uint32, 0 },
        { "transfers-per-ns", &cfg_type_uint32, 0 },
        { "treat-cr-as-space", NULL, CFG_CLAUSEFLAG_ANCIENT },
+       { "udp-receive-buffer", &cfg_type_uint32, 0 },
+       { "udp-send-buffer", &cfg_type_uint32, 0 },
        { "use-id-pool", NULL, CFG_CLAUSEFLAG_ANCIENT },
        { "use-ixfr", NULL, CFG_CLAUSEFLAG_ANCIENT },
        { "use-v4-udp-ports", &cfg_type_bracketed_portlist, 0 },