From: Willy Tarreau Date: Tue, 29 Apr 2025 09:43:46 +0000 (+0200) Subject: MINOR: tcp: add support for setting TCP_NOTSENT_LOWAT on both sides X-Git-Tag: v3.2-dev13~48 X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=2cdb3cb91e229a23d976fea3268b85a42427b20c;p=thirdparty%2Fhaproxy.git MINOR: tcp: add support for setting TCP_NOTSENT_LOWAT on both sides TCP_NOTSENT_LOWAT is very convenient as it indicates when to report EAGAIN on the sending side. It takes a margin on top of the estimated window, meaning that it's no longer needed to store too many data in socket buffers. Instead there's just enough to fill the send window and a little bit of margin to cover the scheduling time to restart sending. Experiments on a 100ms network have shown a 10-fold reduction in the memory used by socket buffers by just setting this value to tune.bufsize, without noticing any performance degradation. Theoretically the responsiveness on multiplexed protocols such as H2 should also be improved. --- diff --git a/doc/configuration.txt b/doc/configuration.txt index 325b9186f..79a3aa2bc 100644 --- a/doc/configuration.txt +++ b/doc/configuration.txt @@ -4278,6 +4278,22 @@ tune.memory.hot-size disable the per-thread CPU caches, using a very small value could work, but it is better to use "-dMno-cache" on the command-line. +tune.notsent-lowat.client +tune.notsent-lowat.server + Adjusts the kernel's per-socket buffering so as to report that the sending + side of a socket is full once the amount of buffered data equals this value + plus the measured window size. The principle is to let the strict minimum + needed amount of bytes in socket buffers, plus a small margin corresponding + to what would be sent by the time haproxy tries to send again. Setting this + to a low value (typically around tune.bufsize) allows to significantly reduce + the memory consumption in system buffers, and reduce the application level + latency incurred by flushing buffered data. This generally represents a more + effective and more accurate setting than tune.sndbuf.client and + tune.sndbuf.client for systems supporting it. This applies per connection + (connection from a client or connection to a server depending on the setting) + and is only used by TCP connections. The default is zero, which means + unlimited. This is only available on Linux. + tune.pattern.cache-size Sets the size of the pattern lookup cache to entries. This is an LRU cache which reminds previous lookups and their results. It is used by ACLs @@ -4633,7 +4649,9 @@ tune.sndbuf.server of received data. Lower values will significantly increase CPU usage though. Another use case is to prevent write timeouts with extremely slow clients due to the kernel waiting for a large part of the buffer to be read before - notifying HAProxy again. + notifying HAProxy again. See also tune.notsent-lowat.client and + tune.notsent-lowat.server for more effective settings to more finely control + memory usage and responsiveness on Linux without hurting performance. tune.ssl.cachesize Sets the size of the global SSL session cache, in a number of blocks. A block diff --git a/include/haproxy/global-t.h b/include/haproxy/global-t.h index a31112fa8..365482372 100644 --- a/include/haproxy/global-t.h +++ b/include/haproxy/global-t.h @@ -184,6 +184,8 @@ struct global { uint client_rcvbuf; /* set client rcvbuf to this value if not null */ uint server_sndbuf; /* set server sndbuf to this value if not null */ uint server_rcvbuf; /* set server rcvbuf to this value if not null */ + uint client_notsent_lowat; /* set client tcp_notsent_lowat to this value if not null */ + uint server_notsent_lowat; /* set client tcp_notsent_lowat to this value if not null */ uint frontend_sndbuf; /* set frontend dgram sndbuf to this value if not null */ uint frontend_rcvbuf; /* set frontend dgram rcvbuf to this value if not null */ uint backend_sndbuf; /* set backend dgram sndbuf to this value if not null */ diff --git a/src/cfgparse-global.c b/src/cfgparse-global.c index caf89df75..370da3e94 100644 --- a/src/cfgparse-global.c +++ b/src/cfgparse-global.c @@ -1282,6 +1282,46 @@ static int cfg_parse_global_tune_opts(char **args, int section_type, return 0; } + else if (strcmp(args[0], "tune.notsent-lowat.client") == 0) { +#if defined(TCP_NOTSENT_LOWAT) + if (global.tune.client_notsent_lowat != 0) { + memprintf(err, "'%s' already specified. Continuing.", args[0]); + return 1; + } + if (*(args[1]) == 0) { + memprintf(err, "'%s' expects an integer argument.", args[0]); + return -1; + } + res = parse_size_err(args[1], &global.tune.client_notsent_lowat); + if (res != NULL) + goto size_err; + + return 0; +#else + memprintf(err, "'%s' is not supported on this system.", args[0]); + return -1; +#endif + } + else if (strcmp(args[0], "tune.notsent-lowat.server") == 0) { +#if defined(TCP_NOTSENT_LOWAT) + if (global.tune.server_notsent_lowat != 0) { + memprintf(err, "'%s' already specified. Continuing.", args[0]); + return 1; + } + if (*(args[1]) == 0) { + memprintf(err, "'%s' expects an integer argument.", args[0]); + return -1; + } + res = parse_size_err(args[1], &global.tune.server_notsent_lowat); + if (res != NULL) + goto size_err; + + return 0; +#else + memprintf(err, "'%s' is not supported on this system.", args[0]); + return -1; +#endif + } else if (strcmp(args[0], "tune.pipesize") == 0) { if (*(args[1]) == 0) { memprintf(err, "'%s' expects an integer argument.", args[0]); @@ -1726,6 +1766,8 @@ static struct cfg_kw_list cfg_kws = {ILH, { { CFG_GLOBAL, "tune.rcvbuf.server", cfg_parse_global_tune_opts }, { CFG_GLOBAL, "tune.sndbuf.client", cfg_parse_global_tune_opts }, { CFG_GLOBAL, "tune.sndbuf.server", cfg_parse_global_tune_opts }, + { CFG_GLOBAL, "tune.notsent-lowat.client", cfg_parse_global_tune_opts }, + { CFG_GLOBAL, "tune.notsent-lowat.server", cfg_parse_global_tune_opts }, { CFG_GLOBAL, "tune.pipesize", cfg_parse_global_tune_opts }, { CFG_GLOBAL, "tune.http.cookielen", cfg_parse_global_tune_opts }, { CFG_GLOBAL, "tune.http.logurilen", cfg_parse_global_tune_opts }, diff --git a/src/proto_tcp.c b/src/proto_tcp.c index 39de465ef..51b9e9950 100644 --- a/src/proto_tcp.c +++ b/src/proto_tcp.c @@ -545,6 +545,11 @@ int tcp_connect_server(struct connection *conn, int flags) if (global.tune.server_sndbuf) setsockopt(fd, SOL_SOCKET, SO_SNDBUF, &global.tune.server_sndbuf, sizeof(global.tune.server_sndbuf)); +#if defined(TCP_NOTSENT_LOWAT) + if (global.tune.server_notsent_lowat) + setsockopt(fd, IPPROTO_TCP, TCP_NOTSENT_LOWAT, &global.tune.server_notsent_lowat, sizeof(global.tune.server_notsent_lowat)); +#endif + if (global.tune.server_rcvbuf) setsockopt(fd, SOL_SOCKET, SO_RCVBUF, &global.tune.server_rcvbuf, sizeof(global.tune.server_rcvbuf)); diff --git a/src/session.c b/src/session.c index ac82aa8ea..b862628de 100644 --- a/src/session.c +++ b/src/session.c @@ -302,6 +302,11 @@ int session_accept_fd(struct connection *cli_conn) if (global.tune.client_sndbuf) setsockopt(cfd, SOL_SOCKET, SO_SNDBUF, &global.tune.client_sndbuf, sizeof(global.tune.client_sndbuf)); +#if defined(TCP_NOTSENT_LOWAT) + if (global.tune.client_notsent_lowat && (l->rx.addr.ss_family == AF_INET || l->rx.addr.ss_family == AF_INET6)) + setsockopt(cfd, IPPROTO_TCP, TCP_NOTSENT_LOWAT, &global.tune.client_notsent_lowat, sizeof(global.tune.client_notsent_lowat)); +#endif + if (global.tune.client_rcvbuf) setsockopt(cfd, SOL_SOCKET, SO_RCVBUF, &global.tune.client_rcvbuf, sizeof(global.tune.client_rcvbuf));