From: Vladimír Čunát Date: Wed, 21 Oct 2020 16:07:43 +0000 (+0200) Subject: XDP: add lua interfaces X-Git-Tag: v5.2.0~1^2~9 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=891bec649d00b3bcab17eda3e4b5d15ac307aa2a;p=thirdparty%2Fknot-resolver.git XDP: add lua interfaces --- diff --git a/daemon/bindings/net.c b/daemon/bindings/net.c index 511f55875..c99522a43 100644 --- a/daemon/bindings/net.c +++ b/daemon/bindings/net.c @@ -26,6 +26,8 @@ static int net_list_add(const char *key, void *val, void *ext) lua_pushliteral(L, "doh2"); } else if (ep->flags.tls) { lua_pushliteral(L, "tls"); + } else if (ep->flags.xdp) { + lua_pushliteral(L, "xdp"); } else { lua_pushliteral(L, "dns"); } @@ -40,6 +42,9 @@ static int net_list_add(const char *key, void *val, void *ext) case AF_INET6: lua_pushliteral(L, "inet6"); break; + case AF_XDP: + lua_pushliteral(L, "inet4+inet6"); // both UDP ports at once + break; case AF_UNIX: lua_pushliteral(L, "unix"); break; @@ -50,17 +55,21 @@ static int net_list_add(const char *key, void *val, void *ext) lua_setfield(L, -2, "family"); lua_pushstring(L, key); - if (ep->family != AF_UNIX) { + if (ep->family == AF_INET || ep->family == AF_INET6) { lua_setfield(L, -2, "ip"); - } else { + lua_pushboolean(L, ep->flags.freebind); + lua_setfield(L, -2, "freebind"); + } else if (ep->family == AF_UNIX) { lua_setfield(L, -2, "path"); + } else if (ep->family == AF_XDP) { + lua_setfield(L, -2, "interface"); + lua_pushinteger(L, ep->nic_queue); + lua_setfield(L, -2, "nic_queue"); } if (ep->family != AF_UNIX) { lua_pushinteger(L, ep->port); lua_setfield(L, -2, "port"); - lua_pushboolean(L, ep->flags.freebind); - lua_setfield(L, -2, "freebind"); } if (ep->family == AF_UNIX) { @@ -97,8 +106,10 @@ static int net_list(lua_State *L) /** Listen on an address list represented by the top of lua stack. * \note flags.kind ownership is not transferred, and flags.sock_type doesn't make sense * \return success */ -static bool net_listen_addrs(lua_State *L, int port, endpoint_flags_t flags) +static bool net_listen_addrs(lua_State *L, int port, endpoint_flags_t flags, int16_t nic_queue) { + assert(flags.xdp || nic_queue == -1); + /* Case: table with 'addr' field; only follow that field directly. */ lua_getfield(L, -1, "addr"); if (!lua_isnil(L, -1)) { @@ -111,31 +122,55 @@ static bool net_listen_addrs(lua_State *L, int port, endpoint_flags_t flags) const char *str = lua_tostring(L, -1); if (str != NULL) { struct network *net = &the_worker->engine->net; + const bool is_unix = str[0] == '/'; int ret = 0; - if (!flags.kind && !flags.tls) { /* normal UDP */ + if (!flags.kind && !flags.tls) { /* normal UDP or XDP */ flags.sock_type = SOCK_DGRAM; - ret = network_listen(net, str, port, flags); + ret = network_listen(net, str, port, nic_queue, flags); } - if (!flags.kind && ret == 0) { /* common for TCP, DoT and DoH (v2) */ + if (!flags.kind && !flags.xdp && ret == 0) { /* common for TCP, DoT and DoH (v2) */ flags.sock_type = SOCK_STREAM; - ret = network_listen(net, str, port, flags); + ret = network_listen(net, str, port, nic_queue, flags); } if (flags.kind) { flags.kind = strdup(flags.kind); flags.sock_type = SOCK_STREAM; /* TODO: allow to override this? */ - ret = network_listen(net, str, port, flags); + ret = network_listen(net, str, (is_unix ? 0 : port), nic_queue, flags); } - if (ret != 0) { - if (str[0] == '/') { - kr_log_error("[system] bind to '%s' (UNIX): %s\n", - str, kr_strerror(ret)); + if (ret == 0) return true; /* success */ + + if (is_unix) { + kr_log_error("[system] bind to '%s' (UNIX): %s\n", + str, kr_strerror(ret)); + } else if (flags.xdp) { + const char *err_str = knot_strerror(ret); + if (ret == KNOT_ELIMIT) { + if ((strcmp(str, "::") == 0 || strcmp(str, "0.0.0.0") == 0)) { + err_str = "wildcard addresses not supported with XDP"; + } else { + err_str = "address matched multiple network interfaces"; + } + } else if (ret == kr_error(ENODEV)) { + err_str = "invalid address or interface name"; + } + /* Notable OK strerror: KNOT_EPERM Operation not permitted */ + + if (nic_queue == -1) { + kr_log_error("[system] failed to initialize XDP for '%s@%d'" + " (nic_queue = ): %s\n", + str, port, err_str); } else { - const char *stype = flags.sock_type == SOCK_DGRAM ? "UDP" : "TCP"; - kr_log_error("[system] bind to '%s@%d' (%s): %s\n", - str, port, stype, kr_strerror(ret)); + kr_log_error("[system] failed to initialize XDP for '%s@%d'" + " (nic_queue = %d): %s\n", + str, port, nic_queue, err_str); } + + } else { + const char *stype = flags.sock_type == SOCK_DGRAM ? "UDP" : "TCP"; + kr_log_error("[system] bind to '%s@%d' (%s): %s\n", + str, port, stype, kr_strerror(ret)); } - return ret == 0; + return false; /* failure */ } /* Last case: table where all entries are added recursively. */ @@ -143,7 +178,7 @@ static bool net_listen_addrs(lua_State *L, int port, endpoint_flags_t flags) lua_error_p(L, "bad type for address"); lua_pushnil(L); while (lua_next(L, -2)) { - if (!net_listen_addrs(L, port, flags)) + if (!net_listen_addrs(L, port, flags, nic_queue)) return false; lua_pop(L, 1); } @@ -189,6 +224,7 @@ static int net_listen(lua_State *L) flags.http = flags.tls = true; } + int16_t nic_queue = -1; if (n > 2 && !lua_isnil(L, 3)) { if (!lua_istable(L, 3)) lua_error_p(L, "wrong type of third parameter (table expected)"); @@ -201,6 +237,7 @@ static int net_listen(lua_State *L) flags.tls = flags.http = false; } else if (k && strcasecmp(k, "xdp") == 0) { flags.tls = flags.http = false; + flags.xdp = true; } else if (k && strcasecmp(k, "tls") == 0) { flags.tls = true; flags.http = false; @@ -213,6 +250,17 @@ static int net_listen(lua_State *L) "kind=\"doh\" is an obsolete DoH implementation, use kind=\"doh2\" instead\n"); } } + + lua_getfield(L, 3, "nic_queue"); + if (lua_isnumber(L, -1)) { + if (flags.xdp) { + nic_queue = lua_tointeger(L, -1); + } else { + lua_error_p(L, "nic_queue only supported with kind = 'xdp'"); + } + } else if (!lua_isnil(L, -1)) { + lua_error_p(L, "wrong value of nic_queue (integer expected)"); + } } /* Memory management of `kind` string is difficult due to longjmp etc. @@ -226,7 +274,7 @@ static int net_listen(lua_State *L) /* Now focus on the first argument. */ lua_settop(L, 1); - if (!net_listen_addrs(L, port, flags)) + if (!net_listen_addrs(L, port, flags, nic_queue)) lua_error_p(L, "net.listen() failed to bind"); lua_pushboolean(L, true); return 1; diff --git a/daemon/bindings/net_server.rst b/daemon/bindings/net_server.rst index ce847a32f..a785d5167 100644 --- a/daemon/bindings/net_server.rst +++ b/daemon/bindings/net_server.rst @@ -109,6 +109,16 @@ Following configuration functions are useful mainly for scripting or :ref:`runti [protocol] => tcp } } + [4] => { + [kind] => xdp + [transport] => { + [family] => inet4+inet6 + [interface] => eth2 + [nic_queue] => 0 + [port] => 53 + [protocol] => udp + } + } .. function:: net.interfaces() diff --git a/daemon/lua/kres-gen.lua b/daemon/lua/kres-gen.lua index 084f7bd0f..b5182a295 100644 --- a/daemon/lua/kres-gen.lua +++ b/daemon/lua/kres-gen.lua @@ -400,8 +400,9 @@ typedef struct { int sock_type; _Bool tls; _Bool http; - const char *kind; + _Bool xdp; _Bool freebind; + const char *kind; } endpoint_flags_t; typedef struct { char **at; diff --git a/daemon/lua/postconfig.lua b/daemon/lua/postconfig.lua index 49213cc0c..818a3fc6e 100644 --- a/daemon/lua/postconfig.lua +++ b/daemon/lua/postconfig.lua @@ -10,6 +10,7 @@ local function count_sockets() if socket.kind == 'control' then control_socks = control_socks + 1 elseif (socket.kind == 'dns' or + socket.kind == 'xdp' or socket.kind == 'tls' or socket.kind == 'doh' or socket.kind == 'doh2') then diff --git a/daemon/network.c b/daemon/network.c index bf63b1751..15b5e05ad 100644 --- a/daemon/network.c +++ b/daemon/network.c @@ -11,6 +11,7 @@ #include #include +#include #include #include @@ -123,7 +124,8 @@ static void endpoint_close_lua_cb(struct network *net, struct endpoint *ep) static void endpoint_close(struct network *net, struct endpoint *ep, bool force) { - bool control = ep->flags.kind && strcmp(ep->flags.kind, "control") == 0; + const bool is_control = ep->flags.kind && strcmp(ep->flags.kind, "control") == 0; + const bool is_xdp = ep->family == AF_XDP; if (ep->family == AF_UNIX) { /* The FS name would be left behind. */ /* Extract local address for this socket. */ @@ -138,7 +140,7 @@ static void endpoint_close(struct network *net, struct endpoint *ep, bool force) } } - if (ep->flags.kind && !control) { + if (ep->flags.kind && !is_control && !is_xdp) { assert(!ep->handle); /* Special lua-handled endpoint. */ if (ep->engaged) { @@ -151,7 +153,7 @@ static void endpoint_close(struct network *net, struct endpoint *ep, bool force) return; } - free_const(ep->flags.kind); /* needed if (control) */ + free_const(ep->flags.kind); /* needed if (is_control) */ assert(ep->handle); if (force) { /* Force close if event loop isn't running. */ if (ep->fd >= 0) { @@ -240,12 +242,18 @@ static int insert_endpoint(struct network *net, const char *addr, struct endpoin return kr_ok(); } -/** Open endpoint protocols. ep->flags were pre-set. */ -static int open_endpoint(struct network *net, struct endpoint *ep, - const struct sockaddr *sa, const char *log_addr) +/** Open endpoint protocols. ep->flags were pre-set. + * \p addr_str is only used for logging or for XDP "address". */ +static int open_endpoint(struct network *net, const char *addr_str, + struct endpoint *ep, const struct sockaddr *sa) { - bool control = ep->flags.kind && strcmp(ep->flags.kind, "control") == 0; - if ((sa != NULL) == (ep->fd != -1)) { + const bool is_control = ep->flags.kind && strcmp(ep->flags.kind, "control") == 0; + const bool is_xdp = ep->family == AF_XDP; + bool ok = is_xdp + ? sa == NULL && ep->fd == -1 && ep->nic_queue >= 0 + && ep->flags.sock_type == SOCK_DGRAM && !ep->flags.tls + : (sa != NULL) != (ep->fd != -1); + if (!ok) { assert(!EINVAL); return kr_error(EINVAL); } @@ -265,33 +273,46 @@ static int open_endpoint(struct network *net, struct endpoint *ep, ep->fd = io_bind(sa, ep->flags.sock_type, &ep->flags); if (ep->fd < 0) return ep->fd; } - if (ep->flags.kind && !control) { + if (ep->flags.kind && !is_control && !is_xdp) { /* This EP isn't to be managed internally after binding. */ - return endpoint_open_lua_cb(net, ep, log_addr); + return endpoint_open_lua_cb(net, ep, addr_str); } else { ep->engaged = true; - /* .engaged seems not really meaningful with .kind == NULL, but... */ + /* .engaged seems not really meaningful in this case, but... */ } - if (control) { + int ret; + if (is_control) { uv_pipe_t *ep_handle = malloc(sizeof(uv_pipe_t)); ep->handle = (uv_handle_t *)ep_handle; - if (!ep->handle) { - return kr_error(ENOMEM); - } - return io_listen_pipe(net->loop, ep_handle, ep->fd); + ret = !ep->handle ? ENOMEM + : io_listen_pipe(net->loop, ep_handle, ep->fd); + goto finish_ret; } if (ep->family == AF_UNIX) { /* Some parts of connection handling would need more work, * so let's support AF_UNIX only with .kind != NULL for now. */ kr_log_error("[system] AF_UNIX only supported with set { kind = '...' }\n"); - return kr_error(EAFNOSUPPORT); + ret = EAFNOSUPPORT; + goto finish_ret; /* uv_pipe_t *ep_handle = malloc(sizeof(uv_pipe_t)); */ } + if (is_xdp) { + #if ENABLE_XDP + uv_poll_t *ep_handle = malloc(sizeof(uv_poll_t)); + ep->handle = (uv_handle_t *)ep_handle; + ret = !ep->handle ? ENOMEM + : io_listen_xdp(net->loop, ep, addr_str); + #else + ret = ESOCKTNOSUPPORT; + #endif + goto finish_ret; + } /* else */ + if (ep->flags.sock_type == SOCK_DGRAM) { if (ep->flags.tls) { assert(!EINVAL); @@ -299,28 +320,33 @@ static int open_endpoint(struct network *net, struct endpoint *ep, } uv_udp_t *ep_handle = malloc(sizeof(uv_udp_t)); ep->handle = (uv_handle_t *)ep_handle; - if (!ep->handle) { - return kr_error(ENOMEM); - } - return io_listen_udp(net->loop, ep_handle, ep->fd); + ret = !ep->handle ? ENOMEM + : io_listen_udp(net->loop, ep_handle, ep->fd); + goto finish_ret; } /* else */ if (ep->flags.sock_type == SOCK_STREAM) { uv_tcp_t *ep_handle = malloc(sizeof(uv_tcp_t)); ep->handle = (uv_handle_t *)ep_handle; - if (!ep->handle) { - return kr_error(ENOMEM); - } - return io_listen_tcp(net->loop, ep_handle, ep->fd, + ret = !ep->handle ? ENOMEM + : io_listen_tcp(net->loop, ep_handle, ep->fd, net->tcp_backlog, ep->flags.tls, ep->flags.http); + goto finish_ret; } /* else */ assert(!EINVAL); return kr_error(EINVAL); +finish_ret: + if (!ret) return ret; + free(ep->handle); + ep->handle = NULL; + return kr_error(ret); } /** @internal Fetch a pointer to endpoint of given parameters (or NULL). - * Beware that there might be multiple matches, though that's not common. */ + * Beware that there might be multiple matches, though that's not common. + * The matching isn't really precise in the sense that it might not find + * and endpoint that would *collide* the passed one. */ static struct endpoint * endpoint_get(struct network *net, const char *addr, uint16_t port, endpoint_flags_t flags) { @@ -337,12 +363,13 @@ static struct endpoint * endpoint_get(struct network *net, const char *addr, return NULL; } -/** \note pass either sa != NULL xor ep.fd != -1; +/** \note pass (either sa != NULL xor ep.fd != -1) or XDP case (neither sa nor ep.fd) + * \note in XDP case addr_str is interface name * \note ownership of ep.flags.* is taken on success. */ static int create_endpoint(struct network *net, const char *addr_str, struct endpoint *ep, const struct sockaddr *sa) { - int ret = open_endpoint(net, ep, sa, addr_str); + int ret = open_endpoint(net, addr_str, ep, sa); if (ret == 0) { ret = insert_endpoint(net, addr_str, ep); } @@ -354,6 +381,10 @@ static int create_endpoint(struct network *net, const char *addr_str, int network_listen_fd(struct network *net, int fd, endpoint_flags_t flags) { + if (flags.xdp) { + assert(!EINVAL); + return kr_error(EINVAL); + } /* Extract fd's socket type. */ socklen_t len = sizeof(flags.sock_type); int ret = getsockopt(fd, SOL_SOCKET, SO_TYPE, &flags.sock_type, &len); @@ -410,29 +441,78 @@ int network_listen_fd(struct network *net, int fd, endpoint_flags_t flags) return create_endpoint(net, addr_str, &ep, NULL); } +/** Try selecting XDP queue automatically. */ +static int16_t nic_queue_auto(void) +{ + const char *inst_str = getenv("SYSTEMD_INSTANCE"); + if (!inst_str) + return 0; // should work OK for simple (single-kresd) deployments + char *endp; + errno = 0; // strtol() is special in this respect + long inst = strtol(inst_str, &endp, 10); + if (!errno && *endp == '\0' && inst > 0 && inst < UINT16_MAX) + return inst - 1; // 1-based vs. 0-based indexing conventions + return -1; +} + int network_listen(struct network *net, const char *addr, uint16_t port, - endpoint_flags_t flags) + int16_t nic_queue, endpoint_flags_t flags) { - if (net == NULL || addr == 0 || port == 0) { + if (net == NULL || addr == 0 || nic_queue < -1) { assert(!EINVAL); return kr_error(EINVAL); } - if (endpoint_get(net, addr, port, flags)) { - return kr_error(EADDRINUSE); /* Already listening */ + + if (flags.xdp && nic_queue < 0) { + nic_queue = nic_queue_auto(); + if (nic_queue < 0) { + return kr_error(EINVAL); + } } - /* Parse address. */ + // Try parsing the address. const struct sockaddr *sa = kr_straddr_socket(addr, port, NULL); - if (!sa) { + if (!sa && !flags.xdp) { // unusable address spec return kr_error(EINVAL); } - struct endpoint ep = { - .flags = flags, - .fd = -1, - .port = port, - .family = sa->sa_family, - }; + char ifname_buf[64] UNUSED; + if (sa && flags.xdp) { // auto-detection: address -> interface + #if ENABLE_XDP + int ret = knot_eth_name_from_addr((const struct sockaddr_storage *)sa, + ifname_buf, sizeof(ifname_buf)); + // even on success we don't want to pass `sa` on + free_const(sa); + sa = NULL; + if (ret) { + return kr_error(ret); + } + addr = ifname_buf; + #else + return kr_error(ESOCKTNOSUPPORT); + #endif + } + // XDP: if addr failed to parse as address, we assume it's an interface name. + + if (endpoint_get(net, addr, port, flags)) { + return kr_error(EADDRINUSE); // Already listening + } + + struct endpoint ep = { 0 }; + ep.flags = flags; + ep.fd = -1; + ep.port = port; + ep.family = flags.xdp ? AF_XDP : sa->sa_family; + ep.nic_queue = nic_queue; + int ret = create_endpoint(net, addr, &ep, sa); + + // Error reporting: more precision. + if (ret == KNOT_EINVAL && !sa && flags.xdp && ENABLE_XDP) { + if (!if_nametoindex(addr) && errno == ENODEV) { + ret = kr_error(ENODEV); + } + } + free_const(sa); return ret; } diff --git a/daemon/network.h b/daemon/network.h index 6a0e4a3e0..8cc8fcf41 100644 --- a/daemon/network.h +++ b/daemon/network.h @@ -13,16 +13,22 @@ #include #include +#include +#ifndef AF_XDP +#define AF_XDP 44 +#endif struct engine; +struct session; /** Ways to listen on a socket (which may exist already). */ typedef struct { int sock_type; /**< SOCK_DGRAM or SOCK_STREAM */ bool tls; /**< only used together with .kind == NULL and SOCK_STREAM */ bool http; /**< DoH2, implies .tls (in current implementation) */ - const char *kind; /**< tag for other types: "control" or module-handled kinds */ + bool xdp; /**< XDP is special (not a normal socket, in particular) */ bool freebind; /**< used for binding to non-local address */ + const char *kind; /**< tag for other types: "control" or module-handled kinds */ } endpoint_flags_t; static inline bool endpoint_flags_eq(endpoint_flags_t f1, endpoint_flags_t f2) @@ -42,9 +48,11 @@ static inline bool endpoint_flags_eq(endpoint_flags_t f1, endpoint_flags_t f2) * ATM AF_UNIX is only supported with flags.kind != NULL */ struct endpoint { - uv_handle_t *handle; /**< uv_udp_t or uv_tcp_t; NULL in case flags.kind != NULL */ + /** uv_{udp,tcp,poll}_t (poll for XDP); + * NULL in case of endpoints that are to be handled by modules. */ + uv_handle_t *handle; int fd; /**< POSIX file-descriptor; always used. */ - int family; /**< AF_INET or AF_INET6 or AF_UNIX */ + int family; /**< AF_INET or AF_INET6 or AF_UNIX or AF_XDP */ uint16_t port; /**< TCP/UDP port. Meaningless with AF_UNIX. */ int16_t nic_queue; /**< -1 or queue number of the interface for AF_XDP use. */ bool engaged; /**< to some module or internally */ @@ -90,9 +98,12 @@ void network_deinit(struct network *net); * nothing is done and kr_error(EADDRINUSE) is returned. * \note there's no short-hand to listen both on UDP and TCP. * \note ownership of flags.* is taken on success. TODO: non-success? + * \param nic_queue == -1 for auto-selection or non-XDP. + * \note In XDP mode, addr may be also interface name, so kr_error(ENODEV) + * is returned if some nonsense is passed */ int network_listen(struct network *net, const char *addr, uint16_t port, - endpoint_flags_t flags); + int16_t nic_queue, endpoint_flags_t flags); /** Start listenting on an open file-descriptor. * \note flags.sock_type isn't meaningful here.