lua_pushliteral(L, "doh2");
} else if (ep->flags.tls) {
lua_pushliteral(L, "tls");
+ } else if (ep->flags.xdp) {
+ lua_pushliteral(L, "xdp");
} else {
lua_pushliteral(L, "dns");
}
case AF_INET6:
lua_pushliteral(L, "inet6");
break;
+ case AF_XDP:
+ lua_pushliteral(L, "inet4+inet6"); // both UDP ports at once
+ break;
case AF_UNIX:
lua_pushliteral(L, "unix");
break;
lua_setfield(L, -2, "family");
lua_pushstring(L, key);
- if (ep->family != AF_UNIX) {
+ if (ep->family == AF_INET || ep->family == AF_INET6) {
lua_setfield(L, -2, "ip");
- } else {
+ lua_pushboolean(L, ep->flags.freebind);
+ lua_setfield(L, -2, "freebind");
+ } else if (ep->family == AF_UNIX) {
lua_setfield(L, -2, "path");
+ } else if (ep->family == AF_XDP) {
+ lua_setfield(L, -2, "interface");
+ lua_pushinteger(L, ep->nic_queue);
+ lua_setfield(L, -2, "nic_queue");
}
if (ep->family != AF_UNIX) {
lua_pushinteger(L, ep->port);
lua_setfield(L, -2, "port");
- lua_pushboolean(L, ep->flags.freebind);
- lua_setfield(L, -2, "freebind");
}
if (ep->family == AF_UNIX) {
/** Listen on an address list represented by the top of lua stack.
* \note flags.kind ownership is not transferred, and flags.sock_type doesn't make sense
* \return success */
-static bool net_listen_addrs(lua_State *L, int port, endpoint_flags_t flags)
+static bool net_listen_addrs(lua_State *L, int port, endpoint_flags_t flags, int16_t nic_queue)
{
+ assert(flags.xdp || nic_queue == -1);
+
/* Case: table with 'addr' field; only follow that field directly. */
lua_getfield(L, -1, "addr");
if (!lua_isnil(L, -1)) {
const char *str = lua_tostring(L, -1);
if (str != NULL) {
struct network *net = &the_worker->engine->net;
+ const bool is_unix = str[0] == '/';
int ret = 0;
- if (!flags.kind && !flags.tls) { /* normal UDP */
+ if (!flags.kind && !flags.tls) { /* normal UDP or XDP */
flags.sock_type = SOCK_DGRAM;
- ret = network_listen(net, str, port, flags);
+ ret = network_listen(net, str, port, nic_queue, flags);
}
- if (!flags.kind && ret == 0) { /* common for TCP, DoT and DoH (v2) */
+ if (!flags.kind && !flags.xdp && ret == 0) { /* common for TCP, DoT and DoH (v2) */
flags.sock_type = SOCK_STREAM;
- ret = network_listen(net, str, port, flags);
+ ret = network_listen(net, str, port, nic_queue, flags);
}
if (flags.kind) {
flags.kind = strdup(flags.kind);
flags.sock_type = SOCK_STREAM; /* TODO: allow to override this? */
- ret = network_listen(net, str, port, flags);
+ ret = network_listen(net, str, (is_unix ? 0 : port), nic_queue, flags);
}
- if (ret != 0) {
- if (str[0] == '/') {
- kr_log_error("[system] bind to '%s' (UNIX): %s\n",
- str, kr_strerror(ret));
+ if (ret == 0) return true; /* success */
+
+ if (is_unix) {
+ kr_log_error("[system] bind to '%s' (UNIX): %s\n",
+ str, kr_strerror(ret));
+ } else if (flags.xdp) {
+ const char *err_str = knot_strerror(ret);
+ if (ret == KNOT_ELIMIT) {
+ if ((strcmp(str, "::") == 0 || strcmp(str, "0.0.0.0") == 0)) {
+ err_str = "wildcard addresses not supported with XDP";
+ } else {
+ err_str = "address matched multiple network interfaces";
+ }
+ } else if (ret == kr_error(ENODEV)) {
+ err_str = "invalid address or interface name";
+ }
+ /* Notable OK strerror: KNOT_EPERM Operation not permitted */
+
+ if (nic_queue == -1) {
+ kr_log_error("[system] failed to initialize XDP for '%s@%d'"
+ " (nic_queue = <auto>): %s\n",
+ str, port, err_str);
} else {
- const char *stype = flags.sock_type == SOCK_DGRAM ? "UDP" : "TCP";
- kr_log_error("[system] bind to '%s@%d' (%s): %s\n",
- str, port, stype, kr_strerror(ret));
+ kr_log_error("[system] failed to initialize XDP for '%s@%d'"
+ " (nic_queue = %d): %s\n",
+ str, port, nic_queue, err_str);
}
+
+ } else {
+ const char *stype = flags.sock_type == SOCK_DGRAM ? "UDP" : "TCP";
+ kr_log_error("[system] bind to '%s@%d' (%s): %s\n",
+ str, port, stype, kr_strerror(ret));
}
- return ret == 0;
+ return false; /* failure */
}
/* Last case: table where all entries are added recursively. */
lua_error_p(L, "bad type for address");
lua_pushnil(L);
while (lua_next(L, -2)) {
- if (!net_listen_addrs(L, port, flags))
+ if (!net_listen_addrs(L, port, flags, nic_queue))
return false;
lua_pop(L, 1);
}
flags.http = flags.tls = true;
}
+ int16_t nic_queue = -1;
if (n > 2 && !lua_isnil(L, 3)) {
if (!lua_istable(L, 3))
lua_error_p(L, "wrong type of third parameter (table expected)");
flags.tls = flags.http = false;
} else if (k && strcasecmp(k, "xdp") == 0) {
flags.tls = flags.http = false;
+ flags.xdp = true;
} else if (k && strcasecmp(k, "tls") == 0) {
flags.tls = true;
flags.http = false;
"kind=\"doh\" is an obsolete DoH implementation, use kind=\"doh2\" instead\n");
}
}
+
+ lua_getfield(L, 3, "nic_queue");
+ if (lua_isnumber(L, -1)) {
+ if (flags.xdp) {
+ nic_queue = lua_tointeger(L, -1);
+ } else {
+ lua_error_p(L, "nic_queue only supported with kind = 'xdp'");
+ }
+ } else if (!lua_isnil(L, -1)) {
+ lua_error_p(L, "wrong value of nic_queue (integer expected)");
+ }
}
/* Memory management of `kind` string is difficult due to longjmp etc.
/* Now focus on the first argument. */
lua_settop(L, 1);
- if (!net_listen_addrs(L, port, flags))
+ if (!net_listen_addrs(L, port, flags, nic_queue))
lua_error_p(L, "net.listen() failed to bind");
lua_pushboolean(L, true);
return 1;
[protocol] => tcp
}
}
+ [4] => {
+ [kind] => xdp
+ [transport] => {
+ [family] => inet4+inet6
+ [interface] => eth2
+ [nic_queue] => 0
+ [port] => 53
+ [protocol] => udp
+ }
+ }
.. function:: net.interfaces()
int sock_type;
_Bool tls;
_Bool http;
- const char *kind;
+ _Bool xdp;
_Bool freebind;
+ const char *kind;
} endpoint_flags_t;
typedef struct {
char **at;
if socket.kind == 'control' then
control_socks = control_socks + 1
elseif (socket.kind == 'dns' or
+ socket.kind == 'xdp' or
socket.kind == 'tls' or
socket.kind == 'doh' or
socket.kind == 'doh2') then
#include <assert.h>
#include <libgen.h>
+#include <net/if.h>
#include <sys/un.h>
#include <unistd.h>
static void endpoint_close(struct network *net, struct endpoint *ep, bool force)
{
- bool control = ep->flags.kind && strcmp(ep->flags.kind, "control") == 0;
+ const bool is_control = ep->flags.kind && strcmp(ep->flags.kind, "control") == 0;
+ const bool is_xdp = ep->family == AF_XDP;
if (ep->family == AF_UNIX) { /* The FS name would be left behind. */
/* Extract local address for this socket. */
}
}
- if (ep->flags.kind && !control) {
+ if (ep->flags.kind && !is_control && !is_xdp) {
assert(!ep->handle);
/* Special lua-handled endpoint. */
if (ep->engaged) {
return;
}
- free_const(ep->flags.kind); /* needed if (control) */
+ free_const(ep->flags.kind); /* needed if (is_control) */
assert(ep->handle);
if (force) { /* Force close if event loop isn't running. */
if (ep->fd >= 0) {
return kr_ok();
}
-/** Open endpoint protocols. ep->flags were pre-set. */
-static int open_endpoint(struct network *net, struct endpoint *ep,
- const struct sockaddr *sa, const char *log_addr)
+/** Open endpoint protocols. ep->flags were pre-set.
+ * \p addr_str is only used for logging or for XDP "address". */
+static int open_endpoint(struct network *net, const char *addr_str,
+ struct endpoint *ep, const struct sockaddr *sa)
{
- bool control = ep->flags.kind && strcmp(ep->flags.kind, "control") == 0;
- if ((sa != NULL) == (ep->fd != -1)) {
+ const bool is_control = ep->flags.kind && strcmp(ep->flags.kind, "control") == 0;
+ const bool is_xdp = ep->family == AF_XDP;
+ bool ok = is_xdp
+ ? sa == NULL && ep->fd == -1 && ep->nic_queue >= 0
+ && ep->flags.sock_type == SOCK_DGRAM && !ep->flags.tls
+ : (sa != NULL) != (ep->fd != -1);
+ if (!ok) {
assert(!EINVAL);
return kr_error(EINVAL);
}
ep->fd = io_bind(sa, ep->flags.sock_type, &ep->flags);
if (ep->fd < 0) return ep->fd;
}
- if (ep->flags.kind && !control) {
+ if (ep->flags.kind && !is_control && !is_xdp) {
/* This EP isn't to be managed internally after binding. */
- return endpoint_open_lua_cb(net, ep, log_addr);
+ return endpoint_open_lua_cb(net, ep, addr_str);
} else {
ep->engaged = true;
- /* .engaged seems not really meaningful with .kind == NULL, but... */
+ /* .engaged seems not really meaningful in this case, but... */
}
- if (control) {
+ int ret;
+ if (is_control) {
uv_pipe_t *ep_handle = malloc(sizeof(uv_pipe_t));
ep->handle = (uv_handle_t *)ep_handle;
- if (!ep->handle) {
- return kr_error(ENOMEM);
- }
- return io_listen_pipe(net->loop, ep_handle, ep->fd);
+ ret = !ep->handle ? ENOMEM
+ : io_listen_pipe(net->loop, ep_handle, ep->fd);
+ goto finish_ret;
}
if (ep->family == AF_UNIX) {
/* Some parts of connection handling would need more work,
* so let's support AF_UNIX only with .kind != NULL for now. */
kr_log_error("[system] AF_UNIX only supported with set { kind = '...' }\n");
- return kr_error(EAFNOSUPPORT);
+ ret = EAFNOSUPPORT;
+ goto finish_ret;
/*
uv_pipe_t *ep_handle = malloc(sizeof(uv_pipe_t));
*/
}
+ if (is_xdp) {
+ #if ENABLE_XDP
+ uv_poll_t *ep_handle = malloc(sizeof(uv_poll_t));
+ ep->handle = (uv_handle_t *)ep_handle;
+ ret = !ep->handle ? ENOMEM
+ : io_listen_xdp(net->loop, ep, addr_str);
+ #else
+ ret = ESOCKTNOSUPPORT;
+ #endif
+ goto finish_ret;
+ } /* else */
+
if (ep->flags.sock_type == SOCK_DGRAM) {
if (ep->flags.tls) {
assert(!EINVAL);
}
uv_udp_t *ep_handle = malloc(sizeof(uv_udp_t));
ep->handle = (uv_handle_t *)ep_handle;
- if (!ep->handle) {
- return kr_error(ENOMEM);
- }
- return io_listen_udp(net->loop, ep_handle, ep->fd);
+ ret = !ep->handle ? ENOMEM
+ : io_listen_udp(net->loop, ep_handle, ep->fd);
+ goto finish_ret;
} /* else */
if (ep->flags.sock_type == SOCK_STREAM) {
uv_tcp_t *ep_handle = malloc(sizeof(uv_tcp_t));
ep->handle = (uv_handle_t *)ep_handle;
- if (!ep->handle) {
- return kr_error(ENOMEM);
- }
- return io_listen_tcp(net->loop, ep_handle, ep->fd,
+ ret = !ep->handle ? ENOMEM
+ : io_listen_tcp(net->loop, ep_handle, ep->fd,
net->tcp_backlog, ep->flags.tls, ep->flags.http);
+ goto finish_ret;
} /* else */
assert(!EINVAL);
return kr_error(EINVAL);
+finish_ret:
+ if (!ret) return ret;
+ free(ep->handle);
+ ep->handle = NULL;
+ return kr_error(ret);
}
/** @internal Fetch a pointer to endpoint of given parameters (or NULL).
- * Beware that there might be multiple matches, though that's not common. */
+ * Beware that there might be multiple matches, though that's not common.
+ * The matching isn't really precise in the sense that it might not find
+ * and endpoint that would *collide* the passed one. */
static struct endpoint * endpoint_get(struct network *net, const char *addr,
uint16_t port, endpoint_flags_t flags)
{
return NULL;
}
-/** \note pass either sa != NULL xor ep.fd != -1;
+/** \note pass (either sa != NULL xor ep.fd != -1) or XDP case (neither sa nor ep.fd)
+ * \note in XDP case addr_str is interface name
* \note ownership of ep.flags.* is taken on success. */
static int create_endpoint(struct network *net, const char *addr_str,
struct endpoint *ep, const struct sockaddr *sa)
{
- int ret = open_endpoint(net, ep, sa, addr_str);
+ int ret = open_endpoint(net, addr_str, ep, sa);
if (ret == 0) {
ret = insert_endpoint(net, addr_str, ep);
}
int network_listen_fd(struct network *net, int fd, endpoint_flags_t flags)
{
+ if (flags.xdp) {
+ assert(!EINVAL);
+ return kr_error(EINVAL);
+ }
/* Extract fd's socket type. */
socklen_t len = sizeof(flags.sock_type);
int ret = getsockopt(fd, SOL_SOCKET, SO_TYPE, &flags.sock_type, &len);
return create_endpoint(net, addr_str, &ep, NULL);
}
+/** Try selecting XDP queue automatically. */
+static int16_t nic_queue_auto(void)
+{
+ const char *inst_str = getenv("SYSTEMD_INSTANCE");
+ if (!inst_str)
+ return 0; // should work OK for simple (single-kresd) deployments
+ char *endp;
+ errno = 0; // strtol() is special in this respect
+ long inst = strtol(inst_str, &endp, 10);
+ if (!errno && *endp == '\0' && inst > 0 && inst < UINT16_MAX)
+ return inst - 1; // 1-based vs. 0-based indexing conventions
+ return -1;
+}
+
int network_listen(struct network *net, const char *addr, uint16_t port,
- endpoint_flags_t flags)
+ int16_t nic_queue, endpoint_flags_t flags)
{
- if (net == NULL || addr == 0 || port == 0) {
+ if (net == NULL || addr == 0 || nic_queue < -1) {
assert(!EINVAL);
return kr_error(EINVAL);
}
- if (endpoint_get(net, addr, port, flags)) {
- return kr_error(EADDRINUSE); /* Already listening */
+
+ if (flags.xdp && nic_queue < 0) {
+ nic_queue = nic_queue_auto();
+ if (nic_queue < 0) {
+ return kr_error(EINVAL);
+ }
}
- /* Parse address. */
+ // Try parsing the address.
const struct sockaddr *sa = kr_straddr_socket(addr, port, NULL);
- if (!sa) {
+ if (!sa && !flags.xdp) { // unusable address spec
return kr_error(EINVAL);
}
- struct endpoint ep = {
- .flags = flags,
- .fd = -1,
- .port = port,
- .family = sa->sa_family,
- };
+ char ifname_buf[64] UNUSED;
+ if (sa && flags.xdp) { // auto-detection: address -> interface
+ #if ENABLE_XDP
+ int ret = knot_eth_name_from_addr((const struct sockaddr_storage *)sa,
+ ifname_buf, sizeof(ifname_buf));
+ // even on success we don't want to pass `sa` on
+ free_const(sa);
+ sa = NULL;
+ if (ret) {
+ return kr_error(ret);
+ }
+ addr = ifname_buf;
+ #else
+ return kr_error(ESOCKTNOSUPPORT);
+ #endif
+ }
+ // XDP: if addr failed to parse as address, we assume it's an interface name.
+
+ if (endpoint_get(net, addr, port, flags)) {
+ return kr_error(EADDRINUSE); // Already listening
+ }
+
+ struct endpoint ep = { 0 };
+ ep.flags = flags;
+ ep.fd = -1;
+ ep.port = port;
+ ep.family = flags.xdp ? AF_XDP : sa->sa_family;
+ ep.nic_queue = nic_queue;
+
int ret = create_endpoint(net, addr, &ep, sa);
+
+ // Error reporting: more precision.
+ if (ret == KNOT_EINVAL && !sa && flags.xdp && ENABLE_XDP) {
+ if (!if_nametoindex(addr) && errno == ENODEV) {
+ ret = kr_error(ENODEV);
+ }
+ }
+
free_const(sa);
return ret;
}
#include <uv.h>
#include <stdbool.h>
+#include <sys/socket.h>
+#ifndef AF_XDP
+#define AF_XDP 44
+#endif
struct engine;
+struct session;
/** Ways to listen on a socket (which may exist already). */
typedef struct {
int sock_type; /**< SOCK_DGRAM or SOCK_STREAM */
bool tls; /**< only used together with .kind == NULL and SOCK_STREAM */
bool http; /**< DoH2, implies .tls (in current implementation) */
- const char *kind; /**< tag for other types: "control" or module-handled kinds */
+ bool xdp; /**< XDP is special (not a normal socket, in particular) */
bool freebind; /**< used for binding to non-local address */
+ const char *kind; /**< tag for other types: "control" or module-handled kinds */
} endpoint_flags_t;
static inline bool endpoint_flags_eq(endpoint_flags_t f1, endpoint_flags_t f2)
* ATM AF_UNIX is only supported with flags.kind != NULL
*/
struct endpoint {
- uv_handle_t *handle; /**< uv_udp_t or uv_tcp_t; NULL in case flags.kind != NULL */
+ /** uv_{udp,tcp,poll}_t (poll for XDP);
+ * NULL in case of endpoints that are to be handled by modules. */
+ uv_handle_t *handle;
int fd; /**< POSIX file-descriptor; always used. */
- int family; /**< AF_INET or AF_INET6 or AF_UNIX */
+ int family; /**< AF_INET or AF_INET6 or AF_UNIX or AF_XDP */
uint16_t port; /**< TCP/UDP port. Meaningless with AF_UNIX. */
int16_t nic_queue; /**< -1 or queue number of the interface for AF_XDP use. */
bool engaged; /**< to some module or internally */
* nothing is done and kr_error(EADDRINUSE) is returned.
* \note there's no short-hand to listen both on UDP and TCP.
* \note ownership of flags.* is taken on success. TODO: non-success?
+ * \param nic_queue == -1 for auto-selection or non-XDP.
+ * \note In XDP mode, addr may be also interface name, so kr_error(ENODEV)
+ * is returned if some nonsense is passed
*/
int network_listen(struct network *net, const char *addr, uint16_t port,
- endpoint_flags_t flags);
+ int16_t nic_queue, endpoint_flags_t flags);
/** Start listenting on an open file-descriptor.
* \note flags.sock_type isn't meaningful here.