From: Štěpán Balážik Date: Fri, 20 Mar 2020 18:43:11 +0000 (+0100) Subject: selection: server selection rewrite X-Git-Tag: v5.3.0~30^2~6 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=4565cc59668053cf72585d3b57bf405c3ee2da99;p=thirdparty%2Fknot-resolver.git selection: server selection rewrite Design discussion: #447 Code discussion: !1030 --- diff --git a/bench/bench_lru.c b/bench/bench_lru.c index a885c2915..9fbf0d6fc 100644 --- a/bench/bench_lru.c +++ b/bench/bench_lru.c @@ -11,7 +11,7 @@ #include "contrib/ucw/lib.h" #include "daemon/engine.h" -#include "lib/nsrep.h" +#include "lib/selection.h" typedef kr_nsrep_lru_t lru_bench_t; diff --git a/daemon/bindings/cache.c b/daemon/bindings/cache.c index 7b08374e3..4a8afd931 100644 --- a/daemon/bindings/cache.c +++ b/daemon/bindings/cache.c @@ -268,8 +268,6 @@ static int cache_clear_everything(lua_State *L) /* Clear reputation tables */ struct kr_context *ctx = &the_worker->engine->resolver; - lru_reset(ctx->cache_rtt); - lru_reset(ctx->cache_rep); lru_reset(ctx->cache_cookie); lua_pushboolean(L, true); return 1; diff --git a/daemon/engine.c b/daemon/engine.c index 59a6cf189..d0d6d87ce 100644 --- a/daemon/engine.c +++ b/daemon/engine.c @@ -22,8 +22,7 @@ #include "kresconfig.h" #include "daemon/engine.h" #include "daemon/ffimodule.h" -#include "daemon/worker.h" -#include "lib/nsrep.h" +#include "lib/selection.h" #include "lib/cache/api.h" #include "lib/defines.h" #include "lib/cache/cdb_lmdb.h" @@ -397,9 +396,6 @@ static int init_resolver(struct engine *engine) engine->resolver.tls_padding = -1; /* Empty init; filled via ./lua/postconfig.lua */ kr_zonecut_init(&engine->resolver.root_hints, (const uint8_t *)"", engine->pool); - /* Open NS rtt + reputation cache */ - lru_create(&engine->resolver.cache_rtt, LRU_RTT_SIZE, NULL, NULL); - lru_create(&engine->resolver.cache_rep, LRU_REP_SIZE, NULL, NULL); lru_create(&engine->resolver.cache_cookie, LRU_COOKIES_SIZE, NULL, NULL); /* Load basic modules */ @@ -578,8 +574,6 @@ void engine_deinit(struct engine *engine) kr_cache_close(&engine->resolver.cache); /* The LRUs are currently malloc-ated and need to be freed. */ - lru_free(engine->resolver.cache_rtt); - lru_free(engine->resolver.cache_rep); lru_free(engine->resolver.cache_cookie); network_deinit(&engine->net); diff --git a/daemon/lua/kres-gen.lua b/daemon/lua/kres-gen.lua index b5182a295..36f21a2e1 100644 --- a/daemon/lua/kres-gen.lua +++ b/daemon/lua/kres-gen.lua @@ -7,6 +7,13 @@ typedef struct knot_dump_style knot_dump_style_t; extern const knot_dump_style_t KNOT_DUMP_STYLE_DEFAULT; struct kr_cdb_api {}; struct lru {}; +typedef enum {KNOT_ANSWER, KNOT_AUTHORITY, KNOT_ADDITIONAL} knot_section_t; +typedef struct { + uint16_t pos; + uint16_t flags; + uint16_t compress_ptr[16]; +} knot_rrinfo_t; +typedef unsigned char knot_dname_t; typedef struct knot_mm { void *ctx, *alloc, *free; @@ -17,13 +24,7 @@ typedef void (*map_free_f)(void *baton, void *ptr); typedef void (*trace_log_f) (const struct kr_request *, const char *); typedef void (*trace_callback_f)(struct kr_request *); typedef uint8_t * (*alloc_wire_f)(struct kr_request *req, uint16_t *maxlen); -typedef enum {KNOT_ANSWER, KNOT_AUTHORITY, KNOT_ADDITIONAL} knot_section_t; -typedef struct { - uint16_t pos; - uint16_t flags; - uint16_t compress_ptr[16]; -} knot_rrinfo_t; -typedef unsigned char knot_dname_t; +typedef bool (*addr_info_f)(struct sockaddr*); typedef struct { knot_dname_t *_owner; uint32_t _ttl; @@ -136,6 +137,11 @@ typedef struct { size_t len; size_t cap; } ranked_rr_array_t; +typedef struct { + union inaddr *at; + size_t len; + size_t cap; +} inaddr_array_t; struct kr_zonecut { knot_dname_t *name; knot_rrset_t *key; @@ -177,7 +183,7 @@ struct kr_request { } qsource; struct { unsigned int rtt; - const struct sockaddr *addr; + const struct kr_transport *transport; } upstream; struct kr_qflags options; int state; @@ -193,6 +199,12 @@ struct kr_request { int vars_ref; knot_mm_t pool; unsigned int uid; + struct { + addr_info_f is_tls_capable; + addr_info_f is_tcp_connected; + addr_info_f is_tcp_waiting; + inaddr_array_t forwarding_targets; + } selection_context; unsigned int count_no_nsaddr; unsigned int count_fail_row; alloc_wire_f alloc_wire_cb; @@ -262,19 +274,19 @@ struct kr_module { void *lib; void *data; }; +struct kr_server_selection { + _Bool initialized; + void (*choose_transport)(struct kr_query *, struct kr_transport **); + void (*update_rtt)(struct kr_query *, const struct kr_transport *, unsigned int); + void (*error)(struct kr_query *, const struct kr_transport *, enum kr_selection_error); + struct local_state *local_state; +}; kr_layer_t kr_layer_t_static; typedef int32_t (*kr_stale_cb)(int32_t ttl, const knot_dname_t *owner, uint16_t type, const struct kr_query *qry); void kr_rrset_init(knot_rrset_t *rrset, knot_dname_t *owner, uint16_t type, uint16_t rclass, uint32_t ttl); -struct kr_nsrep { - unsigned int score; - unsigned int reputation; - const knot_dname_t *name; - struct kr_context *ctx; - /* beware: hidden stub, to avoid hardcoding sockaddr lengths */ -}; struct kr_query { struct kr_query *parent; knot_dname_t *sname; @@ -295,7 +307,7 @@ struct kr_query { struct kr_query *cname_parent; struct kr_request *request; kr_stale_cb stale_cb; - struct kr_nsrep ns; + struct kr_server_selection server_selection; }; struct kr_context { struct kr_qflags options; @@ -305,8 +317,13 @@ struct kr_context { map_t negative_anchors; struct kr_zonecut root_hints; struct kr_cache cache; + unsigned int cache_rtt_tout_retry_interval; char _stub[]; }; +struct kr_transport { + knot_dname_t *ns_name; + /* beware: hidden stub, to avoid hardcoding sockaddr lengths */ +}; const char *knot_strerror(int); knot_dname_t *knot_dname_copy(const knot_dname_t *, knot_mm_t *); knot_dname_t *knot_dname_from_str(uint8_t *, const char *, size_t); @@ -336,7 +353,7 @@ struct kr_query *kr_rplan_push(struct kr_rplan *, struct kr_query *, const knot_ int kr_rplan_pop(struct kr_rplan *, struct kr_query *); struct kr_query *kr_rplan_resolved(struct kr_rplan *); struct kr_query *kr_rplan_last(struct kr_rplan *); -int kr_nsrep_set(struct kr_query *, size_t, const struct sockaddr *); +int kr_forward_add_target(struct kr_request *, const struct sockaddr *); void kr_log_req(const struct kr_request * const, uint32_t, const unsigned int, const char *, const char *, ...); void kr_log_q(const struct kr_query * const, const char *, const char *, ...); int kr_make_query(struct kr_query *, knot_pkt_t *); diff --git a/daemon/lua/kres-gen.sh b/daemon/lua/kres-gen.sh index ad58d238d..4b25f714d 100755 --- a/daemon/lua/kres-gen.sh +++ b/daemon/lua/kres-gen.sh @@ -60,6 +60,14 @@ struct kr_cdb_api {}; struct lru {}; " +${CDEFS} ${LIBKRES} types <<-EOF + knot_section_t + knot_rrinfo_t + knot_dname_t + #knot_rdata_t + #knot_rdataset_t +EOF + # The generator doesn't work well with typedefs of functions. printf " typedef struct knot_mm { @@ -71,16 +79,9 @@ typedef void (*map_free_f)(void *baton, void *ptr); typedef void (*trace_log_f) (const struct kr_request *, const char *); typedef void (*trace_callback_f)(struct kr_request *); typedef uint8_t * (*alloc_wire_f)(struct kr_request *req, uint16_t *maxlen); +typedef bool (*addr_info_f)(struct sockaddr*); " -${CDEFS} ${LIBKRES} types <<-EOF - knot_section_t - knot_rrinfo_t - knot_dname_t - #knot_rdata_t - #knot_rdataset_t -EOF - genResType() { echo "$1" | ${CDEFS} ${LIBKRES} types } @@ -108,6 +109,7 @@ ${CDEFS} ${LIBKRES} types <<-EOF struct kr_qflags ranked_rr_array_entry_t ranked_rr_array_t + inaddr_array_t struct kr_zonecut kr_qarray_t struct kr_rplan @@ -124,6 +126,7 @@ ${CDEFS} ${LIBKRES} types <<-EOF # lib/module.h struct kr_prop struct kr_module + struct kr_server_selection EOF # a static variable; the line might not be simple to generate @@ -139,14 +142,15 @@ void kr_rrset_init(knot_rrset_t *rrset, knot_dname_t *owner, ## Some definitions would need too many deps, so shorten them. -genResType "struct kr_nsrep" | sed '/union/,$ d' -printf "\t/* beware: hidden stub, to avoid hardcoding sockaddr lengths */\n};\n" - genResType "struct kr_query" -genResType "struct kr_context" | sed '/kr_nsrep_rtt_lru_t/,$ d' +genResType "struct kr_context" | sed '/module_array_t/,$ d' printf "\tchar _stub[];\n};\n" + +echo "struct kr_transport" | ${CDEFS} ${KRESD} types | sed '/union /,$ d' +printf "\t/* beware: hidden stub, to avoid hardcoding sockaddr lengths */\n};\n" + ## libknot API ${CDEFS} libknot functions <<-EOF # Utils @@ -188,8 +192,8 @@ ${CDEFS} ${LIBKRES} functions <<-EOF kr_rplan_pop kr_rplan_resolved kr_rplan_last -# Nameservers - kr_nsrep_set +# Forwarding + kr_forward_add_target # Utils kr_log_req kr_log_q @@ -277,6 +281,7 @@ printf "\t/* beware: hidden stub, to avoid hardcoding sockaddr lengths */\n};\n" echo "struct qr_task" | ${CDEFS} ${KRESD} types | sed '/pktbuf/,$ d' printf "\t/* beware: hidden stub, to avoid qr_tasklist_t */\n};\n" + ${CDEFS} ${KRESD} functions <<-EOF worker_resolve_exec worker_resolve_mk_pkt diff --git a/daemon/worker.c b/daemon/worker.c index 6028c0e0d..8c7684dff 100644 --- a/daemon/worker.c +++ b/daemon/worker.c @@ -83,15 +83,15 @@ struct qr_task qr_tasklist_t waiting; struct session *pending[MAX_PENDING]; uint16_t pending_count; - uint16_t addrlist_count; - uint16_t addrlist_turn; uint16_t timeouts; uint16_t iter_count; - struct sockaddr *addrlist; uint32_t refs; bool finished : 1; bool leading : 1; uint64_t creation_time; + uint64_t send_time; + uint64_t recv_time; + struct kr_transport *transport; }; @@ -120,15 +120,15 @@ static int qr_task_send(struct qr_task *task, struct session *session, const struct sockaddr *addr, knot_pkt_t *pkt); static int qr_task_finalize(struct qr_task *task, int state); static void qr_task_complete(struct qr_task *task); -static struct session* worker_find_tcp_connected(struct worker_ctx *worker, +struct session* worker_find_tcp_connected(struct worker_ctx *worker, const struct sockaddr *addr); static int worker_add_tcp_waiting(struct worker_ctx *worker, const struct sockaddr *addr, struct session *session); -static struct session* worker_find_tcp_waiting(struct worker_ctx *worker, +struct session* worker_find_tcp_waiting(struct worker_ctx *worker, const struct sockaddr *addr); static void on_tcp_connect_timeout(uv_timer_t *timer); -static void on_retransmit(uv_timer_t *req); +static void on_udp_timeout(uv_timer_t *timer); static void subreq_finalize(struct qr_task *task, const struct sockaddr *packet_source, knot_pkt_t *pkt); @@ -310,6 +310,19 @@ static void free_wire(const struct request_ctx *ctx) kr_log_verbose("[xdp] freed unsent buffer, ret = %d\n", ret); } #endif +/* Helper functions for transport selection */ +static inline bool is_tls_capable(struct sockaddr *address) { + tls_client_param_t *tls_entry = tls_client_param_get(the_worker->engine->net.tls_client_params, address); + return tls_entry; +} + +static inline bool is_tcp_connected(struct sockaddr *address) { + return worker_find_tcp_connected(the_worker, address); +} + +static inline bool is_tcp_waiting(struct sockaddr *address) { + return worker_find_tcp_waiting(the_worker, address); +} /** Create and initialize a request_ctx (on a fresh mempool). * @@ -383,6 +396,12 @@ static struct request_ctx *request_create(struct worker_ctx *worker, req->qsource.dst_addr = &ctx->source.dst_addr.ip; } + req->selection_context.is_tls_capable = is_tls_capable; + req->selection_context.is_tcp_connected = is_tcp_connected; + req->selection_context.is_tcp_waiting = is_tcp_waiting; + array_init(req->selection_context.forwarding_targets); + array_reserve_mm(req->selection_context.forwarding_targets, 1, kr_memreserve, &req->pool); + worker->stats.rconcurrent += 1; return ctx; @@ -559,7 +578,6 @@ static void qr_task_complete(struct qr_task *task) /* This is called when we send subrequest / answer */ int qr_task_on_send(struct qr_task *task, const uv_handle_t *handle, int status) { - if (task->finished) { assert(task->leading == false); qr_task_complete(task); @@ -572,26 +590,17 @@ int qr_task_on_send(struct qr_task *task, const uv_handle_t *handle, int status) assert(s); if (handle->type == UV_UDP && session_flags(s)->outgoing) { - /* Start the timeout timer for UDP here, since this is the closest - * to the wire we can get. */ - struct kr_request *req = &task->ctx->req; - /* Check current query NSLIST */ - struct kr_query *qry = array_tail(req->rplan.pending); + // This should ensure that we are only dealing with our question to upstream + assert(!knot_wire_get_qr(task->pktbuf->wire)); + // start the timer + struct kr_query *qry = array_tail(task->ctx->req.rplan.pending); assert(qry != NULL); - /* Retransmit at default interval, or more frequently if the mean - * RTT of the server is better. If the server is glued, use default rate. */ - size_t timeout = qry->ns.score; - if (timeout > KR_NS_GLUED) { - /* We don't have information about variance in RTT, expect +10ms */ - timeout = MIN(qry->ns.score + 10, KR_CONN_RETRY); - } else { - timeout = KR_CONN_RETRY; - } - int ret = session_timer_start(s, on_retransmit, timeout, 0); + size_t timeout = task->transport->timeout; + int ret = session_timer_start(s, on_udp_timeout, timeout, 0); /* Start next step with timeout, fatal if can't start a timer. */ if (ret != 0) { - subreq_finalize(task, &qry->ns.addr->ip, task->pktbuf); + subreq_finalize(task, &task->transport->address.ip, task->pktbuf); qr_task_finalize(task, KR_STATE_FAIL); } } @@ -681,6 +690,9 @@ static int qr_task_send(struct qr_task *task, struct session *session, qr_task_ref(task); struct worker_ctx *worker = ctx->worker; + /* Note time for upstream RTT */ + task->send_time = kr_now(); + task->recv_time = 0; // task structure is being reused so we have to zero this out here /* Send using given protocol */ assert(!session_flags(session)->closing); if (session_flags(session)->has_http) { @@ -793,11 +805,9 @@ static int session_tls_hs_cb(struct session *session, int status) if (status) { struct qr_task *task = session_waitinglist_get(session); if (task) { - struct kr_qflags *options = &task->ctx->req.options; - unsigned score = options->FORWARD || options->STUB ? KR_NS_FWD_DEAD : KR_NS_DEAD; - kr_nsrep_update_rtt(NULL, peer, score, - the_worker->engine->resolver.cache_rtt, - KR_NS_UPDATE_NORESET); + // TLS handshake failed, report it to server selection + struct kr_query *qry = array_tail(task->ctx->req.rplan.pending); + qry->server_selection.error(qry, task->transport, KR_SELECTION_TLS_HANDSHAKE_FAILED); } #ifndef NDEBUG else { @@ -973,13 +983,10 @@ static void on_connect(uv_connect_t *req, int status) struct qr_task *task = session_waitinglist_get(session); if (task && status != UV_ETIMEDOUT) { /* Penalize upstream. - * In case of UV_ETIMEDOUT upstream has been - * already penalized in on_tcp_connect_timeout() */ - struct kr_qflags *options = &task->ctx->req.options; - unsigned score = options->FORWARD || options->STUB ? KR_NS_FWD_DEAD : KR_NS_DEAD; - kr_nsrep_update_rtt(NULL, peer, score, - worker->engine->resolver.cache_rtt, - KR_NS_UPDATE_NORESET); + * In case of UV_ETIMEDOUT upstream has been + * already penalized in on_tcp_connect_timeout() */ + struct kr_query *qry = array_tail(task->ctx->req.rplan.pending); + qry->server_selection.error(qry, task->transport, KR_SELECTION_TCP_CONNECT_FAILED); } assert(session_tasklist_is_empty(session)); session_waitinglist_retry(session, false); @@ -1061,10 +1068,7 @@ static void on_tcp_connect_timeout(uv_timer_t *timer) peer_str ? peer_str : ""); } - unsigned score = qry->flags.FORWARD || qry->flags.STUB ? KR_NS_FWD_DEAD : KR_NS_DEAD; - kr_nsrep_update_rtt(NULL, peer, score, - worker->engine->resolver.cache_rtt, - KR_NS_UPDATE_NORESET); + qry->server_selection.error(qry, task->transport, KR_SELECTION_TCP_CONNECT_TIMEOUT); worker->stats.timeout += session_waitinglist_get_len(session); session_waitinglist_retry(session, true); @@ -1089,34 +1093,28 @@ static void on_udp_timeout(uv_timer_t *timer) uv_timer_stop(timer); - /* Penalize all tried nameservers with a timeout. */ struct qr_task *task = session_tasklist_get_first(session); struct worker_ctx *worker = task->ctx->worker; + if (task->leading && task->pending_count > 0) { struct kr_query *qry = array_tail(task->ctx->req.rplan.pending); - struct sockaddr_in6 *addrlist = (struct sockaddr_in6 *)task->addrlist; - for (uint16_t i = 0; i < MIN(task->pending_count, task->addrlist_count); ++i) { - struct sockaddr *choice = (struct sockaddr *)(&addrlist[i]); - WITH_VERBOSE(qry) { - char *addr_str = kr_straddr(choice); - VERBOSE_MSG(qry, "=> server: '%s' flagged as 'bad'\n", addr_str ? addr_str : ""); - } - unsigned score = qry->flags.FORWARD || qry->flags.STUB ? KR_NS_FWD_DEAD : KR_NS_DEAD; - kr_nsrep_update_rtt(&qry->ns, choice, score, - worker->engine->resolver.cache_rtt, - KR_NS_UPDATE_NORESET); - } + qry->server_selection.error(qry, task->transport, KR_SELECTION_QUERY_TIMEOUT); } + task->timeouts += 1; worker->stats.timeout += 1; qr_task_step(task, NULL, NULL); } -static uv_handle_t *retransmit(struct qr_task *task) +static uv_handle_t *transmit(struct qr_task *task) { uv_handle_t *ret = NULL; - if (task && task->addrlist && task->addrlist_count > 0) { - struct sockaddr_in6 *choice = &((struct sockaddr_in6 *)task->addrlist)[task->addrlist_turn]; + + if (task) { + struct kr_transport* transport = task->transport; + + struct sockaddr_in6 *choice = (struct sockaddr_in6 *)&transport->address; + if (!choice) { return ret; } @@ -1125,7 +1123,7 @@ static uv_handle_t *retransmit(struct qr_task *task) } /* Checkout answer before sending it */ struct request_ctx *ctx = task->ctx; - if (kr_resolve_checkout(&ctx->req, NULL, (struct sockaddr *)choice, SOCK_DGRAM, task->pktbuf) != 0) { + if (kr_resolve_checkout(&ctx->req, NULL, transport, task->pktbuf) != 0) { return ret; } ret = ioreq_spawn(ctx->worker, SOCK_DGRAM, choice->sin6_family, false, false); @@ -1144,31 +1142,12 @@ static uv_handle_t *retransmit(struct qr_task *task) } else { task->pending[task->pending_count] = session; task->pending_count += 1; - task->addrlist_turn = (task->addrlist_turn + 1) % - task->addrlist_count; /* Round robin */ session_start_read(session); /* Start reading answer */ } } return ret; } -static void on_retransmit(uv_timer_t *req) -{ - struct session *session = req->data; - assert(session_tasklist_get_len(session) == 1); - - uv_timer_stop(req); - struct qr_task *task = session_tasklist_get_first(session); - if (retransmit(task) == NULL) { - /* Not possible to spawn request, start timeout timer with remaining deadline. */ - struct kr_qflags *options = &task->ctx->req.options; - uint64_t timeout = options->FORWARD || options->STUB ? KR_NS_FWD_TIMEOUT / 2 : - KR_CONN_RTT_MAX - task->pending_count * KR_CONN_RETRY; - uv_timer_start(req, on_udp_timeout, timeout, 0); - } else { - uv_timer_start(req, on_retransmit, KR_CONN_RETRY, 0); - } -} static void subreq_finalize(struct qr_task *task, const struct sockaddr *packet_source, knot_pkt_t *pkt) { @@ -1196,6 +1175,12 @@ static void subreq_finalize(struct qr_task *task, const struct sockaddr *packet_ struct kr_query *qry = array_tail(follower->ctx->req.rplan.pending); qry->id = leader_qry->id; qry->secret = leader_qry->secret; + + // Note that this transport may not be present in `leader_qry`'s server selection + follower->transport = task->transport; + if(follower->transport) { + follower->transport->deduplicated = true; + } leader_qry->secret = 0; /* Next will be already decoded */ } qr_task_step(follower, packet_source, pkt); @@ -1369,7 +1354,7 @@ static int udp_task_step(struct qr_task *task, return kr_ok(); /* Will be notified when outgoing query finishes. */ } /* Start transmitting */ - uv_handle_t *handle = retransmit(task); + uv_handle_t *handle = transmit(task); if (handle == NULL) { subreq_finalize(task, packet_source, packet); return qr_task_finalize(task, KR_STATE_FAIL); @@ -1517,15 +1502,7 @@ static int tcp_task_make_connection(struct qr_task *task, const struct sockaddr worker_del_tcp_waiting(worker, addr); free(conn); session_close(session); - unsigned score = qry->flags.FORWARD || qry->flags.STUB ? KR_NS_FWD_DEAD : KR_NS_DEAD; - kr_nsrep_update_rtt(NULL, peer, score, - worker->engine->resolver.cache_rtt, - KR_NS_UPDATE_NORESET); - WITH_VERBOSE (qry) { - const char *peer_str = kr_straddr(peer); - kr_log_verbose( "[wrkr]=> connect to '%s' failed (%s), flagged as 'bad'\n", - peer_str ? peer_str : "", uv_strerror(ret)); - } + qry->server_selection.error(qry, task->transport, KR_SELECTION_TCP_CONNECT_FAILED); return kr_error(EAGAIN); } @@ -1549,7 +1526,7 @@ static int tcp_task_step(struct qr_task *task, assert(task->pending_count == 0); /* target */ - const struct sockaddr *addr = task->addrlist; + const struct sockaddr *addr = &task->transport->address.ip; if (addr->sa_family == AF_UNSPEC) { /* Target isn't defined. Finalize task with SERVFAIL. * Although task->pending_count is zero, there are can be followers, @@ -1559,8 +1536,7 @@ static int tcp_task_step(struct qr_task *task, } /* Checkout task before connecting */ struct request_ctx *ctx = task->ctx; - if (kr_resolve_checkout(&ctx->req, NULL, (struct sockaddr *)addr, - SOCK_STREAM, task->pktbuf) != 0) { + if (kr_resolve_checkout(&ctx->req, NULL, task->transport, task->pktbuf) != 0) { subreq_finalize(task, packet_source, packet); return qr_task_finalize(task, KR_STATE_FAIL); } @@ -1609,10 +1585,6 @@ static int qr_task_step(struct qr_task *task, assert(ctx); struct kr_request *req = &ctx->req; struct worker_ctx *worker = ctx->worker; - int sock_type = -1; - task->addrlist = NULL; - task->addrlist_count = 0; - task->addrlist_turn = 0; if (worker->too_many_open) { /* */ @@ -1623,22 +1595,29 @@ static int qr_task_step(struct qr_task *task, } else { if (packet && kr_rplan_empty(rplan)) { /* new query; TODO - make this detection more obvious */ - kr_resolve_consume(req, packet_source, packet); + kr_resolve_consume(req, &task->transport, packet); } return qr_task_finalize(task, KR_STATE_FAIL); } } - int state = kr_resolve_consume(req, packet_source, packet); + // Report network RTT back to server selection + if (task->send_time && task->recv_time) { + struct kr_query *qry = array_tail(req->rplan.pending); + qry->server_selection.update_rtt(qry, task->transport, task->recv_time - task->send_time); + } + + int state = kr_resolve_consume(req, &task->transport, packet); + + task->transport = NULL; while (state == KR_STATE_PRODUCE) { - state = kr_resolve_produce(req, &task->addrlist, - &sock_type, task->pktbuf); + state = kr_resolve_produce(req, &task->transport, task->pktbuf); if (unlikely(++task->iter_count > KR_ITER_LIMIT || task->timeouts >= KR_TIMEOUT_LIMIT)) { #ifndef NOVERBOSELOG struct kr_rplan *rplan = &req->rplan; - struct kr_query *last = kr_rplan_last(rplan); + struct kr_query *last = kr_rplan_last(rplan); if (task->iter_count > KR_ITER_LIMIT) { VERBOSE_MSG(last, "canceling query due to exceeded iteration count limit of %d\n", KR_ITER_LIMIT); } @@ -1654,47 +1633,22 @@ static int qr_task_step(struct qr_task *task, /* We're done, no more iterations needed */ if (state & (KR_STATE_DONE|KR_STATE_FAIL)) { return qr_task_finalize(task, state); - } else if (!task->addrlist || sock_type < 0) { + } else if (!task->transport || !task->transport->protocol) { return qr_task_step(task, NULL, NULL); } - /* Count available address choices */ - struct sockaddr_in6 *choice = (struct sockaddr_in6 *)task->addrlist; - for (size_t i = 0; i < KR_NSREP_MAXADDR && choice->sin6_family != AF_UNSPEC; ++i) { - task->addrlist_count += 1; - choice += 1; - } - - /* Upgrade to TLS if the upstream address is configured as DoT capable. */ - if (task->addrlist_count > 0 && kr_inaddr_port(task->addrlist) == KR_DNS_PORT) { - /* TODO if there are multiple addresses (task->addrlist_count > 1) - * check all of them. */ - struct network *net = &worker->engine->net; - /* task->addrlist has to contain TLS port before tls_client_param_get() call */ - kr_inaddr_set_port(task->addrlist, KR_DNS_TLS_PORT); - tls_client_param_t *tls_entry = - tls_client_param_get(net->tls_client_params, task->addrlist); - if (tls_entry) { - packet_source = NULL; - sock_type = SOCK_STREAM; - /* TODO in this case in tcp_task_make_connection() will be performed - * redundant map_get() call. */ - } else { - /* The function is fairly cheap, so we just change there and back. */ - kr_inaddr_set_port(task->addrlist, KR_DNS_PORT); - } - } - - int ret = 0; - if (sock_type == SOCK_DGRAM) { - /* Start fast retransmit with UDP. */ - ret = udp_task_step(task, packet_source, packet); - } else { - /* TCP. Connect to upstream or send the query if connection already exists. */ - assert (sock_type == SOCK_STREAM); - ret = tcp_task_step(task, packet_source, packet); + switch (task->transport->protocol) + { + case KR_TRANSPORT_UDP: + return udp_task_step(task, packet_source, packet); + break; + case KR_TRANSPORT_TCP: // fall through + case KR_TRANSPORT_TLS: + return tcp_task_step(task, packet_source, packet); + default: + assert(0); + break; } - return ret; } static int parse_packet(knot_pkt_t *query) @@ -1791,12 +1745,15 @@ int worker_submit(struct session *session, } assert(!session_flags(session)->closing); addr = peer; + /* Note recieve time for RTT calculation */ + task->recv_time = kr_now(); } assert(uv_is_closing(session_get_handle(session)) == false); /* Packet was successfully parsed. * Task was created (found). */ session_touch(session); + /* Consume input and produce next message */ return qr_task_step(task, addr, pkt); } @@ -1851,7 +1808,7 @@ int worker_del_tcp_connected(struct worker_ctx *worker, return map_del_tcp_session(&worker->tcp_connected, addr); } -static struct session* worker_find_tcp_connected(struct worker_ctx *worker, +struct session* worker_find_tcp_connected(struct worker_ctx *worker, const struct sockaddr* addr) { return map_find_tcp_session(&worker->tcp_connected, addr); @@ -1877,7 +1834,7 @@ int worker_del_tcp_waiting(struct worker_ctx *worker, return map_del_tcp_session(&worker->tcp_waiting, addr); } -static struct session* worker_find_tcp_waiting(struct worker_ctx *worker, +struct session* worker_find_tcp_waiting(struct worker_ctx *worker, const struct sockaddr* addr) { return map_find_tcp_session(&worker->tcp_waiting, addr); @@ -1951,12 +1908,9 @@ int worker_end_tcp(struct session *session) return kr_ok(); } -knot_pkt_t * worker_resolve_mk_pkt(const char *qname_str, uint16_t qtype, uint16_t qclass, +knot_pkt_t *worker_resolve_mk_pkt_dname(knot_dname_t *qname, uint16_t qtype, uint16_t qclass, const struct kr_qflags *options) { - uint8_t qname[KNOT_DNAME_MAXLEN]; - if (!knot_dname_from_str(qname, qname_str, sizeof(qname))) - return NULL; knot_pkt_t *pkt = knot_pkt_new(NULL, KNOT_EDNS_MAX_UDP_PAYLOAD, NULL); if (!pkt) return NULL; @@ -1991,6 +1945,15 @@ knot_pkt_t * worker_resolve_mk_pkt(const char *qname_str, uint16_t qtype, uint16 return pkt; } +knot_pkt_t *worker_resolve_mk_pkt(const char *qname_str, uint16_t qtype, uint16_t qclass, + const struct kr_qflags *options) +{ + uint8_t qname[KNOT_DNAME_MAXLEN]; + if (!knot_dname_from_str(qname, qname_str, sizeof(qname))) + return NULL; + return worker_resolve_mk_pkt_dname(qname, qtype, qclass, options); +} + struct qr_task *worker_resolve_start(knot_pkt_t *query, struct kr_qflags options) { struct worker_ctx *worker = the_worker; diff --git a/daemon/worker.h b/daemon/worker.h index 0e3e27580..5f7be5b43 100644 --- a/daemon/worker.h +++ b/daemon/worker.h @@ -47,6 +47,9 @@ int worker_submit(struct session *session, */ int worker_end_tcp(struct session *session); +KR_EXPORT knot_pkt_t *worker_resolve_mk_pkt_dname(knot_dname_t *qname, uint16_t qtype, uint16_t qclass, + const struct kr_qflags *options); + /** * Create a packet suitable for worker_resolve_start(). All in malloc() memory. */ @@ -96,6 +99,10 @@ int worker_del_tcp_connected(struct worker_ctx *worker, const struct sockaddr *addr); int worker_del_tcp_waiting(struct worker_ctx *worker, const struct sockaddr* addr); +struct session* worker_find_tcp_waiting(struct worker_ctx *worker, + const struct sockaddr* addr); +struct session* worker_find_tcp_connected(struct worker_ctx *worker, + const struct sockaddr* addr); knot_pkt_t *worker_task_get_pktbuf(const struct qr_task *task); struct request_ctx *worker_task_get_request(struct qr_task *task); @@ -136,7 +143,7 @@ struct worker_stats { /** @cond internal */ /** Number of request within timeout window. */ -#define MAX_PENDING KR_NSREP_MAXADDR +#define MAX_PENDING 4 /** Maximum response time from TCP upstream, milliseconds */ #define MAX_TCP_INACTIVITY (KR_RESOLVE_TIME_LIMIT + KR_CONN_RTT_MAX) diff --git a/daemon/zimport.c b/daemon/zimport.c index 71a3b1865..d302120e3 100644 --- a/daemon/zimport.c +++ b/daemon/zimport.c @@ -33,6 +33,7 @@ */ #include /* PRIu64 */ +#include #include #include #include diff --git a/doc/lib.rst b/doc/lib.rst index 101767fe5..f69f4efed 100644 --- a/doc/lib.rst +++ b/doc/lib.rst @@ -38,7 +38,7 @@ Cache Nameservers ----------- -.. doxygenfile:: nsrep.h +.. doxygenfile:: selection.h :project: libkres .. doxygenfile:: zonecut.h :project: libkres diff --git a/lib/defines.h b/lib/defines.h index 76a93cb18..dc8c37882 100644 --- a/lib/defines.h +++ b/lib/defines.h @@ -50,7 +50,7 @@ static inline int KR_COLD kr_error(int x) { #define KR_ITER_LIMIT 100 /* Built-in iterator limit */ #define KR_RESOLVE_TIME_LIMIT 10000 /* Upper limit for resolution time of single query, ms */ #define KR_CNAME_CHAIN_LIMIT 13 /* Built-in maximum CNAME chain length */ -#define KR_TIMEOUT_LIMIT 4 /* Maximum number of retries after timeout. */ +#define KR_TIMEOUT_LIMIT 10 /* Maximum number of retries after timeout. */ #define KR_QUERY_NSRETRY_LIMIT 4 /* Maximum number of retries per query. */ #define KR_COUNT_NO_NSADDR_LIMIT 5 #define KR_CONSUME_FAIL_ROW_LIMIT 3 /* Maximum number of KR_STATE_FAIL in a row. */ diff --git a/lib/layer/iterate.c b/lib/layer/iterate.c index e689c4321..791a6849c 100644 --- a/lib/layer/iterate.c +++ b/lib/layer/iterate.c @@ -29,7 +29,7 @@ #include "lib/resolve.h" #include "lib/rplan.h" #include "lib/defines.h" -#include "lib/nsrep.h" +#include "lib/selection.h" #include "lib/module.h" #include "lib/dnssec/ta.h" @@ -213,10 +213,12 @@ static void fetch_glue(knot_pkt_t *pkt, const knot_dname_t *ns, bool in_bailiwic if ((rr->type == KNOT_RRTYPE_A) && (req->ctx->options.NO_IPV4)) { + QVERBOSE_MSG(qry, "<= skipping IPv4 glue due to network settings\n"); continue; } if ((rr->type == KNOT_RRTYPE_AAAA) && (req->ctx->options.NO_IPV6)) { + QVERBOSE_MSG(qry, "<= skipping IPv6 glue due to network settings\n"); continue; } (void) update_nsaddr(rr, req->current_query, glue_cnt); @@ -258,6 +260,7 @@ static int update_cut(knot_pkt_t *pkt, const knot_rrset_t *rr, && knot_dname_in_bailiwick(qry->sname, rr->owner) >= 0; if (!ok) { VERBOSE_MSG("<= authority: ns outside bailiwick\n"); + qry->server_selection.error(qry, req->upstream.transport, KR_SELECTION_LAME_DELEGATION); #ifdef STRICT_MODE return KR_STATE_FAIL; #else @@ -632,10 +635,11 @@ static int process_referral_answer(knot_pkt_t *pkt, struct kr_request *req) { const knot_dname_t *cname = NULL; int state = unroll_cname(pkt, req, true, &cname); + struct kr_query *query = req->current_query; if (state != kr_ok()) { + query->server_selection.error(query, req->upstream.transport, KR_SELECTION_BAD_CNAME); return KR_STATE_FAIL; } - struct kr_query *query = req->current_query; if (!(query->flags.CACHED)) { /* If not cached (i.e. got from upstream) * make sure that this is not an authoritative answer @@ -721,6 +725,7 @@ static int process_answer(knot_pkt_t *pkt, struct kr_request *req) if (!is_authoritative(pkt, query)) { if (!(query->flags.FORWARD) && pkt_class & (PKT_NXDOMAIN|PKT_NODATA)) { + query->server_selection.error(query, req->upstream.transport, KR_SELECTION_LAME_DELEGATION); VERBOSE_MSG("<= lame response: non-auth sent negative response\n"); return KR_STATE_FAIL; } @@ -730,6 +735,7 @@ static int process_answer(knot_pkt_t *pkt, struct kr_request *req) /* Process answer type */ int state = unroll_cname(pkt, req, false, &cname); if (state != kr_ok()) { + query->server_selection.error(query, req->upstream.transport, KR_SELECTION_BAD_CNAME); return state; } /* Make sure that this is an authoritative answer (even with AA=0) for other layers */ @@ -760,6 +766,7 @@ static int process_answer(knot_pkt_t *pkt, struct kr_request *req) q->stype == query->stype && knot_dname_is_equal(q->sname, cname)) { VERBOSE_MSG("<= cname chain loop\n"); + query->server_selection.error(query, req->upstream.transport, KR_SELECTION_BAD_CNAME); return KR_STATE_FAIL; } } @@ -777,12 +784,6 @@ static int process_answer(knot_pkt_t *pkt, struct kr_request *req) if (query->flags.FORWARD) { next->forward_flags.CNAME = true; - if (query->parent == NULL) { - state = kr_nsrep_copy_set(&next->ns, &query->ns); - if (state != kr_ok()) { - return KR_STATE_FAIL; - } - } } next->cname_parent = query; /* Want DNSSEC if and only if it's posible to secure @@ -998,10 +999,8 @@ static int resolve_badmsg(knot_pkt_t *pkt, struct kr_request *req, struct kr_que /* Work around broken auths/load balancers */ if (query->flags.SAFEMODE) { return resolve_error(pkt, req); - } else if (query->flags.NO_MINIMIZE) { - query->flags.SAFEMODE = true; - return KR_STATE_DONE; } else { + query->flags.SAFEMODE = true; query->flags.NO_MINIMIZE = true; return KR_STATE_DONE; } @@ -1044,13 +1043,16 @@ static int resolve(kr_layer_t *ctx, knot_pkt_t *pkt) return resolve_badmsg(pkt, req, query); } else #endif + /* LATER: Query minimization, 0x20 randomization, EDNS… should really be + * set and managed by selection.c and SAFEMODE should be split and + * removed altogether because it's doing many things at once. */ if (pkt->parsed <= KNOT_WIRE_HEADER_SIZE) { VERBOSE_MSG("<= malformed response (parsed %d)\n", (int)pkt->parsed); return resolve_badmsg(pkt, req, query); } else if (!is_paired_to_query(pkt, query)) { WITH_VERBOSE(query) { const char *ns_str = - req->upstream.addr ? kr_straddr(req->upstream.addr) : "(internal)"; + req->upstream.transport ? kr_straddr(&req->upstream.transport->address.ip) : "(internal)"; VERBOSE_MSG("<= ignoring mismatching response from %s\n", ns_str ? ns_str : "(kr_straddr failed)"); } @@ -1062,11 +1064,12 @@ static int resolve(kr_layer_t *ctx, knot_pkt_t *pkt) VERBOSE_MSG("<= truncated response, failover to TCP\n"); if (query) { /* Fail if already on TCP. */ - if (query->flags.TCP) { + if (req->upstream.transport->protocol != KR_TRANSPORT_UDP) { VERBOSE_MSG("<= TC=1 with TCP, bailing out\n"); + query->server_selection.error(query, req->upstream.transport, KR_SELECTION_TRUNCATED); return resolve_error(pkt, req); } - query->flags.TCP = true; + query->server_selection.error(query, req->upstream.transport, KR_SELECTION_TRUNCATED); } return KR_STATE_CONSUME; } @@ -1079,6 +1082,10 @@ static int resolve(kr_layer_t *ctx, knot_pkt_t *pkt) const knot_lookup_t *rcode = knot_lookup_by_id(knot_rcode_names, knot_wire_get_rcode(pkt->wire)); #endif + // We can't return directly from the switch because we have to give feedback to server selection first + int ret = 0; + int selection_error = -1; + /* Check response code. */ switch(knot_wire_get_rcode(pkt->wire)) { case KNOT_RCODE_NOERROR: @@ -1090,19 +1097,48 @@ static int resolve(kr_layer_t *ctx, knot_pkt_t *pkt) knot_wire_set_rcode(req->answer->wire, KNOT_RCODE_YXDOMAIN); break; case KNOT_RCODE_REFUSED: + if (query->flags.STUB) { + /* just pass answer through if in stub mode */ + break; + } + selection_error = KR_SELECTION_REFUSED; + VERBOSE_MSG("<= rcode: %s\n", rcode ? rcode->name : "??"); + ret = resolve_badmsg(pkt, req, query); + break; case KNOT_RCODE_SERVFAIL: if (query->flags.STUB) { /* just pass answer through if in stub mode */ break; } - /* fall through */ + selection_error = KR_SELECTION_SERVFAIL; + VERBOSE_MSG("<= rcode: %s\n", rcode ? rcode->name : "??"); + ret = resolve_badmsg(pkt, req, query); + break; case KNOT_RCODE_FORMERR: + selection_error = KR_SELECTION_FORMERROR; + VERBOSE_MSG("<= rcode: %s\n", rcode ? rcode->name : "??"); + ret = resolve_badmsg(pkt, req, query); + break; case KNOT_RCODE_NOTIMPL: + selection_error = KR_SELECTION_NOTIMPL; VERBOSE_MSG("<= rcode: %s\n", rcode ? rcode->name : "??"); - return resolve_badmsg(pkt, req, query); + ret = resolve_badmsg(pkt, req, query); + break; default: + selection_error = KR_SELECTION_OTHER_RCODE; VERBOSE_MSG("<= rcode: %s\n", rcode ? rcode->name : "??"); - return resolve_error(pkt, req); + ret = resolve_error(pkt, req); + break; + } + + if (query->server_selection.initialized) { + if (selection_error != -1) { + query->server_selection.error(query, req->upstream.transport, selection_error); + } + } + + if (ret) { + return ret; } int state; @@ -1145,7 +1181,7 @@ rrarray_finalize: (void)0; ranked_rr_array_t *selected[] = kr_request_selected(req); for (knot_section_t i = KNOT_ANSWER; i <= KNOT_ADDITIONAL; ++i) { - int ret = kr_ranked_rrarray_finalize(selected[i], query->uid, &req->pool); + ret = kr_ranked_rrarray_finalize(selected[i], query->uid, &req->pool); if (unlikely(ret)) { return KR_STATE_FAIL; } diff --git a/lib/layer/validate.c b/lib/layer/validate.c index cbbc0738a..aa8aa8be4 100644 --- a/lib/layer/validate.c +++ b/lib/layer/validate.c @@ -23,6 +23,7 @@ #include "lib/utils.h" #include "lib/defines.h" #include "lib/module.h" +#include "lib/selection.h" #define VERBOSE_MSG(qry, ...) QRVERBOSE(qry, "vldr", __VA_ARGS__) @@ -349,7 +350,7 @@ static knot_rrset_t *update_ds(struct kr_zonecut *cut, const knot_pktsection_t * return NULL; } } - return new_ds; + return new_ds; } static void mark_insecure_parents(const struct kr_query *qry) @@ -1190,11 +1191,22 @@ static int hide_bogus(kr_layer_t *ctx) { return ctx->state; } +static int validate_wrapper(kr_layer_t *ctx, knot_pkt_t *pkt) { + // Wrapper for now. + int ret = validate(ctx, pkt); + struct kr_request *req = ctx->req; + struct kr_query *qry = req->current_query; + if (ret & KR_STATE_FAIL && qry->flags.DNSSEC_BOGUS) + qry->server_selection.error(qry, req->upstream.transport, KR_SELECTION_DNSSEC_ERROR); + return ret; +} + + /** Module implementation. */ int validate_init(struct kr_module *self) { static const kr_layer_api_t layer = { - .consume = &validate, + .consume = &validate_wrapper, .answer_finalize = &hide_bogus, }; self->layer = &layer; diff --git a/lib/meson.build b/lib/meson.build index 6d6ec9ce6..d24d997c7 100644 --- a/lib/meson.build +++ b/lib/meson.build @@ -24,9 +24,11 @@ libkres_src = files([ 'layer/iterate.c', 'layer/validate.c', 'module.c', - 'nsrep.c', 'resolve.c', 'rplan.c', + 'selection.c', + 'selection_forward.c', + 'selection_iter.c', 'utils.c', 'zonecut.c', ]) @@ -52,9 +54,11 @@ libkres_headers = files([ 'layer.h', 'layer/iterate.h', 'module.h', - 'nsrep.h', 'resolve.h', 'rplan.h', + 'selection.h', + 'selection_forward.h', + 'selection_iter.h', 'utils.h', 'zonecut.h', ]) diff --git a/lib/nsrep.c b/lib/nsrep.c deleted file mode 100644 index c49f406c0..000000000 --- a/lib/nsrep.c +++ /dev/null @@ -1,570 +0,0 @@ -/* Copyright (C) 2014-2017 CZ.NIC, z.s.p.o. - * SPDX-License-Identifier: GPL-3.0-or-later - */ - -#include -#include -#include -#include - -#include - -#include "lib/nsrep.h" -#include "lib/rplan.h" -#include "lib/resolve.h" -#include "lib/defines.h" -#include "lib/generic/pack.h" -#include "contrib/ucw/lib.h" - -/** Some built-in unfairness ... */ -#ifndef FAVOUR_IPV6 -#define FAVOUR_IPV6 20 /* 20ms bonus for v6 */ -#endif - -/** @internal Macro to set address structure. */ -#define ADDR_SET(sa, family, addr, len, port) do {\ - memcpy(&sa ## _addr, (addr), (len)); \ - sa ## _family = (family); \ - sa ## _port = htons(port); \ -} while (0) - -/** Update nameserver representation with current name/address pair. */ -static void update_nsrep(struct kr_nsrep *ns, size_t pos, uint8_t *addr, size_t addr_len, int port) -{ - if (addr == NULL) { - ns->addr[pos].ip.sa_family = AF_UNSPEC; - return; - } - - /* Rotate previous addresses to the right. */ - memmove(ns->addr + pos + 1, ns->addr + pos, (KR_NSREP_MAXADDR - pos - 1) * sizeof(ns->addr[0])); - - switch(addr_len) { - case sizeof(struct in_addr): - ADDR_SET(ns->addr[pos].ip4.sin, AF_INET, addr, addr_len, port); break; - case sizeof(struct in6_addr): - ADDR_SET(ns->addr[pos].ip6.sin6, AF_INET6, addr, addr_len, port); break; - default: assert(0); break; - } -} - -static void update_nsrep_set(struct kr_nsrep *ns, const knot_dname_t *name, uint8_t *addr[], unsigned score) -{ - /* NSLIST is not empty, empty NS cannot be a leader. */ - if (!addr[0] && ns->addr[0].ip.sa_family != AF_UNSPEC) { - return; - } - /* Set new NS leader */ - ns->name = name; - ns->score = score; - for (size_t i = 0; i < KR_NSREP_MAXADDR; ++i) { - if (addr[i]) { - void *addr_val = pack_obj_val(addr[i]); - size_t len = pack_obj_len(addr[i]); - update_nsrep(ns, i, addr_val, len, KR_DNS_PORT); - } else { - break; - } - } -} - -#undef ADDR_SET - -/** - * \param addr_set pack with one IP address per element */ -static unsigned eval_addr_set(const pack_t *addr_set, struct kr_context *ctx, - struct kr_qflags opts, unsigned score, uint8_t *addr[]) -{ - kr_nsrep_rtt_lru_t *rtt_cache = ctx->cache_rtt; - kr_nsrep_rtt_lru_entry_t *rtt_cache_entry_ptr[KR_NSREP_MAXADDR] = { NULL, }; - assert (KR_NSREP_MAXADDR >= 2); - unsigned rtt_cache_entry_score[KR_NSREP_MAXADDR] = { score, KR_NS_MAX_SCORE + 1, }; - uint64_t now = kr_now(); - - /* Name server is better candidate if it has address record. */ - for (uint8_t *it = pack_head(*addr_set); it != pack_tail(*addr_set); - it = pack_obj_next(it)) { - void *val = pack_obj_val(it); - size_t len = pack_obj_len(it); - unsigned favour = 0; - bool is_valid = false; - /* Check if the address isn't disabled. */ - if (len == sizeof(struct in6_addr)) { - is_valid = !(opts.NO_IPV6); - favour = FAVOUR_IPV6; - } else if (len == sizeof(struct in_addr)) { - is_valid = !(opts.NO_IPV4); - } else { - assert(!EINVAL); - is_valid = false; - } - - if (!is_valid) { - continue; - } - - /* Get score for the current address. */ - kr_nsrep_rtt_lru_entry_t *cached = rtt_cache ? - lru_get_try(rtt_cache, val, len) : - NULL; - unsigned cur_addr_score = KR_NS_GLUED; - if (cached) { - cur_addr_score = cached->score; - if (cached->score >= KR_NS_TIMEOUT) { - /* If NS once was marked as "timeouted", - * it won't participate in NS elections - * at least ctx->cache_rtt_tout_retry_interval milliseconds. */ - uint64_t elapsed = now - cached->tout_timestamp; - elapsed = elapsed > UINT_MAX ? UINT_MAX : elapsed; - if (elapsed > ctx->cache_rtt_tout_retry_interval) { - /* Select this NS for probing in this particular query, - * but don't change the cached score. - * For other queries this NS will remain "timeouted". */ - cur_addr_score = KR_NS_LONG - 1; - } - } - } - - /* We can't always use favour. If these conditions held: - * - * rtt_cache_entry_score[i] < KR_NS_TIMEOUT - * rtt_cache_entry_score[i] + favour > KR_NS_TIMEOUT - * cur_addr_score < rtt_cache_entry_score[i] + favour - * - * we would prefer "certainly dead" cur_addr_score - * instead of "almost dead but alive" rtt_cache_entry_score[i] - */ - const unsigned cur_favour = cur_addr_score < KR_NS_TIMEOUT ? favour : 0; - for (size_t i = 0; i < KR_NSREP_MAXADDR; ++i) { - if (cur_addr_score >= rtt_cache_entry_score[i] + cur_favour) - continue; - - /* Shake down previous contenders */ - for (size_t j = KR_NSREP_MAXADDR - 1; j > i; --j) { - addr[j] = addr[j - 1]; - rtt_cache_entry_ptr[j] = rtt_cache_entry_ptr[j - 1]; - rtt_cache_entry_score[j] = rtt_cache_entry_score[j - 1]; - } - addr[i] = it; - rtt_cache_entry_score[i] = cur_addr_score; - rtt_cache_entry_ptr[i] = cached; - break; - } - } - - /* At this point, rtt_cache_entry_ptr contains up to KR_NSREP_MAXADDR - * pointers to the rtt cache entries with the best scores for the given addr_set. - * Check if there are timeouted NS. */ - - for (size_t i = 0; i < KR_NSREP_MAXADDR; ++i) { - if (rtt_cache_entry_ptr[i] == NULL) - continue; - if (rtt_cache_entry_ptr[i]->score < KR_NS_TIMEOUT) - continue; - - uint64_t elapsed = now - rtt_cache_entry_ptr[i]->tout_timestamp; - elapsed = elapsed > UINT_MAX ? UINT_MAX : elapsed; - if (elapsed <= ctx->cache_rtt_tout_retry_interval) - continue; - - /* rtt_cache_entry_ptr[i] points to "timeouted" rtt cache entry. - * The period of the ban on participation in elections has expired. */ - - if (VERBOSE_STATUS) { - void *val = pack_obj_val(addr[i]); - size_t len = pack_obj_len(addr[i]); - char sa_str[INET6_ADDRSTRLEN]; - int af = (len == sizeof(struct in6_addr)) ? AF_INET6 : AF_INET; - inet_ntop(af, val, sa_str, sizeof(sa_str)); - kr_log_verbose("[ ][nsre] probing timeouted NS: %s, score %i\n", - sa_str, rtt_cache_entry_ptr[i]->score); - } - - rtt_cache_entry_ptr[i]->tout_timestamp = now; - } - - return rtt_cache_entry_score[0]; -} - -static int eval_nsrep(const knot_dname_t *owner, const pack_t *addr_set, struct kr_query *qry) -{ - struct kr_nsrep *ns = &qry->ns; - struct kr_context *ctx = ns->ctx; - unsigned score = KR_NS_MAX_SCORE; - unsigned reputation = 0; - uint8_t *addr_choice[KR_NSREP_MAXADDR] = { NULL, }; - - /* Fetch NS reputation */ - if (ctx->cache_rep) { - unsigned *cached = lru_get_try(ctx->cache_rep, (const char *)owner, - knot_dname_size(owner)); - if (cached) { - reputation = *cached; - } - } - - /* Favour nameservers with unknown addresses to probe them, - * otherwise discover the current best address for the NS. */ - if (addr_set->len == 0) { - score = KR_NS_UNKNOWN; - /* If the server doesn't have IPv6, give it disadvantage. */ - if (reputation & KR_NS_NOIP6) { - score += FAVOUR_IPV6; - /* If the server is unknown but has rep record, treat it as timeouted */ - if (reputation & KR_NS_NOIP4) { - score = KR_NS_UNKNOWN; - /* Try to start with clean slate */ - if (!(qry->flags.NO_IPV6)) { - reputation &= ~KR_NS_NOIP6; - } - if (!(qry->flags.NO_IPV4)) { - reputation &= ~KR_NS_NOIP4; - } - } - } - } else { - score = eval_addr_set(addr_set, ctx, qry->flags, score, addr_choice); - } - - /* Probabilistic bee foraging strategy (naive). - * The fastest NS is preferred by workers until it is depleted (timeouts or degrades), - * at the same time long distance scouts probe other sources (low probability). - * Servers on TIMEOUT will not have probed at all. - * Servers with score above KR_NS_LONG will have periodically removed from - * reputation cache, so that kresd can reprobe them. */ - if (score >= KR_NS_TIMEOUT) { - return kr_ok(); - } else if (score <= ns->score && - (score < KR_NS_LONG || qry->flags.NO_THROTTLE)) { - update_nsrep_set(ns, owner, addr_choice, score); - ns->reputation = reputation; - } else if (kr_rand_coin(1, 10) && - !kr_rand_coin(score, KR_NS_MAX_SCORE)) { - /* With 10% chance probe server with a probability - * given by its RTT / MAX_RTT. */ - update_nsrep_set(ns, owner, addr_choice, score); - ns->reputation = reputation; - return 1; /* Stop evaluation */ - } else if (ns->score > KR_NS_MAX_SCORE) { - /* Check if any server was already selected. - * If no, pick current server and continue evaluation. */ - update_nsrep_set(ns, owner, addr_choice, score); - ns->reputation = reputation; - } - - return kr_ok(); -} - -int kr_nsrep_set(struct kr_query *qry, size_t index, const struct sockaddr *sock) -{ - if (!qry) { - return kr_error(EINVAL); - } - if (index >= KR_NSREP_MAXADDR) { - return kr_error(ENOSPC); - } - - if (!sock) { - qry->ns.name = (const uint8_t *)""; - qry->ns.addr[index].ip.sa_family = AF_UNSPEC; - return kr_ok(); - } - - switch (sock->sa_family) { - case AF_INET: - if (qry->flags.NO_IPV4) { - return kr_error(ENOENT); - } - qry->ns.addr[index].ip4 = *(const struct sockaddr_in *)sock; - break; - case AF_INET6: - if (qry->flags.NO_IPV6) { - return kr_error(ENOENT); - } - qry->ns.addr[index].ip6 = *(const struct sockaddr_in6 *)sock; - break; - default: - qry->ns.addr[index].ip.sa_family = AF_UNSPEC; - return kr_error(EINVAL); - } - - qry->ns.name = (const uint8_t *)""; - /* Reset score on first entry */ - if (index == 0) { - qry->ns.score = KR_NS_UNKNOWN; - qry->ns.reputation = 0; - } - - /* Retrieve RTT from cache */ - struct kr_context *ctx = qry->ns.ctx; - kr_nsrep_rtt_lru_entry_t *rtt_cache_entry = ctx - ? lru_get_try(ctx->cache_rtt, kr_inaddr(sock), kr_family_len(sock->sa_family)) - : NULL; - if (rtt_cache_entry) { - qry->ns.score = MIN(qry->ns.score, rtt_cache_entry->score); - } - - return kr_ok(); -} - -#define ELECT_INIT(ns, ctx_) do { \ - (ns)->ctx = (ctx_); \ - (ns)->addr[0].ip.sa_family = AF_UNSPEC; \ - (ns)->reputation = 0; \ - (ns)->score = KR_NS_MAX_SCORE + 1; \ -} while (0) - -int kr_nsrep_elect(struct kr_query *qry, struct kr_context *ctx) -{ - if (!qry || !ctx) { - //assert(!EINVAL); - return kr_error(EINVAL); - } - - // First we dump the nsset into a temporary array - const int nsset_len = trie_weight(qry->zone_cut.nsset); - struct { - const knot_dname_t *name; - const pack_t *addrs; - } nsset[nsset_len]; - - trie_it_t *it; - int i = 0; - for (it = trie_it_begin(qry->zone_cut.nsset); !trie_it_finished(it); - trie_it_next(it), ++i) { - /* we trust it's a correct dname */ - nsset[i].name = (const knot_dname_t *)trie_it_key(it, NULL); - nsset[i].addrs = (const pack_t *)*trie_it_val(it); - } - trie_it_free(it); - assert(i == nsset_len); - - // Now we sort it randomly, by select-sort. - for (i = 0; i < nsset_len - 1; ++i) { - // The winner for position i will be uniformly chosen from indices >= i - const int j = i + kr_rand_bytes(1) % (nsset_len - i); - // Now we swap the winner with index i - if (i == j) continue; - __typeof__((nsset[i])) tmp = nsset[i]; - nsset[i] = nsset[j]; - nsset[j] = tmp; - } - - // Finally we run the original algorithm, in this randomized order. - struct kr_nsrep *ns = &qry->ns; - ELECT_INIT(ns, ctx); - int ret = kr_ok(); - for (i = 0; i < nsset_len; ++i) { - ret = eval_nsrep(nsset[i].name, nsset[i].addrs, qry); - if (ret) break; - } - - if (qry->ns.score <= KR_NS_MAX_SCORE && qry->ns.score >= KR_NS_LONG) { - /* This is a low-reliability probe, - * go with TCP to get ICMP reachability check. */ - qry->flags.TCP = true; - } - return ret; -} - -int kr_nsrep_elect_addr(struct kr_query *qry, struct kr_context *ctx) -{ - if (!qry || !ctx) { - //assert(!EINVAL); - return kr_error(EINVAL); - } - - /* Get address list for this NS */ - struct kr_nsrep *ns = &qry->ns; - ELECT_INIT(ns, ctx); - pack_t *addr_set = kr_zonecut_find(&qry->zone_cut, ns->name); - if (!addr_set) { - return kr_error(ENOENT); - } - /* Evaluate addr list */ - uint8_t *addr_choice[KR_NSREP_MAXADDR] = { NULL, }; - unsigned score = eval_addr_set(addr_set, ctx, qry->flags, ns->score, addr_choice); - update_nsrep_set(ns, ns->name, addr_choice, score); - return kr_ok(); -} - -#undef ELECT_INIT - -int kr_nsrep_update_rtt(struct kr_nsrep *ns, const struct sockaddr *addr, - unsigned score, kr_nsrep_rtt_lru_t *cache, int umode) -{ - if (!cache || umode > KR_NS_MAX || umode < 0) { - return kr_error(EINVAL); - } - - /* Get `addr`, and later its raw string. */ - if (addr) { - /* Caller provided specific address, OK. */ - } else if (ns != NULL) { - addr = &ns->addr[0].ip; - } else { - assert(false && "kr_nsrep_update_rtt: don't know what address to update"); - return kr_error(EINVAL); - } - const char *addr_in = kr_inaddr(addr); - size_t addr_len = kr_inaddr_len(addr); - if (!addr_in || addr_len <= 0) { - assert(false && "kr_nsrep_update_rtt: incorrect address"); - return kr_error(EINVAL); - } - - bool is_new_entry = false; - kr_nsrep_rtt_lru_entry_t *cur = lru_get_new(cache, addr_in, addr_len, - (&is_new_entry)); - if (!cur) { - return kr_ok(); - } - if (score <= KR_NS_GLUED) { - score = KR_NS_GLUED + 1; - } - /* If there's nothing to update, we reset it unless KR_NS_UPDATE_NORESET - * mode was requested. New items are zeroed by LRU automatically. */ - if (is_new_entry && umode != KR_NS_UPDATE_NORESET) { - umode = KR_NS_RESET; - } - unsigned new_score = 0; - /* Update score, by default smooth over last two measurements. */ - switch (umode) { - case KR_NS_UPDATE: - case KR_NS_UPDATE_NORESET: - new_score = (cur->score + score) / 2; break; - case KR_NS_RESET: new_score = score; break; - case KR_NS_ADD: new_score = MIN(KR_NS_MAX_SCORE - 1, cur->score + score); break; - case KR_NS_MAX: new_score = MAX(cur->score, score); break; - default: return kr_error(EINVAL); - } - /* Score limits */ - if (new_score > KR_NS_MAX_SCORE) { - new_score = KR_NS_MAX_SCORE; - } - if (new_score >= KR_NS_TIMEOUT && cur->score < KR_NS_TIMEOUT) { - /* Set the timestamp only when NS became "timeouted" */ - cur->tout_timestamp = kr_now(); - } - cur->score = new_score; - return kr_ok(); -} - -int kr_nsrep_update_rep(struct kr_nsrep *ns, unsigned reputation, kr_nsrep_lru_t *cache) -{ - if (!ns || !cache ) { - return kr_error(EINVAL); - } - - /* Store in the struct */ - ns->reputation = reputation; - /* Store reputation in the LRU cache */ - unsigned *cur = lru_get_new(cache, (const char *)ns->name, - knot_dname_size(ns->name), NULL); - if (cur) { - *cur = reputation; - } - return kr_ok(); -} - -int kr_nsrep_copy_set(struct kr_nsrep *dst, const struct kr_nsrep *src) -{ - if (!dst || !src ) { - return kr_error(EINVAL); - } - - memcpy(dst, src, sizeof(struct kr_nsrep)); - dst->name = (const uint8_t *)""; - dst->score = KR_NS_UNKNOWN; - dst->reputation = 0; - - return kr_ok(); -} - -int kr_nsrep_sort(struct kr_nsrep *ns, struct kr_context *ctx) -{ - if (!ns || !ctx) { - assert(false); - return kr_error(EINVAL); - } - - kr_nsrep_rtt_lru_t *rtt_cache = ctx->cache_rtt; - - ns->reputation = 0; - ns->score = KR_NS_MAX_SCORE + 1; - - if (ns->addr[0].ip.sa_family == AF_UNSPEC) { - return kr_error(EINVAL); - } - - /* Compute the scores. Unfortunately there's no space for scores - * along the addresses. */ - unsigned scores[KR_NSREP_MAXADDR]; - int i; - bool timeouted_address_is_already_selected = false; - for (i = 0; i < KR_NSREP_MAXADDR; ++i) { - const struct sockaddr *sa = &ns->addr[i].ip; - if (sa->sa_family == AF_UNSPEC) { - break; - } - kr_nsrep_rtt_lru_entry_t *rtt_cache_entry = lru_get_try(rtt_cache, - kr_inaddr(sa), - kr_family_len(sa->sa_family)); - if (!rtt_cache_entry) { - scores[i] = 1; /* prefer unknown to probe RTT */ - } else if (rtt_cache_entry->score < KR_NS_FWD_TIMEOUT) { - /* some probability to bump bad ones up for re-probe */ - scores[i] = rtt_cache_entry->score; - /* The lower the rtt, the more likely it will be selected. */ - if (!kr_rand_coin(rtt_cache_entry->score, KR_NS_FWD_TIMEOUT)) { - scores[i] = 1; - } - } else { - uint64_t now = kr_now(); - uint64_t elapsed = now - rtt_cache_entry->tout_timestamp; - scores[i] = KR_NS_MAX_SCORE + 1; - elapsed = elapsed > UINT_MAX ? UINT_MAX : elapsed; - if (elapsed > ctx->cache_rtt_tout_retry_interval && - !timeouted_address_is_already_selected) { - scores[i] = 1; - rtt_cache_entry->tout_timestamp = now; - timeouted_address_is_already_selected = true; - } - } - - /* Give advantage to IPv6. */ - if (scores[i] <= KR_NS_MAX_SCORE && sa->sa_family == AF_INET) { - scores[i] += FAVOUR_IPV6; - } - - if (VERBOSE_STATUS) { - kr_log_verbose("[ ][nsre] score %d for %s;\t cached RTT: %d\n", - scores[i], kr_straddr(sa), - rtt_cache_entry ? rtt_cache_entry->score : -1); - } - } - - /* Select-sort the addresses. */ - const int count = i; - for (i = 0; i < count - 1; ++i) { - /* find min from i onwards */ - int min_i = i; - for (int j = i + 1; j < count; ++j) { - if (scores[j] < scores[min_i]) { - min_i = j; - } - } - /* swap the indices */ - if (min_i != i) { - SWAP(scores[min_i], scores[i]); - SWAP(ns->addr[min_i], ns->addr[i]); - } - } - - if (count > 0) { - ns->score = scores[0]; - ns->reputation = 0; - } - - return kr_ok(); -} diff --git a/lib/nsrep.h b/lib/nsrep.h deleted file mode 100644 index 57aecc804..000000000 --- a/lib/nsrep.h +++ /dev/null @@ -1,176 +0,0 @@ -/* Copyright (C) 2014-2017 CZ.NIC, z.s.p.o. - * SPDX-License-Identifier: GPL-3.0-or-later - */ - -#pragma once - -#include -#include -#include -#include - -#include "lib/defines.h" -#include "lib/generic/lru.h" - -struct kr_query; - -/** - * NS RTT score (special values). - * @note RTT is measured in milliseconds. - */ -enum kr_ns_score { - KR_NS_MAX_SCORE = 20 * KR_CONN_RTT_MAX, /* max possible value */ - KR_NS_FWD_TIMEOUT = (95 * 10000) / 100, /* timeout for upstream recursor, - * 95 percents from max resolution time */ - KR_NS_TIMEOUT = (95 * KR_CONN_RTT_MAX) / 100, /* timeout for upstream auth */ - KR_NS_LONG = (3 * KR_NS_TIMEOUT) / 4, - KR_NS_UNKNOWN = KR_NS_TIMEOUT / 2, - KR_NS_PENALTY = 100, - KR_NS_GLUED = 10 -}; - -/** - * See kr_nsrep_update_rtt() - */ -#define KR_NS_DEAD (((KR_NS_TIMEOUT * 4) + 3) / 3) -#define KR_NS_FWD_DEAD (((KR_NS_FWD_TIMEOUT * 4) + 3) / 3) - -/** If once NS was marked as "timeouted", it won't participate in NS elections - * at least KR_NS_TIMEOUT_RETRY_INTERVAL milliseconds (now: one second). */ -#define KR_NS_TIMEOUT_RETRY_INTERVAL 1000 - -/** - * NS QoS flags. - */ -enum kr_ns_rep { - KR_NS_NOIP4 = 1 << 0, /**< NS has no IPv4 */ - KR_NS_NOIP6 = 1 << 1, /**< NS has no IPv6 */ - KR_NS_NOEDNS = 1 << 2 /**< NS has no EDNS support */ -}; - -/** - * NS RTT update modes. - * First update is always KR_NS_RESET unless - * KR_NS_UPDATE_NORESET mode had choosen. - */ -enum kr_ns_update_mode { - KR_NS_UPDATE = 0, /**< Update as smooth over last two measurements */ - KR_NS_UPDATE_NORESET, /**< Same as KR_NS_UPDATE, but disable fallback to - * KR_NS_RESET on newly added entries. - * Zero is used as initial value. */ - KR_NS_RESET, /**< Set to given value */ - KR_NS_ADD, /**< Increment current value */ - KR_NS_MAX /**< Set to maximum of current/proposed value. */ -}; - -struct kr_nsrep_rtt_lru_entry { - unsigned score; /* combined rtt */ - uint64_t tout_timestamp; /* The time when score became - * greater or equal then KR_NS_TIMEOUT. - * Is meaningful only when score >= KR_NS_TIMEOUT */ -}; - -typedef struct kr_nsrep_rtt_lru_entry kr_nsrep_rtt_lru_entry_t; - -/** - * NS QoS tracking. - */ -typedef lru_t(kr_nsrep_rtt_lru_entry_t) kr_nsrep_rtt_lru_t; - -/** - * NS reputation tracking. - */ -typedef lru_t(unsigned) kr_nsrep_lru_t; - -/* Maximum count of addresses probed in one go (last is left empty) */ -#define KR_NSREP_MAXADDR 4 - -/** - * Name server representation. - * Contains extra information about the name server, e.g. score - * or other metadata. - */ -struct kr_nsrep -{ - unsigned score; /**< NS score */ - unsigned reputation; /**< NS reputation */ - const knot_dname_t *name; /**< NS name */ - struct kr_context *ctx; /**< Resolution context */ - union inaddr addr[KR_NSREP_MAXADDR]; /**< NS address(es) */ -}; - -/** - * Set given NS address. (Very low-level access to the list.) - * @param qry updated query - * @param index index of the updated target - * @param sock socket address to use (sockaddr_in or sockaddr_in6 or NULL) - * @return 0 or an error code, in particular kr_error(ENOENT) for net.ipvX - */ -KR_EXPORT -int kr_nsrep_set(struct kr_query *qry, size_t index, const struct sockaddr *sock); - -/** - * Elect best nameserver/address pair from the nsset. - * @param qry updated query - * @param ctx resolution context - * @return 0 or an error code - */ -KR_EXPORT -int kr_nsrep_elect(struct kr_query *qry, struct kr_context *ctx); - -/** - * Elect best nameserver/address pair from the nsset. - * @param qry updated query - * @param ctx resolution context - * @return 0 or an error code - */ -KR_EXPORT -int kr_nsrep_elect_addr(struct kr_query *qry, struct kr_context *ctx); - -/** - * Update NS address RTT information. - * - * @brief In KR_NS_UPDATE mode reputation is smoothed over last N measurements. - * - * @param ns updated NS representation - * @param addr chosen address (NULL for first) - * @param score new score (i.e. RTT), see enum kr_ns_score - * @param cache RTT LRU cache - * @param umode update mode (KR_NS_UPDATE or KR_NS_RESET or KR_NS_ADD) - * @return 0 on success, error code on failure - */ -KR_EXPORT -int kr_nsrep_update_rtt(struct kr_nsrep *ns, const struct sockaddr *addr, - unsigned score, kr_nsrep_rtt_lru_t *cache, int umode); - -/** - * Update NSSET reputation information. - * - * @param ns updated NS representation - * @param reputation combined reputation flags, see enum kr_ns_rep - * @param cache LRU cache - * @return 0 on success, error code on failure - */ -KR_EXPORT -int kr_nsrep_update_rep(struct kr_nsrep *ns, unsigned reputation, kr_nsrep_lru_t *cache); -/** - * Copy NSSET reputation information and resets score. - * - * @param dst updated NS representation - * @param src source NS representation - * @return 0 on success, error code on failure - */ -int kr_nsrep_copy_set(struct kr_nsrep *dst, const struct kr_nsrep *src); - -/** - * Sort addresses in the query nsrep list by cached RTT. - * if RTT is greater then KR_NS_TIMEOUT, address will placed at the beginning of the - * nsrep list once in cache.ns_tout() milliseconds. Otherwise it will be sorted - * as if it has cached RTT equal to KR_NS_MAX_SCORE + 1. - * @param ns updated kr_nsrep - * @param ctx name resolution context. - * @return 0 or an error code - * @note ns reputation is zeroed and score is set to KR_NS_MAX_SCORE + 1. - */ -KR_EXPORT -int kr_nsrep_sort(struct kr_nsrep *ns, struct kr_context *ctx); diff --git a/lib/resolve.c b/lib/resolve.c index 79438b26e..50ea4ac49 100644 --- a/lib/resolve.c +++ b/lib/resolve.c @@ -11,6 +11,7 @@ #include #include #include +#include #include "kresconfig.h" #include "lib/resolve.h" #include "lib/layer.h" @@ -147,7 +148,7 @@ static void randomized_qname_case(knot_dname_t * restrict qname, uint32_t secret return; } assert(qname); - const int len = knot_dname_size(qname) - 2; /* Skip first, last label. */ + const int len = knot_dname_size(qname) - 2; /* Skip first, last label. First is length, last is always root */ for (int i = 0; i < len; ++i) { /* Note: this relies on the fact that correct label lengths * can't pass the isletter() test (by "luck"). */ @@ -157,23 +158,6 @@ static void randomized_qname_case(knot_dname_t * restrict qname, uint32_t secret } } -/** Invalidate current NS/addr pair. */ -static int invalidate_ns(struct kr_rplan *rplan, struct kr_query *qry) -{ - if (qry->ns.addr[0].ip.sa_family != AF_UNSPEC) { - const char *addr = kr_inaddr(&qry->ns.addr[0].ip); - int addr_len = kr_inaddr_len(&qry->ns.addr[0].ip); - int ret = kr_zonecut_del(&qry->zone_cut, qry->ns.name, addr, addr_len); - /* Also remove it from the qry->ns.addr array. - * That's useful at least for STUB and FORWARD modes. */ - memmove(qry->ns.addr, qry->ns.addr + 1, - sizeof(qry->ns.addr[0]) * (KR_NSREP_MAXADDR - 1)); - return ret; - } else { - return kr_zonecut_del_all(&qry->zone_cut, qry->ns.name); - } -} - /** This turns of QNAME minimisation if there is a non-terminal between current zone cut, and name target. * It save several minimization steps, as the zone cut is likely final one. */ @@ -310,71 +294,6 @@ static int ns_fetch_cut(struct kr_query *qry, const knot_dname_t *requested_name return KR_STATE_PRODUCE; } -static int ns_resolve_addr(struct kr_query *qry, struct kr_request *req) -{ - struct kr_rplan *rplan = &req->rplan; - struct kr_context *ctx = req->ctx; - - - /* Start NS queries from root, to avoid certain cases - * where a NS drops out of cache and the rest is unavailable, - * this would lead to dependency loop in current zone cut. - * Prefer IPv6 and continue with IPv4 if not available. - */ - uint16_t next_type = 0; - if (!(qry->flags.AWAIT_IPV6) && - !(ctx->options.NO_IPV6)) { - next_type = KNOT_RRTYPE_AAAA; - qry->flags.AWAIT_IPV6 = true; - } else if (!(qry->flags.AWAIT_IPV4) && - !(ctx->options.NO_IPV4)) { - next_type = KNOT_RRTYPE_A; - qry->flags.AWAIT_IPV4 = true; - /* Hmm, no useable IPv6 then. */ - qry->ns.reputation |= KR_NS_NOIP6; - kr_nsrep_update_rep(&qry->ns, qry->ns.reputation, ctx->cache_rep); - } - /* Bail out if the query is already pending or dependency loop. */ - if (!next_type || kr_rplan_satisfies(qry->parent, qry->ns.name, KNOT_CLASS_IN, next_type)) { - /* Fall back to SBELT if root server query fails. */ - if (!next_type && qry->zone_cut.name[0] == '\0') { - VERBOSE_MSG(qry, "=> fallback to root hints\n"); - kr_zonecut_set_sbelt(ctx, &qry->zone_cut); - qry->flags.NO_THROTTLE = true; /* Pick even bad SBELT servers */ - return kr_error(EAGAIN); - } - /* No IPv4 nor IPv6, flag server as unusable. */ - ++req->count_no_nsaddr; - VERBOSE_MSG(qry, "=> unresolvable NS address, bailing out (counter: %u)\n", - req->count_no_nsaddr); - qry->ns.reputation |= KR_NS_NOIP4 | KR_NS_NOIP6; - kr_nsrep_update_rep(&qry->ns, qry->ns.reputation, ctx->cache_rep); - invalidate_ns(rplan, qry); - return kr_error(EHOSTUNREACH); - } - /* Push new query to the resolution plan */ - struct kr_query *next = - kr_rplan_push(rplan, qry, qry->ns.name, KNOT_CLASS_IN, next_type); - if (!next) { - return kr_error(ENOMEM); - } - next->flags.NONAUTH = true; - - /* At the root level with no NS addresses, add SBELT subrequest. */ - int ret = 0; - if (qry->zone_cut.name[0] == '\0') { - ret = kr_zonecut_set_sbelt(ctx, &next->zone_cut); - if (ret == 0) { /* Copy TA and key since it's the same cut to avoid lookup. */ - kr_zonecut_copy_trust(&next->zone_cut, &qry->zone_cut); - kr_zonecut_set_sbelt(ctx, &qry->zone_cut); /* Add SBELT to parent in case query fails. */ - qry->flags.NO_THROTTLE = true; /* Pick even bad SBELT servers */ - } - } else { - next->flags.AWAIT_CUT = true; - } - return ret; -} - static int edns_put(knot_pkt_t *pkt, bool reclaim) { if (!pkt->opt_rr) { @@ -776,7 +695,7 @@ static int resolve_query(struct kr_request *request, const knot_pkt_t *packet) return request->state; } -knot_pkt_t * kr_request_ensure_answer(struct kr_request *request) +knot_pkt_t *kr_request_ensure_answer(struct kr_request *request) { if (request->answer) return request->answer; @@ -839,84 +758,6 @@ enomem: return request->answer = NULL; } -KR_PURE static bool kr_inaddr_equal(const struct sockaddr *a, const struct sockaddr *b) -{ - const int a_len = kr_inaddr_len(a); - const int b_len = kr_inaddr_len(b); - return a_len == b_len && memcmp(kr_inaddr(a), kr_inaddr(b), a_len) == 0; -} - -static void update_nslist_rtt(struct kr_context *ctx, struct kr_query *qry, const struct sockaddr *src) -{ - /* Do not track in safe mode. */ - if (qry->flags.SAFEMODE) { - return; - } - - /* Calculate total resolution time from the time the query was generated. */ - uint64_t elapsed = kr_now() - qry->timestamp_mono; - elapsed = elapsed > UINT_MAX ? UINT_MAX : elapsed; - - /* NSs in the preference list prior to the one who responded will be penalised - * with the RETRY timer interval. This is because we know they didn't respond - * for N retries, so their RTT must be at least N * RETRY. - * The NS in the preference list that responded will have RTT relative to the - * time when the query was sent out, not when it was originated. - */ - for (size_t i = 0; i < KR_NSREP_MAXADDR; ++i) { - const struct sockaddr *addr = &qry->ns.addr[i].ip; - if (addr->sa_family == AF_UNSPEC) { - break; - } - /* If this address is the source of the answer, update its RTT */ - if (kr_inaddr_equal(src, addr)) { - kr_nsrep_update_rtt(&qry->ns, addr, elapsed, ctx->cache_rtt, KR_NS_UPDATE); - WITH_VERBOSE(qry) { - char addr_str[INET6_ADDRSTRLEN]; - inet_ntop(addr->sa_family, kr_inaddr(addr), addr_str, sizeof(addr_str)); - VERBOSE_MSG(qry, "<= server: '%s' rtt: %"PRIu64" ms\n", - addr_str, elapsed); - } - } else { - /* Response didn't come from this IP, but we know the RTT must be at least - * several RETRY timer tries, e.g. if we have addresses [a, b, c] and we have - * tried [a, b] when the answer from 'a' came after 350ms, then we know - * that 'b' didn't respond for at least 350 - (1 * 300) ms. We can't say that - * its RTT is 50ms, but we can say that its score shouldn't be less than 50. */ - kr_nsrep_update_rtt(&qry->ns, addr, elapsed, ctx->cache_rtt, KR_NS_MAX); - WITH_VERBOSE(qry) { - char addr_str[INET6_ADDRSTRLEN]; - inet_ntop(addr->sa_family, kr_inaddr(addr), addr_str, sizeof(addr_str)); - VERBOSE_MSG(qry, "<= server: '%s' rtt: >= %"PRIu64" ms\n", - addr_str, elapsed); - } - } - /* Subtract query start time from elapsed time */ - if (elapsed < KR_CONN_RETRY) { - break; - } - elapsed = elapsed - KR_CONN_RETRY; - } -} - -static void update_nslist_score(struct kr_request *request, struct kr_query *qry, const struct sockaddr *src, knot_pkt_t *packet) -{ - struct kr_context *ctx = request->ctx; - /* On successful answer, update preference list RTT and penalise timer */ - if (!(request->state & KR_STATE_FAIL)) { - /* Update RTT information for preference list */ - update_nslist_rtt(ctx, qry, src); - /* Do not complete NS address resolution on soft-fail. */ - const int rcode = packet ? knot_wire_get_rcode(packet->wire) : 0; - if (rcode != KNOT_RCODE_SERVFAIL && rcode != KNOT_RCODE_REFUSED) { - qry->flags.AWAIT_IPV6 = false; - qry->flags.AWAIT_IPV4 = false; - } else { /* Penalize SERVFAILs. */ - kr_nsrep_update_rtt(&qry->ns, src, KR_NS_PENALTY, ctx->cache_rtt, KR_NS_ADD); - } - } -} - static bool resolution_time_exceeded(struct kr_query *qry, uint64_t now) { uint64_t resolving_time = now - qry->creation_time_mono; @@ -929,7 +770,7 @@ static bool resolution_time_exceeded(struct kr_query *qry, uint64_t now) return false; } -int kr_resolve_consume(struct kr_request *request, const struct sockaddr *src, knot_pkt_t *packet) +int kr_resolve_consume(struct kr_request *request, struct kr_transport **transport, knot_pkt_t *packet) { struct kr_rplan *rplan = &request->rplan; @@ -946,11 +787,7 @@ int kr_resolve_consume(struct kr_request *request, const struct sockaddr *src, k } bool tried_tcp = (qry->flags.TCP); if (!packet || packet->size == 0) { - if (tried_tcp) { - request->state = KR_STATE_FAIL; - } else { - qry->flags.TCP = true; - } + return KR_STATE_PRODUCE; } else { /* Packet cleared, derandomize QNAME. */ knot_dname_t *qname_raw = knot_pkt_qname(packet); @@ -963,25 +800,29 @@ int kr_resolve_consume(struct kr_request *request, const struct sockaddr *src, k } else { /* Fill in source and latency information. */ request->upstream.rtt = kr_now() - qry->timestamp_mono; - request->upstream.addr = src; + request->upstream.transport = transport ? *transport : NULL; ITERATE_LAYERS(request, qry, consume, packet); /* Clear temporary information */ - request->upstream.addr = NULL; + request->upstream.transport = NULL; request->upstream.rtt = 0; } } - /* Track RTT for iterative answers */ - if (src && !(qry->flags.CACHED)) { - update_nslist_score(request, qry, src, packet); + if (transport && !qry->flags.CACHED) { + if (!(request->state & KR_STATE_FAIL)) { + /* Do not complete NS address resolution on soft-fail. */ + const int rcode = packet ? knot_wire_get_rcode(packet->wire) : 0; + if (rcode != KNOT_RCODE_SERVFAIL && rcode != KNOT_RCODE_REFUSED) { + qry->flags.AWAIT_IPV6 = false; + qry->flags.AWAIT_IPV4 = false; + } + } } - /* Resolution failed, invalidate current NS. */ + if (request->state & KR_STATE_FAIL) { - invalidate_ns(rplan, qry); qry->flags.RESOLVED = false; } - /* For multiple errors in a row; invalidate_ns() is not enough. */ if (!qry->flags.CACHED) { if (request->state & KR_STATE_FAIL) { if (++request->count_fail_row > KR_CONSUME_FAIL_ROW_LIMIT) { @@ -1016,7 +857,12 @@ int kr_resolve_consume(struct kr_request *request, const struct sockaddr *src, k /* Do not finish with bogus answer. */ if (qry->flags.DNSSEC_BOGUS) { - return KR_STATE_FAIL; + if (qry->flags.FORWARD || qry->flags.STUB) { + return KR_STATE_FAIL; + } + /* Other servers might not have broken DNSSEC. */ + qry->flags.DNSSEC_BOGUS = false; + return KR_STATE_PRODUCE; } return kr_rplan_empty(&request->rplan) ? KR_STATE_DONE : KR_STATE_PRODUCE; @@ -1368,17 +1214,81 @@ static int zone_cut_check(struct kr_request *request, struct kr_query *qry, knot return trust_chain_check(request, qry); } -int kr_resolve_produce(struct kr_request *request, struct sockaddr **dst, int *type, knot_pkt_t *packet) + +static int ns_resolve_addr(struct kr_query *qry, struct kr_request *param, struct kr_transport *transport, uint16_t next_type) +{ + struct kr_rplan *rplan = ¶m->rplan; + struct kr_context *ctx = param->ctx; + + + /* Start NS queries from root, to avoid certain cases + * where a NS drops out of cache and the rest is unavailable, + * this would lead to dependency loop in current zone cut. + */ + + /* Bail out if the query is already pending or dependency loop. */ + if (!next_type || kr_rplan_satisfies(qry->parent, transport->ns_name, KNOT_CLASS_IN, next_type)) { + /* Fall back to SBELT if root server query fails. */ + if (!next_type && qry->zone_cut.name[0] == '\0') { + VERBOSE_MSG(qry, "=> fallback to root hints\n"); + kr_zonecut_set_sbelt(ctx, &qry->zone_cut); + qry->flags.NO_THROTTLE = true; /* Pick even bad SBELT servers */ + return kr_error(EAGAIN); + } + /* No IPv4 nor IPv6, flag server as unusable. */ + VERBOSE_MSG(qry, "=> unresolvable NS address, bailing out\n"); + kr_zonecut_del_all(&qry->zone_cut, transport->ns_name); + return kr_error(EHOSTUNREACH); + } + /* Push new query to the resolution plan */ + struct kr_query *next = + kr_rplan_push(rplan, qry, transport->ns_name, KNOT_CLASS_IN, next_type); + if (!next) { + return kr_error(ENOMEM); + } + next->flags.NONAUTH = true; + + /* At the root level with no NS addresses, add SBELT subrequest. */ + int ret = 0; + if (qry->zone_cut.name[0] == '\0') { + ret = kr_zonecut_set_sbelt(ctx, &next->zone_cut); + if (ret == 0) { /* Copy TA and key since it's the same cut to avoid lookup. */ + kr_zonecut_copy_trust(&next->zone_cut, &qry->zone_cut); + kr_zonecut_set_sbelt(ctx, &qry->zone_cut); /* Add SBELT to parent in case query fails. */ + qry->flags.NO_THROTTLE = true; /* Pick even bad SBELT servers */ + } + } else { + next->flags.AWAIT_CUT = true; + } + + if (ret == 0) { + if (next_type == KNOT_RRTYPE_AAAA) { + qry->flags.AWAIT_IPV6 = true; + } else { + qry->flags.AWAIT_IPV4 = true; + } + } + + return ret; +} + +int kr_resolve_produce(struct kr_request *request, struct kr_transport **transport, knot_pkt_t *packet) { struct kr_rplan *rplan = &request->rplan; - unsigned ns_election_iter = 0; /* No query left for resolution */ if (kr_rplan_empty(rplan)) { return KR_STATE_FAIL; } - /* If we have deferred answers, resume them. */ + struct kr_query *qry = array_tail(rplan->pending); + + /* Initialize server selection */ + if (!qry->server_selection.initialized) { + kr_server_selection_init(qry); + } + + /* If we have deferred answers, resume them. */ if (qry->deferred != NULL) { /* @todo: Refactoring validator, check trust chain before resuming. */ int state = 0; @@ -1456,70 +1366,42 @@ int kr_resolve_produce(struct kr_request *request, struct sockaddr **dst, int *t } } -ns_election: - - if (unlikely(request->count_no_nsaddr >= KR_COUNT_NO_NSADDR_LIMIT)) { - VERBOSE_MSG(qry, "=> too many unresolvable NSs, bail out " - "(mitigation for NXNSAttack CVE-2020-12667)\n"); - return KR_STATE_FAIL; - } - /* If the query has already selected a NS and is waiting for IPv4/IPv6 record, - * elect best address only, otherwise elect a completely new NS. - */ - if(++ns_election_iter >= KR_ITER_LIMIT) { - VERBOSE_MSG(qry, "=> couldn't converge NS selection, bail out\n"); - return KR_STATE_FAIL; - } const struct kr_qflags qflg = qry->flags; const bool retry = qflg.TCP || qflg.BADCOOKIE_AGAIN; - if (qflg.AWAIT_IPV4 || qflg.AWAIT_IPV6) { - kr_nsrep_elect_addr(qry, request->ctx); - } else if (qflg.FORWARD || qflg.STUB) { - kr_nsrep_sort(&qry->ns, request->ctx); - if (qry->ns.score > KR_NS_MAX_SCORE) { - /* At the moment all NS have bad reputation. - * But there can be existing connections*/ - VERBOSE_MSG(qry, "=> no valid NS left\n"); - return KR_STATE_FAIL; - } - } else if (!qry->ns.name || !retry) { /* Keep NS when requerying/stub/badcookie. */ + if (!qflg.FORWARD && !qflg.STUB && !retry) { /* Keep NS when requerying/stub/badcookie. */ /* Root DNSKEY must be fetched from the hints to avoid chicken and egg problem. */ if (qry->sname[0] == '\0' && qry->stype == KNOT_RRTYPE_DNSKEY) { kr_zonecut_set_sbelt(request->ctx, &qry->zone_cut); qry->flags.NO_THROTTLE = true; /* Pick even bad SBELT servers */ } - kr_nsrep_elect(qry, request->ctx); - if (qry->ns.score > KR_NS_MAX_SCORE) { - if (kr_zonecut_is_empty(&qry->zone_cut)) { - VERBOSE_MSG(qry, "=> no NS with an address\n"); - } else { - VERBOSE_MSG(qry, "=> no valid NS left\n"); - } - if (!qry->flags.NO_NS_FOUND) { - qry->flags.NO_NS_FOUND = true; - } else { - ITERATE_LAYERS(request, qry, reset); - kr_rplan_pop(rplan, qry); - } - return KR_STATE_PRODUCE; - } } - /* Resolve address records */ - if (qry->ns.addr[0].ip.sa_family == AF_UNSPEC) { - int ret = ns_resolve_addr(qry, request); - if (ret != 0) { - qry->flags.AWAIT_IPV6 = false; - qry->flags.AWAIT_IPV4 = false; - qry->flags.TCP = false; - qry->ns.name = NULL; - goto ns_election; /* Must try different NS */ + qry->server_selection.choose_transport(qry, transport); + + if (*transport == NULL) { + /* Properly signal to serve_stale module. */ + if (qry->flags.NO_NS_FOUND) { + ITERATE_LAYERS(request, qry, reset); + kr_rplan_pop(rplan, qry); + } else { + /* FIXME: This is probably quite inefficient: + * we go through the whole qr_task_step loop just because of the serve_stale + * module which might not even be loaded. */ + qry->flags.NO_NS_FOUND = true; } + return KR_STATE_PRODUCE; + } + + if ((*transport)->protocol == KR_TRANSPORT_RESOLVE_A || (*transport)->protocol == KR_TRANSPORT_RESOLVE_AAAA) { + uint16_t type = (*transport)->protocol == KR_TRANSPORT_RESOLVE_A ? KNOT_RRTYPE_A : KNOT_RRTYPE_AAAA; + ns_resolve_addr(qry, qry->request, *transport, type); ITERATE_LAYERS(request, qry, reset); return KR_STATE_PRODUCE; } + qry->flags.SAFEMODE = qry->flags.SAFEMODE || (*transport)->safe_mode; + /* Randomize query case (if not in safe mode or turned off) */ qry->secret = (qry->flags.SAFEMODE || qry->flags.NO_0X20) ? 0 : kr_rand_bytes(sizeof(qry->secret)); @@ -1531,8 +1413,6 @@ ns_election: * kr_resolve_checkout(). */ qry->timestamp_mono = kr_now(); - *dst = &qry->ns.addr[0].ip; - *type = (qry->flags.TCP) ? SOCK_STREAM : SOCK_DGRAM; return request->state; } @@ -1569,7 +1449,7 @@ static bool outbound_request_update_cookies(struct kr_request *req, #endif /* ENABLE_COOKIES */ int kr_resolve_checkout(struct kr_request *request, const struct sockaddr *src, - struct sockaddr *dst, int type, knot_pkt_t *packet) + struct kr_transport *transport, knot_pkt_t *packet) { /* @todo: Update documentation if this function becomes approved. */ @@ -1593,7 +1473,7 @@ int kr_resolve_checkout(struct kr_request *request, const struct sockaddr *src, * actual cookie. If we don't know the server address then we * also don't know the actual cookie size. */ - if (!outbound_request_update_cookies(request, src, dst)) { + if (!outbound_request_update_cookies(request, src, &transport->address.ip)) { return kr_error(EINVAL); } } @@ -1610,8 +1490,20 @@ int kr_resolve_checkout(struct kr_request *request, const struct sockaddr *src, /* Run the checkout layers and cancel on failure. * The checkout layer doesn't persist the state, so canceled subrequests * don't affect the resolution or rest of the processing. */ + int type = -1; + switch(transport->protocol) { + case KR_TRANSPORT_UDP: + type = SOCK_DGRAM; + break; + case KR_TRANSPORT_TCP: + case KR_TRANSPORT_TLS: + type = SOCK_STREAM; + break; + default: + assert(0); + } int state = request->state; - ITERATE_LAYERS(request, qry, checkout, packet, dst, type); + ITERATE_LAYERS(request, qry, checkout, packet, &transport->address.ip, type); if (request->state & KR_STATE_FAIL) { request->state = state; /* Restore */ return kr_error(ECANCELED); @@ -1634,26 +1526,17 @@ int kr_resolve_checkout(struct kr_request *request, const struct sockaddr *src, WITH_VERBOSE(qry) { KR_DNAME_GET_STR(qname_str, knot_pkt_qname(packet)); + KR_DNAME_GET_STR(ns_name, transport->ns_name); KR_DNAME_GET_STR(zonecut_str, qry->zone_cut.name); KR_RRTYPE_GET_STR(type_str, knot_pkt_qtype(packet)); + const char *ns_str = kr_straddr(&transport->address.ip); - for (size_t i = 0; i < KR_NSREP_MAXADDR; ++i) { - struct sockaddr *addr = &qry->ns.addr[i].ip; - if (addr->sa_family == AF_UNSPEC) { - break; - } - if (!kr_inaddr_equal(dst, addr)) { - continue; - } - const char *ns_str = kr_straddr(addr); - VERBOSE_MSG(qry, - "=> id: '%05u' querying: '%s' score: %u zone cut: '%s' " + VERBOSE_MSG(qry, + "=> id: '%05u' querying: '%s'@'%s' zone cut: '%s' " "qname: '%s' qtype: '%s' proto: '%s'\n", - qry->id, ns_str ? ns_str : "", qry->ns.score, zonecut_str, + qry->id, ns_name, ns_str ? ns_str : "", zonecut_str, qname_str, type_str, (qry->flags.TCP) ? "tcp" : "udp"); - - break; - }} + } return kr_ok(); } diff --git a/lib/resolve.h b/lib/resolve.h index 1095fa9ac..e5855cc3e 100644 --- a/lib/resolve.h +++ b/lib/resolve.h @@ -13,7 +13,7 @@ #include "lib/layer.h" #include "lib/generic/map.h" #include "lib/generic/array.h" -#include "lib/nsrep.h" +#include "lib/selection.h" #include "lib/rplan.h" #include "lib/module.h" #include "lib/cache/api.h" @@ -161,9 +161,7 @@ struct kr_context map_t negative_anchors; struct kr_zonecut root_hints; struct kr_cache cache; - kr_nsrep_rtt_lru_t *cache_rtt; unsigned cache_rtt_tout_retry_interval; - kr_nsrep_lru_t *cache_rep; module_array_t *modules; /* The cookie context structure should not be held within the cookies * module because of better access. */ @@ -182,6 +180,10 @@ struct kr_request_qsource_flags { bool xdp:1; /**< true if the request is on AF_XDP; only meaningful if (dst_addr). */ }; +typedef bool (*addr_info_f)(struct sockaddr*); +typedef void (*async_resolution_f)(knot_dname_t*, enum knot_rr_type); +typedef array_t(union inaddr) inaddr_array_t; + /** * Name resolution request. * @@ -210,7 +212,7 @@ struct kr_request { } qsource; struct { unsigned rtt; /**< Current upstream RTT */ - const struct sockaddr *addr; /**< Current upstream address */ + const struct kr_transport *transport; /**< Current upstream transport */ } upstream; /**< Upstream information, valid only in consume() phase */ struct kr_qflags options; int state; @@ -235,6 +237,12 @@ struct kr_request { int vars_ref; /**< Reference to per-request variable table. LUA_NOREF if not set. */ knot_mm_t pool; unsigned int uid; /**< for logging purposes only */ + struct { + addr_info_f is_tls_capable; + addr_info_f is_tcp_connected; + addr_info_f is_tcp_waiting; + inaddr_array_t forwarding_targets; /**< When forwarding, possible targets are put here */ + } selection_context; unsigned int count_no_nsaddr; unsigned int count_fail_row; alloc_wire_f alloc_wire_cb; /**< CB to allocate answer wire (can be NULL). */ @@ -281,7 +289,7 @@ knot_pkt_t * kr_request_ensure_answer(struct kr_request *request); * @return any state */ KR_EXPORT -int kr_resolve_consume(struct kr_request *request, const struct sockaddr *src, knot_pkt_t *packet); +int kr_resolve_consume(struct kr_request *request, struct kr_transport **transport, knot_pkt_t *packet); /** * Produce either next additional query or finish. @@ -297,7 +305,7 @@ int kr_resolve_consume(struct kr_request *request, const struct sockaddr *src, k * @return any state */ KR_EXPORT -int kr_resolve_produce(struct kr_request *request, struct sockaddr **dst, int *type, knot_pkt_t *packet); +int kr_resolve_produce(struct kr_request *request, struct kr_transport **transport, knot_pkt_t *packet); /** * Finalises the outbound query packet with the knowledge of the IP addresses. @@ -313,7 +321,7 @@ int kr_resolve_produce(struct kr_request *request, struct sockaddr **dst, int *t */ KR_EXPORT int kr_resolve_checkout(struct kr_request *request, const struct sockaddr *src, - struct sockaddr *dst, int type, knot_pkt_t *packet); + struct kr_transport *transport, knot_pkt_t *packet); /** * Finish resolution and commit results if the state is DONE. @@ -343,4 +351,3 @@ struct kr_rplan *kr_resolve_plan(struct kr_request *request); */ KR_EXPORT KR_PURE knot_mm_t *kr_resolve_pool(struct kr_request *request); - diff --git a/lib/rplan.c b/lib/rplan.c index 18dc6b827..02e32d3e4 100644 --- a/lib/rplan.c +++ b/lib/rplan.c @@ -159,22 +159,13 @@ static struct kr_query *kr_rplan_push_query(struct kr_rplan *rplan, qry->flags = rplan->request->options; qry->parent = parent; qry->request = rplan->request; - qry->ns.ctx = rplan->request->ctx; - qry->ns.addr[0].ip.sa_family = AF_UNSPEC; + gettimeofday(&qry->timestamp, NULL); qry->timestamp_mono = kr_now(); qry->creation_time_mono = parent ? parent->creation_time_mono : qry->timestamp_mono; kr_zonecut_init(&qry->zone_cut, (const uint8_t *)"", rplan->pool); qry->reorder = qry->flags.REORDER_RR ? kr_rand_bytes(sizeof(qry->reorder)) : 0; - /* When forwarding, keep the nameserver addresses. */ - if (parent && parent->flags.FORWARD && qry->flags.FORWARD) { - ret = kr_nsrep_copy_set(&qry->ns, &parent->ns); - if (ret) { - query_free(rplan->pool, qry); - return NULL; - } - } assert((rplan->pending.len == 0 && rplan->resolved.len == 0) == (rplan->initial == NULL)); diff --git a/lib/rplan.h b/lib/rplan.h index e69a3f86c..c3d28a263 100644 --- a/lib/rplan.h +++ b/lib/rplan.h @@ -8,9 +8,9 @@ #include #include +#include "lib/selection.h" #include "lib/cache/api.h" #include "lib/zonecut.h" -#include "lib/nsrep.h" /** Query flags */ struct kr_qflags { @@ -101,8 +101,7 @@ struct kr_query { struct kr_query *cname_parent; struct kr_request *request; /**< Parent resolution request. */ kr_stale_cb stale_cb; /**< See the type */ - /* Beware: this must remain the last, because of lua bindings. */ - struct kr_nsrep ns; + struct kr_server_selection server_selection; }; /** @cond internal Array of queries. */ diff --git a/lib/selection.c b/lib/selection.c new file mode 100644 index 000000000..b9997e92d --- /dev/null +++ b/lib/selection.c @@ -0,0 +1,596 @@ +#include + +#include "lib/selection.h" +#include "lib/selection_forward.h" +#include "lib/selection_iter.h" +#include "lib/generic/pack.h" +#include "lib/generic/trie.h" +#include "lib/rplan.h" +#include "lib/cache/api.h" +#include "lib/resolve.h" + +#include "daemon/worker.h" +#include "daemon/tls.h" + +#include "lib/utils.h" + +#define VERBOSE_MSG(qry, ...) QRVERBOSE((qry), "slct", __VA_ARGS__) + +/** @internal Macro to set address structure. */ +#define ADDR_SET(sa, family, addr, len, port) do {\ + memcpy(&sa ## _addr, (addr), (len)); \ + sa ## _family = (family); \ + sa ## _port = htons(port); \ +} while (0) + +#define DEFAULT_TIMEOUT 800 +#define MAX_TIMEOUT 10000 +#define MAX_BACKOFF 5 +#define MINIMAL_TIMEOUT_ADDITION 20 + +/* After TCP_TIMEOUT_THRESHOLD timeouts one transport, we'll switch to TCP. */ +#define TCP_TIMEOUT_THRESHOLD 2 +/* If the expected RTT is over TCP_RTT_THRESHOLD we switch to TCP instead. */ +#define TCP_RTT_THRESHOLD 2000 + +/* Define ε for ε-greedy algorithm (see select_transport) + * as ε=EPSILON_NOMIN/EPSILON_DENOM */ +#define EPSILON_NOMIN 1 +#define EPSILON_DENOM 20 + +/* Simple cache interface follows */ + +#define KEY_PREFIX 'S' + +void *prefix_key(const uint8_t *ip, size_t len) +{ + void *key = malloc(len + 1); + *(char *)key = KEY_PREFIX; + memcpy((uint8_t *)key + 1, ip, len); + return key; +} + +#undef PREFIX + +/* First value of timeout will be calculated as SRTT+4*DEFAULT_TIMEOUT + * by calc_timeout(), so it'll be equal to DEFAULT_TIMEOUT. */ +static const struct rtt_state default_rtt_state = { .srtt = 0, + .variance = + DEFAULT_TIMEOUT / 4, + .consecutive_timeouts = 0, + .dead_since = 0 }; + +/* Note that this opens a cace transaction, which is usually closed by calling + * `put_rtt_state` i.e. callee is responsible for its closing + * (e.g. calling kr_cache_commit). */ +struct rtt_state get_rtt_state(const uint8_t *ip, size_t len, + struct kr_cache *cache) +{ + struct rtt_state state; + knot_db_val_t value; + knot_db_t *db = cache->db; + struct kr_cdb_stats *stats = &cache->stats; + uint8_t *prefixed_ip = prefix_key(ip, len); + + knot_db_val_t key = { .len = len + 1, .data = prefixed_ip }; + + if (cache->api->read(db, stats, &key, &value, 1)) { + state = default_rtt_state; + } else { + assert(value.len == sizeof(struct rtt_state)); + state = *(struct rtt_state *)value.data; + } + + free(prefixed_ip); + return state; +} + +int put_rtt_state(const uint8_t *ip, size_t len, struct rtt_state state, + struct kr_cache *cache) +{ + knot_db_t *db = cache->db; + struct kr_cdb_stats *stats = &cache->stats; + uint8_t *prefixed_ip = prefix_key(ip, len); + + knot_db_val_t key = { .len = len + 1, .data = prefixed_ip }; + knot_db_val_t value = { .len = sizeof(struct rtt_state), + .data = &state }; + + int ret = cache->api->write(db, stats, &key, &value, 1); + cache->api->commit(db, stats); + + free(prefixed_ip); + return ret; +} + +void bytes_to_ip(uint8_t *bytes, size_t len, union inaddr *dst) +{ + switch (len) { + case sizeof(struct in_addr): + ADDR_SET(dst->ip4.sin, AF_INET, bytes, len, 0); + break; + case sizeof(struct in6_addr): + ADDR_SET(dst->ip6.sin6, AF_INET6, bytes, len, 0); + break; + default: + assert(0); + } +} + +uint8_t *ip_to_bytes(const union inaddr *src, size_t len) +{ + switch (len) { + case sizeof(struct in_addr): + return (uint8_t *)&src->ip4.sin_addr; + case sizeof(struct in6_addr): + return (uint8_t *)&src->ip6.sin6_addr; + default: + assert(0); + } +} + +static bool no_rtt_info(struct rtt_state s) +{ + return s.srtt == 0 && s.consecutive_timeouts == 0; +} + +static unsigned back_off_timeout(uint32_t to, int pow) +{ + if (pow > MAX_BACKOFF) { + to *= 1 << MAX_BACKOFF; + } else { + to *= (1 << pow); + } + if (to > MAX_TIMEOUT) { + to = MAX_TIMEOUT; + } + return to; +} + +/* This is verbatim (minus the default timeout value and minimal variance) + * RFC6298, sec. 2. */ +static unsigned calc_timeout(struct rtt_state state) +{ + int32_t timeout = + state.srtt + MAX(4 * state.variance, MINIMAL_TIMEOUT_ADDITION); + return back_off_timeout(timeout, state.consecutive_timeouts); +} + +/* This is verbatim RFC6298, sec. 2. */ +static struct rtt_state calc_rtt_state(struct rtt_state old, unsigned new_rtt) +{ + if (no_rtt_info(old)) { + return (struct rtt_state){ new_rtt, new_rtt / 2, 0 }; + } + + struct rtt_state ret; + + ret.srtt = (int32_t)(0.75 * old.srtt + 0.25 * new_rtt); + ret.variance = (int32_t)(0.875 * old.variance + + 0.125 * abs(old.srtt - (int32_t)new_rtt)); + ret.consecutive_timeouts = 0; + + return ret; +} + +/** + * @internal Invalidate addresses which should be considered dead + */ +static void invalidate_dead_upstream(struct address_state *state, + unsigned int retry_timeout) +{ + if (kr_now() - state->rtt_state.dead_since < retry_timeout) { + state->generation = -1; + } +} + +/** + * @internal Check if IP address is TLS capable. + * + * @p req has to have the selection_context properly initiazed. + */ +static void check_tls_capable(struct address_state *address_state, + struct kr_request *req, struct sockaddr *address) +{ + address_state->tls_capable = + req->selection_context.is_tls_capable ? + req->selection_context.is_tls_capable(address) : + false; +} + +#if 0 +/* TODO: uncomment these once we actually use the information it collects. */ +/** + * Check if there is a existing TCP connection to this address. + * + * @p req has to have the selection_context properly initiazed. + */ +void check_tcp_connections(struct address_state *address_state, struct kr_request *req, struct sockaddr *address) { + address_state->tcp_connected = req->selection_context.is_tcp_connected ? req->selection_context.is_tcp_connected(address) : false; + address_state->tcp_waiting = req->selection_context.is_tcp_waiting ? req->selection_context.is_tcp_waiting(address) : false; +} +#endif + +/** + * @internal Invalidate address if the respective IP version is disabled. + */ +static void check_network_settings(struct address_state *address_state, + size_t address_len, bool no_ipv4, bool no_ipv6) +{ + if (no_ipv4 && address_len == sizeof(struct in_addr)) { + address_state->generation = -1; + } + if (no_ipv6 && address_len == sizeof(struct in6_addr)) { + address_state->generation = -1; + } +} + +void update_address_state(struct address_state *state, uint8_t *address, + size_t address_len, struct kr_query *qry) +{ + union inaddr tmp_address; + bytes_to_ip(address, address_len, &tmp_address); + check_tls_capable(state, qry->request, &tmp_address.ip); + /* TODO: uncomment this once we actually use the information it collects + check_tcp_connections(address_state, qry->request, &tmp_address.ip); + */ + check_network_settings(state, address_len, qry->flags.NO_IPV4, + qry->flags.NO_IPV6); + state->rtt_state = + get_rtt_state(address, address_len, &qry->request->ctx->cache); + invalidate_dead_upstream( + state, qry->request->ctx->cache_rtt_tout_retry_interval); +#ifdef SELECTION_CHOICE_LOGGING + // This is sometimes useful for debugging, but usually too verbose + WITH_VERBOSE(qry) + { + const char *ns_str = kr_straddr(&tmp_address.ip); + VERBOSE_MSG(qry, "rtt of %s is %d, variance is %d\n", ns_str, + state->rtt_state.srtt, state->rtt_state.variance); + } +#endif +} + +static int cmp_choices(const void *a, const void *b) +{ + struct choice *a_ = (struct choice *)a; + struct choice *b_ = (struct choice *)b; + + int diff; + /* Address with no RTT information is better than address + * with some information. */ + if ((diff = no_rtt_info(b_->address_state->rtt_state) - + no_rtt_info(a_->address_state->rtt_state))) { + return diff; + } + /* Address with less errors is better. */ + if ((diff = a_->address_state->error_count - + b_->address_state->error_count)) { + return diff; + } + /* Address with smaller expected timeout is better. */ + if ((diff = calc_timeout(a_->address_state->rtt_state) - + calc_timeout(b_->address_state->rtt_state))) { + return diff; + } + return 0; +} + +/* Fisher-Yates shuffle of the choices */ +static void shuffle_choices(struct choice choices[], int choices_len) +{ + struct choice tmp; + for (int i = choices_len - 1; i > 0; i--) { + int j = kr_rand_bytes(1) % (i + 1); + tmp = choices[i]; + choices[i] = choices[j]; + choices[j] = tmp; + } +} + +/* Performs the actual selection (currently variation on epsilon-greedy). */ +struct kr_transport *select_transport(struct choice choices[], int choices_len, + struct to_resolve unresolved[], + int unresolved_len, int timeouts, + struct knot_mm *mempool, bool tcp, + size_t *choice_index) +{ + if (!choices_len && !unresolved_len) { + /* There is nothing to choose from */ + return NULL; + } + + struct kr_transport *transport = + mm_alloc(mempool, sizeof(struct kr_transport)); + memset(transport, 0, sizeof(struct kr_transport)); + + int choice = 0; + if (kr_rand_coin(EPSILON_NOMIN, EPSILON_DENOM) || choices_len == 0) { + /* "EXPLORE": + * randomly choose some option + * (including resolution of some new name). */ + int index = kr_rand_bytes(1) % (choices_len + unresolved_len); + if (index < unresolved_len) { + // We will resolve a new NS name + *transport = (struct kr_transport){ + .protocol = unresolved[index].type, + .ns_name = unresolved[index].name + }; + return transport; + } else { + choice = index - unresolved_len; + } + } else { + /* "EXPLOIT": + * choose a resolved address which seems best right now. */ + shuffle_choices(choices, choices_len); + /* If there are some addresses with no rtt_info we try them + * first (see cmp_choices). So unknown servers are chosen + * *before* the best know server. This ensures that every option + * is tried before going back to some that was tried before. */ + qsort(choices, choices_len, sizeof(struct choice), cmp_choices); + choice = 0; + } + + struct choice *chosen = &choices[choice]; + + /* Don't try the same server again when there are other choices to be explored */ + if (chosen->address_state->error_count && unresolved_len) { + int index = kr_rand_bytes(1) % unresolved_len; + *transport = (struct kr_transport){ + .ns_name = unresolved[index].name, + .protocol = unresolved[index].type, + }; + return transport; + } + + unsigned timeout; + if (no_rtt_info(chosen->address_state->rtt_state)) { + /* Exponential back-off when retrying after timeout and choosing + * an unknown server. */ + timeout = back_off_timeout(DEFAULT_TIMEOUT, timeouts); + } else { + timeout = calc_timeout(chosen->address_state->rtt_state); + } + + enum kr_transport_protocol protocol; + if (chosen->address_state->tls_capable) { + protocol = KR_TRANSPORT_TLS; + } else if (tcp || + chosen->address_state->errors[KR_SELECTION_QUERY_TIMEOUT] >= TCP_TIMEOUT_THRESHOLD || + timeout > TCP_RTT_THRESHOLD) { + protocol = KR_TRANSPORT_TCP; + } else { + protocol = KR_TRANSPORT_UDP; + } + + *transport = (struct kr_transport){ + .ns_name = chosen->address_state->ns_name, + .protocol = protocol, + .timeout = timeout, + .safe_mode = + chosen->address_state->errors[KR_SELECTION_FORMERROR], + }; + + int port; + if (!(port = chosen->port)) { + switch (transport->protocol) { + case KR_TRANSPORT_TLS: + port = KR_DNS_TLS_PORT; + break; + case KR_TRANSPORT_UDP: + case KR_TRANSPORT_TCP: + port = KR_DNS_PORT; + break; + default: + assert(0); + break; + } + } + + switch (chosen->address_len) { + case sizeof(struct in_addr): + ADDR_SET(transport->address.ip4.sin, AF_INET, chosen->address, + chosen->address_len, port); + transport->address_len = chosen->address_len; + break; + case sizeof(struct in6_addr): + ADDR_SET(transport->address.ip6.sin6, AF_INET6, chosen->address, + chosen->address_len, port); + transport->address_len = chosen->address_len; + break; + default: + assert(0); + break; + } + + if (choice_index) { + *choice_index = chosen->address_state->choice_array_index; + } + + return transport; +} + +void update_rtt(struct kr_query *qry, struct address_state *addr_state, + const struct kr_transport *transport, unsigned rtt) +{ + if (!transport || !addr_state) { + /* Answers from cache have NULL transport, ignore them. */ + return; + } + + struct kr_cache *cache = &qry->request->ctx->cache; + + uint8_t *address = + ip_to_bytes(&transport->address, transport->address_len); + /* This construct is a bit racy since the global state may change + * between calls to `get_rtt_state` and `put_rtt_state` but we don't + * care that much since it is rare and we only risk slightly suboptimal + * transport choice. */ + struct rtt_state cur_rtt_state = + get_rtt_state(address, transport->address_len, cache); + struct rtt_state new_rtt_state = calc_rtt_state(cur_rtt_state, rtt); + put_rtt_state(address, transport->address_len, new_rtt_state, cache); + + WITH_VERBOSE(qry) + { + KR_DNAME_GET_STR(ns_name, transport->ns_name); + KR_DNAME_GET_STR(zonecut_str, qry->zone_cut.name); + const char *ns_str = kr_straddr(&transport->address.ip); + + VERBOSE_MSG( + qry, + "=> id: '%05u' updating: '%s'@'%s' zone cut: '%s' with rtt %u to srtt: %d and variance: %d \n", + qry->id, ns_name, ns_str ? ns_str : "", zonecut_str, + rtt, new_rtt_state.srtt, new_rtt_state.variance); + } +} + +static void cache_timeout(const struct kr_transport *transport, + struct address_state *addr_state, struct kr_cache *cache) +{ + if (transport->deduplicated) { + /* Transport was chosen by a different query, that one will + * cache the result. */ + return; + } + + uint8_t *address = + ip_to_bytes(&transport->address, transport->address_len); + struct rtt_state old_state = addr_state->rtt_state; + struct rtt_state cur_state = + get_rtt_state(address, transport->address_len, cache); + + /* We could lose some update from some other process by doing this, + * but at least timeout count can't blow up. */ + if (cur_state.consecutive_timeouts == old_state.consecutive_timeouts) { + if (++cur_state.consecutive_timeouts >= + KR_NS_TIMEOUT_ROW_DEAD) { + cur_state.dead_since = kr_now(); + } + put_rtt_state(address, transport->address_len, cur_state, + cache); + } else { + /* `get_rtt_state` opens a cache transaction, we have to end it. */ + kr_cache_commit(cache); + } +} + +void error(struct kr_query *qry, struct address_state *addr_state, + const struct kr_transport *transport, + enum kr_selection_error sel_error) +{ + if (!transport || !addr_state) { + /* Answers from cache have NULL transport, ignore them. */ + return; + } + + if (sel_error >= KR_SELECTION_NUMBER_OF_ERRORS) { + assert(0); + } + + if (sel_error == KR_SELECTION_QUERY_TIMEOUT) { + qry->server_selection.local_state->timeouts++; + // Make sure the query was chosen by this query + if (!transport->deduplicated) { + cache_timeout(transport, addr_state, + &qry->request->ctx->cache); + } + } + + if (sel_error == KR_SELECTION_TRUNCATED && + transport->protocol == KR_TRANSPORT_UDP) { + /* Don't punish the server that told us to switch to TCP. */ + qry->server_selection.local_state->truncated = true; + } else { + if (sel_error == KR_SELECTION_TRUNCATED) { + /* TRUNCATED over TCP/TLS, upstream is broken. */ + addr_state->unrecoverable_errors++; + } + + if (UNRECOVERABLE_ERRORS[sel_error]) { + addr_state->unrecoverable_errors++; + } + + if (sel_error == KR_SELECTION_FORMERROR && transport->safe_mode) { + addr_state->unrecoverable_errors++; + } + + addr_state->errors[sel_error]++; + addr_state->error_count++; + } + + WITH_VERBOSE(qry) + { + KR_DNAME_GET_STR(ns_name, transport->ns_name); + KR_DNAME_GET_STR(zonecut_str, qry->zone_cut.name); + const char *ns_str = kr_straddr(&transport->address.ip); + + VERBOSE_MSG( + qry, + "=> id: '%05u' noting selection error: '%s'@'%s' zone cut: '%s' error no.:%d\n", + qry->id, ns_name, ns_str ? ns_str : "", zonecut_str, + sel_error); + } +} + +void kr_server_selection_init(struct kr_query *qry) +{ + struct knot_mm *mempool = &qry->request->pool; + if (qry->flags.FORWARD || qry->flags.STUB) { + qry->server_selection = (struct kr_server_selection){ + .initialized = true, + .choose_transport = forward_choose_transport, + .update_rtt = forward_update_rtt, + .error = forward_error, + .local_state = + mm_alloc(mempool, sizeof(struct local_state)), + }; + memset(qry->server_selection.local_state, 0, + sizeof(struct local_state)); + forward_local_state_alloc( + mempool, &qry->server_selection.local_state->private, + qry->request); + } else { + qry->server_selection = (struct kr_server_selection){ + .initialized = true, + .choose_transport = iter_choose_transport, + .update_rtt = iter_update_rtt, + .error = iter_error, + .local_state = + mm_alloc(mempool, sizeof(struct local_state)), + }; + memset(qry->server_selection.local_state, 0, + sizeof(struct local_state)); + iter_local_state_alloc( + mempool, &qry->server_selection.local_state->private); + } +} + +int kr_forward_add_target(struct kr_request *req, const struct sockaddr *sock) +{ + if (!req->selection_context.forwarding_targets.at) { + return kr_error(EINVAL); + } + + union inaddr address; + + switch (sock->sa_family) { + case AF_INET: + if (req->options.NO_IPV4) + return kr_error(EINVAL); + address.ip4 = *(const struct sockaddr_in *)sock; + break; + case AF_INET6: + if (req->options.NO_IPV6) + return kr_error(EINVAL); + address.ip6 = *(const struct sockaddr_in6 *)sock; + break; + default: + return kr_error(EINVAL); + } + + array_push_mm(req->selection_context.forwarding_targets, address, + kr_memreserve, &req->pool); + return kr_ok(); +} diff --git a/lib/selection.h b/lib/selection.h new file mode 100644 index 000000000..f8e2730e5 --- /dev/null +++ b/lib/selection.h @@ -0,0 +1,233 @@ +/* Copyright (C) 2014-2020 CZ.NIC, z.s.p.o. + * SPDX-License-Identifier: GPL-3.0-or-later + */ + +#pragma once + +/** + * @file selection.h + * Provides server selection API (see `kr_server_selection`) and functions common to both implementations. + */ + +#include "lib/cache/api.h" + +/* After KR_NS_TIMEOUT_ROW_DEAD consecutive timeouts, we consider the upstream IP dead for KR_NS_TIMEOUT_RETRY_INTERVAL ms */ +#define KR_NS_TIMEOUT_ROW_DEAD 4 +#define KR_NS_TIMEOUT_RETRY_INTERVAL 1000 + +/** + * These errors are to be reported as feedback to server selection. + * See `kr_server_selection::error` for more details. + */ +enum kr_selection_error { + KR_SELECTION_OK = 0, + + // Network errors + KR_SELECTION_QUERY_TIMEOUT, + KR_SELECTION_TLS_HANDSHAKE_FAILED, + KR_SELECTION_TCP_CONNECT_FAILED, + KR_SELECTION_TCP_CONNECT_TIMEOUT, + + // RCODEs + KR_SELECTION_REFUSED, + KR_SELECTION_SERVFAIL, + KR_SELECTION_FORMERROR, + KR_SELECTION_NOTIMPL, + KR_SELECTION_OTHER_RCODE, + + // DNS errors + KR_SELECTION_TRUNCATED, + KR_SELECTION_DNSSEC_ERROR, + KR_SELECTION_LAME_DELEGATION, + /** Too long chain, or cycle. */ + KR_SELECTION_BAD_CNAME, + + /** Leave this last, as it is used as array size. */ + KR_SELECTION_NUMBER_OF_ERRORS +}; + +enum kr_transport_protocol { + /** Selected name with no IPv4 address, it has to be resolved first. */ + KR_TRANSPORT_RESOLVE_A, + /** Selected name with no IPv6 address, it has to be resolved first. */ + KR_TRANSPORT_RESOLVE_AAAA, + KR_TRANSPORT_UDP, + KR_TRANSPORT_TCP, + KR_TRANSPORT_TLS, +}; + +/** + * Output of the selection algorithm. + */ +struct kr_transport { + knot_dname_t *ns_name; /**< Set to "." for forwarding targets.*/ + union inaddr address; + size_t address_len; + enum kr_transport_protocol protocol; + unsigned timeout; /**< Timeout in ms to be set for UDP transmission. */ + /** True iff transport was set in worker.c:subreq_finalize, + * that means it may be different from the one originally chosen one.*/ + bool deduplicated; + bool safe_mode; /**< Turn on SAFEMODE for this transport */ +}; + +struct local_state { + int timeouts; /**< Number of timeouts that occured resolving this query.*/ + bool truncated; /**< Query was truncated, switch to TCP. */ + void *private; /**< Inner state of the implementation.*/ +}; + +/** + * Specifies a API for selecting transports and giving feedback on the choices. + * + * The function pointers are to be used throughout resolver when some information about + * the transport is obtained. E.g. RTT in `worker.c` or RCODE in `iterate.c`,… + */ +struct kr_server_selection { + bool initialized; + /** + * Puts a pointer to next transport of @p qry to @p transport . + * + * Allocates new kr_transport in request's mempool, chooses transport to be used for this query. + * Selection may fail, so @p transport can be set to NULL. + * + * @param transport to be filled with pointer to the chosen transport or NULL on failure + */ + void (*choose_transport)(struct kr_query *qry, + struct kr_transport **transport); + /** Report back the RTT of network operation for transport in ms. */ + void (*update_rtt)(struct kr_query *qry, + const struct kr_transport *transport, unsigned rtt); + /** Report back error encourtered with the chosen transport. See `enum kr_selection` */ + void (*error)(struct kr_query *qry, + const struct kr_transport *transport, + enum kr_selection_error error); + + struct local_state *local_state; +}; + +/** + * @brief Initialize the server selection API for @p qry. + * + * The implementation is to be chosen based on qry->flags. + */ +KR_EXPORT +void kr_server_selection_init(struct kr_query *qry); + +/** + * @brief Add forwarding target to request. + * + * This is exposed to Lua in order to add forwarding targets to request. + * These are then shared by all the queries in said request. + */ +KR_EXPORT +int kr_forward_add_target(struct kr_request *req, const struct sockaddr *sock); + +/** + * To be held per IP address in the global LMDB cache + */ +struct rtt_state { + int32_t srtt; + int32_t variance; + int32_t consecutive_timeouts; + /** Timestamp of pronouncing this IP bad based on KR_NS_TIMEOUT_ROW_DEAD */ + uint64_t dead_since; +}; + +/** + * @brief To be held per IP address and locally "inside" query. + */ +struct address_state { + /** Used to distinguish old and valid records in local_state. */ + unsigned int generation; + struct rtt_state rtt_state; + knot_dname_t *ns_name; + bool tls_capable : 1; + /* TODO: uncomment these once we actually use this information in selection + bool tcp_waiting : 1; + bool tcp_connected : 1; + */ + int choice_array_index; + int error_count; + int unrecoverable_errors; + int errors[KR_SELECTION_NUMBER_OF_ERRORS]; +}; + +/** + * @brief Array of these is one of inputs for the actual selection algorithm (`select_transport`) + */ +struct choice { + uint8_t *address; + size_t address_len; + struct address_state *address_state; + /** used to overwrite the port number; + * if zero, `select_transport` determines it. */ + uint16_t port; +}; + +/** + * @brief Array of these is description of names to be resolved (i.e. name without some address) + */ +struct to_resolve { + knot_dname_t *name; + /** Either KR_TRANSPORT_RESOLVE_A or KR_TRANSPORT_RESOLVE_AAAA is valid here. */ + enum kr_transport_protocol type; +}; + +/** + * @brief Based on passed choices, choose the next transport. + * + * Common function to both implementations (iteration and forwarding). + * The `*_choose_transport` functions from `selection_*.h` preprocess the input for this one. + * + * @param choices Options to choose from, see struct above + * @param unresolved Array of names that can be resolved (i.e. no A/AAAA record) + * @param timeouts Number of timeouts that occured in this query (used for exponential backoff) + * @param mempool Memory context of current request + * @param tcp Force TCP as transport protocol + * @param[out] choice_index Optinally index of the chosen transport in the @p choices array is stored here. + * @return Chosen transport or NULL when no choice is viable + */ +struct kr_transport *select_transport(struct choice choices[], int choices_len, + struct to_resolve unresolved[], + int unresolved_len, int timeouts, + struct knot_mm *mempool, bool tcp, + size_t *choice_index); + +/** + * Common part of RTT feedback mechanism. Notes RTT to global cache. + */ +void update_rtt(struct kr_query *qry, struct address_state *addr_state, + const struct kr_transport *transport, unsigned rtt); + +/** + * Common part of error feedback mechanism. + */ +void error(struct kr_query *qry, struct address_state *addr_state, + const struct kr_transport *transport, + enum kr_selection_error sel_error); + +/** + * Get RTT state from cache. Returns `default_rtt_state` on unknown addresses. + */ +struct rtt_state get_rtt_state(const uint8_t *ip, size_t len, + struct kr_cache *cache); + +int put_rtt_state(const uint8_t *ip, size_t len, struct rtt_state state, + struct kr_cache *cache); + +/** + * @internal Helper function for conversion between different IP representations. + */ +void bytes_to_ip(uint8_t *bytes, size_t len, union inaddr *dst); + +/** + * @internal Helper function for conversion between different IP representations. + */ +uint8_t *ip_to_bytes(const union inaddr *src, size_t len); + +/** + * @internal Fetch per-address information from various sources. + */ +void update_address_state(struct address_state *state, uint8_t *address, + size_t address_len, struct kr_query *qry); diff --git a/lib/selection_forward.c b/lib/selection_forward.c new file mode 100644 index 000000000..2f85bcd80 --- /dev/null +++ b/lib/selection_forward.c @@ -0,0 +1,129 @@ +/* Copyright (C) 2014-2020 CZ.NIC, z.s.p.o. + * SPDX-License-Identifier: GPL-3.0-or-later + */ + +#include "lib/selection_forward.h" +#include "lib/resolve.h" + +#define VERBOSE_MSG(qry, ...) QRVERBOSE((qry), "slct", __VA_ARGS__) + +#define FORWARDING_TIMEOUT 2000 + +struct forward_local_state { + inaddr_array_t *targets; + struct address_state *addr_states; + /** Index of last choice in the targets array, used for error reporting. */ + size_t last_choice_index; +}; + +void forward_local_state_alloc(struct knot_mm *mm, void **local_state, + struct kr_request *req) +{ + assert(req->selection_context.forwarding_targets.at); + *local_state = mm_alloc(mm, sizeof(struct forward_local_state)); + memset(*local_state, 0, sizeof(struct forward_local_state)); + + struct forward_local_state *forward_state = + (struct forward_local_state *)*local_state; + forward_state->targets = &req->selection_context.forwarding_targets; + + forward_state->addr_states = mm_alloc( + mm, sizeof(struct address_state) * forward_state->targets->len); + memset(forward_state->addr_states, 0, + sizeof(struct address_state) * forward_state->targets->len); +} + +void forward_choose_transport(struct kr_query *qry, + struct kr_transport **transport) +{ + struct forward_local_state *local_state = + qry->server_selection.local_state->private; + struct choice choices[local_state->targets->len]; + int valid = 0; + + for (int i = 0; i < local_state->targets->len; i++) { + union inaddr *address = &local_state->targets->at[i]; + size_t addr_len; + uint16_t port; + switch (address->ip.sa_family) { + case AF_INET: + port = ntohs(address->ip4.sin_port); + addr_len = sizeof(struct in_addr); + break; + case AF_INET6: + port = ntohs(address->ip6.sin6_port); + addr_len = sizeof(struct in6_addr); + break; + default: + assert(0); + } + + struct address_state *addr_state = &local_state->addr_states[i]; + addr_state->ns_name = (knot_dname_t *)""; + + update_address_state(addr_state, ip_to_bytes(address, addr_len), + addr_len, qry); + + if (addr_state->generation == -1) { + continue; + } + addr_state->choice_array_index = i; + + choices[valid++] = (struct choice){ + .address = ip_to_bytes(address, addr_len), + .address_len = addr_len, + .address_state = addr_state, + .port = port, + }; + } + + bool tcp = + qry->flags.TCP | qry->server_selection.local_state->truncated; + *transport = + select_transport(choices, valid, NULL, 0, + qry->server_selection.local_state->timeouts, + &qry->request->pool, tcp, + &local_state->last_choice_index); + if (*transport) { + /* Set static timeout for forwarding; there is no point in this + * being dynamic since the RTT of a packet to forwarding target + * says nothing about the network RTT of said target, since + * it is doing resolution upstream. */ + (*transport)->timeout = FORWARDING_TIMEOUT; + /* We need to propagate this to flags since it's used in other + * parts of the resolver (e.g. logging and stats). */ + qry->flags.TCP = tcp; + } +} + +void forward_error(struct kr_query *qry, const struct kr_transport *transport, + enum kr_selection_error sel_error) +{ + if (!qry->server_selection.initialized) { + return; + } + struct forward_local_state *local_state = + qry->server_selection.local_state->private; + struct address_state *addr_state = + &local_state->addr_states[local_state->last_choice_index]; + error(qry, addr_state, transport, sel_error); +} + +void forward_update_rtt(struct kr_query *qry, + const struct kr_transport *transport, unsigned rtt) +{ + if (!qry->server_selection.initialized) { + return; + } + + if (!transport) { + return; + } + + struct forward_local_state *local_state = + qry->server_selection.local_state->private; + struct address_state *addr_state = + &local_state->addr_states[local_state->last_choice_index]; + + update_rtt(qry, addr_state, transport, rtt); +} \ No newline at end of file diff --git a/lib/selection_forward.h b/lib/selection_forward.h new file mode 100644 index 000000000..e66274ffc --- /dev/null +++ b/lib/selection_forward.h @@ -0,0 +1,17 @@ +/* Copyright (C) 2014-2020 CZ.NIC, z.s.p.o. + * SPDX-License-Identifier: GPL-3.0-or-later + */ + +#pragma once + +#include "lib/selection.h" +#include "lib/resolve.h" + +void forward_local_state_alloc(struct knot_mm *mm, void **local_state, + struct kr_request *req); +void forward_choose_transport(struct kr_query *qry, + struct kr_transport **transport); +void forward_error(struct kr_query *qry, const struct kr_transport *transport, + enum kr_selection_error sel_error); +void forward_update_rtt(struct kr_query *qry, + const struct kr_transport *transport, unsigned rtt); \ No newline at end of file diff --git a/lib/selection_iter.c b/lib/selection_iter.c new file mode 100644 index 000000000..24fd3ce9b --- /dev/null +++ b/lib/selection_iter.c @@ -0,0 +1,369 @@ +/* Copyright (C) 2014-2020 CZ.NIC, z.s.p.o. + * SPDX-License-Identifier: GPL-3.0-or-later + */ + +#include "lib/selection_iter.h" +#include "lib/selection.h" + +#include "lib/generic/trie.h" +#include "lib/generic/pack.h" +#include "lib/zonecut.h" +#include "lib/resolve.h" + +#define VERBOSE_MSG(qry, ...) QRVERBOSE((qry), "slct", __VA_ARGS__) + +// To be held per query and locally +struct iter_local_state { + trie_t *names; + trie_t *addresses; + knot_dname_t *zonecut; + /** Used to distinguish old and valid records in tries. */ + unsigned int generation; + enum kr_selection_error last_error; + unsigned int no_ns_addr_count; +}; + +enum record_state { RECORD_UNKNOWN, RECORD_RESOLVED, RECORD_TRIED }; + +// To be held per NS name and locally +struct iter_name_state { + unsigned int generation; + enum record_state a_state; + enum record_state aaaa_state; +}; + +void iter_local_state_alloc(struct knot_mm *mm, void **local_state) +{ + *local_state = mm_alloc(mm, sizeof(struct iter_local_state)); + memset(*local_state, 0, sizeof(struct iter_local_state)); +} + +static struct address_state *get_address_state(struct iter_local_state *local_state, + const struct kr_transport *transport) +{ + if (!transport) { + return NULL; + } + + trie_t *addresses = local_state->addresses; + uint8_t *address = + ip_to_bytes(&transport->address, transport->address_len); + + trie_val_t *address_state = trie_get_try(addresses, (char *)address, + transport->address_len); + + if (!address_state) { + if (transport->deduplicated) { + /* Transport was chosen by a different query. */ + return NULL; + } + + assert(0); + } + return (struct address_state *)*address_state; +} + +static bool zonecut_changed(knot_dname_t *new, knot_dname_t *old) +{ + return knot_dname_cmp(old, new); +} + +static void unpack_state_from_zonecut(struct iter_local_state *local_state, + struct kr_query *qry) +{ + struct kr_zonecut *zonecut = &qry->zone_cut; + struct knot_mm *mm = &qry->request->pool; + + bool zcut_changed = false; + if (local_state->names == NULL || local_state->addresses == NULL) { + /* Local state initialization. */ + memset(local_state, 0, sizeof(struct iter_local_state)); + local_state->names = trie_create(mm); + local_state->addresses = trie_create(mm); + } else { + zcut_changed = zonecut_changed(zonecut->name, local_state->zonecut); + } + local_state->zonecut = zonecut->name; + local_state->generation++; + + if (zcut_changed) { + local_state->no_ns_addr_count = 0; + } + + trie_it_t *it; + unsigned int current_generation = local_state->generation; + + for (it = trie_it_begin(zonecut->nsset); !trie_it_finished(it); trie_it_next(it)) { + knot_dname_t *dname = (knot_dname_t *)trie_it_key(it, NULL); + pack_t *addresses = (pack_t *)*trie_it_val(it); + + trie_val_t *val = trie_get_ins(local_state->names, (char *)dname, + knot_dname_size(dname)); + if (!*val) { + /* We encountered this name for the first time. */ + *val = mm_alloc(mm, sizeof(struct iter_name_state)); + memset(*val, 0, sizeof(struct iter_name_state)); + } + struct iter_name_state *name_state = *(struct iter_name_state **)val; + name_state->generation = current_generation; + + if (zcut_changed) { + /* Set name as unresolved as they might have fallen out + * of cache (TTL expired). */ + name_state->a_state = RECORD_UNKNOWN; + name_state->aaaa_state = RECORD_UNKNOWN; + } + + if (addresses->len == 0) { + continue; + } + + /* We have some addresses to work with, let's iterate over them. */ + for (uint8_t *obj = pack_head(*addresses); obj != pack_tail(*addresses); + obj = pack_obj_next(obj)) { + uint8_t *address = pack_obj_val(obj); + size_t address_len = pack_obj_len(obj); + trie_val_t *tval = trie_get_ins(local_state->addresses, + (char *)address, + address_len); + if (!*tval) { + /* We have have not seen this address before. */ + *tval = mm_alloc(mm, sizeof(struct address_state)); + memset(*tval, 0, sizeof(struct address_state)); + } + struct address_state *address_state = (*(struct address_state **)tval); + address_state->generation = current_generation; + address_state->ns_name = dname; + + if (address_len == sizeof(struct in_addr)) { + name_state->a_state = RECORD_RESOLVED; + } else if (address_len == sizeof(struct in6_addr)) { + name_state->aaaa_state = RECORD_RESOLVED; + } + update_address_state(address_state, address, address_len, qry); + } + } + trie_it_free(it); +} + +static int get_valid_addresses(struct iter_local_state *local_state, + struct choice choices[]) +{ + unsigned count = 0; + trie_it_t *it; + for (it = trie_it_begin(local_state->addresses); !trie_it_finished(it); + trie_it_next(it)) { + size_t address_len; + uint8_t *address = (uint8_t *)trie_it_key(it, &address_len); + struct address_state *address_state = + (struct address_state *)*trie_it_val(it); + if (address_state->generation == local_state->generation && + !address_state->unrecoverable_errors) { + choices[count] = (struct choice){ + .address = address, + .address_len = address_len, + .address_state = address_state, + }; + count++; + } + } + trie_it_free(it); + return count; +} + +static int get_resolvable_names(struct iter_local_state *local_state, + struct to_resolve resolvable[], struct kr_query *qry) +{ + /* Further resolution is not possible until we get `. DNSKEY` record; + * we have to choose one of the known addresses here. */ + if (qry->sname[0] == '\0' && qry->stype == KNOT_RRTYPE_DNSKEY) { + return 0; + } + + unsigned count = 0; + trie_it_t *it; + for (it = trie_it_begin(local_state->names); !trie_it_finished(it); + trie_it_next(it)) { + struct iter_name_state *name_state = + *(struct iter_name_state **)trie_it_val(it); + if (name_state->generation == local_state->generation) { + knot_dname_t *name = (knot_dname_t *)trie_it_key(it, NULL); + if (qry->stype == KNOT_RRTYPE_DNSKEY && + knot_dname_in_bailiwick(name, qry->sname) > 0) { + /* Resolving `domain. DNSKEY` can't trigger the + * resolution of `sub.domain. A/AAAA` since it + * will cause a cycle. */ + continue; + } + + /* FIXME: kr_rplan_satisfies(qry,…) should have been here, but this leads to failures on + * iter_ns_badip.rpl, this is because the test requires the resolver to switch to parent + * side after a record in cache expires. Only way to do this in the current zonecut setup is + * to requery the same query twice in the row. So we have to allow that and only check the + * rplan from parent upwards. + */ + bool a_in_rplan = kr_rplan_satisfies(qry->parent, name, + KNOT_CLASS_IN, + KNOT_RRTYPE_A); + bool aaaa_in_rplan = + kr_rplan_satisfies(qry->parent, name, + KNOT_CLASS_IN, + KNOT_RRTYPE_AAAA); + + if (name_state->a_state == RECORD_UNKNOWN && + !qry->flags.NO_IPV4 && !a_in_rplan) { + resolvable[count++] = (struct to_resolve){ + name, KR_TRANSPORT_RESOLVE_A + }; + } + + if (name_state->aaaa_state == RECORD_UNKNOWN && + !qry->flags.NO_IPV6 && !aaaa_in_rplan) { + resolvable[count++] = (struct to_resolve){ + name, KR_TRANSPORT_RESOLVE_AAAA + }; + } + } + } + trie_it_free(it); + return count; +} + +static void update_name_state(knot_dname_t *name, enum kr_transport_protocol type, + trie_t *names) +{ + size_t name_len = knot_dname_size(name); + trie_val_t *val = trie_get_try(names, (char *)name, name_len); + + if (!val) { + return; + } + + struct iter_name_state *name_state = (struct iter_name_state *)*val; + switch (type) { + case KR_TRANSPORT_RESOLVE_A: + name_state->a_state = RECORD_TRIED; + break; + case KR_TRANSPORT_RESOLVE_AAAA: + name_state->aaaa_state = RECORD_TRIED; + break; + default: + assert(0); + } +} + +void iter_choose_transport(struct kr_query *qry, + struct kr_transport **transport) +{ + struct knot_mm *mempool = &qry->request->pool; + struct iter_local_state *local_state = + (struct iter_local_state *) + qry->server_selection.local_state->private; + + unpack_state_from_zonecut(local_state, qry); + + struct choice choices[trie_weight(local_state->addresses)]; + /* We may try to resolve A and AAAA record for each name, so therefore + * 2*trie_weight(…) is here. */ + struct to_resolve resolvable[2 * trie_weight(local_state->names)]; + + // Filter valid addresses and names from the tries + int choices_len = get_valid_addresses(local_state, choices); + int resolvable_len = get_resolvable_names(local_state, resolvable, qry); + + if (choices_len || resolvable_len) { + bool tcp = qry->flags.TCP | + qry->server_selection.local_state->truncated; + *transport = select_transport( + choices, choices_len, resolvable, resolvable_len, + qry->server_selection.local_state->timeouts, mempool, + tcp, NULL); + if (*transport) { + switch ((*transport)->protocol) { + case KR_TRANSPORT_RESOLVE_A: + case KR_TRANSPORT_RESOLVE_AAAA: + /* Note that we tried resolving this name to not try it again. */ + update_name_state((*transport)->ns_name, + (*transport)->protocol, + local_state->names); + break; + case KR_TRANSPORT_TLS: + case KR_TRANSPORT_TCP: + /* We need to propagate this to flags since it's used in + * other parts of the resolver. */ + qry->flags.TCP = true; + break; + default: + break; + } + } + } else { + *transport = NULL; + /* Last selected server had broken DNSSEC and now we have no more + * servers to ask. We signal this to the rest of resolver by + * setting DNSSEC_BOGUS flag. */ + if (local_state->last_error == KR_SELECTION_DNSSEC_ERROR) { + qry->flags.DNSSEC_BOGUS = true; + } + } + + bool nxnsattack_mitigation = false; + enum kr_transport_protocol proto = + *transport ? (*transport)->protocol : -1; + if (proto == KR_TRANSPORT_RESOLVE_A || proto == KR_TRANSPORT_RESOLVE_AAAA) { + if (++local_state->no_ns_addr_count > KR_COUNT_NO_NSADDR_LIMIT) { + *transport = NULL; + nxnsattack_mitigation = true; + } + } + + WITH_VERBOSE(qry) + { + KR_DNAME_GET_STR(zonecut_str, qry->zone_cut.name); + if (*transport) { + KR_DNAME_GET_STR(ns_name, (*transport)->ns_name); + const char *ns_str = kr_straddr(&(*transport)->address.ip); + const char *ip_version; + switch (proto) + { + case KR_TRANSPORT_RESOLVE_A: + case KR_TRANSPORT_RESOLVE_AAAA: + ip_version = (proto == KR_TRANSPORT_RESOLVE_A) ? "A" : "AAAA"; + VERBOSE_MSG(qry, "=> id: '%05u' choosing to resolve %s: '%s' zone cut: '%s'\n", + qry->id, ip_version, ns_name, zonecut_str); + break; + default: + VERBOSE_MSG(qry, "=> id: '%05u' choosing: '%s'@'%s' with timeout %u ms zone cut: '%s'%s\n", + qry->id, ns_name, ns_str ? ns_str : "", (*transport)->timeout, zonecut_str, + (*transport)->safe_mode ? " SAFEMODE" : ""); + break; + } + } else { + VERBOSE_MSG(qry, "=> id: '%05u' no suitable transport, zone cut: '%s'%s\n", + qry->id, zonecut_str, nxnsattack_mitigation ? " (stopped due to mitigation for NXNSAttack CVE-2020-12667)" : ""); + } + } +} + +void iter_error(struct kr_query *qry, const struct kr_transport *transport, + enum kr_selection_error sel_error) +{ + if (!qry->server_selection.initialized) { + return; + } + struct iter_local_state *local_state = qry->server_selection.local_state->private; + struct address_state *addr_state = get_address_state(local_state, transport); + local_state->last_error = sel_error; + error(qry, addr_state, transport, sel_error); +} + +void iter_update_rtt(struct kr_query *qry, const struct kr_transport *transport, + unsigned rtt) +{ + if (!qry->server_selection.initialized) { + return; + } + struct iter_local_state *local_state = qry->server_selection.local_state->private; + struct address_state *addr_state = get_address_state(local_state, transport); + update_rtt(qry, addr_state, transport, rtt); +} diff --git a/lib/selection_iter.h b/lib/selection_iter.h new file mode 100644 index 000000000..f1c798bfc --- /dev/null +++ b/lib/selection_iter.h @@ -0,0 +1,34 @@ +/* Copyright (C) 2014-2020 CZ.NIC, z.s.p.o. + * SPDX-License-Identifier: GPL-3.0-or-later + */ + +#pragma once + +#include "lib/selection.h" + +/** + * If one of the errors set to true is encountered, there is no point in asking this server again. + */ +static const bool UNRECOVERABLE_ERRORS[] = { + [KR_SELECTION_QUERY_TIMEOUT] = false, + [KR_SELECTION_TLS_HANDSHAKE_FAILED] = false, + [KR_SELECTION_TCP_CONNECT_FAILED] = false, + [KR_SELECTION_TCP_CONNECT_TIMEOUT] = false, + [KR_SELECTION_REFUSED] = true, + [KR_SELECTION_SERVFAIL] = true, + [KR_SELECTION_FORMERROR] = false, + [KR_SELECTION_NOTIMPL] = true, + [KR_SELECTION_OTHER_RCODE] = true, + [KR_SELECTION_TRUNCATED] = false, + [KR_SELECTION_DNSSEC_ERROR] = true, + [KR_SELECTION_LAME_DELEGATION] = true, + [KR_SELECTION_BAD_CNAME] = true, +}; + +void iter_local_state_alloc(struct knot_mm *mm, void **local_state); +void iter_choose_transport(struct kr_query *qry, + struct kr_transport **transport); +void iter_error(struct kr_query *qry, const struct kr_transport *transport, + enum kr_selection_error sel_error); +void iter_update_rtt(struct kr_query *qry, const struct kr_transport *transport, + unsigned rtt); \ No newline at end of file diff --git a/lib/utils.c b/lib/utils.c index ea10f2925..f2027a90d 100644 --- a/lib/utils.c +++ b/lib/utils.c @@ -11,7 +11,7 @@ #include "lib/defines.h" #include "lib/generic/array.h" #include "lib/module.h" -#include "lib/nsrep.h" +#include "lib/selection.h" #include "lib/resolve.h" #include diff --git a/lib/utils.h b/lib/utils.h index 022ae301e..b4f92fa63 100644 --- a/lib/utils.h +++ b/lib/utils.h @@ -118,6 +118,7 @@ static inline void free_const(const void *what) free((void *)what); } +// Use this for alocations with mm. static inline void *mm_alloc(knot_mm_t *mm, size_t size) { if (mm) return mm->alloc(mm->ctx, size); @@ -137,6 +138,7 @@ KR_EXPORT void *mm_realloc(knot_mm_t *mm, void *what, size_t size, size_t prev_size); /** Trivial malloc() wrapper. */ +// Use mm_alloc for alocations into mempool void *mm_malloc(void *ctx, size_t n); /** posix_memalign() wrapper. */ void *mm_malloc_aligned(void *ctx, size_t n); diff --git a/lib/zonecut.c b/lib/zonecut.c index 5839b9743..7227bf9c0 100644 --- a/lib/zonecut.c +++ b/lib/zonecut.c @@ -337,17 +337,8 @@ static addrset_info_t fetch_addr(pack_t *addrs, const knot_dname_t *ns, uint16_t (int)rd->len, (int)rrtype); continue; } - /* Check RTT cache - whether the IP is usable or not. */ - kr_nsrep_rtt_lru_entry_t *rtt_e = ctx->cache_rtt - ? lru_get_try(ctx->cache_rtt, (const char *)rd->data, rd->len) - : NULL; - const bool unusable = rtt_e && rtt_e->score >= KR_NS_TIMEOUT - && qry->creation_time_mono - < rtt_e->tout_timestamp + ctx->cache_rtt_tout_retry_interval; - if (!unusable) { - result = AI_OK; - ++usable_cnt; - } + result = AI_OK; + ++usable_cnt; ret = pack_obj_push(addrs, rd->data, rd->len); assert(!ret); /* didn't fit because of incorrectly reserved memory */ @@ -413,16 +404,10 @@ static int fetch_ns(struct kr_context *ctx, struct kr_zonecut *cut, pack_init(**pack); addrset_info_t infos[2]; + /* Fetch NS reputation and decide whether to prefetch A/AAAA records. */ - unsigned *cached = lru_get_try(ctx->cache_rep, - (const char *)ns_name, ns_size); - unsigned reputation = (cached) ? *cached : 0; - infos[0] = (reputation & KR_NS_NOIP4) || qry->flags.NO_IPV4 - ? AI_REPUT - : fetch_addr(*pack, ns_name, KNOT_RRTYPE_A, cut->pool, qry); - infos[1] = (reputation & KR_NS_NOIP6) || qry->flags.NO_IPV6 - ? AI_REPUT - : fetch_addr(*pack, ns_name, KNOT_RRTYPE_AAAA, cut->pool, qry); + infos[0] = fetch_addr(*pack, ns_name, KNOT_RRTYPE_A, cut->pool, qry); + infos[1] = fetch_addr(*pack, ns_name, KNOT_RRTYPE_AAAA, cut->pool, qry); #if 0 /* rather unlikely to be useful unless changing some zcut code */ WITH_VERBOSE(qry) { diff --git a/modules/bogus_log/test.integr/kresd_config.j2 b/modules/bogus_log/test.integr/kresd_config.j2 index 3bf1203ea..4054fd22e 100644 --- a/modules/bogus_log/test.integr/kresd_config.j2 +++ b/modules/bogus_log/test.integr/kresd_config.j2 @@ -11,7 +11,7 @@ function check_stats(got) local expected = { [1] = { ['type'] = 'DNSKEY', - ['count'] = 2, + ['count'] = 8, -- This is a trade-off to not hardfailing on DNSSEC errors ['name'] = '.', } } diff --git a/modules/policy/README.rst b/modules/policy/README.rst index 85c524ad3..d209f846c 100644 --- a/modules/policy/README.rst +++ b/modules/policy/README.rst @@ -332,6 +332,11 @@ Actions :func:`policy.FORWARD`, :func:`policy.TLS_FORWARD` and :func:`policy.STU policy.STUB('192.0.2.1@5353'), {todname('1.168.192.in-addr.arpa')})) +.. note:: Forwarding targets must support + `EDNS `_ and + `0x20 randomization `_. + + .. _tls-forwarding: Forwarding over TLS protocol (DNS-over-TLS) diff --git a/modules/policy/policy.lua b/modules/policy/policy.lua index e19d71466..b13ee798d 100644 --- a/modules/policy/policy.lua +++ b/modules/policy/policy.lua @@ -77,18 +77,13 @@ function policy.MIRROR(target) end -- Override the list of nameservers (forwarders) -local function set_nslist(qry, list) +local function set_nslist(req, list) local ns_i = 0 for _, ns in ipairs(list) do - -- kr_nsrep_set() can return kr_error(ENOENT), it's OK - if ffi.C.kr_nsrep_set(qry, ns_i, ns) == 0 then + if ffi.C.kr_forward_add_target(req, ns) == 0 then ns_i = ns_i + 1 end end - -- If less than maximum NSs, insert guard to terminate the list - if ns_i < 3 then - assert(ffi.C.kr_nsrep_set(qry, ns_i, nil) == 0); - end if ns_i == 0 then -- would use assert() but don't want to compose the message if not triggered error('no usable address in NS set (check net.ipv4 and ' @@ -102,7 +97,6 @@ function policy.STUB(target) if type(target) == 'table' then for _, v in pairs(target) do table.insert(list, addr2sock(v, 53)) - assert(#list <= 4, 'at most 4 STUB targets are supported') end else table.insert(list, addr2sock(target, 53)) @@ -112,7 +106,7 @@ function policy.STUB(target) -- Switch mode to stub resolver, do not track origin zone cut since it's not real authority NS qry.flags.STUB = true qry.flags.ALWAYS_CUT = false - set_nslist(qry, list) + set_nslist(req, list) return state end end @@ -123,7 +117,6 @@ function policy.FORWARD(target) if type(target) == 'table' then for _, v in pairs(target) do table.insert(list, addr2sock(v, 53)) - assert(#list <= 4, 'at most 4 FORWARD targets are supported') end else table.insert(list, addr2sock(target, 53)) @@ -136,7 +129,7 @@ function policy.FORWARD(target) qry.flags.ALWAYS_CUT = false qry.flags.NO_MINIMIZE = true qry.flags.AWAIT_CUT = true - set_nslist(qry, list) + set_nslist(req, list) return state end end @@ -145,8 +138,6 @@ end function policy.TLS_FORWARD(targets) if type(targets) ~= 'table' or #targets < 1 then error('TLS_FORWARD argument must be a non-empty table') - elseif #targets > 4 then - error('TLS_FORWARD supports at most four targets (in a single call)') end local sockaddr_c_set = {} @@ -182,7 +173,7 @@ function policy.TLS_FORWARD(targets) qry.flags.AWAIT_CUT = true req.options.TCP = true qry.flags.TCP = true - set_nslist(qry, nslist) + set_nslist(req, nslist) return state end end diff --git a/modules/stats/stats.c b/modules/stats/stats.c index 132c05c49..f9c47ba82 100644 --- a/modules/stats/stats.c +++ b/modules/stats/stats.c @@ -147,7 +147,7 @@ static int collect_rtt(kr_layer_t *ctx, knot_pkt_t *pkt) { struct kr_request *req = ctx->req; struct kr_query *qry = req->current_query; - if (qry->flags.CACHED || !req->upstream.addr) { + if (qry->flags.CACHED || !req->upstream.transport) { return ctx->state; } @@ -158,11 +158,11 @@ static int collect_rtt(kr_layer_t *ctx, knot_pkt_t *pkt) /* Socket address is encoded into sockaddr_in6 struct that * unions with sockaddr_in and differ in sa_family */ struct sockaddr_in6 *e = &data->upstreams.q.at[data->upstreams.head]; - const struct sockaddr *src = req->upstream.addr; - switch (src->sa_family) { - case AF_INET: memcpy(e, src, sizeof(struct sockaddr_in)); break; - case AF_INET6: memcpy(e, src, sizeof(struct sockaddr_in6)); break; - default: return ctx->state; + const union inaddr *src = &req->upstream.transport->address; + switch (src->ip.sa_family) { + case AF_INET: memcpy(e, &src->ip4, sizeof(src->ip4)); break; + case AF_INET6: memcpy(e, &src->ip6, sizeof(src->ip6)); break; + default: return ctx->state; } /* Replace port number with the RTT information (cap is UINT16_MAX milliseconds) */ e->sin6_rtt = req->upstream.rtt; diff --git a/tests/config/test_utils.lua b/tests/config/test_utils.lua index 93a937708..4389293b2 100644 --- a/tests/config/test_utils.lua +++ b/tests/config/test_utils.lua @@ -111,7 +111,7 @@ function M.check_answer(desc, qname, qtype, expected_rcode, expected_rdata) end ) - for delay = 0.1, 4, 0.5 do -- total max 14.9s in 8 steps + for delay = 0.1, 5, 0.5 do -- total max 23.5s in 9 steps if done then return end worker.sleep(delay) end