]> git.ipfire.org Git - thirdparty/knot-resolver.git/commitdiff
selection: server selection rewrite
authorŠtěpán Balážik <stepan.balazik@nic.cz>
Fri, 20 Mar 2020 18:43:11 +0000 (19:43 +0100)
committerVladimír Čunát <vladimir.cunat@nic.cz>
Thu, 31 Dec 2020 14:35:58 +0000 (15:35 +0100)
Design discussion: #447
Code discussion: !1030

33 files changed:
bench/bench_lru.c
daemon/bindings/cache.c
daemon/engine.c
daemon/lua/kres-gen.lua
daemon/lua/kres-gen.sh
daemon/worker.c
daemon/worker.h
daemon/zimport.c
doc/lib.rst
lib/defines.h
lib/layer/iterate.c
lib/layer/validate.c
lib/meson.build
lib/nsrep.c [deleted file]
lib/nsrep.h [deleted file]
lib/resolve.c
lib/resolve.h
lib/rplan.c
lib/rplan.h
lib/selection.c [new file with mode: 0644]
lib/selection.h [new file with mode: 0644]
lib/selection_forward.c [new file with mode: 0644]
lib/selection_forward.h [new file with mode: 0644]
lib/selection_iter.c [new file with mode: 0644]
lib/selection_iter.h [new file with mode: 0644]
lib/utils.c
lib/utils.h
lib/zonecut.c
modules/bogus_log/test.integr/kresd_config.j2
modules/policy/README.rst
modules/policy/policy.lua
modules/stats/stats.c
tests/config/test_utils.lua

index a885c2915250ebed524577baa0e14d4143d5fb0d..9fbf0d6fcc189c1f6bbe39d4f74df517c0d5c473 100644 (file)
@@ -11,7 +11,7 @@
 
 #include "contrib/ucw/lib.h"
 #include "daemon/engine.h"
-#include "lib/nsrep.h"
+#include "lib/selection.h"
 
 typedef kr_nsrep_lru_t lru_bench_t;
 
index 7b08374e364822b29e6c5837bd21a3145a05b1e4..4a8afd931843f9680236f7591bdba90627099e08 100644 (file)
@@ -268,8 +268,6 @@ static int cache_clear_everything(lua_State *L)
 
        /* Clear reputation tables */
        struct kr_context *ctx = &the_worker->engine->resolver;
-       lru_reset(ctx->cache_rtt);
-       lru_reset(ctx->cache_rep);
        lru_reset(ctx->cache_cookie);
        lua_pushboolean(L, true);
        return 1;
index 59a6cf18936e6d494720cd312ed68fe696ac8167..d0d6d87cee8b53306cdfc66b33de77d83f836461 100644 (file)
@@ -22,8 +22,7 @@
 #include "kresconfig.h"
 #include "daemon/engine.h"
 #include "daemon/ffimodule.h"
-#include "daemon/worker.h"
-#include "lib/nsrep.h"
+#include "lib/selection.h"
 #include "lib/cache/api.h"
 #include "lib/defines.h"
 #include "lib/cache/cdb_lmdb.h"
@@ -397,9 +396,6 @@ static int init_resolver(struct engine *engine)
        engine->resolver.tls_padding = -1;
        /* Empty init; filled via ./lua/postconfig.lua */
        kr_zonecut_init(&engine->resolver.root_hints, (const uint8_t *)"", engine->pool);
-       /* Open NS rtt + reputation cache */
-       lru_create(&engine->resolver.cache_rtt, LRU_RTT_SIZE, NULL, NULL);
-       lru_create(&engine->resolver.cache_rep, LRU_REP_SIZE, NULL, NULL);
        lru_create(&engine->resolver.cache_cookie, LRU_COOKIES_SIZE, NULL, NULL);
 
        /* Load basic modules */
@@ -578,8 +574,6 @@ void engine_deinit(struct engine *engine)
        kr_cache_close(&engine->resolver.cache);
 
        /* The LRUs are currently malloc-ated and need to be freed. */
-       lru_free(engine->resolver.cache_rtt);
-       lru_free(engine->resolver.cache_rep);
        lru_free(engine->resolver.cache_cookie);
 
        network_deinit(&engine->net);
index b5182a295e03fe8db30ed8c8f03e543173b0ec5a..36f21a2e1cff2a350910f7b323a974582a3da28f 100644 (file)
@@ -7,6 +7,13 @@ typedef struct knot_dump_style knot_dump_style_t;
 extern const knot_dump_style_t KNOT_DUMP_STYLE_DEFAULT;
 struct kr_cdb_api {};
 struct lru {};
+typedef enum {KNOT_ANSWER, KNOT_AUTHORITY, KNOT_ADDITIONAL} knot_section_t;
+typedef struct {
+       uint16_t pos;
+       uint16_t flags;
+       uint16_t compress_ptr[16];
+} knot_rrinfo_t;
+typedef unsigned char knot_dname_t;
 
 typedef struct knot_mm {
        void *ctx, *alloc, *free;
@@ -17,13 +24,7 @@ typedef void (*map_free_f)(void *baton, void *ptr);
 typedef void (*trace_log_f) (const struct kr_request *, const char *);
 typedef void (*trace_callback_f)(struct kr_request *);
 typedef uint8_t * (*alloc_wire_f)(struct kr_request *req, uint16_t *maxlen);
-typedef enum {KNOT_ANSWER, KNOT_AUTHORITY, KNOT_ADDITIONAL} knot_section_t;
-typedef struct {
-       uint16_t pos;
-       uint16_t flags;
-       uint16_t compress_ptr[16];
-} knot_rrinfo_t;
-typedef unsigned char knot_dname_t;
+typedef bool (*addr_info_f)(struct sockaddr*);
 typedef struct {
        knot_dname_t *_owner;
        uint32_t _ttl;
@@ -136,6 +137,11 @@ typedef struct {
        size_t len;
        size_t cap;
 } ranked_rr_array_t;
+typedef struct {
+       union inaddr *at;
+       size_t len;
+       size_t cap;
+} inaddr_array_t;
 struct kr_zonecut {
        knot_dname_t *name;
        knot_rrset_t *key;
@@ -177,7 +183,7 @@ struct kr_request {
        } qsource;
        struct {
                unsigned int rtt;
-               const struct sockaddr *addr;
+               const struct kr_transport *transport;
        } upstream;
        struct kr_qflags options;
        int state;
@@ -193,6 +199,12 @@ struct kr_request {
        int vars_ref;
        knot_mm_t pool;
        unsigned int uid;
+       struct {
+               addr_info_f is_tls_capable;
+               addr_info_f is_tcp_connected;
+               addr_info_f is_tcp_waiting;
+               inaddr_array_t forwarding_targets;
+       } selection_context;
        unsigned int count_no_nsaddr;
        unsigned int count_fail_row;
        alloc_wire_f alloc_wire_cb;
@@ -262,19 +274,19 @@ struct kr_module {
        void *lib;
        void *data;
 };
+struct kr_server_selection {
+       _Bool initialized;
+       void (*choose_transport)(struct kr_query *, struct kr_transport **);
+       void (*update_rtt)(struct kr_query *, const struct kr_transport *, unsigned int);
+       void (*error)(struct kr_query *, const struct kr_transport *, enum kr_selection_error);
+       struct local_state *local_state;
+};
 kr_layer_t kr_layer_t_static;
 typedef int32_t (*kr_stale_cb)(int32_t ttl, const knot_dname_t *owner, uint16_t type,
                                const struct kr_query *qry);
 
 void kr_rrset_init(knot_rrset_t *rrset, knot_dname_t *owner,
                        uint16_t type, uint16_t rclass, uint32_t ttl);
-struct kr_nsrep {
-       unsigned int score;
-       unsigned int reputation;
-       const knot_dname_t *name;
-       struct kr_context *ctx;
-       /* beware: hidden stub, to avoid hardcoding sockaddr lengths */
-};
 struct kr_query {
        struct kr_query *parent;
        knot_dname_t *sname;
@@ -295,7 +307,7 @@ struct kr_query {
        struct kr_query *cname_parent;
        struct kr_request *request;
        kr_stale_cb stale_cb;
-       struct kr_nsrep ns;
+       struct kr_server_selection server_selection;
 };
 struct kr_context {
        struct kr_qflags options;
@@ -305,8 +317,13 @@ struct kr_context {
        map_t negative_anchors;
        struct kr_zonecut root_hints;
        struct kr_cache cache;
+       unsigned int cache_rtt_tout_retry_interval;
        char _stub[];
 };
+struct kr_transport {
+       knot_dname_t *ns_name;
+       /* beware: hidden stub, to avoid hardcoding sockaddr lengths */
+};
 const char *knot_strerror(int);
 knot_dname_t *knot_dname_copy(const knot_dname_t *, knot_mm_t *);
 knot_dname_t *knot_dname_from_str(uint8_t *, const char *, size_t);
@@ -336,7 +353,7 @@ struct kr_query *kr_rplan_push(struct kr_rplan *, struct kr_query *, const knot_
 int kr_rplan_pop(struct kr_rplan *, struct kr_query *);
 struct kr_query *kr_rplan_resolved(struct kr_rplan *);
 struct kr_query *kr_rplan_last(struct kr_rplan *);
-int kr_nsrep_set(struct kr_query *, size_t, const struct sockaddr *);
+int kr_forward_add_target(struct kr_request *, const struct sockaddr *);
 void kr_log_req(const struct kr_request * const, uint32_t, const unsigned int, const char *, const char *, ...);
 void kr_log_q(const struct kr_query * const, const char *, const char *, ...);
 int kr_make_query(struct kr_query *, knot_pkt_t *);
index ad58d238df24185aa5d65f018ee127cd37000d3e..4b25f714d1d3604a78c90edb98efbe285e1f6961 100755 (executable)
@@ -60,6 +60,14 @@ struct kr_cdb_api {};
 struct lru {};
 "
 
+${CDEFS} ${LIBKRES} types <<-EOF
+       knot_section_t
+       knot_rrinfo_t
+       knot_dname_t
+       #knot_rdata_t
+       #knot_rdataset_t
+EOF
+
 # The generator doesn't work well with typedefs of functions.
 printf "
 typedef struct knot_mm {
@@ -71,16 +79,9 @@ typedef void (*map_free_f)(void *baton, void *ptr);
 typedef void (*trace_log_f) (const struct kr_request *, const char *);
 typedef void (*trace_callback_f)(struct kr_request *);
 typedef uint8_t * (*alloc_wire_f)(struct kr_request *req, uint16_t *maxlen);
+typedef bool (*addr_info_f)(struct sockaddr*);
 "
 
-${CDEFS} ${LIBKRES} types <<-EOF
-       knot_section_t
-       knot_rrinfo_t
-       knot_dname_t
-       #knot_rdata_t
-       #knot_rdataset_t
-EOF
-
 genResType() {
        echo "$1" | ${CDEFS} ${LIBKRES} types
 }
@@ -108,6 +109,7 @@ ${CDEFS} ${LIBKRES} types <<-EOF
        struct kr_qflags
        ranked_rr_array_entry_t
        ranked_rr_array_t
+       inaddr_array_t
        struct kr_zonecut
        kr_qarray_t
        struct kr_rplan
@@ -124,6 +126,7 @@ ${CDEFS} ${LIBKRES} types <<-EOF
        # lib/module.h
        struct kr_prop
        struct kr_module
+       struct kr_server_selection
 EOF
 
 # a static variable; the line might not be simple to generate
@@ -139,14 +142,15 @@ void kr_rrset_init(knot_rrset_t *rrset, knot_dname_t *owner,
 
 ## Some definitions would need too many deps, so shorten them.
 
-genResType "struct kr_nsrep" | sed '/union/,$ d'
-printf "\t/* beware: hidden stub, to avoid hardcoding sockaddr lengths */\n};\n"
-
 genResType "struct kr_query"
 
-genResType "struct kr_context" | sed '/kr_nsrep_rtt_lru_t/,$ d'
+genResType "struct kr_context" | sed '/module_array_t/,$ d'
 printf "\tchar _stub[];\n};\n"
 
+
+echo "struct kr_transport" | ${CDEFS} ${KRESD} types | sed '/union /,$ d'
+printf "\t/* beware: hidden stub, to avoid hardcoding sockaddr lengths */\n};\n"
+
 ## libknot API
 ${CDEFS} libknot functions <<-EOF
 # Utils
@@ -188,8 +192,8 @@ ${CDEFS} ${LIBKRES} functions <<-EOF
        kr_rplan_pop
        kr_rplan_resolved
        kr_rplan_last
-# Nameservers
-       kr_nsrep_set
+# Forwarding
+       kr_forward_add_target
 # Utils
        kr_log_req
        kr_log_q
@@ -277,6 +281,7 @@ printf "\t/* beware: hidden stub, to avoid hardcoding sockaddr lengths */\n};\n"
 echo "struct qr_task" | ${CDEFS} ${KRESD} types | sed '/pktbuf/,$ d'
 printf "\t/* beware: hidden stub, to avoid qr_tasklist_t */\n};\n"
 
+
 ${CDEFS} ${KRESD} functions <<-EOF
        worker_resolve_exec
        worker_resolve_mk_pkt
index 6028c0e0df231bfe050bf920506e972b2198c613..8c7684dff0323b538cad73bd16b8113185d81706 100644 (file)
@@ -83,15 +83,15 @@ struct qr_task
        qr_tasklist_t waiting;
        struct session *pending[MAX_PENDING];
        uint16_t pending_count;
-       uint16_t addrlist_count;
-       uint16_t addrlist_turn;
        uint16_t timeouts;
        uint16_t iter_count;
-       struct sockaddr *addrlist;
        uint32_t refs;
        bool finished : 1;
        bool leading  : 1;
        uint64_t creation_time;
+       uint64_t send_time;
+       uint64_t recv_time;
+       struct kr_transport *transport;
 };
 
 
@@ -120,15 +120,15 @@ static int qr_task_send(struct qr_task *task, struct session *session,
                        const struct sockaddr *addr, knot_pkt_t *pkt);
 static int qr_task_finalize(struct qr_task *task, int state);
 static void qr_task_complete(struct qr_task *task);
-static struct session* worker_find_tcp_connected(struct worker_ctx *worker,
+struct session* worker_find_tcp_connected(struct worker_ctx *worker,
                                                 const struct sockaddr *addr);
 static int worker_add_tcp_waiting(struct worker_ctx *worker,
                                  const struct sockaddr *addr,
                                  struct session *session);
-static struct session* worker_find_tcp_waiting(struct worker_ctx *worker,
+struct session* worker_find_tcp_waiting(struct worker_ctx *worker,
                                               const struct sockaddr *addr);
 static void on_tcp_connect_timeout(uv_timer_t *timer);
-static void on_retransmit(uv_timer_t *req);
+static void on_udp_timeout(uv_timer_t *timer);
 static void subreq_finalize(struct qr_task *task, const struct sockaddr *packet_source, knot_pkt_t *pkt);
 
 
@@ -310,6 +310,19 @@ static void free_wire(const struct request_ctx *ctx)
        kr_log_verbose("[xdp] freed unsent buffer, ret = %d\n", ret);
 }
 #endif
+/* Helper functions for transport selection */
+static inline bool is_tls_capable(struct sockaddr *address) {
+       tls_client_param_t *tls_entry = tls_client_param_get(the_worker->engine->net.tls_client_params, address);
+       return tls_entry;
+}
+
+static inline bool is_tcp_connected(struct sockaddr *address) {
+       return worker_find_tcp_connected(the_worker, address);
+}
+
+static inline bool is_tcp_waiting(struct sockaddr *address) {
+       return worker_find_tcp_waiting(the_worker, address);
+}
 
 /** Create and initialize a request_ctx (on a fresh mempool).
  *
@@ -383,6 +396,12 @@ static struct request_ctx *request_create(struct worker_ctx *worker,
                req->qsource.dst_addr = &ctx->source.dst_addr.ip;
        }
 
+       req->selection_context.is_tls_capable = is_tls_capable;
+       req->selection_context.is_tcp_connected = is_tcp_connected;
+       req->selection_context.is_tcp_waiting = is_tcp_waiting;
+       array_init(req->selection_context.forwarding_targets);
+       array_reserve_mm(req->selection_context.forwarding_targets, 1, kr_memreserve, &req->pool);
+
        worker->stats.rconcurrent += 1;
 
        return ctx;
@@ -559,7 +578,6 @@ static void qr_task_complete(struct qr_task *task)
 /* This is called when we send subrequest / answer */
 int qr_task_on_send(struct qr_task *task, const uv_handle_t *handle, int status)
 {
-
        if (task->finished) {
                assert(task->leading == false);
                qr_task_complete(task);
@@ -572,26 +590,17 @@ int qr_task_on_send(struct qr_task *task, const uv_handle_t *handle, int status)
        assert(s);
 
        if (handle->type == UV_UDP && session_flags(s)->outgoing) {
-               /* Start the timeout timer for UDP here, since this is the closest
-                * to the wire we can get. */
-               struct kr_request *req = &task->ctx->req;
-               /* Check current query NSLIST */
-               struct kr_query *qry = array_tail(req->rplan.pending);
+               // This should ensure that we are only dealing with our question to upstream
+               assert(!knot_wire_get_qr(task->pktbuf->wire));
+               // start the timer
+               struct kr_query *qry = array_tail(task->ctx->req.rplan.pending);
                assert(qry != NULL);
-               /* Retransmit at default interval, or more frequently if the mean
-                * RTT of the server is better. If the server is glued, use default rate. */
-               size_t timeout = qry->ns.score;
-               if (timeout > KR_NS_GLUED) {
-                       /* We don't have information about variance in RTT, expect +10ms */
-                       timeout = MIN(qry->ns.score + 10, KR_CONN_RETRY);
-               } else {
-                       timeout = KR_CONN_RETRY;
-               }
 
-               int ret = session_timer_start(s, on_retransmit, timeout, 0);
+               size_t timeout = task->transport->timeout;
+               int ret = session_timer_start(s, on_udp_timeout, timeout, 0);
                /* Start next step with timeout, fatal if can't start a timer. */
                if (ret != 0) {
-                       subreq_finalize(task, &qry->ns.addr->ip, task->pktbuf);
+                       subreq_finalize(task, &task->transport->address.ip, task->pktbuf);
                        qr_task_finalize(task, KR_STATE_FAIL);
                }
        }
@@ -681,6 +690,9 @@ static int qr_task_send(struct qr_task *task, struct session *session,
        qr_task_ref(task);
 
        struct worker_ctx *worker = ctx->worker;
+       /* Note time for upstream RTT */
+       task->send_time = kr_now();
+       task->recv_time = 0; // task structure is being reused so we have to zero this out here
        /* Send using given protocol */
        assert(!session_flags(session)->closing);
        if (session_flags(session)->has_http) {
@@ -793,11 +805,9 @@ static int session_tls_hs_cb(struct session *session, int status)
        if (status) {
                struct qr_task *task = session_waitinglist_get(session);
                if (task) {
-                       struct kr_qflags *options = &task->ctx->req.options;
-                       unsigned score = options->FORWARD || options->STUB ? KR_NS_FWD_DEAD : KR_NS_DEAD;
-                       kr_nsrep_update_rtt(NULL, peer, score,
-                                           the_worker->engine->resolver.cache_rtt,
-                                           KR_NS_UPDATE_NORESET);
+                       // TLS handshake failed, report it to server selection
+                       struct kr_query *qry = array_tail(task->ctx->req.rplan.pending);
+                       qry->server_selection.error(qry, task->transport, KR_SELECTION_TLS_HANDSHAKE_FAILED);
                }
 #ifndef NDEBUG
                else {
@@ -973,13 +983,10 @@ static void on_connect(uv_connect_t *req, int status)
                struct qr_task *task = session_waitinglist_get(session);
                if (task && status != UV_ETIMEDOUT) {
                        /* Penalize upstream.
-                        * In case of UV_ETIMEDOUT upstream has been
-                        * already penalized in on_tcp_connect_timeout() */
-                       struct kr_qflags *options = &task->ctx->req.options;
-                       unsigned score = options->FORWARD || options->STUB ? KR_NS_FWD_DEAD : KR_NS_DEAD;
-                       kr_nsrep_update_rtt(NULL, peer, score,
-                                           worker->engine->resolver.cache_rtt,
-                                           KR_NS_UPDATE_NORESET);
+                       * In case of UV_ETIMEDOUT upstream has been
+                       * already penalized in on_tcp_connect_timeout() */
+                       struct kr_query *qry = array_tail(task->ctx->req.rplan.pending);
+                       qry->server_selection.error(qry, task->transport, KR_SELECTION_TCP_CONNECT_FAILED);
                }
                assert(session_tasklist_is_empty(session));
                session_waitinglist_retry(session, false);
@@ -1061,10 +1068,7 @@ static void on_tcp_connect_timeout(uv_timer_t *timer)
                            peer_str ? peer_str : "");
        }
 
-       unsigned score = qry->flags.FORWARD || qry->flags.STUB ? KR_NS_FWD_DEAD : KR_NS_DEAD;
-       kr_nsrep_update_rtt(NULL, peer, score,
-                           worker->engine->resolver.cache_rtt,
-                           KR_NS_UPDATE_NORESET);
+       qry->server_selection.error(qry, task->transport, KR_SELECTION_TCP_CONNECT_TIMEOUT);
 
        worker->stats.timeout += session_waitinglist_get_len(session);
        session_waitinglist_retry(session, true);
@@ -1089,34 +1093,28 @@ static void on_udp_timeout(uv_timer_t *timer)
 
        uv_timer_stop(timer);
 
-       /* Penalize all tried nameservers with a timeout. */
        struct qr_task *task = session_tasklist_get_first(session);
        struct worker_ctx *worker = task->ctx->worker;
+
        if (task->leading && task->pending_count > 0) {
                struct kr_query *qry = array_tail(task->ctx->req.rplan.pending);
-               struct sockaddr_in6 *addrlist = (struct sockaddr_in6 *)task->addrlist;
-               for (uint16_t i = 0; i < MIN(task->pending_count, task->addrlist_count); ++i) {
-                       struct sockaddr *choice = (struct sockaddr *)(&addrlist[i]);
-                       WITH_VERBOSE(qry) {
-                               char *addr_str = kr_straddr(choice);
-                               VERBOSE_MSG(qry, "=> server: '%s' flagged as 'bad'\n", addr_str ? addr_str : "");
-                       }
-                       unsigned score = qry->flags.FORWARD || qry->flags.STUB ? KR_NS_FWD_DEAD : KR_NS_DEAD;
-                       kr_nsrep_update_rtt(&qry->ns, choice, score,
-                                           worker->engine->resolver.cache_rtt,
-                                           KR_NS_UPDATE_NORESET);
-               }
+               qry->server_selection.error(qry, task->transport, KR_SELECTION_QUERY_TIMEOUT);
        }
+
        task->timeouts += 1;
        worker->stats.timeout += 1;
        qr_task_step(task, NULL, NULL);
 }
 
-static uv_handle_t *retransmit(struct qr_task *task)
+static uv_handle_t *transmit(struct qr_task *task)
 {
        uv_handle_t *ret = NULL;
-       if (task && task->addrlist && task->addrlist_count > 0) {
-               struct sockaddr_in6 *choice = &((struct sockaddr_in6 *)task->addrlist)[task->addrlist_turn];
+
+       if (task) {
+               struct kr_transport* transport = task->transport;
+
+               struct sockaddr_in6 *choice = (struct sockaddr_in6 *)&transport->address;
+
                if (!choice) {
                        return ret;
                }
@@ -1125,7 +1123,7 @@ static uv_handle_t *retransmit(struct qr_task *task)
                }
                /* Checkout answer before sending it */
                struct request_ctx *ctx = task->ctx;
-               if (kr_resolve_checkout(&ctx->req, NULL, (struct sockaddr *)choice, SOCK_DGRAM, task->pktbuf) != 0) {
+               if (kr_resolve_checkout(&ctx->req, NULL, transport, task->pktbuf) != 0) {
                        return ret;
                }
                ret = ioreq_spawn(ctx->worker, SOCK_DGRAM, choice->sin6_family, false, false);
@@ -1144,31 +1142,12 @@ static uv_handle_t *retransmit(struct qr_task *task)
                } else {
                        task->pending[task->pending_count] = session;
                        task->pending_count += 1;
-                       task->addrlist_turn = (task->addrlist_turn + 1) %
-                                              task->addrlist_count; /* Round robin */
                        session_start_read(session); /* Start reading answer */
                }
        }
        return ret;
 }
 
-static void on_retransmit(uv_timer_t *req)
-{
-       struct session *session = req->data;
-       assert(session_tasklist_get_len(session) == 1);
-
-       uv_timer_stop(req);
-       struct qr_task *task = session_tasklist_get_first(session);
-       if (retransmit(task) == NULL) {
-               /* Not possible to spawn request, start timeout timer with remaining deadline. */
-               struct kr_qflags *options = &task->ctx->req.options;
-               uint64_t timeout = options->FORWARD || options->STUB ? KR_NS_FWD_TIMEOUT / 2 :
-                                  KR_CONN_RTT_MAX - task->pending_count * KR_CONN_RETRY;
-               uv_timer_start(req, on_udp_timeout, timeout, 0);
-       } else {
-               uv_timer_start(req, on_retransmit, KR_CONN_RETRY, 0);
-       }
-}
 
 static void subreq_finalize(struct qr_task *task, const struct sockaddr *packet_source, knot_pkt_t *pkt)
 {
@@ -1196,6 +1175,12 @@ static void subreq_finalize(struct qr_task *task, const struct sockaddr *packet_
                        struct kr_query *qry = array_tail(follower->ctx->req.rplan.pending);
                        qry->id = leader_qry->id;
                        qry->secret = leader_qry->secret;
+
+                       // Note that this transport may not be present in `leader_qry`'s server selection
+                       follower->transport = task->transport;
+                       if(follower->transport) {
+                               follower->transport->deduplicated = true;
+                       }
                        leader_qry->secret = 0; /* Next will be already decoded */
                }
                qr_task_step(follower, packet_source, pkt);
@@ -1369,7 +1354,7 @@ static int udp_task_step(struct qr_task *task,
                return kr_ok(); /* Will be notified when outgoing query finishes. */
        }
        /* Start transmitting */
-       uv_handle_t *handle = retransmit(task);
+       uv_handle_t *handle = transmit(task);
        if (handle == NULL) {
                subreq_finalize(task, packet_source, packet);
                return qr_task_finalize(task, KR_STATE_FAIL);
@@ -1517,15 +1502,7 @@ static int tcp_task_make_connection(struct qr_task *task, const struct sockaddr
                worker_del_tcp_waiting(worker, addr);
                free(conn);
                session_close(session);
-               unsigned score = qry->flags.FORWARD || qry->flags.STUB ? KR_NS_FWD_DEAD : KR_NS_DEAD;
-               kr_nsrep_update_rtt(NULL, peer, score,
-                                   worker->engine->resolver.cache_rtt,
-                                   KR_NS_UPDATE_NORESET);
-               WITH_VERBOSE (qry) {
-                       const char *peer_str = kr_straddr(peer);
-                       kr_log_verbose( "[wrkr]=> connect to '%s' failed (%s), flagged as 'bad'\n",
-                                       peer_str ? peer_str : "", uv_strerror(ret));
-               }
+               qry->server_selection.error(qry, task->transport, KR_SELECTION_TCP_CONNECT_FAILED);
                return kr_error(EAGAIN);
        }
 
@@ -1549,7 +1526,7 @@ static int tcp_task_step(struct qr_task *task,
        assert(task->pending_count == 0);
 
        /* target */
-       const struct sockaddr *addr = task->addrlist;
+       const struct sockaddr *addr = &task->transport->address.ip;
        if (addr->sa_family == AF_UNSPEC) {
                /* Target isn't defined. Finalize task with SERVFAIL.
                 * Although task->pending_count is zero, there are can be followers,
@@ -1559,8 +1536,7 @@ static int tcp_task_step(struct qr_task *task,
        }
        /* Checkout task before connecting */
        struct request_ctx *ctx = task->ctx;
-       if (kr_resolve_checkout(&ctx->req, NULL, (struct sockaddr *)addr,
-                               SOCK_STREAM, task->pktbuf) != 0) {
+       if (kr_resolve_checkout(&ctx->req, NULL, task->transport, task->pktbuf) != 0) {
                subreq_finalize(task, packet_source, packet);
                return qr_task_finalize(task, KR_STATE_FAIL);
        }
@@ -1609,10 +1585,6 @@ static int qr_task_step(struct qr_task *task,
        assert(ctx);
        struct kr_request *req = &ctx->req;
        struct worker_ctx *worker = ctx->worker;
-       int sock_type = -1;
-       task->addrlist = NULL;
-       task->addrlist_count = 0;
-       task->addrlist_turn = 0;
 
        if (worker->too_many_open) {
                /* */
@@ -1623,22 +1595,29 @@ static int qr_task_step(struct qr_task *task,
                } else {
                        if (packet && kr_rplan_empty(rplan)) {
                                /* new query; TODO - make this detection more obvious */
-                               kr_resolve_consume(req, packet_source, packet);
+                               kr_resolve_consume(req, &task->transport, packet);
                        }
                        return qr_task_finalize(task, KR_STATE_FAIL);
                }
        }
 
-       int state = kr_resolve_consume(req, packet_source, packet);
+       // Report network RTT back to server selection
+       if (task->send_time && task->recv_time) {
+               struct kr_query *qry = array_tail(req->rplan.pending);
+               qry->server_selection.update_rtt(qry, task->transport, task->recv_time - task->send_time);
+       }
+
+       int state = kr_resolve_consume(req, &task->transport, packet);
+
+       task->transport = NULL;
        while (state == KR_STATE_PRODUCE) {
-               state = kr_resolve_produce(req, &task->addrlist,
-                                          &sock_type, task->pktbuf);
+               state = kr_resolve_produce(req, &task->transport, task->pktbuf);
                if (unlikely(++task->iter_count > KR_ITER_LIMIT ||
                             task->timeouts >= KR_TIMEOUT_LIMIT)) {
 
                        #ifndef NOVERBOSELOG
                        struct kr_rplan *rplan = &req->rplan;
-                       struct kr_query *last  = kr_rplan_last(rplan);
+                       struct kr_query *last = kr_rplan_last(rplan);
                        if (task->iter_count > KR_ITER_LIMIT) {
                                VERBOSE_MSG(last, "canceling query due to exceeded iteration count limit of %d\n", KR_ITER_LIMIT);
                        }
@@ -1654,47 +1633,22 @@ static int qr_task_step(struct qr_task *task,
        /* We're done, no more iterations needed */
        if (state & (KR_STATE_DONE|KR_STATE_FAIL)) {
                return qr_task_finalize(task, state);
-       } else if (!task->addrlist || sock_type < 0) {
+       } else if (!task->transport || !task->transport->protocol) {
                return qr_task_step(task, NULL, NULL);
        }
 
-       /* Count available address choices */
-       struct sockaddr_in6 *choice = (struct sockaddr_in6 *)task->addrlist;
-       for (size_t i = 0; i < KR_NSREP_MAXADDR && choice->sin6_family != AF_UNSPEC; ++i) {
-               task->addrlist_count += 1;
-               choice += 1;
-       }
-
-       /* Upgrade to TLS if the upstream address is configured as DoT capable. */
-       if (task->addrlist_count > 0 && kr_inaddr_port(task->addrlist) == KR_DNS_PORT) {
-               /* TODO if there are multiple addresses (task->addrlist_count > 1)
-                * check all of them. */
-               struct network *net = &worker->engine->net;
-               /* task->addrlist has to contain TLS port before tls_client_param_get() call */
-               kr_inaddr_set_port(task->addrlist, KR_DNS_TLS_PORT);
-               tls_client_param_t *tls_entry =
-                       tls_client_param_get(net->tls_client_params, task->addrlist);
-               if (tls_entry) {
-                       packet_source = NULL;
-                       sock_type = SOCK_STREAM;
-                       /* TODO in this case in tcp_task_make_connection() will be performed
-                        * redundant map_get() call. */
-               } else {
-                       /* The function is fairly cheap, so we just change there and back. */
-                       kr_inaddr_set_port(task->addrlist, KR_DNS_PORT);
-               }
-       }
-
-       int ret = 0;
-       if (sock_type == SOCK_DGRAM) {
-               /* Start fast retransmit with UDP. */
-               ret = udp_task_step(task, packet_source, packet);
-       } else {
-               /* TCP. Connect to upstream or send the query if connection already exists. */
-               assert (sock_type == SOCK_STREAM);
-               ret = tcp_task_step(task, packet_source, packet);
+       switch (task->transport->protocol)
+       {
+       case KR_TRANSPORT_UDP:
+               return udp_task_step(task, packet_source, packet);
+               break;
+       case KR_TRANSPORT_TCP: // fall through
+       case KR_TRANSPORT_TLS:
+               return tcp_task_step(task, packet_source, packet);
+       default:
+               assert(0);
+               break;
        }
-       return ret;
 }
 
 static int parse_packet(knot_pkt_t *query)
@@ -1791,12 +1745,15 @@ int worker_submit(struct session *session,
                }
                assert(!session_flags(session)->closing);
                addr = peer;
+               /* Note recieve time for RTT calculation */
+               task->recv_time = kr_now();
        }
        assert(uv_is_closing(session_get_handle(session)) == false);
 
        /* Packet was successfully parsed.
         * Task was created (found). */
        session_touch(session);
+
        /* Consume input and produce next message */
        return qr_task_step(task, addr, pkt);
 }
@@ -1851,7 +1808,7 @@ int worker_del_tcp_connected(struct worker_ctx *worker,
        return map_del_tcp_session(&worker->tcp_connected, addr);
 }
 
-static struct session* worker_find_tcp_connected(struct worker_ctx *worker,
+struct session* worker_find_tcp_connected(struct worker_ctx *worker,
                                                 const struct sockaddr* addr)
 {
        return map_find_tcp_session(&worker->tcp_connected, addr);
@@ -1877,7 +1834,7 @@ int worker_del_tcp_waiting(struct worker_ctx *worker,
        return map_del_tcp_session(&worker->tcp_waiting, addr);
 }
 
-static struct session* worker_find_tcp_waiting(struct worker_ctx *worker,
+struct session* worker_find_tcp_waiting(struct worker_ctx *worker,
                                               const struct sockaddr* addr)
 {
        return map_find_tcp_session(&worker->tcp_waiting, addr);
@@ -1951,12 +1908,9 @@ int worker_end_tcp(struct session *session)
        return kr_ok();
 }
 
-knot_pkt_t * worker_resolve_mk_pkt(const char *qname_str, uint16_t qtype, uint16_t qclass,
+knot_pkt_t *worker_resolve_mk_pkt_dname(knot_dname_t *qname, uint16_t qtype, uint16_t qclass,
                                   const struct kr_qflags *options)
 {
-       uint8_t qname[KNOT_DNAME_MAXLEN];
-       if (!knot_dname_from_str(qname, qname_str, sizeof(qname)))
-               return NULL;
        knot_pkt_t *pkt = knot_pkt_new(NULL, KNOT_EDNS_MAX_UDP_PAYLOAD, NULL);
        if (!pkt)
                return NULL;
@@ -1991,6 +1945,15 @@ knot_pkt_t * worker_resolve_mk_pkt(const char *qname_str, uint16_t qtype, uint16
        return pkt;
 }
 
+knot_pkt_t *worker_resolve_mk_pkt(const char *qname_str, uint16_t qtype, uint16_t qclass,
+                                  const struct kr_qflags *options)
+{
+       uint8_t qname[KNOT_DNAME_MAXLEN];
+       if (!knot_dname_from_str(qname, qname_str, sizeof(qname)))
+               return NULL;
+       return worker_resolve_mk_pkt_dname(qname, qtype, qclass, options);
+}
+
 struct qr_task *worker_resolve_start(knot_pkt_t *query, struct kr_qflags options)
 {
        struct worker_ctx *worker = the_worker;
index 0e3e275801c2e5a320c7f8565dd8d0eace25731f..5f7be5b43ba301342bc5bfae2ff72d06a961be85 100644 (file)
@@ -47,6 +47,9 @@ int worker_submit(struct session *session,
  */
 int worker_end_tcp(struct session *session);
 
+KR_EXPORT knot_pkt_t *worker_resolve_mk_pkt_dname(knot_dname_t *qname, uint16_t qtype, uint16_t qclass,
+                                  const struct kr_qflags *options);
+
 /**
  * Create a packet suitable for worker_resolve_start().  All in malloc() memory.
  */
@@ -96,6 +99,10 @@ int worker_del_tcp_connected(struct worker_ctx *worker,
                             const struct sockaddr *addr);
 int worker_del_tcp_waiting(struct worker_ctx *worker,
                           const struct sockaddr* addr);
+struct session* worker_find_tcp_waiting(struct worker_ctx *worker,
+                                              const struct sockaddr* addr);
+struct session* worker_find_tcp_connected(struct worker_ctx *worker,
+                                              const struct sockaddr* addr);
 knot_pkt_t *worker_task_get_pktbuf(const struct qr_task *task);
 
 struct request_ctx *worker_task_get_request(struct qr_task *task);
@@ -136,7 +143,7 @@ struct worker_stats {
 /** @cond internal */
 
 /** Number of request within timeout window. */
-#define MAX_PENDING KR_NSREP_MAXADDR
+#define MAX_PENDING 4
 
 /** Maximum response time from TCP upstream, milliseconds */
 #define MAX_TCP_INACTIVITY (KR_RESOLVE_TIME_LIMIT + KR_CONN_RTT_MAX)
index 71a3b18657afb2dbca20537f9eb082ec55a73f7b..d302120e31ffd1d2a2c60c45a858cb0bc4b449b9 100644 (file)
@@ -33,6 +33,7 @@
  */
 
 #include <inttypes.h> /* PRIu64 */
+#include <limits.h>
 #include <stdlib.h>
 #include <uv.h>
 #include <ucw/mempool.h>
index 101767fe5add31818b3b8da8fcd41923b0611c99..f69f4efed34bd3136f79af7b17e1692681e19888 100644 (file)
@@ -38,7 +38,7 @@ Cache
 Nameservers
 -----------
 
-.. doxygenfile:: nsrep.h
+.. doxygenfile:: selection.h
    :project: libkres
 .. doxygenfile:: zonecut.h
    :project: libkres
index 76a93cb1831d730a71c3dab55abc8ded3823f452..dc8c3788248886455ba331f192c4aaab32dfb1e4 100644 (file)
@@ -50,7 +50,7 @@ static inline int KR_COLD kr_error(int x) {
 #define KR_ITER_LIMIT 100    /* Built-in iterator limit */
 #define KR_RESOLVE_TIME_LIMIT 10000 /* Upper limit for resolution time of single query, ms */
 #define KR_CNAME_CHAIN_LIMIT 13 /* Built-in maximum CNAME chain length */
-#define KR_TIMEOUT_LIMIT 4   /* Maximum number of retries after timeout. */
+#define KR_TIMEOUT_LIMIT 10   /* Maximum number of retries after timeout. */
 #define KR_QUERY_NSRETRY_LIMIT 4 /* Maximum number of retries per query. */
 #define KR_COUNT_NO_NSADDR_LIMIT 5
 #define KR_CONSUME_FAIL_ROW_LIMIT 3 /* Maximum number of KR_STATE_FAIL in a row. */
index e689c432196a4ddde133c96e18979d96a88b3d42..791a6849ca03483ab534aad4a2c95faf667c8bbb 100644 (file)
@@ -29,7 +29,7 @@
 #include "lib/resolve.h"
 #include "lib/rplan.h"
 #include "lib/defines.h"
-#include "lib/nsrep.h"
+#include "lib/selection.h"
 #include "lib/module.h"
 #include "lib/dnssec/ta.h"
 
@@ -213,10 +213,12 @@ static void fetch_glue(knot_pkt_t *pkt, const knot_dname_t *ns, bool in_bailiwic
 
                        if ((rr->type == KNOT_RRTYPE_A) &&
                            (req->ctx->options.NO_IPV4)) {
+                               QVERBOSE_MSG(qry, "<= skipping IPv4 glue due to network settings\n");
                                continue;
                        }
                        if ((rr->type == KNOT_RRTYPE_AAAA) &&
                            (req->ctx->options.NO_IPV6)) {
+                               QVERBOSE_MSG(qry, "<= skipping IPv6 glue due to network settings\n");
                                continue;
                        }
                        (void) update_nsaddr(rr, req->current_query, glue_cnt);
@@ -258,6 +260,7 @@ static int update_cut(knot_pkt_t *pkt, const knot_rrset_t *rr,
                     && knot_dname_in_bailiwick(qry->sname, rr->owner)  >= 0;
        if (!ok) {
                VERBOSE_MSG("<= authority: ns outside bailiwick\n");
+               qry->server_selection.error(qry, req->upstream.transport, KR_SELECTION_LAME_DELEGATION);
 #ifdef STRICT_MODE
                return KR_STATE_FAIL;
 #else
@@ -632,10 +635,11 @@ static int process_referral_answer(knot_pkt_t *pkt, struct kr_request *req)
 {
        const knot_dname_t *cname = NULL;
        int state = unroll_cname(pkt, req, true, &cname);
+       struct kr_query *query = req->current_query;
        if (state != kr_ok()) {
+               query->server_selection.error(query, req->upstream.transport, KR_SELECTION_BAD_CNAME);
                return KR_STATE_FAIL;
        }
-       struct kr_query *query = req->current_query;
        if (!(query->flags.CACHED)) {
                /* If not cached (i.e. got from upstream)
                 * make sure that this is not an authoritative answer
@@ -721,6 +725,7 @@ static int process_answer(knot_pkt_t *pkt, struct kr_request *req)
        if (!is_authoritative(pkt, query)) {
                if (!(query->flags.FORWARD) &&
                    pkt_class & (PKT_NXDOMAIN|PKT_NODATA)) {
+                       query->server_selection.error(query, req->upstream.transport, KR_SELECTION_LAME_DELEGATION);
                        VERBOSE_MSG("<= lame response: non-auth sent negative response\n");
                        return KR_STATE_FAIL;
                }
@@ -730,6 +735,7 @@ static int process_answer(knot_pkt_t *pkt, struct kr_request *req)
        /* Process answer type */
        int state = unroll_cname(pkt, req, false, &cname);
        if (state != kr_ok()) {
+               query->server_selection.error(query, req->upstream.transport, KR_SELECTION_BAD_CNAME);
                return state;
        }
        /* Make sure that this is an authoritative answer (even with AA=0) for other layers */
@@ -760,6 +766,7 @@ static int process_answer(knot_pkt_t *pkt, struct kr_request *req)
                            q->stype == query->stype   &&
                            knot_dname_is_equal(q->sname, cname)) {
                                VERBOSE_MSG("<= cname chain loop\n");
+                               query->server_selection.error(query, req->upstream.transport, KR_SELECTION_BAD_CNAME);
                                return KR_STATE_FAIL;
                        }
                }
@@ -777,12 +784,6 @@ static int process_answer(knot_pkt_t *pkt, struct kr_request *req)
 
                if (query->flags.FORWARD) {
                        next->forward_flags.CNAME = true;
-                       if (query->parent == NULL) {
-                               state = kr_nsrep_copy_set(&next->ns, &query->ns);
-                               if (state != kr_ok()) {
-                                       return KR_STATE_FAIL;
-                               }
-                       }
                }
                next->cname_parent = query;
                /* Want DNSSEC if and only if it's posible to secure
@@ -998,10 +999,8 @@ static int resolve_badmsg(knot_pkt_t *pkt, struct kr_request *req, struct kr_que
        /* Work around broken auths/load balancers */
        if (query->flags.SAFEMODE) {
                return resolve_error(pkt, req);
-       } else if (query->flags.NO_MINIMIZE) {
-               query->flags.SAFEMODE = true;
-               return KR_STATE_DONE;
        } else {
+               query->flags.SAFEMODE = true;
                query->flags.NO_MINIMIZE = true;
                return KR_STATE_DONE;
        }
@@ -1044,13 +1043,16 @@ static int resolve(kr_layer_t *ctx, knot_pkt_t *pkt)
                return resolve_badmsg(pkt, req, query);
        } else
 #endif
+       /* LATER: Query minimization, 0x20 randomization, EDNS… should really be
+        * set and managed by selection.c and SAFEMODE should be split and
+        * removed altogether because it's doing many things at once. */
        if (pkt->parsed <= KNOT_WIRE_HEADER_SIZE) {
                VERBOSE_MSG("<= malformed response (parsed %d)\n", (int)pkt->parsed);
                return resolve_badmsg(pkt, req, query);
        } else if (!is_paired_to_query(pkt, query)) {
                WITH_VERBOSE(query) {
                        const char *ns_str =
-                               req->upstream.addr ? kr_straddr(req->upstream.addr) : "(internal)";
+                               req->upstream.transport ? kr_straddr(&req->upstream.transport->address.ip) : "(internal)";
                        VERBOSE_MSG("<= ignoring mismatching response from %s\n",
                                        ns_str ? ns_str : "(kr_straddr failed)");
                }
@@ -1062,11 +1064,12 @@ static int resolve(kr_layer_t *ctx, knot_pkt_t *pkt)
                VERBOSE_MSG("<= truncated response, failover to TCP\n");
                if (query) {
                        /* Fail if already on TCP. */
-                       if (query->flags.TCP) {
+                       if (req->upstream.transport->protocol != KR_TRANSPORT_UDP) {
                                VERBOSE_MSG("<= TC=1 with TCP, bailing out\n");
+                               query->server_selection.error(query, req->upstream.transport, KR_SELECTION_TRUNCATED);
                                return resolve_error(pkt, req);
                        }
-                       query->flags.TCP = true;
+                       query->server_selection.error(query, req->upstream.transport, KR_SELECTION_TRUNCATED);
                }
                return KR_STATE_CONSUME;
        }
@@ -1079,6 +1082,10 @@ static int resolve(kr_layer_t *ctx, knot_pkt_t *pkt)
        const knot_lookup_t *rcode = knot_lookup_by_id(knot_rcode_names, knot_wire_get_rcode(pkt->wire));
 #endif
 
+       // We can't return directly from the switch because we have to give feedback to server selection first
+       int ret = 0;
+       int selection_error = -1;
+
        /* Check response code. */
        switch(knot_wire_get_rcode(pkt->wire)) {
        case KNOT_RCODE_NOERROR:
@@ -1090,19 +1097,48 @@ static int resolve(kr_layer_t *ctx, knot_pkt_t *pkt)
                knot_wire_set_rcode(req->answer->wire, KNOT_RCODE_YXDOMAIN);
                break;
        case KNOT_RCODE_REFUSED:
+               if (query->flags.STUB) {
+                        /* just pass answer through if in stub mode */
+                       break;
+               }
+               selection_error = KR_SELECTION_REFUSED;
+               VERBOSE_MSG("<= rcode: %s\n", rcode ? rcode->name : "??");
+               ret = resolve_badmsg(pkt, req, query);
+               break;
        case KNOT_RCODE_SERVFAIL:
                if (query->flags.STUB) {
                         /* just pass answer through if in stub mode */
                        break;
                }
-               /* fall through */
+               selection_error = KR_SELECTION_SERVFAIL;
+               VERBOSE_MSG("<= rcode: %s\n", rcode ? rcode->name : "??");
+               ret = resolve_badmsg(pkt, req, query);
+               break;
        case KNOT_RCODE_FORMERR:
+               selection_error = KR_SELECTION_FORMERROR;
+               VERBOSE_MSG("<= rcode: %s\n", rcode ? rcode->name : "??");
+               ret = resolve_badmsg(pkt, req, query);
+               break;
        case KNOT_RCODE_NOTIMPL:
+               selection_error = KR_SELECTION_NOTIMPL;
                VERBOSE_MSG("<= rcode: %s\n", rcode ? rcode->name : "??");
-               return resolve_badmsg(pkt, req, query);
+               ret = resolve_badmsg(pkt, req, query);
+               break;
        default:
+               selection_error = KR_SELECTION_OTHER_RCODE;
                VERBOSE_MSG("<= rcode: %s\n", rcode ? rcode->name : "??");
-               return resolve_error(pkt, req);
+               ret = resolve_error(pkt, req);
+               break;
+       }
+
+       if (query->server_selection.initialized) {
+               if (selection_error != -1) {
+                       query->server_selection.error(query, req->upstream.transport, selection_error);
+               }
+       }
+
+       if (ret) {
+               return ret;
        }
 
        int state;
@@ -1145,7 +1181,7 @@ rrarray_finalize:
        (void)0;
        ranked_rr_array_t *selected[] = kr_request_selected(req);
        for (knot_section_t i = KNOT_ANSWER; i <= KNOT_ADDITIONAL; ++i) {
-               int ret = kr_ranked_rrarray_finalize(selected[i], query->uid, &req->pool);
+               ret = kr_ranked_rrarray_finalize(selected[i], query->uid, &req->pool);
                if (unlikely(ret)) {
                        return KR_STATE_FAIL;
                }
index cbbc0738a8aaa2e2d8940632492740e4cd916c0e..aa8aa8be40ff01cef805155b6ac69d821628b8ec 100644 (file)
@@ -23,6 +23,7 @@
 #include "lib/utils.h"
 #include "lib/defines.h"
 #include "lib/module.h"
+#include "lib/selection.h"
 
 #define VERBOSE_MSG(qry, ...) QRVERBOSE(qry, "vldr", __VA_ARGS__)
 
@@ -349,7 +350,7 @@ static knot_rrset_t *update_ds(struct kr_zonecut *cut, const knot_pktsection_t *
                        return NULL;
                }
        }
-       return new_ds;  
+       return new_ds;
 }
 
 static void mark_insecure_parents(const struct kr_query *qry)
@@ -1190,11 +1191,22 @@ static int hide_bogus(kr_layer_t *ctx) {
        return ctx->state;
 }
 
+static int validate_wrapper(kr_layer_t *ctx, knot_pkt_t *pkt) {
+       // Wrapper for now.
+       int ret = validate(ctx, pkt);
+       struct kr_request *req = ctx->req;
+       struct kr_query *qry = req->current_query;
+       if (ret & KR_STATE_FAIL && qry->flags.DNSSEC_BOGUS)
+               qry->server_selection.error(qry, req->upstream.transport, KR_SELECTION_DNSSEC_ERROR);
+       return ret;
+}
+
+
 /** Module implementation. */
 int validate_init(struct kr_module *self)
 {
        static const kr_layer_api_t layer = {
-               .consume = &validate,
+               .consume = &validate_wrapper,
                .answer_finalize = &hide_bogus,
        };
        self->layer = &layer;
index 6d6ec9ce6a7e2d3634f58fc57ce64a94bec8c235..d24d997c79fc90c3d7959138433c1d03408cba16 100644 (file)
@@ -24,9 +24,11 @@ libkres_src = files([
   'layer/iterate.c',
   'layer/validate.c',
   'module.c',
-  'nsrep.c',
   'resolve.c',
   'rplan.c',
+  'selection.c',
+  'selection_forward.c',
+  'selection_iter.c',
   'utils.c',
   'zonecut.c',
 ])
@@ -52,9 +54,11 @@ libkres_headers = files([
   'layer.h',
   'layer/iterate.h',
   'module.h',
-  'nsrep.h',
   'resolve.h',
   'rplan.h',
+  'selection.h',
+  'selection_forward.h',
+  'selection_iter.h',
   'utils.h',
   'zonecut.h',
 ])
diff --git a/lib/nsrep.c b/lib/nsrep.c
deleted file mode 100644 (file)
index c49f406..0000000
+++ /dev/null
@@ -1,570 +0,0 @@
-/*  Copyright (C) 2014-2017 CZ.NIC, z.s.p.o. <knot-dns@labs.nic.cz>
- *  SPDX-License-Identifier: GPL-3.0-or-later
- */
-
-#include <assert.h>
-#include <sys/socket.h>
-#include <netinet/in.h>
-#include <netdb.h>
-
-#include <arpa/inet.h>
-
-#include "lib/nsrep.h"
-#include "lib/rplan.h"
-#include "lib/resolve.h"
-#include "lib/defines.h"
-#include "lib/generic/pack.h"
-#include "contrib/ucw/lib.h"
-
-/** Some built-in unfairness ... */
-#ifndef FAVOUR_IPV6
-#define FAVOUR_IPV6 20 /* 20ms bonus for v6 */
-#endif
-
-/** @internal Macro to set address structure. */
-#define ADDR_SET(sa, family, addr, len, port) do {\
-       memcpy(&sa ## _addr, (addr), (len)); \
-       sa ## _family = (family); \
-       sa ## _port = htons(port); \
-} while (0)
-
-/** Update nameserver representation with current name/address pair. */
-static void update_nsrep(struct kr_nsrep *ns, size_t pos, uint8_t *addr, size_t addr_len, int port)
-{
-       if (addr == NULL) {
-               ns->addr[pos].ip.sa_family = AF_UNSPEC;
-               return;
-       }
-
-       /* Rotate previous addresses to the right. */
-       memmove(ns->addr + pos + 1, ns->addr + pos, (KR_NSREP_MAXADDR - pos - 1) * sizeof(ns->addr[0]));
-
-       switch(addr_len) {
-       case sizeof(struct in_addr):
-               ADDR_SET(ns->addr[pos].ip4.sin, AF_INET, addr, addr_len, port); break;
-       case sizeof(struct in6_addr):
-               ADDR_SET(ns->addr[pos].ip6.sin6, AF_INET6, addr, addr_len, port); break;
-       default: assert(0); break;
-       }
-}
-
-static void update_nsrep_set(struct kr_nsrep *ns, const knot_dname_t *name, uint8_t *addr[], unsigned score)
-{
-       /* NSLIST is not empty, empty NS cannot be a leader. */
-       if (!addr[0] && ns->addr[0].ip.sa_family != AF_UNSPEC) {
-               return;
-       }
-       /* Set new NS leader */
-       ns->name = name;
-       ns->score = score;
-       for (size_t i = 0; i < KR_NSREP_MAXADDR; ++i) {
-               if (addr[i]) {
-                       void *addr_val = pack_obj_val(addr[i]);
-                       size_t len = pack_obj_len(addr[i]);
-                       update_nsrep(ns, i, addr_val, len, KR_DNS_PORT);
-               } else {
-                       break;
-               }
-       }
-}
-
-#undef ADDR_SET
-
-/**
- * \param addr_set pack with one IP address per element */
-static unsigned eval_addr_set(const pack_t *addr_set, struct kr_context *ctx,
-                             struct kr_qflags opts, unsigned score, uint8_t *addr[])
-{
-       kr_nsrep_rtt_lru_t *rtt_cache = ctx->cache_rtt;
-       kr_nsrep_rtt_lru_entry_t *rtt_cache_entry_ptr[KR_NSREP_MAXADDR] = { NULL, };
-       assert (KR_NSREP_MAXADDR >= 2);
-       unsigned rtt_cache_entry_score[KR_NSREP_MAXADDR] = { score, KR_NS_MAX_SCORE + 1, };
-       uint64_t now = kr_now();
-
-       /* Name server is better candidate if it has address record. */
-       for (uint8_t *it = pack_head(*addr_set); it != pack_tail(*addr_set);
-                                               it = pack_obj_next(it)) {
-               void *val = pack_obj_val(it);
-               size_t len = pack_obj_len(it);
-               unsigned favour = 0;
-               bool is_valid = false;
-               /* Check if the address isn't disabled. */
-               if (len == sizeof(struct in6_addr)) {
-                       is_valid = !(opts.NO_IPV6);
-                       favour = FAVOUR_IPV6;
-               } else if (len == sizeof(struct in_addr)) {
-                       is_valid = !(opts.NO_IPV4);
-               } else {
-                       assert(!EINVAL);
-                       is_valid = false;
-               }
-
-               if (!is_valid) {
-                       continue;
-               }
-
-               /* Get score for the current address. */
-               kr_nsrep_rtt_lru_entry_t *cached = rtt_cache ?
-                                                  lru_get_try(rtt_cache, val, len) :
-                                                  NULL;
-               unsigned cur_addr_score = KR_NS_GLUED;
-               if (cached) {
-                       cur_addr_score = cached->score;
-                       if (cached->score >= KR_NS_TIMEOUT) {
-                               /* If NS once was marked as "timeouted",
-                                * it won't participate in NS elections
-                                * at least ctx->cache_rtt_tout_retry_interval milliseconds. */
-                               uint64_t elapsed = now - cached->tout_timestamp;
-                               elapsed = elapsed > UINT_MAX ? UINT_MAX : elapsed;
-                               if (elapsed > ctx->cache_rtt_tout_retry_interval) {
-                                       /* Select this NS for probing in this particular query,
-                                        * but don't change the cached score.
-                                        * For other queries this NS will remain "timeouted". */
-                                       cur_addr_score = KR_NS_LONG - 1;
-                               }
-                       }
-               }
-
-               /* We can't always use favour.  If these conditions held:
-                *
-                * rtt_cache_entry_score[i] < KR_NS_TIMEOUT
-                * rtt_cache_entry_score[i] + favour > KR_NS_TIMEOUT
-                * cur_addr_score < rtt_cache_entry_score[i] + favour
-                *
-                * we would prefer "certainly dead" cur_addr_score
-                * instead of "almost dead but alive" rtt_cache_entry_score[i]
-                */
-               const unsigned cur_favour = cur_addr_score < KR_NS_TIMEOUT ? favour : 0;
-               for (size_t i = 0; i < KR_NSREP_MAXADDR; ++i) {
-                       if (cur_addr_score >= rtt_cache_entry_score[i] + cur_favour)
-                               continue;
-
-                       /* Shake down previous contenders */
-                       for (size_t j = KR_NSREP_MAXADDR - 1; j > i; --j) {
-                               addr[j] = addr[j - 1];
-                               rtt_cache_entry_ptr[j] = rtt_cache_entry_ptr[j - 1];
-                               rtt_cache_entry_score[j] = rtt_cache_entry_score[j - 1];
-                       }
-                       addr[i] = it;
-                       rtt_cache_entry_score[i] = cur_addr_score;
-                       rtt_cache_entry_ptr[i] = cached;
-                       break;
-               }
-       }
-
-       /* At this point, rtt_cache_entry_ptr contains up to KR_NSREP_MAXADDR
-        * pointers to the rtt cache entries with the best scores for the given addr_set.
-        * Check if there are timeouted NS. */
-
-       for (size_t i = 0; i < KR_NSREP_MAXADDR; ++i) {
-               if (rtt_cache_entry_ptr[i] == NULL)
-                       continue;
-               if (rtt_cache_entry_ptr[i]->score < KR_NS_TIMEOUT)
-                       continue;
-
-               uint64_t elapsed = now - rtt_cache_entry_ptr[i]->tout_timestamp;
-               elapsed = elapsed > UINT_MAX ? UINT_MAX : elapsed;
-               if (elapsed <= ctx->cache_rtt_tout_retry_interval)
-                       continue;
-
-               /* rtt_cache_entry_ptr[i] points to "timeouted" rtt cache entry.
-                * The period of the ban on participation in elections has expired. */
-
-               if (VERBOSE_STATUS) {
-                       void *val = pack_obj_val(addr[i]);
-                       size_t len = pack_obj_len(addr[i]);
-                       char sa_str[INET6_ADDRSTRLEN];
-                       int af = (len == sizeof(struct in6_addr)) ? AF_INET6 : AF_INET;
-                       inet_ntop(af, val, sa_str, sizeof(sa_str));
-                       kr_log_verbose("[     ][nsre] probing timeouted NS: %s, score %i\n",
-                                      sa_str, rtt_cache_entry_ptr[i]->score);
-               }
-
-               rtt_cache_entry_ptr[i]->tout_timestamp = now;
-       }
-
-       return rtt_cache_entry_score[0];
-}
-
-static int eval_nsrep(const knot_dname_t *owner, const pack_t *addr_set, struct kr_query *qry)
-{
-       struct kr_nsrep *ns = &qry->ns;
-       struct kr_context *ctx = ns->ctx;
-       unsigned score = KR_NS_MAX_SCORE;
-       unsigned reputation = 0;
-       uint8_t *addr_choice[KR_NSREP_MAXADDR] = { NULL, };
-
-       /* Fetch NS reputation */
-       if (ctx->cache_rep) {
-               unsigned *cached = lru_get_try(ctx->cache_rep, (const char *)owner,
-                                              knot_dname_size(owner));
-               if (cached) {
-                       reputation = *cached;
-               }
-       }
-
-       /* Favour nameservers with unknown addresses to probe them,
-        * otherwise discover the current best address for the NS. */
-       if (addr_set->len == 0) {
-               score = KR_NS_UNKNOWN;
-               /* If the server doesn't have IPv6, give it disadvantage. */
-               if (reputation & KR_NS_NOIP6) {
-                       score += FAVOUR_IPV6;
-                       /* If the server is unknown but has rep record, treat it as timeouted */
-                       if (reputation & KR_NS_NOIP4) {
-                               score = KR_NS_UNKNOWN;
-                               /* Try to start with clean slate */
-                               if (!(qry->flags.NO_IPV6)) {
-                                       reputation &= ~KR_NS_NOIP6;
-                               }
-                               if (!(qry->flags.NO_IPV4)) {
-                                       reputation &= ~KR_NS_NOIP4;
-                               }
-                       }
-               }
-       } else {
-               score = eval_addr_set(addr_set, ctx, qry->flags, score, addr_choice);
-       }
-
-       /* Probabilistic bee foraging strategy (naive).
-        * The fastest NS is preferred by workers until it is depleted (timeouts or degrades),
-        * at the same time long distance scouts probe other sources (low probability).
-        * Servers on TIMEOUT will not have probed at all.
-        * Servers with score above KR_NS_LONG will have periodically removed from
-        * reputation cache, so that kresd can reprobe them. */
-       if (score >= KR_NS_TIMEOUT) {
-               return kr_ok();
-       } else if (score <= ns->score &&
-          (score < KR_NS_LONG  || qry->flags.NO_THROTTLE)) {
-               update_nsrep_set(ns, owner, addr_choice, score);
-               ns->reputation = reputation;
-       } else if (kr_rand_coin(1, 10) &&
-                  !kr_rand_coin(score, KR_NS_MAX_SCORE)) {
-               /* With 10% chance probe server with a probability
-                * given by its RTT / MAX_RTT. */
-               update_nsrep_set(ns, owner, addr_choice, score);
-               ns->reputation = reputation;
-               return 1; /* Stop evaluation */
-       } else if (ns->score > KR_NS_MAX_SCORE) {
-               /* Check if any server was already selected.
-                * If no, pick current server and continue evaluation. */
-               update_nsrep_set(ns, owner, addr_choice, score);
-               ns->reputation = reputation;
-       }
-
-       return kr_ok();
-}
-
-int kr_nsrep_set(struct kr_query *qry, size_t index, const struct sockaddr *sock)
-{
-       if (!qry) {
-               return kr_error(EINVAL);
-       }
-       if (index >= KR_NSREP_MAXADDR) {
-               return kr_error(ENOSPC);
-       }
-
-       if (!sock) {
-               qry->ns.name = (const uint8_t *)"";
-               qry->ns.addr[index].ip.sa_family = AF_UNSPEC;
-               return kr_ok();
-       }
-
-       switch (sock->sa_family) {
-       case AF_INET:
-               if (qry->flags.NO_IPV4) {
-                       return kr_error(ENOENT);
-               }
-               qry->ns.addr[index].ip4 = *(const struct sockaddr_in *)sock;
-               break;
-       case AF_INET6:
-               if (qry->flags.NO_IPV6) {
-                       return kr_error(ENOENT);
-               }
-               qry->ns.addr[index].ip6 = *(const struct sockaddr_in6 *)sock;
-               break;
-       default:
-               qry->ns.addr[index].ip.sa_family = AF_UNSPEC;
-               return kr_error(EINVAL);
-       }
-
-       qry->ns.name = (const uint8_t *)"";
-       /* Reset score on first entry */
-       if (index == 0) {
-               qry->ns.score = KR_NS_UNKNOWN;
-               qry->ns.reputation = 0;
-       }
-
-       /* Retrieve RTT from cache */
-       struct kr_context *ctx = qry->ns.ctx;
-       kr_nsrep_rtt_lru_entry_t *rtt_cache_entry = ctx
-               ? lru_get_try(ctx->cache_rtt, kr_inaddr(sock), kr_family_len(sock->sa_family))
-               : NULL;
-       if (rtt_cache_entry) {
-               qry->ns.score = MIN(qry->ns.score, rtt_cache_entry->score);
-       }
-
-       return kr_ok();
-}
-
-#define ELECT_INIT(ns, ctx_) do { \
-       (ns)->ctx = (ctx_); \
-       (ns)->addr[0].ip.sa_family = AF_UNSPEC; \
-       (ns)->reputation = 0; \
-       (ns)->score = KR_NS_MAX_SCORE + 1; \
-} while (0)
-
-int kr_nsrep_elect(struct kr_query *qry, struct kr_context *ctx)
-{
-       if (!qry || !ctx) {
-               //assert(!EINVAL);
-               return kr_error(EINVAL);
-       }
-
-       // First we dump the nsset into a temporary array
-       const int nsset_len = trie_weight(qry->zone_cut.nsset);
-       struct {
-               const knot_dname_t *name;
-               const pack_t *addrs;
-       } nsset[nsset_len];
-
-       trie_it_t *it;
-       int i = 0;
-       for (it = trie_it_begin(qry->zone_cut.nsset); !trie_it_finished(it);
-                                                       trie_it_next(it), ++i) {
-               /* we trust it's a correct dname */
-               nsset[i].name = (const knot_dname_t *)trie_it_key(it, NULL);
-               nsset[i].addrs = (const pack_t *)*trie_it_val(it);
-       }
-       trie_it_free(it);
-       assert(i == nsset_len);
-
-       // Now we sort it randomly, by select-sort.
-       for (i = 0; i < nsset_len - 1; ++i) {
-               // The winner for position i will be uniformly chosen from indices >= i
-               const int j = i + kr_rand_bytes(1) % (nsset_len - i);
-               // Now we swap the winner with index i
-               if (i == j) continue;
-               __typeof__((nsset[i])) tmp = nsset[i];
-               nsset[i] = nsset[j];
-               nsset[j] = tmp;
-       }
-
-       // Finally we run the original algorithm, in this randomized order.
-       struct kr_nsrep *ns = &qry->ns;
-       ELECT_INIT(ns, ctx);
-       int ret = kr_ok();
-       for (i = 0; i < nsset_len; ++i) {
-               ret = eval_nsrep(nsset[i].name, nsset[i].addrs, qry);
-               if (ret) break;
-       }
-
-       if (qry->ns.score <= KR_NS_MAX_SCORE && qry->ns.score >= KR_NS_LONG) {
-               /* This is a low-reliability probe,
-                * go with TCP to get ICMP reachability check. */
-               qry->flags.TCP = true;
-       }
-       return ret;
-}
-
-int kr_nsrep_elect_addr(struct kr_query *qry, struct kr_context *ctx)
-{
-       if (!qry || !ctx) {
-               //assert(!EINVAL);
-               return kr_error(EINVAL);
-       }
-
-       /* Get address list for this NS */
-       struct kr_nsrep *ns = &qry->ns;
-       ELECT_INIT(ns, ctx);
-       pack_t *addr_set = kr_zonecut_find(&qry->zone_cut, ns->name);
-       if (!addr_set) {
-               return kr_error(ENOENT);
-       }
-       /* Evaluate addr list */
-       uint8_t *addr_choice[KR_NSREP_MAXADDR] = { NULL, };
-       unsigned score = eval_addr_set(addr_set, ctx, qry->flags, ns->score, addr_choice);
-       update_nsrep_set(ns, ns->name, addr_choice, score);
-       return kr_ok();
-}
-
-#undef ELECT_INIT
-
-int kr_nsrep_update_rtt(struct kr_nsrep *ns, const struct sockaddr *addr,
-                       unsigned score, kr_nsrep_rtt_lru_t *cache, int umode)
-{
-       if (!cache || umode > KR_NS_MAX || umode < 0) {
-               return kr_error(EINVAL);
-       }
-
-       /* Get `addr`, and later its raw string. */
-       if (addr) {
-               /* Caller provided specific address, OK. */
-       } else if (ns != NULL) {
-               addr = &ns->addr[0].ip;
-       } else {
-               assert(false && "kr_nsrep_update_rtt: don't know what address to update");
-               return kr_error(EINVAL);
-       }
-       const char *addr_in = kr_inaddr(addr);
-       size_t addr_len = kr_inaddr_len(addr);
-       if (!addr_in || addr_len <= 0) {
-               assert(false && "kr_nsrep_update_rtt: incorrect address");
-               return kr_error(EINVAL);
-       }
-
-       bool is_new_entry = false;
-       kr_nsrep_rtt_lru_entry_t  *cur = lru_get_new(cache, addr_in, addr_len,
-                                                    (&is_new_entry));
-       if (!cur) {
-               return kr_ok();
-       }
-       if (score <= KR_NS_GLUED) {
-               score = KR_NS_GLUED + 1;
-       }
-       /* If there's nothing to update, we reset it unless KR_NS_UPDATE_NORESET
-        * mode was requested.  New items are zeroed by LRU automatically. */
-       if (is_new_entry && umode != KR_NS_UPDATE_NORESET) {
-               umode = KR_NS_RESET;
-       }
-       unsigned new_score = 0;
-       /* Update score, by default smooth over last two measurements. */
-       switch (umode) {
-       case KR_NS_UPDATE:
-       case KR_NS_UPDATE_NORESET:
-               new_score = (cur->score + score) / 2; break;
-       case KR_NS_RESET:  new_score = score; break;
-       case KR_NS_ADD:    new_score = MIN(KR_NS_MAX_SCORE - 1, cur->score + score); break;
-       case KR_NS_MAX:    new_score = MAX(cur->score, score); break;
-       default:           return kr_error(EINVAL);
-       }
-       /* Score limits */
-       if (new_score > KR_NS_MAX_SCORE) {
-               new_score = KR_NS_MAX_SCORE;
-       }
-       if (new_score >= KR_NS_TIMEOUT && cur->score < KR_NS_TIMEOUT) {
-               /* Set the timestamp only when NS became "timeouted" */
-               cur->tout_timestamp = kr_now();
-       }
-       cur->score = new_score;
-       return kr_ok();
-}
-
-int kr_nsrep_update_rep(struct kr_nsrep *ns, unsigned reputation, kr_nsrep_lru_t *cache)
-{
-       if (!ns || !cache ) {
-               return kr_error(EINVAL);
-       }
-
-       /* Store in the struct */
-       ns->reputation = reputation;
-       /* Store reputation in the LRU cache */
-       unsigned *cur = lru_get_new(cache, (const char *)ns->name,
-                                   knot_dname_size(ns->name), NULL);
-       if (cur) {
-               *cur = reputation;
-       }
-       return kr_ok();
-}
-
-int kr_nsrep_copy_set(struct kr_nsrep *dst, const struct kr_nsrep *src)
-{
-       if (!dst || !src ) {
-               return kr_error(EINVAL);
-       }
-
-       memcpy(dst, src, sizeof(struct kr_nsrep));
-       dst->name = (const uint8_t *)"";
-       dst->score = KR_NS_UNKNOWN;
-       dst->reputation = 0;
-
-       return kr_ok();
-}
-
-int kr_nsrep_sort(struct kr_nsrep *ns, struct kr_context *ctx)
-{
-       if (!ns || !ctx) {
-               assert(false);
-               return kr_error(EINVAL);
-       }
-
-       kr_nsrep_rtt_lru_t *rtt_cache = ctx->cache_rtt;
-
-       ns->reputation = 0;
-       ns->score = KR_NS_MAX_SCORE + 1;
-
-       if (ns->addr[0].ip.sa_family == AF_UNSPEC) {
-               return kr_error(EINVAL);
-       }
-
-       /* Compute the scores.  Unfortunately there's no space for scores
-        * along the addresses. */
-       unsigned scores[KR_NSREP_MAXADDR];
-       int i;
-       bool timeouted_address_is_already_selected = false;
-       for (i = 0; i < KR_NSREP_MAXADDR; ++i) {
-               const struct sockaddr *sa = &ns->addr[i].ip;
-               if (sa->sa_family == AF_UNSPEC) {
-                       break;
-               }
-               kr_nsrep_rtt_lru_entry_t *rtt_cache_entry = lru_get_try(rtt_cache,
-                                                                       kr_inaddr(sa),
-                                                                       kr_family_len(sa->sa_family));
-               if (!rtt_cache_entry) {
-                       scores[i] = 1; /* prefer unknown to probe RTT */
-               } else if (rtt_cache_entry->score < KR_NS_FWD_TIMEOUT) {
-                       /* some probability to bump bad ones up for re-probe */
-                       scores[i] = rtt_cache_entry->score;
-                       /* The lower the rtt, the more likely it will be selected. */
-                       if (!kr_rand_coin(rtt_cache_entry->score, KR_NS_FWD_TIMEOUT)) {
-                               scores[i] = 1;
-                       }
-               } else {
-                       uint64_t now = kr_now();
-                       uint64_t elapsed = now - rtt_cache_entry->tout_timestamp;
-                       scores[i] = KR_NS_MAX_SCORE + 1;
-                       elapsed = elapsed > UINT_MAX ? UINT_MAX : elapsed;
-                       if (elapsed > ctx->cache_rtt_tout_retry_interval &&
-                           !timeouted_address_is_already_selected) {
-                               scores[i] = 1;
-                               rtt_cache_entry->tout_timestamp = now;
-                               timeouted_address_is_already_selected = true;
-                       }
-               }
-
-               /* Give advantage to IPv6. */
-               if (scores[i] <= KR_NS_MAX_SCORE && sa->sa_family == AF_INET) {
-                       scores[i] += FAVOUR_IPV6;
-               }
-
-               if (VERBOSE_STATUS) {
-                       kr_log_verbose("[     ][nsre] score %d for %s;\t cached RTT: %d\n",
-                                       scores[i], kr_straddr(sa),
-                                       rtt_cache_entry ? rtt_cache_entry->score : -1);
-               }
-       }
-
-       /* Select-sort the addresses. */
-       const int count = i;
-       for (i = 0; i < count - 1; ++i) {
-               /* find min from i onwards */
-               int min_i = i;
-               for (int j = i + 1; j < count; ++j) {
-                       if (scores[j] < scores[min_i]) {
-                               min_i = j;
-                       }
-               }
-               /* swap the indices */
-               if (min_i != i) {
-                       SWAP(scores[min_i], scores[i]);
-                       SWAP(ns->addr[min_i], ns->addr[i]);
-               }
-       }
-
-       if (count > 0) {
-               ns->score = scores[0];
-               ns->reputation = 0;
-       }
-
-       return kr_ok();
-}
diff --git a/lib/nsrep.h b/lib/nsrep.h
deleted file mode 100644 (file)
index 57aecc8..0000000
+++ /dev/null
@@ -1,176 +0,0 @@
-/*  Copyright (C) 2014-2017 CZ.NIC, z.s.p.o. <knot-dns@labs.nic.cz>
- *  SPDX-License-Identifier: GPL-3.0-or-later
- */
-
-#pragma once
-
-#include <netinet/in.h>
-#include <sys/socket.h>
-#include <libknot/dname.h>
-#include <limits.h>
-
-#include "lib/defines.h"
-#include "lib/generic/lru.h"
-
-struct kr_query;
-
-/**
-  * NS RTT score (special values).
-  * @note RTT is measured in milliseconds.
-  */
-enum kr_ns_score {
-       KR_NS_MAX_SCORE     = 20 * KR_CONN_RTT_MAX, /* max possible value */
-       KR_NS_FWD_TIMEOUT   = (95 * 10000) / 100, /* timeout for upstream recursor,
-                                                  * 95 percents from max resolution time */
-       KR_NS_TIMEOUT       = (95 * KR_CONN_RTT_MAX) / 100, /* timeout for upstream auth */
-       KR_NS_LONG          = (3 * KR_NS_TIMEOUT) / 4,
-       KR_NS_UNKNOWN       = KR_NS_TIMEOUT / 2,
-       KR_NS_PENALTY       = 100,
-       KR_NS_GLUED         = 10
-};
-
-/**
- *  See kr_nsrep_update_rtt()
- */
-#define KR_NS_DEAD (((KR_NS_TIMEOUT * 4) + 3) / 3)
-#define KR_NS_FWD_DEAD (((KR_NS_FWD_TIMEOUT * 4) + 3) / 3)
-
-/** If once NS was marked as "timeouted", it won't participate in NS elections
- * at least KR_NS_TIMEOUT_RETRY_INTERVAL milliseconds (now: one second). */
-#define KR_NS_TIMEOUT_RETRY_INTERVAL 1000
-
-/**
- * NS QoS flags.
- */
-enum kr_ns_rep {
-       KR_NS_NOIP4  = 1 << 0, /**< NS has no IPv4 */
-       KR_NS_NOIP6  = 1 << 1, /**< NS has no IPv6 */
-       KR_NS_NOEDNS = 1 << 2  /**< NS has no EDNS support */
-};
-
-/**
- * NS RTT update modes.
- * First update is always KR_NS_RESET unless
- * KR_NS_UPDATE_NORESET mode had choosen.
- */
-enum kr_ns_update_mode {
-       KR_NS_UPDATE = 0,     /**< Update as smooth over last two measurements */
-       KR_NS_UPDATE_NORESET, /**< Same as KR_NS_UPDATE, but disable fallback to
-                              *   KR_NS_RESET on newly added entries.
-                              *   Zero is used as initial value. */
-       KR_NS_RESET,          /**< Set to given value */
-       KR_NS_ADD,            /**< Increment current value */
-       KR_NS_MAX             /**< Set to maximum of current/proposed value. */
-};
-
-struct kr_nsrep_rtt_lru_entry {
-       unsigned score;           /* combined rtt */
-       uint64_t tout_timestamp;  /* The time when score became
-                                  * greater or equal then KR_NS_TIMEOUT.
-                                  * Is meaningful only when score >= KR_NS_TIMEOUT */
-};
-
-typedef struct kr_nsrep_rtt_lru_entry kr_nsrep_rtt_lru_entry_t;
-
-/**
- * NS QoS tracking.
- */
-typedef lru_t(kr_nsrep_rtt_lru_entry_t) kr_nsrep_rtt_lru_t;
-
-/**
- * NS reputation tracking.
- */
-typedef lru_t(unsigned) kr_nsrep_lru_t;
-
-/* Maximum count of addresses probed in one go (last is left empty) */
-#define KR_NSREP_MAXADDR 4
-
-/**
- * Name server representation.
- * Contains extra information about the name server, e.g. score
- * or other metadata.
- */
-struct kr_nsrep
-{
-       unsigned score;                  /**< NS score */
-       unsigned reputation;             /**< NS reputation */
-       const knot_dname_t *name;        /**< NS name */
-       struct kr_context *ctx;          /**< Resolution context */
-       union inaddr addr[KR_NSREP_MAXADDR];        /**< NS address(es) */
-};
-
-/**
- * Set given NS address.  (Very low-level access to the list.)
- * @param  qry      updated query
- * @param  index    index of the updated target
- * @param  sock     socket address to use (sockaddr_in or sockaddr_in6 or NULL)
- * @return          0 or an error code, in particular kr_error(ENOENT) for net.ipvX
- */
-KR_EXPORT
-int kr_nsrep_set(struct kr_query *qry, size_t index, const struct sockaddr *sock);
-
-/**
- * Elect best nameserver/address pair from the nsset.
- * @param  qry          updated query
- * @param  ctx          resolution context
- * @return              0 or an error code
- */
-KR_EXPORT
-int kr_nsrep_elect(struct kr_query *qry, struct kr_context *ctx);
-
-/**
- * Elect best nameserver/address pair from the nsset.
- * @param  qry          updated query
- * @param  ctx          resolution context
- * @return              0 or an error code
- */
-KR_EXPORT
-int kr_nsrep_elect_addr(struct kr_query *qry, struct kr_context *ctx);
-
-/**
- * Update NS address RTT information.
- *
- * @brief In KR_NS_UPDATE mode reputation is smoothed over last N measurements.
- * 
- * @param  ns           updated NS representation
- * @param  addr         chosen address (NULL for first)
- * @param  score        new score (i.e. RTT), see enum kr_ns_score
- * @param  cache        RTT LRU cache
- * @param  umode        update mode (KR_NS_UPDATE or KR_NS_RESET or KR_NS_ADD)
- * @return              0 on success, error code on failure
- */
-KR_EXPORT
-int kr_nsrep_update_rtt(struct kr_nsrep *ns, const struct sockaddr *addr,
-                       unsigned score, kr_nsrep_rtt_lru_t *cache, int umode);
-
-/**
- * Update NSSET reputation information.
- * 
- * @param  ns           updated NS representation
- * @param  reputation   combined reputation flags, see enum kr_ns_rep
- * @param  cache        LRU cache
- * @return              0 on success, error code on failure
- */
-KR_EXPORT
-int kr_nsrep_update_rep(struct kr_nsrep *ns, unsigned reputation, kr_nsrep_lru_t *cache);
-/**
- * Copy NSSET reputation information and resets score.
- *
- * @param  dst          updated NS representation
- * @param  src          source NS representation
- * @return              0 on success, error code on failure
- */
-int kr_nsrep_copy_set(struct kr_nsrep *dst, const struct kr_nsrep *src);
-
-/**
- * Sort addresses in the query nsrep list by cached RTT.
- * if RTT is greater then KR_NS_TIMEOUT, address will placed at the beginning of the
- * nsrep list once in cache.ns_tout() milliseconds. Otherwise it will be sorted
- * as if it has cached RTT equal to KR_NS_MAX_SCORE + 1.
- * @param  ns           updated kr_nsrep
- * @param  ctx          name resolution context.
- * @return              0 or an error code
- * @note   ns reputation is zeroed and score is set to KR_NS_MAX_SCORE + 1.
- */
-KR_EXPORT
-int kr_nsrep_sort(struct kr_nsrep *ns,  struct kr_context *ctx);
index 79438b26e38a6749d03b90e723ffa57f7c483888..50ea4ac4960e928f32c709349cd3b152f27fa5f2 100644 (file)
@@ -11,6 +11,7 @@
 #include <libknot/rrtype/rdname.h>
 #include <libknot/descriptor.h>
 #include <ucw/mempool.h>
+#include <sys/socket.h>
 #include "kresconfig.h"
 #include "lib/resolve.h"
 #include "lib/layer.h"
@@ -147,7 +148,7 @@ static void randomized_qname_case(knot_dname_t * restrict qname, uint32_t secret
                return;
        }
        assert(qname);
-       const int len = knot_dname_size(qname) - 2; /* Skip first, last label. */
+       const int len = knot_dname_size(qname) - 2; /* Skip first, last label. First is length, last is always root */
        for (int i = 0; i < len; ++i) {
                /* Note: this relies on the fact that correct label lengths
                 * can't pass the isletter() test (by "luck"). */
@@ -157,23 +158,6 @@ static void randomized_qname_case(knot_dname_t * restrict qname, uint32_t secret
        }
 }
 
-/** Invalidate current NS/addr pair. */
-static int invalidate_ns(struct kr_rplan *rplan, struct kr_query *qry)
-{
-       if (qry->ns.addr[0].ip.sa_family != AF_UNSPEC) {
-               const char *addr = kr_inaddr(&qry->ns.addr[0].ip);
-               int addr_len = kr_inaddr_len(&qry->ns.addr[0].ip);
-               int ret = kr_zonecut_del(&qry->zone_cut, qry->ns.name, addr, addr_len);
-               /* Also remove it from the qry->ns.addr array.
-                * That's useful at least for STUB and FORWARD modes. */
-               memmove(qry->ns.addr, qry->ns.addr + 1,
-                       sizeof(qry->ns.addr[0]) * (KR_NSREP_MAXADDR - 1));
-               return ret;
-       } else {
-               return kr_zonecut_del_all(&qry->zone_cut, qry->ns.name);
-       }
-}
-
 /** This turns of QNAME minimisation if there is a non-terminal between current zone cut, and name target.
  *  It save several minimization steps, as the zone cut is likely final one.
  */
@@ -310,71 +294,6 @@ static int ns_fetch_cut(struct kr_query *qry, const knot_dname_t *requested_name
        return KR_STATE_PRODUCE;
 }
 
-static int ns_resolve_addr(struct kr_query *qry, struct kr_request *req)
-{
-       struct kr_rplan *rplan = &req->rplan;
-       struct kr_context *ctx = req->ctx;
-
-
-       /* Start NS queries from root, to avoid certain cases
-        * where a NS drops out of cache and the rest is unavailable,
-        * this would lead to dependency loop in current zone cut.
-        * Prefer IPv6 and continue with IPv4 if not available.
-        */
-       uint16_t next_type = 0;
-       if (!(qry->flags.AWAIT_IPV6) &&
-           !(ctx->options.NO_IPV6)) {
-               next_type = KNOT_RRTYPE_AAAA;
-               qry->flags.AWAIT_IPV6 = true;
-       } else if (!(qry->flags.AWAIT_IPV4) &&
-                  !(ctx->options.NO_IPV4)) {
-               next_type = KNOT_RRTYPE_A;
-               qry->flags.AWAIT_IPV4 = true;
-               /* Hmm, no useable IPv6 then. */
-               qry->ns.reputation |= KR_NS_NOIP6;
-               kr_nsrep_update_rep(&qry->ns, qry->ns.reputation, ctx->cache_rep);
-       }
-       /* Bail out if the query is already pending or dependency loop. */
-       if (!next_type || kr_rplan_satisfies(qry->parent, qry->ns.name, KNOT_CLASS_IN, next_type)) {
-               /* Fall back to SBELT if root server query fails. */
-               if (!next_type && qry->zone_cut.name[0] == '\0') {
-                       VERBOSE_MSG(qry, "=> fallback to root hints\n");
-                       kr_zonecut_set_sbelt(ctx, &qry->zone_cut);
-                       qry->flags.NO_THROTTLE = true; /* Pick even bad SBELT servers */
-                       return kr_error(EAGAIN);
-               }
-               /* No IPv4 nor IPv6, flag server as unusable. */
-               ++req->count_no_nsaddr;
-               VERBOSE_MSG(qry, "=> unresolvable NS address, bailing out (counter: %u)\n",
-                               req->count_no_nsaddr);
-               qry->ns.reputation |= KR_NS_NOIP4 | KR_NS_NOIP6;
-               kr_nsrep_update_rep(&qry->ns, qry->ns.reputation, ctx->cache_rep);
-               invalidate_ns(rplan, qry);
-               return kr_error(EHOSTUNREACH);
-       }
-       /* Push new query to the resolution plan */
-       struct kr_query *next =
-               kr_rplan_push(rplan, qry, qry->ns.name, KNOT_CLASS_IN, next_type);
-       if (!next) {
-               return kr_error(ENOMEM);
-       }
-       next->flags.NONAUTH = true;
-
-       /* At the root level with no NS addresses, add SBELT subrequest. */
-       int ret = 0;
-       if (qry->zone_cut.name[0] == '\0') {
-               ret = kr_zonecut_set_sbelt(ctx, &next->zone_cut);
-               if (ret == 0) { /* Copy TA and key since it's the same cut to avoid lookup. */
-                       kr_zonecut_copy_trust(&next->zone_cut, &qry->zone_cut);
-                       kr_zonecut_set_sbelt(ctx, &qry->zone_cut); /* Add SBELT to parent in case query fails. */
-                       qry->flags.NO_THROTTLE = true; /* Pick even bad SBELT servers */
-               }
-       } else {
-               next->flags.AWAIT_CUT = true;
-       }
-       return ret;
-}
-
 static int edns_put(knot_pkt_t *pkt, bool reclaim)
 {
        if (!pkt->opt_rr) {
@@ -776,7 +695,7 @@ static int resolve_query(struct kr_request *request, const knot_pkt_t *packet)
        return request->state;
 }
 
-knot_pkt_t * kr_request_ensure_answer(struct kr_request *request)
+knot_pkt_t *kr_request_ensure_answer(struct kr_request *request)
 {
        if (request->answer)
                return request->answer;
@@ -839,84 +758,6 @@ enomem:
        return request->answer = NULL;
 }
 
-KR_PURE static bool kr_inaddr_equal(const struct sockaddr *a, const struct sockaddr *b)
-{
-       const int a_len = kr_inaddr_len(a);
-       const int b_len = kr_inaddr_len(b);
-       return a_len == b_len && memcmp(kr_inaddr(a), kr_inaddr(b), a_len) == 0;
-}
-
-static void update_nslist_rtt(struct kr_context *ctx, struct kr_query *qry, const struct sockaddr *src)
-{
-       /* Do not track in safe mode. */
-       if (qry->flags.SAFEMODE) {
-               return;
-       }
-
-       /* Calculate total resolution time from the time the query was generated. */
-       uint64_t elapsed = kr_now() - qry->timestamp_mono;
-       elapsed = elapsed > UINT_MAX ? UINT_MAX : elapsed;
-
-       /* NSs in the preference list prior to the one who responded will be penalised
-        * with the RETRY timer interval. This is because we know they didn't respond
-        * for N retries, so their RTT must be at least N * RETRY.
-        * The NS in the preference list that responded will have RTT relative to the
-        * time when the query was sent out, not when it was originated.
-        */
-       for (size_t i = 0; i < KR_NSREP_MAXADDR; ++i) {
-               const struct sockaddr *addr = &qry->ns.addr[i].ip;
-               if (addr->sa_family == AF_UNSPEC) {
-                       break;
-               }
-               /* If this address is the source of the answer, update its RTT */
-               if (kr_inaddr_equal(src, addr)) {
-                       kr_nsrep_update_rtt(&qry->ns, addr, elapsed, ctx->cache_rtt, KR_NS_UPDATE);
-                       WITH_VERBOSE(qry) {
-                               char addr_str[INET6_ADDRSTRLEN];
-                               inet_ntop(addr->sa_family, kr_inaddr(addr), addr_str, sizeof(addr_str));
-                               VERBOSE_MSG(qry, "<= server: '%s' rtt: %"PRIu64" ms\n",
-                                               addr_str, elapsed);
-                       }
-               } else {
-                       /* Response didn't come from this IP, but we know the RTT must be at least
-                        * several RETRY timer tries, e.g. if we have addresses [a, b, c] and we have
-                        * tried [a, b] when the answer from 'a' came after 350ms, then we know
-                        * that 'b' didn't respond for at least 350 - (1 * 300) ms. We can't say that
-                        * its RTT is 50ms, but we can say that its score shouldn't be less than 50. */
-                        kr_nsrep_update_rtt(&qry->ns, addr, elapsed, ctx->cache_rtt, KR_NS_MAX);
-                        WITH_VERBOSE(qry) {
-                               char addr_str[INET6_ADDRSTRLEN];
-                               inet_ntop(addr->sa_family, kr_inaddr(addr), addr_str, sizeof(addr_str));
-                               VERBOSE_MSG(qry, "<= server: '%s' rtt: >= %"PRIu64" ms\n",
-                                               addr_str, elapsed);
-                        }
-               }
-               /* Subtract query start time from elapsed time */
-               if (elapsed < KR_CONN_RETRY) {
-                       break;
-               }
-               elapsed = elapsed - KR_CONN_RETRY;
-       }
-}
-
-static void update_nslist_score(struct kr_request *request, struct kr_query *qry, const struct sockaddr *src, knot_pkt_t *packet)
-{
-       struct kr_context *ctx = request->ctx;
-       /* On successful answer, update preference list RTT and penalise timer  */
-       if (!(request->state & KR_STATE_FAIL)) {
-               /* Update RTT information for preference list */
-               update_nslist_rtt(ctx, qry, src);
-               /* Do not complete NS address resolution on soft-fail. */
-               const int rcode = packet ? knot_wire_get_rcode(packet->wire) : 0;
-               if (rcode != KNOT_RCODE_SERVFAIL && rcode != KNOT_RCODE_REFUSED) {
-                       qry->flags.AWAIT_IPV6 = false;
-                       qry->flags.AWAIT_IPV4 = false;
-               } else { /* Penalize SERVFAILs. */
-                       kr_nsrep_update_rtt(&qry->ns, src, KR_NS_PENALTY, ctx->cache_rtt, KR_NS_ADD);
-               }
-       }
-}
-
 static bool resolution_time_exceeded(struct kr_query *qry, uint64_t now)
 {
        uint64_t resolving_time = now - qry->creation_time_mono;
@@ -929,7 +770,7 @@ static bool resolution_time_exceeded(struct kr_query *qry, uint64_t now)
        return false;
 }
 
-int kr_resolve_consume(struct kr_request *request, const struct sockaddr *src, knot_pkt_t *packet)
+int kr_resolve_consume(struct kr_request *request, struct kr_transport **transport, knot_pkt_t *packet)
 {
        struct kr_rplan *rplan = &request->rplan;
 
@@ -946,11 +787,7 @@ int kr_resolve_consume(struct kr_request *request, const struct sockaddr *src, k
        }
        bool tried_tcp = (qry->flags.TCP);
        if (!packet || packet->size == 0) {
-               if (tried_tcp) {
-                       request->state = KR_STATE_FAIL;
-               } else {
-                       qry->flags.TCP = true;
-               }
+               return KR_STATE_PRODUCE;
        } else {
                /* Packet cleared, derandomize QNAME. */
                knot_dname_t *qname_raw = knot_pkt_qname(packet);
@@ -963,25 +800,29 @@ int kr_resolve_consume(struct kr_request *request, const struct sockaddr *src, k
                } else {
                        /* Fill in source and latency information. */
                        request->upstream.rtt = kr_now() - qry->timestamp_mono;
-                       request->upstream.addr = src;
+                       request->upstream.transport = transport ? *transport : NULL;
                        ITERATE_LAYERS(request, qry, consume, packet);
                        /* Clear temporary information */
-                       request->upstream.addr = NULL;
+                       request->upstream.transport = NULL;
                        request->upstream.rtt = 0;
                }
        }
 
-       /* Track RTT for iterative answers */
-       if (src && !(qry->flags.CACHED)) {
-               update_nslist_score(request, qry, src, packet);
+       if (transport && !qry->flags.CACHED) {
+               if (!(request->state & KR_STATE_FAIL)) {
+                       /* Do not complete NS address resolution on soft-fail. */
+                       const int rcode = packet ? knot_wire_get_rcode(packet->wire) : 0;
+                       if (rcode != KNOT_RCODE_SERVFAIL && rcode != KNOT_RCODE_REFUSED) {
+                               qry->flags.AWAIT_IPV6 = false;
+                               qry->flags.AWAIT_IPV4 = false;
+                       }
+               }
        }
-       /* Resolution failed, invalidate current NS. */
+
        if (request->state & KR_STATE_FAIL) {
-               invalidate_ns(rplan, qry);
                qry->flags.RESOLVED = false;
        }
 
-       /* For multiple errors in a row; invalidate_ns() is not enough. */
        if (!qry->flags.CACHED) {
                if (request->state & KR_STATE_FAIL) {
                        if (++request->count_fail_row > KR_CONSUME_FAIL_ROW_LIMIT) {
@@ -1016,7 +857,12 @@ int kr_resolve_consume(struct kr_request *request, const struct sockaddr *src, k
 
        /* Do not finish with bogus answer. */
        if (qry->flags.DNSSEC_BOGUS)  {
-               return KR_STATE_FAIL;
+               if (qry->flags.FORWARD || qry->flags.STUB) {
+                       return KR_STATE_FAIL;
+               }
+               /* Other servers might not have broken DNSSEC. */
+               qry->flags.DNSSEC_BOGUS = false;
+               return KR_STATE_PRODUCE;
        }
 
        return kr_rplan_empty(&request->rplan) ? KR_STATE_DONE : KR_STATE_PRODUCE;
@@ -1368,17 +1214,81 @@ static int zone_cut_check(struct kr_request *request, struct kr_query *qry, knot
        return trust_chain_check(request, qry);
 }
 
-int kr_resolve_produce(struct kr_request *request, struct sockaddr **dst, int *type, knot_pkt_t *packet)
+
+static int ns_resolve_addr(struct kr_query *qry, struct kr_request *param, struct kr_transport *transport, uint16_t next_type)
+{
+       struct kr_rplan *rplan = &param->rplan;
+       struct kr_context *ctx = param->ctx;
+
+
+       /* Start NS queries from root, to avoid certain cases
+        * where a NS drops out of cache and the rest is unavailable,
+        * this would lead to dependency loop in current zone cut.
+        */
+
+       /* Bail out if the query is already pending or dependency loop. */
+       if (!next_type || kr_rplan_satisfies(qry->parent, transport->ns_name, KNOT_CLASS_IN, next_type)) {
+               /* Fall back to SBELT if root server query fails. */
+               if (!next_type && qry->zone_cut.name[0] == '\0') {
+                       VERBOSE_MSG(qry, "=> fallback to root hints\n");
+                       kr_zonecut_set_sbelt(ctx, &qry->zone_cut);
+                       qry->flags.NO_THROTTLE = true; /* Pick even bad SBELT servers */
+                       return kr_error(EAGAIN);
+               }
+               /* No IPv4 nor IPv6, flag server as unusable. */
+               VERBOSE_MSG(qry, "=> unresolvable NS address, bailing out\n");
+               kr_zonecut_del_all(&qry->zone_cut, transport->ns_name);
+               return kr_error(EHOSTUNREACH);
+       }
+       /* Push new query to the resolution plan */
+       struct kr_query *next =
+               kr_rplan_push(rplan, qry, transport->ns_name, KNOT_CLASS_IN, next_type);
+       if (!next) {
+               return kr_error(ENOMEM);
+       }
+       next->flags.NONAUTH = true;
+
+       /* At the root level with no NS addresses, add SBELT subrequest. */
+       int ret = 0;
+       if (qry->zone_cut.name[0] == '\0') {
+               ret = kr_zonecut_set_sbelt(ctx, &next->zone_cut);
+               if (ret == 0) { /* Copy TA and key since it's the same cut to avoid lookup. */
+                       kr_zonecut_copy_trust(&next->zone_cut, &qry->zone_cut);
+                       kr_zonecut_set_sbelt(ctx, &qry->zone_cut); /* Add SBELT to parent in case query fails. */
+                       qry->flags.NO_THROTTLE = true; /* Pick even bad SBELT servers */
+               }
+       } else {
+               next->flags.AWAIT_CUT = true;
+       }
+
+       if (ret == 0) {
+               if (next_type == KNOT_RRTYPE_AAAA) {
+                       qry->flags.AWAIT_IPV6 = true;
+               } else {
+                       qry->flags.AWAIT_IPV4 = true;
+               }       
+       }
+
+       return ret;
+}
+
+int kr_resolve_produce(struct kr_request *request, struct kr_transport **transport, knot_pkt_t *packet)
 {
        struct kr_rplan *rplan = &request->rplan;
-       unsigned ns_election_iter = 0;
 
        /* No query left for resolution */
        if (kr_rplan_empty(rplan)) {
                return KR_STATE_FAIL;
        }
-       /* If we have deferred answers, resume them. */
+
        struct kr_query *qry = array_tail(rplan->pending);
+
+       /* Initialize server selection */
+       if (!qry->server_selection.initialized) {
+               kr_server_selection_init(qry);
+       }
+
+       /* If we have deferred answers, resume them. */
        if (qry->deferred != NULL) {
                /* @todo: Refactoring validator, check trust chain before resuming. */
                int state = 0;
@@ -1456,70 +1366,42 @@ int kr_resolve_produce(struct kr_request *request, struct sockaddr **dst, int *t
                }
        }
 
-ns_election:
-
-       if (unlikely(request->count_no_nsaddr >= KR_COUNT_NO_NSADDR_LIMIT)) {
-               VERBOSE_MSG(qry, "=> too many unresolvable NSs, bail out "
-                               "(mitigation for NXNSAttack CVE-2020-12667)\n");
-               return KR_STATE_FAIL;
-       }
-       /* If the query has already selected a NS and is waiting for IPv4/IPv6 record,
-        * elect best address only, otherwise elect a completely new NS.
-        */
-       if(++ns_election_iter >= KR_ITER_LIMIT) {
-               VERBOSE_MSG(qry, "=> couldn't converge NS selection, bail out\n");
-               return KR_STATE_FAIL;
-       }
 
        const struct kr_qflags qflg = qry->flags;
        const bool retry = qflg.TCP || qflg.BADCOOKIE_AGAIN;
-       if (qflg.AWAIT_IPV4 || qflg.AWAIT_IPV6) {
-               kr_nsrep_elect_addr(qry, request->ctx);
-       } else if (qflg.FORWARD || qflg.STUB) {
-               kr_nsrep_sort(&qry->ns, request->ctx);
-               if (qry->ns.score > KR_NS_MAX_SCORE) {
-                       /* At the moment all NS have bad reputation.
-                        * But there can be existing connections*/
-                       VERBOSE_MSG(qry, "=> no valid NS left\n");
-                       return KR_STATE_FAIL;
-               }
-       } else if (!qry->ns.name || !retry) { /* Keep NS when requerying/stub/badcookie. */
+       if (!qflg.FORWARD && !qflg.STUB && !retry) { /* Keep NS when requerying/stub/badcookie. */
                /* Root DNSKEY must be fetched from the hints to avoid chicken and egg problem. */
                if (qry->sname[0] == '\0' && qry->stype == KNOT_RRTYPE_DNSKEY) {
                        kr_zonecut_set_sbelt(request->ctx, &qry->zone_cut);
                        qry->flags.NO_THROTTLE = true; /* Pick even bad SBELT servers */
                }
-               kr_nsrep_elect(qry, request->ctx);
-               if (qry->ns.score > KR_NS_MAX_SCORE) {
-                       if (kr_zonecut_is_empty(&qry->zone_cut)) {
-                               VERBOSE_MSG(qry, "=> no NS with an address\n");
-                       } else {
-                               VERBOSE_MSG(qry, "=> no valid NS left\n");
-                       }
-                       if (!qry->flags.NO_NS_FOUND) {
-                               qry->flags.NO_NS_FOUND = true;
-                       } else {
-                               ITERATE_LAYERS(request, qry, reset);
-                               kr_rplan_pop(rplan, qry);
-                       }
-                       return KR_STATE_PRODUCE;
-               }
        }
 
-       /* Resolve address records */
-       if (qry->ns.addr[0].ip.sa_family == AF_UNSPEC) {
-               int ret = ns_resolve_addr(qry, request);
-               if (ret != 0) {
-                       qry->flags.AWAIT_IPV6 = false;
-                       qry->flags.AWAIT_IPV4 = false;
-                       qry->flags.TCP = false;
-                       qry->ns.name = NULL;
-                       goto ns_election; /* Must try different NS */
+       qry->server_selection.choose_transport(qry, transport);
+
+       if (*transport == NULL) {
+               /* Properly signal to serve_stale module. */
+               if (qry->flags.NO_NS_FOUND) {
+                       ITERATE_LAYERS(request, qry, reset);
+                       kr_rplan_pop(rplan, qry);
+               } else {
+                       /* FIXME: This is probably quite inefficient:
+                       * we go through the whole qr_task_step loop just because of the serve_stale
+                       * module which might not even be loaded. */
+                       qry->flags.NO_NS_FOUND = true;
                }
+               return KR_STATE_PRODUCE;
+       }
+
+       if ((*transport)->protocol == KR_TRANSPORT_RESOLVE_A || (*transport)->protocol == KR_TRANSPORT_RESOLVE_AAAA) {
+               uint16_t type = (*transport)->protocol == KR_TRANSPORT_RESOLVE_A ? KNOT_RRTYPE_A : KNOT_RRTYPE_AAAA;
+               ns_resolve_addr(qry, qry->request, *transport, type);
                ITERATE_LAYERS(request, qry, reset);
                return KR_STATE_PRODUCE;
        }
 
+       qry->flags.SAFEMODE = qry->flags.SAFEMODE || (*transport)->safe_mode;
+
        /* Randomize query case (if not in safe mode or turned off) */
        qry->secret = (qry->flags.SAFEMODE || qry->flags.NO_0X20)
                        ? 0 : kr_rand_bytes(sizeof(qry->secret));
@@ -1531,8 +1413,6 @@ ns_election:
         * kr_resolve_checkout().
         */
        qry->timestamp_mono = kr_now();
-       *dst = &qry->ns.addr[0].ip;
-       *type = (qry->flags.TCP) ? SOCK_STREAM : SOCK_DGRAM;
        return request->state;
 }
 
@@ -1569,7 +1449,7 @@ static bool outbound_request_update_cookies(struct kr_request *req,
 #endif /* ENABLE_COOKIES */
 
 int kr_resolve_checkout(struct kr_request *request, const struct sockaddr *src,
-                        struct sockaddr *dst, int type, knot_pkt_t *packet)
+                        struct kr_transport *transport, knot_pkt_t *packet)
 {
        /* @todo: Update documentation if this function becomes approved. */
 
@@ -1593,7 +1473,7 @@ int kr_resolve_checkout(struct kr_request *request, const struct sockaddr *src,
                 * actual cookie. If we don't know the server address then we
                 * also don't know the actual cookie size.
                 */
-               if (!outbound_request_update_cookies(request, src, dst)) {
+               if (!outbound_request_update_cookies(request, src, &transport->address.ip)) {
                        return kr_error(EINVAL);
                }
        }
@@ -1610,8 +1490,20 @@ int kr_resolve_checkout(struct kr_request *request, const struct sockaddr *src,
        /* Run the checkout layers and cancel on failure.
         * The checkout layer doesn't persist the state, so canceled subrequests
         * don't affect the resolution or rest of the processing. */
+       int type = -1;
+       switch(transport->protocol) {
+       case KR_TRANSPORT_UDP:
+               type = SOCK_DGRAM;
+               break;
+       case KR_TRANSPORT_TCP:
+       case KR_TRANSPORT_TLS:
+               type = SOCK_STREAM;
+               break;
+       default:
+               assert(0);
+       }
        int state = request->state;
-       ITERATE_LAYERS(request, qry, checkout, packet, dst, type);
+       ITERATE_LAYERS(request, qry, checkout, packet, &transport->address.ip, type);
        if (request->state & KR_STATE_FAIL) {
                request->state = state; /* Restore */
                return kr_error(ECANCELED);
@@ -1634,26 +1526,17 @@ int kr_resolve_checkout(struct kr_request *request, const struct sockaddr *src,
        WITH_VERBOSE(qry) {
 
        KR_DNAME_GET_STR(qname_str, knot_pkt_qname(packet));
+       KR_DNAME_GET_STR(ns_name, transport->ns_name);
        KR_DNAME_GET_STR(zonecut_str, qry->zone_cut.name);
        KR_RRTYPE_GET_STR(type_str, knot_pkt_qtype(packet));
+       const char *ns_str = kr_straddr(&transport->address.ip);
 
-       for (size_t i = 0; i < KR_NSREP_MAXADDR; ++i) {
-               struct sockaddr *addr = &qry->ns.addr[i].ip;
-               if (addr->sa_family == AF_UNSPEC) {
-                       break;
-               }
-               if (!kr_inaddr_equal(dst, addr)) {
-                       continue;
-               }
-               const char *ns_str = kr_straddr(addr);
-               VERBOSE_MSG(qry,
-                       "=> id: '%05u' querying: '%s' score: %u zone cut: '%s' "
+       VERBOSE_MSG(qry,
+                       "=> id: '%05u' querying: '%s'@'%s' zone cut: '%s' "
                        "qname: '%s' qtype: '%s' proto: '%s'\n",
-                       qry->id, ns_str ? ns_str : "", qry->ns.score, zonecut_str,
+                       qry->id, ns_name, ns_str ? ns_str : "", zonecut_str,
                        qname_str, type_str, (qry->flags.TCP) ? "tcp" : "udp");
-
-               break;
-       }}
+       }
 
        return kr_ok();
 }
index 1095fa9ac46dc4951823d5a5c1e8b4ee0b152db1..e5855cc3eafb23e6b64dc6771f079ad6128ec7b4 100644 (file)
@@ -13,7 +13,7 @@
 #include "lib/layer.h"
 #include "lib/generic/map.h"
 #include "lib/generic/array.h"
-#include "lib/nsrep.h"
+#include "lib/selection.h"
 #include "lib/rplan.h"
 #include "lib/module.h"
 #include "lib/cache/api.h"
@@ -161,9 +161,7 @@ struct kr_context
        map_t negative_anchors;
        struct kr_zonecut root_hints;
        struct kr_cache cache;
-       kr_nsrep_rtt_lru_t *cache_rtt;
        unsigned cache_rtt_tout_retry_interval;
-       kr_nsrep_lru_t *cache_rep;
        module_array_t *modules;
        /* The cookie context structure should not be held within the cookies
         * module because of better access. */
@@ -182,6 +180,10 @@ struct kr_request_qsource_flags {
        bool xdp:1; /**< true if the request is on AF_XDP; only meaningful if (dst_addr). */
 };
 
+typedef bool (*addr_info_f)(struct sockaddr*);
+typedef void (*async_resolution_f)(knot_dname_t*, enum knot_rr_type);
+typedef array_t(union inaddr) inaddr_array_t;
+
 /**
  * Name resolution request.
  *
@@ -210,7 +212,7 @@ struct kr_request {
        } qsource;
        struct {
                unsigned rtt;                  /**< Current upstream RTT */
-               const struct sockaddr *addr;   /**< Current upstream address */
+               const struct kr_transport *transport;   /**< Current upstream transport */
        } upstream;                        /**< Upstream information, valid only in consume() phase */
        struct kr_qflags options;
        int state;
@@ -235,6 +237,12 @@ struct kr_request {
        int vars_ref; /**< Reference to per-request variable table. LUA_NOREF if not set. */
        knot_mm_t pool;
        unsigned int uid; /**< for logging purposes only */
+       struct {
+               addr_info_f is_tls_capable;
+               addr_info_f is_tcp_connected;
+               addr_info_f is_tcp_waiting;
+               inaddr_array_t forwarding_targets; /**< When forwarding, possible targets are put here */
+       } selection_context;
        unsigned int count_no_nsaddr;
        unsigned int count_fail_row;
        alloc_wire_f alloc_wire_cb; /**< CB to allocate answer wire (can be NULL). */
@@ -281,7 +289,7 @@ knot_pkt_t * kr_request_ensure_answer(struct kr_request *request);
  * @return         any state
  */
 KR_EXPORT
-int kr_resolve_consume(struct kr_request *request, const struct sockaddr *src, knot_pkt_t *packet);
+int kr_resolve_consume(struct kr_request *request, struct kr_transport **transport, knot_pkt_t *packet);
 
 /**
  * Produce either next additional query or finish.
@@ -297,7 +305,7 @@ int kr_resolve_consume(struct kr_request *request, const struct sockaddr *src, k
  * @return         any state
  */
 KR_EXPORT
-int kr_resolve_produce(struct kr_request *request, struct sockaddr **dst, int *type, knot_pkt_t *packet);
+int kr_resolve_produce(struct kr_request *request, struct kr_transport **transport, knot_pkt_t *packet);
 
 /**
  * Finalises the outbound query packet with the knowledge of the IP addresses.
@@ -313,7 +321,7 @@ int kr_resolve_produce(struct kr_request *request, struct sockaddr **dst, int *t
  */
 KR_EXPORT
 int kr_resolve_checkout(struct kr_request *request, const struct sockaddr *src,
-                        struct sockaddr *dst, int type, knot_pkt_t *packet);
+                        struct kr_transport *transport, knot_pkt_t *packet);
 
 /**
  * Finish resolution and commit results if the state is DONE.
@@ -343,4 +351,3 @@ struct kr_rplan *kr_resolve_plan(struct kr_request *request);
  */
 KR_EXPORT KR_PURE
 knot_mm_t *kr_resolve_pool(struct kr_request *request);
-
index 18dc6b8276c0b3186b3ebd130d881b8a643a4a72..02e32d3e469b5e54b1c09d189c8178785ad40806 100644 (file)
@@ -159,22 +159,13 @@ static struct kr_query *kr_rplan_push_query(struct kr_rplan *rplan,
        qry->flags = rplan->request->options;
        qry->parent = parent;
        qry->request = rplan->request;
-       qry->ns.ctx = rplan->request->ctx;
-       qry->ns.addr[0].ip.sa_family = AF_UNSPEC;
+
        gettimeofday(&qry->timestamp, NULL);
        qry->timestamp_mono = kr_now();
        qry->creation_time_mono = parent ? parent->creation_time_mono : qry->timestamp_mono;
        kr_zonecut_init(&qry->zone_cut, (const uint8_t *)"", rplan->pool);
        qry->reorder = qry->flags.REORDER_RR ? kr_rand_bytes(sizeof(qry->reorder)) : 0;
 
-       /* When forwarding, keep the nameserver addresses. */
-       if (parent && parent->flags.FORWARD && qry->flags.FORWARD) {
-               ret = kr_nsrep_copy_set(&qry->ns, &parent->ns);
-               if (ret) {
-                       query_free(rplan->pool, qry);
-                       return NULL;
-               }
-       }
 
        assert((rplan->pending.len == 0 && rplan->resolved.len == 0)
                == (rplan->initial == NULL));
index e69a3f86c689990b87d582913864c68dd6e07b10..c3d28a263d52717790d481cfe98355372db901ab 100644 (file)
@@ -8,9 +8,9 @@
 #include <libknot/dname.h>
 #include <libknot/codes.h>
 
+#include "lib/selection.h"
 #include "lib/cache/api.h"
 #include "lib/zonecut.h"
-#include "lib/nsrep.h"
 
 /** Query flags */
 struct kr_qflags {
@@ -101,8 +101,7 @@ struct kr_query {
        struct kr_query *cname_parent;
        struct kr_request *request; /**< Parent resolution request. */
        kr_stale_cb stale_cb; /**< See the type */
-       /* Beware: this must remain the last, because of lua bindings. */
-       struct kr_nsrep ns;
+       struct kr_server_selection server_selection;
 };
 
 /** @cond internal Array of queries. */
diff --git a/lib/selection.c b/lib/selection.c
new file mode 100644 (file)
index 0000000..b9997e9
--- /dev/null
@@ -0,0 +1,596 @@
+#include <libknot/dname.h>
+
+#include "lib/selection.h"
+#include "lib/selection_forward.h"
+#include "lib/selection_iter.h"
+#include "lib/generic/pack.h"
+#include "lib/generic/trie.h"
+#include "lib/rplan.h"
+#include "lib/cache/api.h"
+#include "lib/resolve.h"
+
+#include "daemon/worker.h"
+#include "daemon/tls.h"
+
+#include "lib/utils.h"
+
+#define VERBOSE_MSG(qry, ...) QRVERBOSE((qry), "slct", __VA_ARGS__)
+
+/** @internal Macro to set address structure. */
+#define ADDR_SET(sa, family, addr, len, port) do {\
+               memcpy(&sa ## _addr, (addr), (len)); \
+               sa ## _family = (family); \
+       sa ## _port = htons(port); \
+} while (0)
+
+#define DEFAULT_TIMEOUT 800
+#define MAX_TIMEOUT 10000
+#define MAX_BACKOFF 5
+#define MINIMAL_TIMEOUT_ADDITION 20
+
+/* After TCP_TIMEOUT_THRESHOLD timeouts one transport, we'll switch to TCP. */
+#define TCP_TIMEOUT_THRESHOLD 2
+/* If the expected RTT is over TCP_RTT_THRESHOLD we switch to TCP instead. */
+#define TCP_RTT_THRESHOLD 2000
+
+/* Define ε for ε-greedy algorithm (see select_transport)
+ * as ε=EPSILON_NOMIN/EPSILON_DENOM */
+#define EPSILON_NOMIN 1
+#define EPSILON_DENOM 20
+
+/* Simple cache interface follows */
+
+#define KEY_PREFIX 'S'
+
+void *prefix_key(const uint8_t *ip, size_t len)
+{
+       void *key = malloc(len + 1);
+       *(char *)key = KEY_PREFIX;
+       memcpy((uint8_t *)key + 1, ip, len);
+       return key;
+}
+
+#undef PREFIX
+
+/* First value of timeout will be calculated as SRTT+4*DEFAULT_TIMEOUT
+ * by calc_timeout(), so it'll be equal to DEFAULT_TIMEOUT. */
+static const struct rtt_state default_rtt_state = { .srtt = 0,
+                                                   .variance =
+                                                           DEFAULT_TIMEOUT / 4,
+                                                   .consecutive_timeouts = 0,
+                                                   .dead_since = 0 };
+
+/* Note that this opens a cace transaction, which is usually closed by calling
+ * `put_rtt_state` i.e. callee is responsible for its closing
+ * (e.g. calling kr_cache_commit). */
+struct rtt_state get_rtt_state(const uint8_t *ip, size_t len,
+                              struct kr_cache *cache)
+{
+       struct rtt_state state;
+       knot_db_val_t value;
+       knot_db_t *db = cache->db;
+       struct kr_cdb_stats *stats = &cache->stats;
+       uint8_t *prefixed_ip = prefix_key(ip, len);
+
+       knot_db_val_t key = { .len = len + 1, .data = prefixed_ip };
+
+       if (cache->api->read(db, stats, &key, &value, 1)) {
+               state = default_rtt_state;
+       } else {
+               assert(value.len == sizeof(struct rtt_state));
+               state = *(struct rtt_state *)value.data;
+       }
+
+       free(prefixed_ip);
+       return state;
+}
+
+int put_rtt_state(const uint8_t *ip, size_t len, struct rtt_state state,
+                 struct kr_cache *cache)
+{
+       knot_db_t *db = cache->db;
+       struct kr_cdb_stats *stats = &cache->stats;
+       uint8_t *prefixed_ip = prefix_key(ip, len);
+
+       knot_db_val_t key = { .len = len + 1, .data = prefixed_ip };
+       knot_db_val_t value = { .len = sizeof(struct rtt_state),
+                               .data = &state };
+
+       int ret = cache->api->write(db, stats, &key, &value, 1);
+       cache->api->commit(db, stats);
+
+       free(prefixed_ip);
+       return ret;
+}
+
+void bytes_to_ip(uint8_t *bytes, size_t len, union inaddr *dst)
+{
+       switch (len) {
+       case sizeof(struct in_addr):
+               ADDR_SET(dst->ip4.sin, AF_INET, bytes, len, 0);
+               break;
+       case sizeof(struct in6_addr):
+               ADDR_SET(dst->ip6.sin6, AF_INET6, bytes, len, 0);
+               break;
+       default:
+               assert(0);
+       }
+}
+
+uint8_t *ip_to_bytes(const union inaddr *src, size_t len)
+{
+       switch (len) {
+       case sizeof(struct in_addr):
+               return (uint8_t *)&src->ip4.sin_addr;
+       case sizeof(struct in6_addr):
+               return (uint8_t *)&src->ip6.sin6_addr;
+       default:
+               assert(0);
+       }
+}
+
+static bool no_rtt_info(struct rtt_state s)
+{
+       return s.srtt == 0 && s.consecutive_timeouts == 0;
+}
+
+static unsigned back_off_timeout(uint32_t to, int pow)
+{
+       if (pow > MAX_BACKOFF) {
+               to *= 1 << MAX_BACKOFF;
+       } else {
+               to *= (1 << pow);
+       }
+       if (to > MAX_TIMEOUT) {
+               to = MAX_TIMEOUT;
+       }
+       return to;
+}
+
+/* This is verbatim (minus the default timeout value and minimal variance)
+ * RFC6298, sec. 2. */
+static unsigned calc_timeout(struct rtt_state state)
+{
+       int32_t timeout =
+               state.srtt + MAX(4 * state.variance, MINIMAL_TIMEOUT_ADDITION);
+       return back_off_timeout(timeout, state.consecutive_timeouts);
+}
+
+/* This is verbatim RFC6298, sec. 2. */
+static struct rtt_state calc_rtt_state(struct rtt_state old, unsigned new_rtt)
+{
+       if (no_rtt_info(old)) {
+               return (struct rtt_state){ new_rtt, new_rtt / 2, 0 };
+       }
+
+       struct rtt_state ret;
+
+       ret.srtt = (int32_t)(0.75 * old.srtt + 0.25 * new_rtt);
+       ret.variance = (int32_t)(0.875 * old.variance +
+                                0.125 * abs(old.srtt - (int32_t)new_rtt));
+       ret.consecutive_timeouts = 0;
+
+       return ret;
+}
+
+/**
+ * @internal Invalidate addresses which should be considered dead
+ */
+static void invalidate_dead_upstream(struct address_state *state,
+                                    unsigned int retry_timeout)
+{
+       if (kr_now() - state->rtt_state.dead_since < retry_timeout) {
+               state->generation = -1;
+       }
+}
+
+/**
+ * @internal Check if IP address is TLS capable.
+ *
+ * @p req has to have the selection_context properly initiazed.
+ */
+static void check_tls_capable(struct address_state *address_state,
+                             struct kr_request *req, struct sockaddr *address)
+{
+       address_state->tls_capable =
+               req->selection_context.is_tls_capable ?
+                             req->selection_context.is_tls_capable(address) :
+                             false;
+}
+
+#if 0
+/* TODO: uncomment these once we actually use the information it collects. */
+/**
+ * Check if there is a existing TCP connection to this address.
+ * 
+ * @p req has to have the selection_context properly initiazed.
+ */
+void check_tcp_connections(struct address_state *address_state, struct kr_request *req, struct sockaddr *address) {
+       address_state->tcp_connected = req->selection_context.is_tcp_connected ? req->selection_context.is_tcp_connected(address) : false;
+       address_state->tcp_waiting = req->selection_context.is_tcp_waiting ? req->selection_context.is_tcp_waiting(address) : false;
+}
+#endif
+
+/**
+ * @internal Invalidate address if the respective IP version is disabled.
+ */
+static void check_network_settings(struct address_state *address_state,
+                                  size_t address_len, bool no_ipv4, bool no_ipv6)
+{
+       if (no_ipv4 && address_len == sizeof(struct in_addr)) {
+               address_state->generation = -1;
+       }
+       if (no_ipv6 && address_len == sizeof(struct in6_addr)) {
+               address_state->generation = -1;
+       }
+}
+
+void update_address_state(struct address_state *state, uint8_t *address,
+                         size_t address_len, struct kr_query *qry)
+{
+       union inaddr tmp_address;
+       bytes_to_ip(address, address_len, &tmp_address);
+       check_tls_capable(state, qry->request, &tmp_address.ip);
+       /* TODO: uncomment this once we actually use the information it collects
+       check_tcp_connections(address_state, qry->request, &tmp_address.ip);
+       */
+       check_network_settings(state, address_len, qry->flags.NO_IPV4,
+                              qry->flags.NO_IPV6);
+       state->rtt_state =
+               get_rtt_state(address, address_len, &qry->request->ctx->cache);
+       invalidate_dead_upstream(
+               state, qry->request->ctx->cache_rtt_tout_retry_interval);
+#ifdef SELECTION_CHOICE_LOGGING
+       // This is sometimes useful for debugging, but usually too verbose
+       WITH_VERBOSE(qry)
+       {
+               const char *ns_str = kr_straddr(&tmp_address.ip);
+               VERBOSE_MSG(qry, "rtt of %s is %d, variance is %d\n", ns_str,
+                           state->rtt_state.srtt, state->rtt_state.variance);
+       }
+#endif
+}
+
+static int cmp_choices(const void *a, const void *b)
+{
+       struct choice *a_ = (struct choice *)a;
+       struct choice *b_ = (struct choice *)b;
+
+       int diff;
+       /* Address with no RTT information is better than address
+        * with some information. */
+       if ((diff = no_rtt_info(b_->address_state->rtt_state) -
+                   no_rtt_info(a_->address_state->rtt_state))) {
+               return diff;
+       }
+       /* Address with less errors is better. */
+       if ((diff = a_->address_state->error_count -
+                   b_->address_state->error_count)) {
+               return diff;
+       }
+       /* Address with smaller expected timeout is better. */
+       if ((diff = calc_timeout(a_->address_state->rtt_state) -
+                   calc_timeout(b_->address_state->rtt_state))) {
+               return diff;
+       }
+       return 0;
+}
+
+/* Fisher-Yates shuffle of the choices */
+static void shuffle_choices(struct choice choices[], int choices_len)
+{
+       struct choice tmp;
+       for (int i = choices_len - 1; i > 0; i--) {
+               int j = kr_rand_bytes(1) % (i + 1);
+               tmp = choices[i];
+               choices[i] = choices[j];
+               choices[j] = tmp;
+       }
+}
+
+/* Performs the actual selection (currently variation on epsilon-greedy). */
+struct kr_transport *select_transport(struct choice choices[], int choices_len,
+                                     struct to_resolve unresolved[],
+                                     int unresolved_len, int timeouts,
+                                     struct knot_mm *mempool, bool tcp,
+                                     size_t *choice_index)
+{
+       if (!choices_len && !unresolved_len) {
+               /* There is nothing to choose from */
+               return NULL;
+       }
+
+       struct kr_transport *transport =
+               mm_alloc(mempool, sizeof(struct kr_transport));
+       memset(transport, 0, sizeof(struct kr_transport));
+
+       int choice = 0;
+       if (kr_rand_coin(EPSILON_NOMIN, EPSILON_DENOM) || choices_len == 0) {
+               /* "EXPLORE":
+                * randomly choose some option
+                * (including resolution of some new name). */
+               int index = kr_rand_bytes(1) % (choices_len + unresolved_len);
+               if (index < unresolved_len) {
+                       // We will resolve a new NS name
+                       *transport = (struct kr_transport){
+                               .protocol = unresolved[index].type,
+                               .ns_name = unresolved[index].name
+                       };
+                       return transport;
+               } else {
+                       choice = index - unresolved_len;
+               }
+       } else {
+               /* "EXPLOIT":
+                * choose a resolved address which seems best right now. */
+               shuffle_choices(choices, choices_len);
+               /* If there are some addresses with no rtt_info we try them
+                * first (see cmp_choices). So unknown servers are chosen
+                * *before* the best know server. This ensures that every option
+                * is tried before going back to some that was tried before. */
+               qsort(choices, choices_len, sizeof(struct choice), cmp_choices);
+               choice = 0;
+       }
+
+       struct choice *chosen = &choices[choice];
+
+       /* Don't try the same server again when there are other choices to be explored */
+       if (chosen->address_state->error_count && unresolved_len) {
+               int index = kr_rand_bytes(1) % unresolved_len;
+               *transport = (struct kr_transport){
+                       .ns_name = unresolved[index].name,
+                       .protocol = unresolved[index].type,
+               };
+               return transport;
+       }
+
+       unsigned timeout;
+       if (no_rtt_info(chosen->address_state->rtt_state)) {
+               /* Exponential back-off when retrying after timeout and choosing
+                * an unknown server. */
+               timeout = back_off_timeout(DEFAULT_TIMEOUT, timeouts);
+       } else {
+               timeout = calc_timeout(chosen->address_state->rtt_state);
+       }
+
+       enum kr_transport_protocol protocol;
+       if (chosen->address_state->tls_capable) {
+               protocol = KR_TRANSPORT_TLS;
+       } else if (tcp ||
+                  chosen->address_state->errors[KR_SELECTION_QUERY_TIMEOUT] >= TCP_TIMEOUT_THRESHOLD ||
+                  timeout > TCP_RTT_THRESHOLD) {
+               protocol = KR_TRANSPORT_TCP;
+       } else {
+               protocol = KR_TRANSPORT_UDP;
+       }
+
+       *transport = (struct kr_transport){
+               .ns_name = chosen->address_state->ns_name,
+               .protocol = protocol,
+               .timeout = timeout,
+               .safe_mode =
+                       chosen->address_state->errors[KR_SELECTION_FORMERROR],
+       };
+
+       int port;
+       if (!(port = chosen->port)) {
+               switch (transport->protocol) {
+               case KR_TRANSPORT_TLS:
+                       port = KR_DNS_TLS_PORT;
+                       break;
+               case KR_TRANSPORT_UDP:
+               case KR_TRANSPORT_TCP:
+                       port = KR_DNS_PORT;
+                       break;
+               default:
+                       assert(0);
+                       break;
+               }
+       }
+
+       switch (chosen->address_len) {
+       case sizeof(struct in_addr):
+               ADDR_SET(transport->address.ip4.sin, AF_INET, chosen->address,
+                        chosen->address_len, port);
+               transport->address_len = chosen->address_len;
+               break;
+       case sizeof(struct in6_addr):
+               ADDR_SET(transport->address.ip6.sin6, AF_INET6, chosen->address,
+                        chosen->address_len, port);
+               transport->address_len = chosen->address_len;
+               break;
+       default:
+               assert(0);
+               break;
+       }
+
+       if (choice_index) {
+               *choice_index = chosen->address_state->choice_array_index;
+       }
+
+       return transport;
+}
+
+void update_rtt(struct kr_query *qry, struct address_state *addr_state,
+               const struct kr_transport *transport, unsigned rtt)
+{
+       if (!transport || !addr_state) {
+               /* Answers from cache have NULL transport, ignore them. */
+               return;
+       }
+
+       struct kr_cache *cache = &qry->request->ctx->cache;
+
+       uint8_t *address =
+               ip_to_bytes(&transport->address, transport->address_len);
+       /* This construct is a bit racy since the global state may change
+        * between calls to `get_rtt_state` and `put_rtt_state`  but we don't
+        * care that much since it is rare and we only risk slightly suboptimal
+        * transport choice. */
+       struct rtt_state cur_rtt_state =
+               get_rtt_state(address, transport->address_len, cache);
+       struct rtt_state new_rtt_state = calc_rtt_state(cur_rtt_state, rtt);
+       put_rtt_state(address, transport->address_len, new_rtt_state, cache);
+
+       WITH_VERBOSE(qry)
+       {
+       KR_DNAME_GET_STR(ns_name, transport->ns_name);
+       KR_DNAME_GET_STR(zonecut_str, qry->zone_cut.name);
+       const char *ns_str = kr_straddr(&transport->address.ip);
+
+       VERBOSE_MSG(
+               qry,
+               "=> id: '%05u' updating: '%s'@'%s' zone cut: '%s' with rtt %u to srtt: %d and variance: %d \n",
+               qry->id, ns_name, ns_str ? ns_str : "", zonecut_str,
+               rtt, new_rtt_state.srtt, new_rtt_state.variance);
+       }
+}
+
+static void cache_timeout(const struct kr_transport *transport,
+                         struct address_state *addr_state, struct kr_cache *cache)
+{
+       if (transport->deduplicated) {
+               /* Transport was chosen by a different query, that one will
+                * cache the result. */
+               return;
+       }
+
+       uint8_t *address =
+               ip_to_bytes(&transport->address, transport->address_len);
+       struct rtt_state old_state = addr_state->rtt_state;
+       struct rtt_state cur_state =
+               get_rtt_state(address, transport->address_len, cache);
+
+       /* We could lose some update from some other process by doing this,
+        * but at least timeout count can't blow up. */
+       if (cur_state.consecutive_timeouts == old_state.consecutive_timeouts) {
+               if (++cur_state.consecutive_timeouts >=
+                   KR_NS_TIMEOUT_ROW_DEAD) {
+                       cur_state.dead_since = kr_now();
+               }
+               put_rtt_state(address, transport->address_len, cur_state,
+                             cache);
+       } else {
+               /* `get_rtt_state` opens a cache transaction, we have to end it. */
+               kr_cache_commit(cache);
+       }
+}
+
+void error(struct kr_query *qry, struct address_state *addr_state,
+          const struct kr_transport *transport,
+          enum kr_selection_error sel_error)
+{
+       if (!transport || !addr_state) {
+               /* Answers from cache have NULL transport, ignore them. */
+               return;
+       }
+
+       if (sel_error >= KR_SELECTION_NUMBER_OF_ERRORS) {
+               assert(0);
+       }
+
+       if (sel_error == KR_SELECTION_QUERY_TIMEOUT) {
+               qry->server_selection.local_state->timeouts++;
+               // Make sure the query was chosen by this query
+               if (!transport->deduplicated) {
+                       cache_timeout(transport, addr_state,
+                                     &qry->request->ctx->cache);
+               }
+       }
+
+       if (sel_error == KR_SELECTION_TRUNCATED &&
+           transport->protocol == KR_TRANSPORT_UDP) {
+               /* Don't punish the server that told us to switch to TCP. */
+               qry->server_selection.local_state->truncated = true;
+       } else {
+               if (sel_error == KR_SELECTION_TRUNCATED) {
+                       /* TRUNCATED over TCP/TLS, upstream is broken. */
+                       addr_state->unrecoverable_errors++;
+               }
+
+               if (UNRECOVERABLE_ERRORS[sel_error]) {
+                       addr_state->unrecoverable_errors++;
+               }
+
+               if (sel_error == KR_SELECTION_FORMERROR && transport->safe_mode) {
+                       addr_state->unrecoverable_errors++;
+               }
+
+               addr_state->errors[sel_error]++;
+               addr_state->error_count++;
+       }
+       
+       WITH_VERBOSE(qry)
+       {
+       KR_DNAME_GET_STR(ns_name, transport->ns_name);
+       KR_DNAME_GET_STR(zonecut_str, qry->zone_cut.name);
+       const char *ns_str = kr_straddr(&transport->address.ip);
+
+       VERBOSE_MSG(
+               qry,
+               "=> id: '%05u' noting selection error: '%s'@'%s' zone cut: '%s' error no.:%d\n",
+               qry->id, ns_name, ns_str ? ns_str : "", zonecut_str,
+               sel_error);
+       }
+}
+
+void kr_server_selection_init(struct kr_query *qry)
+{
+       struct knot_mm *mempool = &qry->request->pool;
+       if (qry->flags.FORWARD || qry->flags.STUB) {
+               qry->server_selection = (struct kr_server_selection){
+                       .initialized = true,
+                       .choose_transport = forward_choose_transport,
+                       .update_rtt = forward_update_rtt,
+                       .error = forward_error,
+                       .local_state =
+                               mm_alloc(mempool, sizeof(struct local_state)),
+               };
+               memset(qry->server_selection.local_state, 0,
+                      sizeof(struct local_state));
+               forward_local_state_alloc(
+                       mempool, &qry->server_selection.local_state->private,
+                       qry->request);
+       } else {
+               qry->server_selection = (struct kr_server_selection){
+                       .initialized = true,
+                       .choose_transport = iter_choose_transport,
+                       .update_rtt = iter_update_rtt,
+                       .error = iter_error,
+                       .local_state =
+                               mm_alloc(mempool, sizeof(struct local_state)),
+               };
+               memset(qry->server_selection.local_state, 0,
+                      sizeof(struct local_state));
+               iter_local_state_alloc(
+                       mempool, &qry->server_selection.local_state->private);
+       }
+}
+
+int kr_forward_add_target(struct kr_request *req, const struct sockaddr *sock)
+{
+       if (!req->selection_context.forwarding_targets.at) {
+               return kr_error(EINVAL);
+       }
+
+       union inaddr address;
+
+       switch (sock->sa_family) {
+       case AF_INET:
+               if (req->options.NO_IPV4)
+                       return kr_error(EINVAL);
+               address.ip4 = *(const struct sockaddr_in *)sock;
+               break;
+       case AF_INET6:
+               if (req->options.NO_IPV6)
+                       return kr_error(EINVAL);
+               address.ip6 = *(const struct sockaddr_in6 *)sock;
+               break;
+       default:
+               return kr_error(EINVAL);
+       }
+
+       array_push_mm(req->selection_context.forwarding_targets, address,
+                     kr_memreserve, &req->pool);
+       return kr_ok();
+}
diff --git a/lib/selection.h b/lib/selection.h
new file mode 100644 (file)
index 0000000..f8e2730
--- /dev/null
@@ -0,0 +1,233 @@
+/*  Copyright (C) 2014-2020 CZ.NIC, z.s.p.o. <knot-dns@labs.nic.cz>
+ *  SPDX-License-Identifier: GPL-3.0-or-later
+ */
+
+#pragma once
+
+/**
+ * @file selection.h
+ * Provides server selection API (see `kr_server_selection`) and functions common to both implementations.
+ */
+
+#include "lib/cache/api.h"
+
+/* After KR_NS_TIMEOUT_ROW_DEAD consecutive timeouts, we consider the upstream IP dead for KR_NS_TIMEOUT_RETRY_INTERVAL ms */
+#define KR_NS_TIMEOUT_ROW_DEAD 4
+#define KR_NS_TIMEOUT_RETRY_INTERVAL 1000
+
+/**
+ * These errors are to be reported as feedback to server selection.
+ * See `kr_server_selection::error` for more details.
+ */
+enum kr_selection_error {
+       KR_SELECTION_OK = 0,
+
+       // Network errors
+       KR_SELECTION_QUERY_TIMEOUT,
+       KR_SELECTION_TLS_HANDSHAKE_FAILED,
+       KR_SELECTION_TCP_CONNECT_FAILED,
+       KR_SELECTION_TCP_CONNECT_TIMEOUT,
+
+       // RCODEs
+       KR_SELECTION_REFUSED,
+       KR_SELECTION_SERVFAIL,
+       KR_SELECTION_FORMERROR,
+       KR_SELECTION_NOTIMPL,
+       KR_SELECTION_OTHER_RCODE,
+
+       // DNS errors
+       KR_SELECTION_TRUNCATED,
+       KR_SELECTION_DNSSEC_ERROR,
+       KR_SELECTION_LAME_DELEGATION,
+       /** Too long chain, or cycle. */
+       KR_SELECTION_BAD_CNAME,
+
+       /** Leave this last, as it is used as array size. */
+       KR_SELECTION_NUMBER_OF_ERRORS 
+};
+
+enum kr_transport_protocol {
+       /** Selected name with no IPv4 address, it has to be resolved first. */
+       KR_TRANSPORT_RESOLVE_A,
+       /** Selected name with no IPv6 address, it has to be resolved first. */
+       KR_TRANSPORT_RESOLVE_AAAA,
+       KR_TRANSPORT_UDP,
+       KR_TRANSPORT_TCP,
+       KR_TRANSPORT_TLS,
+};
+
+/**
+ * Output of the selection algorithm.
+ */
+struct kr_transport {
+       knot_dname_t *ns_name; /**< Set to "." for forwarding targets.*/
+       union inaddr address;
+       size_t address_len;
+       enum kr_transport_protocol protocol;
+       unsigned timeout; /**< Timeout in ms to be set for UDP transmission. */
+       /** True iff transport was set in worker.c:subreq_finalize,
+        * that means it may be different from the one originally chosen one.*/
+       bool deduplicated;
+       bool safe_mode; /**< Turn on SAFEMODE for this transport */
+};
+
+struct local_state {
+       int timeouts; /**< Number of timeouts that occured resolving this query.*/
+       bool truncated; /**< Query was truncated, switch to TCP. */
+       void *private; /**< Inner state of the implementation.*/
+};
+
+/**
+ * Specifies a API for selecting transports and giving feedback on the choices.
+ *
+ * The function pointers are to be used throughout resolver when some information about
+ * the transport is obtained. E.g. RTT in `worker.c` or RCODE in `iterate.c`,…
+ */
+struct kr_server_selection {
+       bool initialized;
+       /**
+        * Puts a pointer to next transport of @p qry to @p transport .
+        *
+        * Allocates new kr_transport in request's mempool, chooses transport to be used for this query.
+        * Selection may fail, so @p transport can be set to NULL.
+        *
+        * @param transport to be filled with pointer to the chosen transport or NULL on failure
+        */
+       void (*choose_transport)(struct kr_query *qry,
+                                struct kr_transport **transport);
+       /** Report back the RTT of network operation for transport in ms. */
+       void (*update_rtt)(struct kr_query *qry,
+                          const struct kr_transport *transport, unsigned rtt);
+       /** Report back error encourtered with the chosen transport. See `enum kr_selection` */
+       void (*error)(struct kr_query *qry,
+                     const struct kr_transport *transport,
+                     enum kr_selection_error error);
+
+       struct local_state *local_state;
+};
+
+/**
+ * @brief Initialize the server selection API for @p qry.
+ *
+ * The implementation is to be chosen based on qry->flags.
+ */
+KR_EXPORT
+void kr_server_selection_init(struct kr_query *qry);
+
+/**
+ * @brief Add forwarding target to request.
+ *
+ * This is exposed to Lua in order to add forwarding targets to request.
+ * These are then shared by all the queries in said request.
+ */
+KR_EXPORT
+int kr_forward_add_target(struct kr_request *req, const struct sockaddr *sock);
+
+/**
+ * To be held per IP address in the global LMDB cache
+ */
+struct rtt_state {
+       int32_t srtt;
+       int32_t variance;
+       int32_t consecutive_timeouts;
+       /** Timestamp of pronouncing this IP bad based on KR_NS_TIMEOUT_ROW_DEAD */
+       uint64_t dead_since;
+};
+
+/**
+ * @brief To be held per IP address and locally "inside" query.
+ */
+struct address_state {
+       /** Used to distinguish old and valid records in local_state. */
+       unsigned int generation;
+       struct rtt_state rtt_state;
+       knot_dname_t *ns_name;
+       bool tls_capable : 1;
+       /* TODO: uncomment these once we actually use this information in selection
+       bool tcp_waiting : 1;
+       bool tcp_connected : 1;
+       */
+       int choice_array_index;
+       int error_count;
+       int unrecoverable_errors;
+       int errors[KR_SELECTION_NUMBER_OF_ERRORS];
+};
+
+/**
+ * @brief Array of these is one of inputs for the actual selection algorithm (`select_transport`)
+ */
+struct choice {
+       uint8_t *address;
+       size_t address_len;
+       struct address_state *address_state;
+       /** used to overwrite the port number;
+        * if zero, `select_transport` determines it. */
+       uint16_t port;
+};
+
+/**
+ * @brief Array of these is description of names to be resolved (i.e. name without some address)
+ */
+struct to_resolve {
+       knot_dname_t *name;
+       /** Either KR_TRANSPORT_RESOLVE_A or KR_TRANSPORT_RESOLVE_AAAA is valid here. */
+       enum kr_transport_protocol type;
+};
+
+/**
+ * @brief Based on passed choices, choose the next transport.
+ *
+ * Common function to both implementations (iteration and forwarding).
+ * The `*_choose_transport` functions from `selection_*.h` preprocess the input for this one.
+ *
+ * @param choices Options to choose from, see struct above
+ * @param unresolved Array of names that can be resolved (i.e. no A/AAAA record)
+ * @param timeouts Number of timeouts that occured in this query (used for exponential backoff)
+ * @param mempool Memory context of current request
+ * @param tcp Force TCP as transport protocol
+ * @param[out] choice_index Optinally index of the chosen transport in the @p choices array is stored here.
+ * @return Chosen transport or NULL when no choice is viable
+ */
+struct kr_transport *select_transport(struct choice choices[], int choices_len,
+                                     struct to_resolve unresolved[],
+                                     int unresolved_len, int timeouts,
+                                     struct knot_mm *mempool, bool tcp,
+                                     size_t *choice_index);
+
+/**
+ * Common part of RTT feedback mechanism. Notes RTT to global cache.
+ */
+void update_rtt(struct kr_query *qry, struct address_state *addr_state,
+               const struct kr_transport *transport, unsigned rtt);
+
+/**
+ * Common part of error feedback mechanism.
+ */
+void error(struct kr_query *qry, struct address_state *addr_state,
+          const struct kr_transport *transport,
+          enum kr_selection_error sel_error);
+
+/**
+ * Get RTT state from cache. Returns `default_rtt_state` on unknown addresses.
+ */
+struct rtt_state get_rtt_state(const uint8_t *ip, size_t len,
+                              struct kr_cache *cache);
+
+int put_rtt_state(const uint8_t *ip, size_t len, struct rtt_state state,
+                 struct kr_cache *cache);
+
+/**
+ * @internal Helper function for conversion between different IP representations.
+ */
+void bytes_to_ip(uint8_t *bytes, size_t len, union inaddr *dst);
+
+/**
+ * @internal Helper function for conversion between different IP representations.
+ */
+uint8_t *ip_to_bytes(const union inaddr *src, size_t len);
+
+/**
+ * @internal Fetch per-address information from various sources.
+ */
+void update_address_state(struct address_state *state, uint8_t *address,
+                         size_t address_len, struct kr_query *qry);
diff --git a/lib/selection_forward.c b/lib/selection_forward.c
new file mode 100644 (file)
index 0000000..2f85bcd
--- /dev/null
@@ -0,0 +1,129 @@
+/*  Copyright (C) 2014-2020 CZ.NIC, z.s.p.o. <knot-dns@labs.nic.cz>
+ *  SPDX-License-Identifier: GPL-3.0-or-later
+ */
+
+#include "lib/selection_forward.h"
+#include "lib/resolve.h"
+
+#define VERBOSE_MSG(qry, ...) QRVERBOSE((qry), "slct", __VA_ARGS__)
+
+#define FORWARDING_TIMEOUT 2000
+
+struct forward_local_state {
+       inaddr_array_t *targets;
+       struct address_state *addr_states;
+       /** Index of last choice in the targets array, used for error reporting. */
+       size_t last_choice_index;
+};
+
+void forward_local_state_alloc(struct knot_mm *mm, void **local_state,
+                              struct kr_request *req)
+{
+       assert(req->selection_context.forwarding_targets.at);
+       *local_state = mm_alloc(mm, sizeof(struct forward_local_state));
+       memset(*local_state, 0, sizeof(struct forward_local_state));
+
+       struct forward_local_state *forward_state =
+               (struct forward_local_state *)*local_state;
+       forward_state->targets = &req->selection_context.forwarding_targets;
+
+       forward_state->addr_states = mm_alloc(
+               mm, sizeof(struct address_state) * forward_state->targets->len);
+       memset(forward_state->addr_states, 0,
+              sizeof(struct address_state) * forward_state->targets->len);
+}
+
+void forward_choose_transport(struct kr_query *qry,
+                             struct kr_transport **transport)
+{
+       struct forward_local_state *local_state =
+               qry->server_selection.local_state->private;
+       struct choice choices[local_state->targets->len];
+       int valid = 0;
+
+       for (int i = 0; i < local_state->targets->len; i++) {
+               union inaddr *address = &local_state->targets->at[i];
+               size_t addr_len;
+               uint16_t port;
+               switch (address->ip.sa_family) {
+               case AF_INET:
+                       port = ntohs(address->ip4.sin_port);
+                       addr_len = sizeof(struct in_addr);
+                       break;
+               case AF_INET6:
+                       port = ntohs(address->ip6.sin6_port);
+                       addr_len = sizeof(struct in6_addr);
+                       break;
+               default:
+                       assert(0);
+               }
+
+               struct address_state *addr_state = &local_state->addr_states[i];
+               addr_state->ns_name = (knot_dname_t *)"";
+
+               update_address_state(addr_state, ip_to_bytes(address, addr_len),
+                                    addr_len, qry);
+
+               if (addr_state->generation == -1) {
+                       continue;
+               }
+               addr_state->choice_array_index = i;
+
+               choices[valid++] = (struct choice){
+                       .address = ip_to_bytes(address, addr_len),
+                       .address_len = addr_len,
+                       .address_state = addr_state,
+                       .port = port,
+               };
+       }
+
+       bool tcp =
+               qry->flags.TCP | qry->server_selection.local_state->truncated;
+       *transport =
+               select_transport(choices, valid, NULL, 0,
+                                qry->server_selection.local_state->timeouts,
+                                &qry->request->pool, tcp,
+                                &local_state->last_choice_index);
+       if (*transport) {
+               /* Set static timeout for forwarding; there is no point in this
+                * being dynamic since the RTT of a packet to forwarding target
+                * says nothing about the network RTT of said target, since
+                * it is doing resolution upstream. */
+               (*transport)->timeout = FORWARDING_TIMEOUT;
+               /* We need to propagate this to flags since it's used in other
+                * parts of the resolver (e.g. logging and stats). */
+               qry->flags.TCP = tcp;
+       }
+}
+
+void forward_error(struct kr_query *qry, const struct kr_transport *transport,
+                  enum kr_selection_error sel_error)
+{
+       if (!qry->server_selection.initialized) {
+               return;
+       }
+       struct forward_local_state *local_state =
+               qry->server_selection.local_state->private;
+       struct address_state *addr_state =
+               &local_state->addr_states[local_state->last_choice_index];
+       error(qry, addr_state, transport, sel_error);
+}
+
+void forward_update_rtt(struct kr_query *qry,
+                       const struct kr_transport *transport, unsigned rtt)
+{
+       if (!qry->server_selection.initialized) {
+               return;
+       }
+
+       if (!transport) {
+               return;
+       }
+
+       struct forward_local_state *local_state =
+               qry->server_selection.local_state->private;
+       struct address_state *addr_state =
+               &local_state->addr_states[local_state->last_choice_index];
+
+       update_rtt(qry, addr_state, transport, rtt);
+}
\ No newline at end of file
diff --git a/lib/selection_forward.h b/lib/selection_forward.h
new file mode 100644 (file)
index 0000000..e66274f
--- /dev/null
@@ -0,0 +1,17 @@
+/*  Copyright (C) 2014-2020 CZ.NIC, z.s.p.o. <knot-dns@labs.nic.cz>
+ *  SPDX-License-Identifier: GPL-3.0-or-later
+ */
+
+#pragma once
+
+#include "lib/selection.h"
+#include "lib/resolve.h"
+
+void forward_local_state_alloc(struct knot_mm *mm, void **local_state,
+                              struct kr_request *req);
+void forward_choose_transport(struct kr_query *qry,
+                             struct kr_transport **transport);
+void forward_error(struct kr_query *qry, const struct kr_transport *transport,
+                  enum kr_selection_error sel_error);
+void forward_update_rtt(struct kr_query *qry,
+                       const struct kr_transport *transport, unsigned rtt);
\ No newline at end of file
diff --git a/lib/selection_iter.c b/lib/selection_iter.c
new file mode 100644 (file)
index 0000000..24fd3ce
--- /dev/null
@@ -0,0 +1,369 @@
+/*  Copyright (C) 2014-2020 CZ.NIC, z.s.p.o. <knot-dns@labs.nic.cz>
+ *  SPDX-License-Identifier: GPL-3.0-or-later
+ */
+
+#include "lib/selection_iter.h"
+#include "lib/selection.h"
+
+#include "lib/generic/trie.h"
+#include "lib/generic/pack.h"
+#include "lib/zonecut.h"
+#include "lib/resolve.h"
+
+#define VERBOSE_MSG(qry, ...) QRVERBOSE((qry), "slct", __VA_ARGS__)
+
+// To be held per query and locally
+struct iter_local_state {
+       trie_t *names;
+       trie_t *addresses;
+       knot_dname_t *zonecut;
+       /** Used to distinguish old and valid records in tries. */
+       unsigned int generation;
+       enum kr_selection_error last_error;
+       unsigned int no_ns_addr_count;
+};
+
+enum record_state { RECORD_UNKNOWN, RECORD_RESOLVED, RECORD_TRIED };
+
+// To be held per NS name and locally
+struct iter_name_state {
+       unsigned int generation;
+       enum record_state a_state;
+       enum record_state aaaa_state;
+};
+
+void iter_local_state_alloc(struct knot_mm *mm, void **local_state)
+{
+       *local_state = mm_alloc(mm, sizeof(struct iter_local_state));
+       memset(*local_state, 0, sizeof(struct iter_local_state));
+}
+
+static struct address_state *get_address_state(struct iter_local_state *local_state,
+                                               const struct kr_transport *transport)
+{
+       if (!transport) {
+               return NULL;
+       }
+
+       trie_t *addresses = local_state->addresses;
+       uint8_t *address =
+               ip_to_bytes(&transport->address, transport->address_len);
+
+       trie_val_t *address_state = trie_get_try(addresses, (char *)address,
+                                                transport->address_len);
+
+       if (!address_state) {
+               if (transport->deduplicated) {
+                       /* Transport was chosen by a different query. */
+                       return NULL;
+               }
+
+               assert(0);
+       }
+       return (struct address_state *)*address_state;
+}
+
+static bool zonecut_changed(knot_dname_t *new, knot_dname_t *old)
+{
+       return knot_dname_cmp(old, new);
+}
+
+static void unpack_state_from_zonecut(struct iter_local_state *local_state,
+                                     struct kr_query *qry)
+{
+       struct kr_zonecut *zonecut = &qry->zone_cut;
+       struct knot_mm *mm = &qry->request->pool;
+
+       bool zcut_changed = false;
+       if (local_state->names == NULL || local_state->addresses == NULL) {
+               /* Local state initialization. */
+               memset(local_state, 0, sizeof(struct iter_local_state));
+               local_state->names = trie_create(mm);
+               local_state->addresses = trie_create(mm);
+       } else {
+               zcut_changed = zonecut_changed(zonecut->name, local_state->zonecut);
+       }
+       local_state->zonecut = zonecut->name;
+       local_state->generation++;
+
+       if (zcut_changed) {
+               local_state->no_ns_addr_count = 0;
+       }
+
+       trie_it_t *it;
+       unsigned int current_generation = local_state->generation;
+
+       for (it = trie_it_begin(zonecut->nsset); !trie_it_finished(it); trie_it_next(it)) {
+               knot_dname_t *dname = (knot_dname_t *)trie_it_key(it, NULL);
+               pack_t *addresses = (pack_t *)*trie_it_val(it);
+
+               trie_val_t *val = trie_get_ins(local_state->names, (char *)dname,
+                                              knot_dname_size(dname));
+               if (!*val) {
+                       /* We encountered this name for the first time. */
+                       *val = mm_alloc(mm, sizeof(struct iter_name_state));
+                       memset(*val, 0, sizeof(struct iter_name_state));
+               }
+               struct iter_name_state *name_state = *(struct iter_name_state **)val;
+               name_state->generation = current_generation;
+
+               if (zcut_changed) {
+                       /* Set name as unresolved as they might have fallen out
+                        * of cache (TTL expired). */
+                       name_state->a_state = RECORD_UNKNOWN;
+                       name_state->aaaa_state = RECORD_UNKNOWN;
+               }
+               
+               if (addresses->len == 0) {
+                       continue;
+               }
+
+               /* We have some addresses to work with, let's iterate over them. */
+               for (uint8_t *obj = pack_head(*addresses); obj != pack_tail(*addresses);
+                    obj = pack_obj_next(obj)) {
+                       uint8_t *address = pack_obj_val(obj);
+                       size_t address_len = pack_obj_len(obj);
+                       trie_val_t *tval = trie_get_ins(local_state->addresses,
+                                                       (char *)address,
+                                                       address_len);
+                       if (!*tval) {
+                               /* We have have not seen this address before. */
+                               *tval = mm_alloc(mm, sizeof(struct address_state));
+                               memset(*tval, 0, sizeof(struct address_state));
+                       }
+                       struct address_state *address_state = (*(struct address_state **)tval);
+                       address_state->generation = current_generation;
+                       address_state->ns_name = dname;
+
+                       if (address_len == sizeof(struct in_addr)) {
+                               name_state->a_state = RECORD_RESOLVED;
+                       } else if (address_len == sizeof(struct in6_addr)) {
+                               name_state->aaaa_state = RECORD_RESOLVED;
+                       }
+                       update_address_state(address_state, address, address_len, qry);
+               }
+       }
+       trie_it_free(it);
+}
+
+static int get_valid_addresses(struct iter_local_state *local_state,
+                               struct choice choices[])
+{
+       unsigned count = 0;
+       trie_it_t *it;
+       for (it = trie_it_begin(local_state->addresses); !trie_it_finished(it);
+            trie_it_next(it)) {
+               size_t address_len;
+               uint8_t *address = (uint8_t *)trie_it_key(it, &address_len);
+               struct address_state *address_state =
+                       (struct address_state *)*trie_it_val(it);
+               if (address_state->generation == local_state->generation &&
+                   !address_state->unrecoverable_errors) {
+                       choices[count] = (struct choice){
+                               .address = address,
+                               .address_len = address_len,
+                               .address_state = address_state,
+                       };
+                       count++;
+               }
+       }
+       trie_it_free(it);
+       return count;
+}
+
+static int get_resolvable_names(struct iter_local_state *local_state,
+                               struct to_resolve resolvable[], struct kr_query *qry)
+{
+       /* Further resolution is not possible until we get `. DNSKEY` record;
+        * we have to choose one of the known addresses here. */
+       if (qry->sname[0] == '\0' && qry->stype == KNOT_RRTYPE_DNSKEY) {
+               return 0;
+       }
+
+       unsigned count = 0;
+       trie_it_t *it;
+       for (it = trie_it_begin(local_state->names); !trie_it_finished(it);
+            trie_it_next(it)) {
+               struct iter_name_state *name_state =
+                       *(struct iter_name_state **)trie_it_val(it);
+               if (name_state->generation == local_state->generation) {
+                       knot_dname_t *name = (knot_dname_t *)trie_it_key(it, NULL);
+                       if (qry->stype == KNOT_RRTYPE_DNSKEY &&
+                           knot_dname_in_bailiwick(name, qry->sname) > 0) {
+                               /* Resolving `domain. DNSKEY` can't trigger the
+                                * resolution of `sub.domain. A/AAAA` since it
+                                * will cause a cycle. */
+                               continue;
+                       }
+
+                       /* FIXME: kr_rplan_satisfies(qry,…) should have been here, but this leads to failures on 
+                        * iter_ns_badip.rpl, this is because the test requires the resolver to switch to parent
+                        * side after a record in cache expires. Only way to do this in the current zonecut setup is
+                        * to requery the same query twice in the row. So we have to allow that and only check the 
+                        * rplan from parent upwards.
+                        */
+                       bool a_in_rplan = kr_rplan_satisfies(qry->parent, name,
+                                                            KNOT_CLASS_IN,
+                                                            KNOT_RRTYPE_A);
+                       bool aaaa_in_rplan =
+                               kr_rplan_satisfies(qry->parent, name,
+                                                  KNOT_CLASS_IN,
+                                                  KNOT_RRTYPE_AAAA);
+
+                       if (name_state->a_state == RECORD_UNKNOWN &&
+                           !qry->flags.NO_IPV4 && !a_in_rplan) {
+                               resolvable[count++] = (struct to_resolve){
+                                       name, KR_TRANSPORT_RESOLVE_A
+                               };
+                       }
+
+                       if (name_state->aaaa_state == RECORD_UNKNOWN &&
+                           !qry->flags.NO_IPV6 && !aaaa_in_rplan) {
+                               resolvable[count++] = (struct to_resolve){
+                                       name, KR_TRANSPORT_RESOLVE_AAAA
+                               };
+                       }
+               }
+       }
+       trie_it_free(it);
+       return count;
+}
+
+static void update_name_state(knot_dname_t *name, enum kr_transport_protocol type,
+                             trie_t *names)
+{
+       size_t name_len = knot_dname_size(name);
+       trie_val_t *val = trie_get_try(names, (char *)name, name_len);
+
+       if (!val) {
+               return;
+       }
+
+       struct iter_name_state *name_state = (struct iter_name_state *)*val;
+       switch (type) {
+       case KR_TRANSPORT_RESOLVE_A:
+               name_state->a_state = RECORD_TRIED;
+               break;
+       case KR_TRANSPORT_RESOLVE_AAAA:
+               name_state->aaaa_state = RECORD_TRIED;
+               break;
+       default:
+               assert(0);
+       }
+}
+
+void iter_choose_transport(struct kr_query *qry,
+                          struct kr_transport **transport)
+{
+       struct knot_mm *mempool = &qry->request->pool;
+       struct iter_local_state *local_state =
+               (struct iter_local_state *)
+                       qry->server_selection.local_state->private;
+
+       unpack_state_from_zonecut(local_state, qry);
+
+       struct choice choices[trie_weight(local_state->addresses)];
+       /* We may try to resolve A and AAAA record for each name, so therefore
+        * 2*trie_weight(…) is here. */
+       struct to_resolve resolvable[2 * trie_weight(local_state->names)];
+
+       // Filter valid addresses and names from the tries
+       int choices_len = get_valid_addresses(local_state, choices);
+       int resolvable_len = get_resolvable_names(local_state, resolvable, qry);
+
+       if (choices_len || resolvable_len) {
+               bool tcp = qry->flags.TCP |
+                          qry->server_selection.local_state->truncated;
+               *transport = select_transport(
+                       choices, choices_len, resolvable, resolvable_len,
+                       qry->server_selection.local_state->timeouts, mempool,
+                       tcp, NULL);
+               if (*transport) {
+                       switch ((*transport)->protocol) {
+                       case KR_TRANSPORT_RESOLVE_A:
+                       case KR_TRANSPORT_RESOLVE_AAAA:
+                               /* Note that we tried resolving this name to not try it again. */
+                               update_name_state((*transport)->ns_name,
+                                                 (*transport)->protocol,
+                                                 local_state->names);
+                               break;
+                       case KR_TRANSPORT_TLS:
+                       case KR_TRANSPORT_TCP:
+                               /* We need to propagate this to flags since it's used in
+                                * other parts of the resolver. */
+                               qry->flags.TCP = true;
+                               break;
+                       default:
+                               break;
+                       }
+               }
+       } else {
+               *transport = NULL;
+               /* Last selected server had broken DNSSEC and now we have no more
+                * servers to ask. We signal this to the rest of resolver by
+                * setting DNSSEC_BOGUS flag. */
+               if (local_state->last_error == KR_SELECTION_DNSSEC_ERROR) {
+                       qry->flags.DNSSEC_BOGUS = true;
+               }
+       }
+
+       bool nxnsattack_mitigation = false;
+       enum kr_transport_protocol proto =
+               *transport ? (*transport)->protocol : -1;
+       if (proto == KR_TRANSPORT_RESOLVE_A || proto == KR_TRANSPORT_RESOLVE_AAAA) {
+               if (++local_state->no_ns_addr_count > KR_COUNT_NO_NSADDR_LIMIT) {
+                       *transport = NULL;
+                       nxnsattack_mitigation = true;
+               }
+       }
+
+       WITH_VERBOSE(qry)
+       {
+       KR_DNAME_GET_STR(zonecut_str, qry->zone_cut.name);
+       if (*transport) {
+               KR_DNAME_GET_STR(ns_name, (*transport)->ns_name);
+               const char *ns_str = kr_straddr(&(*transport)->address.ip);
+               const char *ip_version;
+               switch (proto)
+               {
+               case KR_TRANSPORT_RESOLVE_A:
+               case KR_TRANSPORT_RESOLVE_AAAA:
+                       ip_version = (proto == KR_TRANSPORT_RESOLVE_A) ? "A" : "AAAA";
+                       VERBOSE_MSG(qry, "=> id: '%05u' choosing to resolve %s: '%s' zone cut: '%s'\n",
+                                   qry->id, ip_version, ns_name, zonecut_str);
+                       break;
+               default:
+                       VERBOSE_MSG(qry, "=> id: '%05u' choosing: '%s'@'%s' with timeout %u ms zone cut: '%s'%s\n",
+                                   qry->id, ns_name, ns_str ? ns_str : "", (*transport)->timeout, zonecut_str,
+                                   (*transport)->safe_mode ? " SAFEMODE" : "");
+                       break;
+               }
+       } else {
+               VERBOSE_MSG(qry, "=> id: '%05u' no suitable transport, zone cut: '%s'%s\n",
+                       qry->id, zonecut_str, nxnsattack_mitigation ? " (stopped due to mitigation for NXNSAttack CVE-2020-12667)" : "");
+       }
+       }
+}
+
+void iter_error(struct kr_query *qry, const struct kr_transport *transport,
+               enum kr_selection_error sel_error)
+{
+       if (!qry->server_selection.initialized) {
+               return;
+       }
+       struct iter_local_state *local_state = qry->server_selection.local_state->private;
+       struct address_state *addr_state = get_address_state(local_state, transport);
+       local_state->last_error = sel_error;
+       error(qry, addr_state, transport, sel_error);
+}
+
+void iter_update_rtt(struct kr_query *qry, const struct kr_transport *transport,
+                    unsigned rtt)
+{
+       if (!qry->server_selection.initialized) {
+               return;
+       }
+       struct iter_local_state *local_state = qry->server_selection.local_state->private;
+       struct address_state *addr_state = get_address_state(local_state, transport);
+       update_rtt(qry, addr_state, transport, rtt);
+}
diff --git a/lib/selection_iter.h b/lib/selection_iter.h
new file mode 100644 (file)
index 0000000..f1c798b
--- /dev/null
@@ -0,0 +1,34 @@
+/*  Copyright (C) 2014-2020 CZ.NIC, z.s.p.o. <knot-dns@labs.nic.cz>
+ *  SPDX-License-Identifier: GPL-3.0-or-later
+ */
+
+#pragma once
+
+#include "lib/selection.h"
+
+/**
+ * If one of the errors set to true is encountered, there is no point in asking this server again.
+ */
+static const bool UNRECOVERABLE_ERRORS[] = {
+       [KR_SELECTION_QUERY_TIMEOUT] = false,
+       [KR_SELECTION_TLS_HANDSHAKE_FAILED] = false,
+       [KR_SELECTION_TCP_CONNECT_FAILED] = false,
+       [KR_SELECTION_TCP_CONNECT_TIMEOUT] = false,
+       [KR_SELECTION_REFUSED] = true,
+       [KR_SELECTION_SERVFAIL] = true,
+       [KR_SELECTION_FORMERROR] = false,
+       [KR_SELECTION_NOTIMPL] = true,
+       [KR_SELECTION_OTHER_RCODE] = true,
+       [KR_SELECTION_TRUNCATED] = false,
+       [KR_SELECTION_DNSSEC_ERROR] = true,
+       [KR_SELECTION_LAME_DELEGATION] = true,
+       [KR_SELECTION_BAD_CNAME] = true,
+};
+
+void iter_local_state_alloc(struct knot_mm *mm, void **local_state);
+void iter_choose_transport(struct kr_query *qry,
+                          struct kr_transport **transport);
+void iter_error(struct kr_query *qry, const struct kr_transport *transport,
+               enum kr_selection_error sel_error);
+void iter_update_rtt(struct kr_query *qry, const struct kr_transport *transport,
+                    unsigned rtt);
\ No newline at end of file
index ea10f2925a62b13be95a09d30e9966829b707ad7..f2027a90d39d5dbbcc82f636311d9c4318004a53 100644 (file)
@@ -11,7 +11,7 @@
 #include "lib/defines.h"
 #include "lib/generic/array.h"
 #include "lib/module.h"
-#include "lib/nsrep.h"
+#include "lib/selection.h"
 #include "lib/resolve.h"
 
 #include <gnutls/gnutls.h>
index 022ae301e054d87efae2c264a849ebce24be38a1..b4f92fa63037dc6c233d2bbf0a8b7ebd45a3d50c 100644 (file)
@@ -118,6 +118,7 @@ static inline void free_const(const void *what)
        free((void *)what);
 }
 
+// Use this for alocations with mm.
 static inline void *mm_alloc(knot_mm_t *mm, size_t size)
 {
        if (mm) return mm->alloc(mm->ctx, size);
@@ -137,6 +138,7 @@ KR_EXPORT
 void *mm_realloc(knot_mm_t *mm, void *what, size_t size, size_t prev_size);
 
 /** Trivial malloc() wrapper. */
+// Use mm_alloc for alocations into mempool
 void *mm_malloc(void *ctx, size_t n);
 /** posix_memalign() wrapper. */
 void *mm_malloc_aligned(void *ctx, size_t n);
index 5839b9743d496cde98af372075b44a6e870fb845..7227bf9c0982f613e3e788e1a16a1bb4f9ef8ab0 100644 (file)
@@ -337,17 +337,8 @@ static addrset_info_t fetch_addr(pack_t *addrs, const knot_dname_t *ns, uint16_t
                                        (int)rd->len, (int)rrtype);
                        continue;
                }
-               /* Check RTT cache - whether the IP is usable or not. */
-               kr_nsrep_rtt_lru_entry_t *rtt_e = ctx->cache_rtt
-                       ? lru_get_try(ctx->cache_rtt, (const char *)rd->data, rd->len)
-                       : NULL;
-               const bool unusable = rtt_e && rtt_e->score >= KR_NS_TIMEOUT
-                       && qry->creation_time_mono
-                          < rtt_e->tout_timestamp + ctx->cache_rtt_tout_retry_interval;
-               if (!unusable) {
-                       result = AI_OK;
-                       ++usable_cnt;
-               }
+               result = AI_OK;
+               ++usable_cnt;
 
                ret = pack_obj_push(addrs, rd->data, rd->len);
                assert(!ret); /* didn't fit because of incorrectly reserved memory */
@@ -413,16 +404,10 @@ static int fetch_ns(struct kr_context *ctx, struct kr_zonecut *cut,
                pack_init(**pack);
 
                addrset_info_t infos[2];
+
                /* Fetch NS reputation and decide whether to prefetch A/AAAA records. */
-               unsigned *cached = lru_get_try(ctx->cache_rep,
-                                       (const char *)ns_name, ns_size);
-               unsigned reputation = (cached) ? *cached : 0;
-               infos[0] = (reputation & KR_NS_NOIP4) || qry->flags.NO_IPV4
-                       ? AI_REPUT
-                       : fetch_addr(*pack, ns_name, KNOT_RRTYPE_A, cut->pool, qry);
-               infos[1] = (reputation & KR_NS_NOIP6) || qry->flags.NO_IPV6
-                       ? AI_REPUT
-                       : fetch_addr(*pack, ns_name, KNOT_RRTYPE_AAAA, cut->pool, qry);
+               infos[0] = fetch_addr(*pack, ns_name, KNOT_RRTYPE_A, cut->pool, qry);
+               infos[1] = fetch_addr(*pack, ns_name, KNOT_RRTYPE_AAAA, cut->pool, qry);
 
                #if 0 /* rather unlikely to be useful unless changing some zcut code */
                WITH_VERBOSE(qry) {
index 3bf1203eaffc7d13acfc8781cfef32839eb48003..4054fd22e12a732a82e5cddf10377d60aea4f247 100644 (file)
@@ -11,7 +11,7 @@ function check_stats(got)
        local expected = {
                [1] = {
                        ['type'] = 'DNSKEY',
-                       ['count'] = 2,
+                       ['count'] = 8,  -- This is a trade-off to not hardfailing on DNSSEC errors
                        ['name'] = '.',
                }
        }
index 85c524ad38255f5c33c7416fc5bac99b918b01f7..d209f846c01dbb12133ed394b78025ae05ad3d6d 100644 (file)
@@ -332,6 +332,11 @@ Actions :func:`policy.FORWARD`, :func:`policy.TLS_FORWARD` and :func:`policy.STU
           policy.STUB('192.0.2.1@5353'),
           {todname('1.168.192.in-addr.arpa')}))
 
+.. note:: Forwarding targets must support
+   `EDNS <https://en.wikipedia.org/wiki/Extension_mechanisms_for_DNS>`_ and
+   `0x20 randomization <https://tools.ietf.org/html/draft-vixie-dnsext-dns0x20-00>`_.
+
+
 .. _tls-forwarding:
 
 Forwarding over TLS protocol (DNS-over-TLS)
index e19d7146684699c5f323c05aaa022836c960ea06..b13ee798d0f32252e64d001b3eb5c9ff75b6bac1 100644 (file)
@@ -77,18 +77,13 @@ function policy.MIRROR(target)
 end
 
 -- Override the list of nameservers (forwarders)
-local function set_nslist(qry, list)
+local function set_nslist(req, list)
        local ns_i = 0
        for _, ns in ipairs(list) do
-               -- kr_nsrep_set() can return kr_error(ENOENT), it's OK
-               if ffi.C.kr_nsrep_set(qry, ns_i, ns) == 0 then
+               if ffi.C.kr_forward_add_target(req, ns) == 0 then
                        ns_i = ns_i + 1
                end
        end
-       -- If less than maximum NSs, insert guard to terminate the list
-       if ns_i < 3 then
-               assert(ffi.C.kr_nsrep_set(qry, ns_i, nil) == 0);
-       end
        if ns_i == 0 then
                -- would use assert() but don't want to compose the message if not triggered
                error('no usable address in NS set (check net.ipv4 and '
@@ -102,7 +97,6 @@ function policy.STUB(target)
        if type(target) == 'table' then
                for _, v in pairs(target) do
                        table.insert(list, addr2sock(v, 53))
-                       assert(#list <= 4, 'at most 4 STUB targets are supported')
                end
        else
                table.insert(list, addr2sock(target, 53))
@@ -112,7 +106,7 @@ function policy.STUB(target)
                -- Switch mode to stub resolver, do not track origin zone cut since it's not real authority NS
                qry.flags.STUB = true
                qry.flags.ALWAYS_CUT = false
-               set_nslist(qry, list)
+               set_nslist(req, list)
                return state
        end
 end
@@ -123,7 +117,6 @@ function policy.FORWARD(target)
        if type(target) == 'table' then
                for _, v in pairs(target) do
                        table.insert(list, addr2sock(v, 53))
-                       assert(#list <= 4, 'at most 4 FORWARD targets are supported')
                end
        else
                table.insert(list, addr2sock(target, 53))
@@ -136,7 +129,7 @@ function policy.FORWARD(target)
                qry.flags.ALWAYS_CUT = false
                qry.flags.NO_MINIMIZE = true
                qry.flags.AWAIT_CUT = true
-               set_nslist(qry, list)
+               set_nslist(req, list)
                return state
        end
 end
@@ -145,8 +138,6 @@ end
 function policy.TLS_FORWARD(targets)
        if type(targets) ~= 'table' or #targets < 1 then
                error('TLS_FORWARD argument must be a non-empty table')
-       elseif #targets > 4 then
-               error('TLS_FORWARD supports at most four targets (in a single call)')
        end
 
        local sockaddr_c_set = {}
@@ -182,7 +173,7 @@ function policy.TLS_FORWARD(targets)
                qry.flags.AWAIT_CUT = true
                req.options.TCP = true
                qry.flags.TCP = true
-               set_nslist(qry, nslist)
+               set_nslist(req, nslist)
                return state
        end
 end
index 132c05c496982cb096a980494edbdbf02d295440..f9c47ba82b22de2c96b306493e821f15b7584405 100644 (file)
@@ -147,7 +147,7 @@ static int collect_rtt(kr_layer_t *ctx, knot_pkt_t *pkt)
 {
        struct kr_request *req = ctx->req;
        struct kr_query *qry = req->current_query;
-       if (qry->flags.CACHED || !req->upstream.addr) {
+       if (qry->flags.CACHED || !req->upstream.transport) {
                return ctx->state;
        }
 
@@ -158,11 +158,11 @@ static int collect_rtt(kr_layer_t *ctx, knot_pkt_t *pkt)
        /* Socket address is encoded into sockaddr_in6 struct that
         * unions with sockaddr_in and differ in sa_family */
        struct sockaddr_in6 *e = &data->upstreams.q.at[data->upstreams.head];
-       const struct sockaddr *src = req->upstream.addr;
-       switch (src->sa_family) {
-       case AF_INET:  memcpy(e, src, sizeof(struct sockaddr_in)); break;
-       case AF_INET6: memcpy(e, src, sizeof(struct sockaddr_in6)); break;
-       default: return ctx->state;
+       const union inaddr *src = &req->upstream.transport->address;
+       switch (src->ip.sa_family) {
+               case AF_INET:  memcpy(e, &src->ip4, sizeof(src->ip4)); break;
+               case AF_INET6: memcpy(e, &src->ip6, sizeof(src->ip6)); break;
+               default: return ctx->state;
        }
        /* Replace port number with the RTT information (cap is UINT16_MAX milliseconds) */
        e->sin6_rtt = req->upstream.rtt;
index 93a937708c65e6e1ae8faab1d24e45908b68f222..4389293b2b5a16267c67cc17fb3ea728ce915d2d 100644 (file)
@@ -111,7 +111,7 @@ function M.check_answer(desc, qname, qtype, expected_rcode, expected_rdata)
                end
        )
 
-       for delay = 0.1, 4, 0.5 do -- total max 14.9s in 8 steps
+       for delay = 0.1, 5, 0.5 do -- total max 23.5s in 9 steps
                if done then return end
                worker.sleep(delay)
        end