]> git.ipfire.org Git - thirdparty/knot-resolver.git/commitdiff
first buildable version of server selection rewrite
authorŠtěpán Balážik <stepan.balazik@nic.cz>
Fri, 20 Mar 2020 18:43:11 +0000 (19:43 +0100)
committerŠtěpán Balážik <stepan.balazik@nic.cz>
Thu, 15 Oct 2020 11:22:21 +0000 (13:22 +0200)
28 files changed:
bench/bench_lru.c
daemon/bindings/cache.c
daemon/engine.c
daemon/lua/kres-gen.lua
daemon/lua/kres-gen.sh
daemon/worker.c
daemon/worker.h
daemon/zimport.c
lib/layer/iterate.c
lib/layer/validate.c
lib/meson.build
lib/nsrep.c [deleted file]
lib/nsrep.h [deleted file]
lib/resolve.c
lib/resolve.h
lib/rplan.c
lib/rplan.h
lib/selection.c [new file with mode: 0644]
lib/selection.h [new file with mode: 0644]
lib/selection_forward.c [new file with mode: 0644]
lib/selection_forward.h [new file with mode: 0644]
lib/selection_iter.c [new file with mode: 0644]
lib/selection_iter.h [new file with mode: 0644]
lib/utils.c
lib/utils.h
lib/zonecut.c
modules/policy/policy.lua
modules/stats/stats.c

index a885c2915250ebed524577baa0e14d4143d5fb0d..9fbf0d6fcc189c1f6bbe39d4f74df517c0d5c473 100644 (file)
@@ -11,7 +11,7 @@
 
 #include "contrib/ucw/lib.h"
 #include "daemon/engine.h"
-#include "lib/nsrep.h"
+#include "lib/selection.h"
 
 typedef kr_nsrep_lru_t lru_bench_t;
 
index 7b08374e364822b29e6c5837bd21a3145a05b1e4..4a8afd931843f9680236f7591bdba90627099e08 100644 (file)
@@ -268,8 +268,6 @@ static int cache_clear_everything(lua_State *L)
 
        /* Clear reputation tables */
        struct kr_context *ctx = &the_worker->engine->resolver;
-       lru_reset(ctx->cache_rtt);
-       lru_reset(ctx->cache_rep);
        lru_reset(ctx->cache_cookie);
        lua_pushboolean(L, true);
        return 1;
index 52894f891c6a424ffa1e0e3af14d8dc96c959728..7801936bbb92f43fd5a1c0268a61974c1e0963c3 100644 (file)
@@ -21,8 +21,7 @@
 #include "kresconfig.h"
 #include "daemon/engine.h"
 #include "daemon/ffimodule.h"
-#include "daemon/worker.h"
-#include "lib/nsrep.h"
+#include "lib/selection.h"
 #include "lib/cache/api.h"
 #include "lib/defines.h"
 #include "lib/cache/cdb_lmdb.h"
@@ -439,7 +438,6 @@ static int init_resolver(struct engine *engine)
        engine->resolver.negative_anchors = map_make(NULL);
        engine->resolver.pool = engine->pool;
        engine->resolver.modules = &engine->modules;
-       engine->resolver.cache_rtt_tout_retry_interval = KR_NS_TIMEOUT_RETRY_INTERVAL;
        /* Create OPT RR */
        engine->resolver.downstream_opt_rr = mm_alloc(engine->pool, sizeof(knot_rrset_t));
        engine->resolver.upstream_opt_rr = mm_alloc(engine->pool, sizeof(knot_rrset_t));
@@ -452,9 +450,6 @@ static int init_resolver(struct engine *engine)
        engine->resolver.tls_padding = -1;
        /* Empty init; filled via ./lua/postconfig.lua */
        kr_zonecut_init(&engine->resolver.root_hints, (const uint8_t *)"", engine->pool);
-       /* Open NS rtt + reputation cache */
-       lru_create(&engine->resolver.cache_rtt, LRU_RTT_SIZE, NULL, NULL);
-       lru_create(&engine->resolver.cache_rep, LRU_REP_SIZE, NULL, NULL);
        lru_create(&engine->resolver.cache_cookie, LRU_COOKIES_SIZE, NULL, NULL);
 
        /* Load basic modules */
@@ -638,8 +633,6 @@ void engine_deinit(struct engine *engine)
        kr_cache_close(&engine->resolver.cache);
 
        /* The LRUs are currently malloc-ated and need to be freed. */
-       lru_free(engine->resolver.cache_rtt);
-       lru_free(engine->resolver.cache_rep);
        lru_free(engine->resolver.cache_cookie);
 
        network_deinit(&engine->net);
index a689b46ddd8391298d2bf411b093bbbd5a11ec3e..9afe512bb65b8dd758ea91dcd7322ecca3f2e3fb 100644 (file)
@@ -7,6 +7,13 @@ typedef struct knot_dump_style knot_dump_style_t;
 extern const knot_dump_style_t KNOT_DUMP_STYLE_DEFAULT;
 struct kr_cdb_api {};
 struct lru {};
+typedef enum {KNOT_ANSWER, KNOT_AUTHORITY, KNOT_ADDITIONAL} knot_section_t;
+typedef struct {
+       uint16_t pos;
+       uint16_t flags;
+       uint16_t compress_ptr[16];
+} knot_rrinfo_t;
+typedef unsigned char knot_dname_t;
 
 typedef struct knot_mm {
        void *ctx, *alloc, *free;
@@ -16,13 +23,8 @@ typedef void *(*map_alloc_f)(void *, size_t);
 typedef void (*map_free_f)(void *baton, void *ptr);
 typedef void (*trace_log_f) (const struct kr_request *, const char *);
 typedef void (*trace_callback_f)(struct kr_request *);
-typedef enum {KNOT_ANSWER, KNOT_AUTHORITY, KNOT_ADDITIONAL} knot_section_t;
-typedef struct {
-       uint16_t pos;
-       uint16_t flags;
-       uint16_t compress_ptr[16];
-} knot_rrinfo_t;
-typedef unsigned char knot_dname_t;
+typedef bool (*addr_info_f)(struct sockaddr*);
+typedef void (*async_resolution_f)(knot_dname_t*, enum knot_rr_type);
 typedef struct {
        knot_dname_t *_owner;
        uint32_t _ttl;
@@ -175,7 +177,7 @@ struct kr_request {
        } qsource;
        struct {
                unsigned int rtt;
-               const struct sockaddr *addr;
+               const struct kr_transport *transport;
        } upstream;
        struct kr_qflags options;
        int state;
@@ -190,6 +192,14 @@ struct kr_request {
        trace_callback_f trace_finish;
        int vars_ref;
        knot_mm_t pool;
+       struct {
+               addr_info_f is_tls_capable;
+               addr_info_f is_tcp_connected;
+               addr_info_f is_tcp_waiting;
+               async_resolution_f async_ns_resolution;
+               union inaddr *forwarding_targets;
+               size_t forward_targets_num;
+       } selection_context;
        unsigned int uid;
        unsigned int count_no_nsaddr;
        unsigned int count_fail_row;
@@ -259,19 +269,20 @@ struct kr_module {
        void *lib;
        void *data;
 };
+struct kr_server_selection {
+       _Bool initialized;
+       void (*choose_transport)(struct kr_query *, struct kr_transport **);
+       void (*success)(struct kr_query *, const struct kr_transport *);
+       void (*update_rtt)(struct kr_query *, const struct kr_transport *, unsigned int);
+       void (*error)(struct kr_query *, const struct kr_transport *, enum kr_selection_error);
+       void *local_state;
+};
 kr_layer_t kr_layer_t_static;
 typedef int32_t (*kr_stale_cb)(int32_t ttl, const knot_dname_t *owner, uint16_t type,
                                const struct kr_query *qry);
 
 void kr_rrset_init(knot_rrset_t *rrset, knot_dname_t *owner,
                        uint16_t type, uint16_t rclass, uint32_t ttl);
-struct kr_nsrep {
-       unsigned int score;
-       unsigned int reputation;
-       const knot_dname_t *name;
-       struct kr_context *ctx;
-       /* beware: hidden stub, to avoid hardcoding sockaddr lengths */
-};
 struct kr_query {
        struct kr_query *parent;
        knot_dname_t *sname;
@@ -292,7 +303,7 @@ struct kr_query {
        struct kr_query *cname_parent;
        struct kr_request *request;
        kr_stale_cb stale_cb;
-       struct kr_nsrep ns;
+       struct kr_server_selection server_selection;
 };
 struct kr_context {
        struct kr_qflags options;
@@ -302,8 +313,13 @@ struct kr_context {
        map_t negative_anchors;
        struct kr_zonecut root_hints;
        struct kr_cache cache;
+       unsigned int cache_rtt_tout_retry_interval;
        char _stub[];
 };
+struct kr_transport {
+       knot_dname_t *name;
+       /* beware: hidden stub, to avoid hardcoding sockaddr lengths */
+};
 const char *knot_strerror(int);
 knot_dname_t *knot_dname_copy(const knot_dname_t *, knot_mm_t *);
 knot_dname_t *knot_dname_from_str(uint8_t *, const char *, size_t);
@@ -332,7 +348,7 @@ struct kr_query *kr_rplan_push(struct kr_rplan *, struct kr_query *, const knot_
 int kr_rplan_pop(struct kr_rplan *, struct kr_query *);
 struct kr_query *kr_rplan_resolved(struct kr_rplan *);
 struct kr_query *kr_rplan_last(struct kr_rplan *);
-int kr_nsrep_set(struct kr_query *, size_t, const struct sockaddr *);
+int kr_forward_add_target(struct kr_request *, size_t, const struct sockaddr *);
 void kr_log_req(const struct kr_request * const, uint32_t, const unsigned int, const char *, const char *, ...);
 void kr_log_q(const struct kr_query * const, const char *, const char *, ...);
 int kr_make_query(struct kr_query *, knot_pkt_t *);
index bd5c1fd4618f1caa250a88581c8492a0f129ce26..256d4fb225ce90fff8aa383ebf7129df99cddba9 100755 (executable)
@@ -60,6 +60,14 @@ struct kr_cdb_api {};
 struct lru {};
 "
 
+${CDEFS} ${LIBKRES} types <<-EOF
+       knot_section_t
+       knot_rrinfo_t
+       knot_dname_t
+       #knot_rdata_t
+       #knot_rdataset_t
+EOF
+
 # The generator doesn't work well with typedefs of functions.
 printf "
 typedef struct knot_mm {
@@ -70,16 +78,10 @@ typedef void *(*map_alloc_f)(void *, size_t);
 typedef void (*map_free_f)(void *baton, void *ptr);
 typedef void (*trace_log_f) (const struct kr_request *, const char *);
 typedef void (*trace_callback_f)(struct kr_request *);
+typedef bool (*addr_info_f)(struct sockaddr*);
+typedef void (*async_resolution_f)(knot_dname_t*, enum knot_rr_type);
 "
 
-${CDEFS} ${LIBKRES} types <<-EOF
-       knot_section_t
-       knot_rrinfo_t
-       knot_dname_t
-       #knot_rdata_t
-       #knot_rdataset_t
-EOF
-
 genResType() {
        echo "$1" | ${CDEFS} ${LIBKRES} types
 }
@@ -123,6 +125,7 @@ ${CDEFS} ${LIBKRES} types <<-EOF
        # lib/module.h
        struct kr_prop
        struct kr_module
+       struct kr_server_selection
 EOF
 
 # a static variable; the line might not be simple to generate
@@ -138,14 +141,15 @@ void kr_rrset_init(knot_rrset_t *rrset, knot_dname_t *owner,
 
 ## Some definitions would need too many deps, so shorten them.
 
-genResType "struct kr_nsrep" | sed '/union/,$ d'
-printf "\t/* beware: hidden stub, to avoid hardcoding sockaddr lengths */\n};\n"
-
 genResType "struct kr_query"
 
-genResType "struct kr_context" | sed '/kr_nsrep_rtt_lru_t/,$ d'
+genResType "struct kr_context" | sed '/module_array_t/,$ d'
 printf "\tchar _stub[];\n};\n"
 
+
+echo "struct kr_transport" | ${CDEFS} ${KRESD} types | sed '/union /,$ d'
+printf "\t/* beware: hidden stub, to avoid hardcoding sockaddr lengths */\n};\n"
+
 ## libknot API
 ${CDEFS} libknot functions <<-EOF
 # Utils
@@ -186,8 +190,8 @@ ${CDEFS} ${LIBKRES} functions <<-EOF
        kr_rplan_pop
        kr_rplan_resolved
        kr_rplan_last
-# Nameservers
-       kr_nsrep_set
+# Forwarding
+       kr_forward_add_target
 # Utils
        kr_log_req
        kr_log_q
@@ -274,6 +278,7 @@ printf "\t/* beware: hidden stub, to avoid hardcoding sockaddr lengths */\n};\n"
 echo "struct qr_task" | ${CDEFS} ${KRESD} types | sed '/pktbuf/,$ d'
 printf "\t/* beware: hidden stub, to avoid qr_tasklist_t */\n};\n"
 
+
 ${CDEFS} ${KRESD} functions <<-EOF
        worker_resolve_exec
        worker_resolve_mk_pkt
index 796e2cdb0fe21ef6fe26a02acfe54a137d61bcb9..171a425a15ebbb66ac4444cdb8f18e6062f0dc54 100644 (file)
@@ -74,15 +74,18 @@ struct qr_task
        qr_tasklist_t waiting;
        struct session *pending[MAX_PENDING];
        uint16_t pending_count;
-       uint16_t addrlist_count;
-       uint16_t addrlist_turn;
+       // uint16_t addrlist_count;
+       // uint16_t addrlist_turn;
        uint16_t timeouts;
        uint16_t iter_count;
-       struct sockaddr *addrlist;
+       // struct sockaddr *addrlist;
        uint32_t refs;
        bool finished : 1;
        bool leading  : 1;
        uint64_t creation_time;
+       uint64_t send_time;
+       uint64_t recv_time;
+       struct kr_transport *transport;
 };
 
 
@@ -111,15 +114,15 @@ static int qr_task_send(struct qr_task *task, struct session *session,
                        const struct sockaddr *addr, knot_pkt_t *pkt);
 static int qr_task_finalize(struct qr_task *task, int state);
 static void qr_task_complete(struct qr_task *task);
-static struct session* worker_find_tcp_connected(struct worker_ctx *worker,
+struct session* worker_find_tcp_connected(struct worker_ctx *worker,
                                                 const struct sockaddr *addr);
 static int worker_add_tcp_waiting(struct worker_ctx *worker,
                                  const struct sockaddr *addr,
                                  struct session *session);
-static struct session* worker_find_tcp_waiting(struct worker_ctx *worker,
+struct session* worker_find_tcp_waiting(struct worker_ctx *worker,
                                               const struct sockaddr *addr);
 static void on_tcp_connect_timeout(uv_timer_t *timer);
-static void on_retransmit(uv_timer_t *req);
+static void on_udp_timeout(uv_timer_t *req);
 static void subreq_finalize(struct qr_task *task, const struct sockaddr *packet_source, knot_pkt_t *pkt);
 
 
@@ -257,6 +260,28 @@ static int subreq_key(char *dst, knot_pkt_t *pkt)
                        knot_pkt_qtype(pkt), knot_pkt_qtype(pkt));
 }
 
+/* Helper functions for transport selection */
+static inline bool is_tls_capable(struct sockaddr *address) {
+       tls_client_param_t *tls_entry = tls_client_param_get(the_worker->engine->net.tls_client_params, address);
+       return tls_entry;
+}
+
+static inline bool is_tcp_connected(struct sockaddr *address) {
+       return worker_find_tcp_connected(the_worker, address);
+}
+
+static inline bool is_tcp_waiting(struct sockaddr *address) {
+       return worker_find_tcp_waiting(the_worker, address);
+}
+
+void async_ns_resolution(knot_dname_t *name, enum knot_rr_type type) {
+    struct kr_qflags flags;
+    memset(&flags, 0, sizeof(struct kr_qflags));
+    knot_pkt_t* pkt = worker_resolve_mk_pkt_dname(name, type, KNOT_CLASS_IN, &flags);
+    worker_resolve_start(pkt, flags);
+    free(pkt);
+}
+
 /** Create and initialize a request_ctx (on a fresh mempool).
  *
  * handle and addr point to the source of the request, and they are NULL
@@ -310,6 +335,10 @@ static struct request_ctx *request_create(struct worker_ctx *worker,
                req->qsource.addr = &ctx->source.addr.ip;
        }
 
+       req->selection_context.is_tls_capable = is_tls_capable;
+       req->selection_context.is_tcp_connected = is_tcp_connected;
+       req->selection_context.is_tcp_waiting = is_tcp_waiting;
+
        worker->stats.rconcurrent += 1;
 
        return ctx;
@@ -492,7 +521,6 @@ static void qr_task_complete(struct qr_task *task)
 /* This is called when we send subrequest / answer */
 int qr_task_on_send(struct qr_task *task, uv_handle_t *handle, int status)
 {
-
        if (task->finished) {
                assert(task->leading == false);
                qr_task_complete(task);
@@ -505,26 +533,17 @@ int qr_task_on_send(struct qr_task *task, uv_handle_t *handle, int status)
        assert(s);
 
        if (handle->type == UV_UDP && session_flags(s)->outgoing) {
-               /* Start the timeout timer for UDP here, since this is the closest
-                * to the wire we can get. */
-               struct kr_request *req = &task->ctx->req;
-               /* Check current query NSLIST */
-               struct kr_query *qry = array_tail(req->rplan.pending);
+               // This should ensure that we are only dealing with our question to upstream
+               assert(!knot_wire_get_qr(task->pktbuf->wire));
+               // start the timer
+               struct kr_query *qry = array_tail(task->ctx->req.rplan.pending);
                assert(qry != NULL);
-               /* Retransmit at default interval, or more frequently if the mean
-                * RTT of the server is better. If the server is glued, use default rate. */
-               size_t timeout = qry->ns.score;
-               if (timeout > KR_NS_GLUED) {
-                       /* We don't have information about variance in RTT, expect +10ms */
-                       timeout = MIN(qry->ns.score + 10, KR_CONN_RETRY);
-               } else {
-                       timeout = KR_CONN_RETRY;
-               }
 
-               int ret = session_timer_start(s, on_retransmit, timeout, 0);
+               size_t timeout = task->transport->timeout;
+               int ret = session_timer_start(s, on_udp_timeout, timeout, 0);
                /* Start next step with timeout, fatal if can't start a timer. */
                if (ret != 0) {
-                       subreq_finalize(task, &qry->ns.addr->ip, task->pktbuf);
+                       subreq_finalize(task, &task->transport->address.ip, task->pktbuf);
                        qr_task_finalize(task, KR_STATE_FAIL);
                }
        }
@@ -614,6 +633,9 @@ static int qr_task_send(struct qr_task *task, struct session *session,
        qr_task_ref(task);
 
        struct worker_ctx *worker = ctx->worker;
+       /* Note time for upstream RTT */
+       task->send_time = kr_now();
+       task->recv_time = 0; // task structure is being reused so we have to zero this out here
        /* Send using given protocol */
        assert(!session_flags(session)->closing);
        if (session_flags(session)->has_http) {
@@ -726,11 +748,9 @@ static int session_tls_hs_cb(struct session *session, int status)
        if (status) {
                struct qr_task *task = session_waitinglist_get(session);
                if (task) {
-                       struct kr_qflags *options = &task->ctx->req.options;
-                       unsigned score = options->FORWARD || options->STUB ? KR_NS_FWD_DEAD : KR_NS_DEAD;
-                       kr_nsrep_update_rtt(NULL, peer, score,
-                                           the_worker->engine->resolver.cache_rtt,
-                                           KR_NS_UPDATE_NORESET);
+                       // TLS handshake failed, report it to server selection
+                       struct kr_query *qry = array_tail(task->ctx->req.rplan.pending);
+                       qry->server_selection.error(qry, task->transport, KR_SELECTION_TLS_HANDSHAKE_FAILED);
                }
 #ifndef NDEBUG
                else {
@@ -905,14 +925,8 @@ static void on_connect(uv_connect_t *req, int status)
                worker_del_tcp_waiting(worker, peer);
                struct qr_task *task = session_waitinglist_get(session);
                if (task && status != UV_ETIMEDOUT) {
-                       /* Penalize upstream.
-                        * In case of UV_ETIMEDOUT upstream has been
-                        * already penalized in on_tcp_connect_timeout() */
-                       struct kr_qflags *options = &task->ctx->req.options;
-                       unsigned score = options->FORWARD || options->STUB ? KR_NS_FWD_DEAD : KR_NS_DEAD;
-                       kr_nsrep_update_rtt(NULL, peer, score,
-                                           worker->engine->resolver.cache_rtt,
-                                           KR_NS_UPDATE_NORESET);
+                       struct kr_query *qry = array_tail(task->ctx->req.rplan.pending);
+                       qry->server_selection.error(qry, task->transport, KR_SELECTION_TCP_CONNECT_FAILED);
                }
                assert(session_tasklist_is_empty(session));
                session_waitinglist_retry(session, false);
@@ -994,10 +1008,7 @@ static void on_tcp_connect_timeout(uv_timer_t *timer)
                            peer_str ? peer_str : "");
        }
 
-       unsigned score = qry->flags.FORWARD || qry->flags.STUB ? KR_NS_FWD_DEAD : KR_NS_DEAD;
-       kr_nsrep_update_rtt(NULL, peer, score,
-                           worker->engine->resolver.cache_rtt,
-                           KR_NS_UPDATE_NORESET);
+       qry->server_selection.error(qry, task->transport, KR_SELECTION_TCP_CONNECT_TIMEOUT);
 
        worker->stats.timeout += session_waitinglist_get_len(session);
        session_waitinglist_retry(session, true);
@@ -1022,34 +1033,28 @@ static void on_udp_timeout(uv_timer_t *timer)
 
        uv_timer_stop(timer);
 
-       /* Penalize all tried nameservers with a timeout. */
        struct qr_task *task = session_tasklist_get_first(session);
        struct worker_ctx *worker = task->ctx->worker;
+
        if (task->leading && task->pending_count > 0) {
                struct kr_query *qry = array_tail(task->ctx->req.rplan.pending);
-               struct sockaddr_in6 *addrlist = (struct sockaddr_in6 *)task->addrlist;
-               for (uint16_t i = 0; i < MIN(task->pending_count, task->addrlist_count); ++i) {
-                       struct sockaddr *choice = (struct sockaddr *)(&addrlist[i]);
-                       WITH_VERBOSE(qry) {
-                               char *addr_str = kr_straddr(choice);
-                               VERBOSE_MSG(qry, "=> server: '%s' flagged as 'bad'\n", addr_str ? addr_str : "");
-                       }
-                       unsigned score = qry->flags.FORWARD || qry->flags.STUB ? KR_NS_FWD_DEAD : KR_NS_DEAD;
-                       kr_nsrep_update_rtt(&qry->ns, choice, score,
-                                           worker->engine->resolver.cache_rtt,
-                                           KR_NS_UPDATE_NORESET);
-               }
+               qry->server_selection.error(qry, task->transport, KR_SELECTION_TIMEOUT);
        }
+
        task->timeouts += 1;
        worker->stats.timeout += 1;
        qr_task_step(task, NULL, NULL);
 }
 
-static uv_handle_t *retransmit(struct qr_task *task)
+static uv_handle_t *transmit(struct qr_task *task)
 {
        uv_handle_t *ret = NULL;
-       if (task && task->addrlist && task->addrlist_count > 0) {
-               struct sockaddr_in6 *choice = &((struct sockaddr_in6 *)task->addrlist)[task->addrlist_turn];
+
+       if (task) {
+               struct kr_transport* transport = task->transport;
+
+               struct sockaddr_in6 *choice = (struct sockaddr_in6 *)&transport->address;
+
                if (!choice) {
                        return ret;
                }
@@ -1058,7 +1063,7 @@ static uv_handle_t *retransmit(struct qr_task *task)
                }
                /* Checkout answer before sending it */
                struct request_ctx *ctx = task->ctx;
-               if (kr_resolve_checkout(&ctx->req, NULL, (struct sockaddr *)choice, SOCK_DGRAM, task->pktbuf) != 0) {
+               if (kr_resolve_checkout(&ctx->req, NULL, transport, task->pktbuf) != 0) {
                        return ret;
                }
                ret = ioreq_spawn(ctx->worker, SOCK_DGRAM, choice->sin6_family, false, false);
@@ -1077,31 +1082,12 @@ static uv_handle_t *retransmit(struct qr_task *task)
                } else {
                        task->pending[task->pending_count] = session;
                        task->pending_count += 1;
-                       task->addrlist_turn = (task->addrlist_turn + 1) %
-                                              task->addrlist_count; /* Round robin */
                        session_start_read(session); /* Start reading answer */
                }
        }
        return ret;
 }
 
-static void on_retransmit(uv_timer_t *req)
-{
-       struct session *session = req->data;
-       assert(session_tasklist_get_len(session) == 1);
-
-       uv_timer_stop(req);
-       struct qr_task *task = session_tasklist_get_first(session);
-       if (retransmit(task) == NULL) {
-               /* Not possible to spawn request, start timeout timer with remaining deadline. */
-               struct kr_qflags *options = &task->ctx->req.options;
-               uint64_t timeout = options->FORWARD || options->STUB ? KR_NS_FWD_TIMEOUT / 2 :
-                                  KR_CONN_RTT_MAX - task->pending_count * KR_CONN_RETRY;
-               uv_timer_start(req, on_udp_timeout, timeout, 0);
-       } else {
-               uv_timer_start(req, on_retransmit, KR_CONN_RETRY, 0);
-       }
-}
 
 static void subreq_finalize(struct qr_task *task, const struct sockaddr *packet_source, knot_pkt_t *pkt)
 {
@@ -1129,6 +1115,7 @@ static void subreq_finalize(struct qr_task *task, const struct sockaddr *packet_
                        struct kr_query *qry = array_tail(follower->ctx->req.rplan.pending);
                        qry->id = leader_qry->id;
                        qry->secret = leader_qry->secret;
+                       follower->transport = task->transport;
                        leader_qry->secret = 0; /* Next will be already decoded */
                }
                qr_task_step(follower, packet_source, pkt);
@@ -1248,7 +1235,7 @@ static int udp_task_step(struct qr_task *task,
                return kr_ok(); /* Will be notified when outgoing query finishes. */
        }
        /* Start transmitting */
-       uv_handle_t *handle = retransmit(task);
+       uv_handle_t *handle = transmit(task);
        if (handle == NULL) {
                subreq_finalize(task, packet_source, packet);
                return qr_task_finalize(task, KR_STATE_FAIL);
@@ -1396,15 +1383,7 @@ static int tcp_task_make_connection(struct qr_task *task, const struct sockaddr
                worker_del_tcp_waiting(worker, addr);
                free(conn);
                session_close(session);
-               unsigned score = qry->flags.FORWARD || qry->flags.STUB ? KR_NS_FWD_DEAD : KR_NS_DEAD;
-               kr_nsrep_update_rtt(NULL, peer, score,
-                                   worker->engine->resolver.cache_rtt,
-                                   KR_NS_UPDATE_NORESET);
-               WITH_VERBOSE (qry) {
-                       const char *peer_str = kr_straddr(peer);
-                       kr_log_verbose( "[wrkr]=> connect to '%s' failed (%s), flagged as 'bad'\n",
-                                       peer_str ? peer_str : "", uv_strerror(ret));
-               }
+               qry->server_selection.error(qry, task->transport, KR_SELECTION_TCP_CONNECT_FAILED);
                return kr_error(EAGAIN);
        }
 
@@ -1428,7 +1407,7 @@ static int tcp_task_step(struct qr_task *task,
        assert(task->pending_count == 0);
 
        /* target */
-       const struct sockaddr *addr = task->addrlist;
+       const struct sockaddr *addr = &task->transport->address.ip;
        if (addr->sa_family == AF_UNSPEC) {
                /* Target isn't defined. Finalize task with SERVFAIL.
                 * Although task->pending_count is zero, there are can be followers,
@@ -1438,8 +1417,7 @@ static int tcp_task_step(struct qr_task *task,
        }
        /* Checkout task before connecting */
        struct request_ctx *ctx = task->ctx;
-       if (kr_resolve_checkout(&ctx->req, NULL, (struct sockaddr *)addr,
-                               SOCK_STREAM, task->pktbuf) != 0) {
+       if (kr_resolve_checkout(&ctx->req, NULL, task->transport, task->pktbuf) != 0) {
                subreq_finalize(task, packet_source, packet);
                return qr_task_finalize(task, KR_STATE_FAIL);
        }
@@ -1488,10 +1466,6 @@ static int qr_task_step(struct qr_task *task,
        assert(ctx);
        struct kr_request *req = &ctx->req;
        struct worker_ctx *worker = ctx->worker;
-       int sock_type = -1;
-       task->addrlist = NULL;
-       task->addrlist_count = 0;
-       task->addrlist_turn = 0;
 
        if (worker->too_many_open) {
                /* */
@@ -1502,22 +1476,29 @@ static int qr_task_step(struct qr_task *task,
                } else {
                        if (packet && kr_rplan_empty(rplan)) {
                                /* new query; TODO - make this detection more obvious */
-                               kr_resolve_consume(req, packet_source, packet);
+                               kr_resolve_consume(req, &task->transport, packet);
                        }
                        return qr_task_finalize(task, KR_STATE_FAIL);
                }
        }
 
-       int state = kr_resolve_consume(req, packet_source, packet);
+       // Report network RTT back to server selection
+       if (task->send_time && task->recv_time) {
+               struct kr_query *qry = array_tail(req->rplan.pending);
+               qry->server_selection.update_rtt(qry, task->transport, task->recv_time - task->send_time);
+       }
+
+       int state = kr_resolve_consume(req, &task->transport, packet);
+
+       task->transport = NULL;
        while (state == KR_STATE_PRODUCE) {
-               state = kr_resolve_produce(req, &task->addrlist,
-                                          &sock_type, task->pktbuf);
+               state = kr_resolve_produce(req, &task->transport, task->pktbuf);
                if (unlikely(++task->iter_count > KR_ITER_LIMIT ||
                             task->timeouts >= KR_TIMEOUT_LIMIT)) {
 
                        #ifndef NOVERBOSELOG
                        struct kr_rplan *rplan = &req->rplan;
-                       struct kr_query *last  = kr_rplan_last(rplan);
+                       struct kr_query *last = kr_rplan_last(rplan);
                        if (task->iter_count > KR_ITER_LIMIT) {
                                VERBOSE_MSG(last, "canceling query due to exceeded iteration count limit of %d\n", KR_ITER_LIMIT);
                        }
@@ -1533,47 +1514,22 @@ static int qr_task_step(struct qr_task *task,
        /* We're done, no more iterations needed */
        if (state & (KR_STATE_DONE|KR_STATE_FAIL)) {
                return qr_task_finalize(task, state);
-       } else if (!task->addrlist || sock_type < 0) {
+       } else if (!task->transport->protocol) {
                return qr_task_step(task, NULL, NULL);
        }
 
-       /* Count available address choices */
-       struct sockaddr_in6 *choice = (struct sockaddr_in6 *)task->addrlist;
-       for (size_t i = 0; i < KR_NSREP_MAXADDR && choice->sin6_family != AF_UNSPEC; ++i) {
-               task->addrlist_count += 1;
-               choice += 1;
-       }
-
-       /* Upgrade to TLS if the upstream address is configured as DoT capable. */
-       if (task->addrlist_count > 0 && kr_inaddr_port(task->addrlist) == KR_DNS_PORT) {
-               /* TODO if there are multiple addresses (task->addrlist_count > 1)
-                * check all of them. */
-               struct network *net = &worker->engine->net;
-               /* task->addrlist has to contain TLS port before tls_client_param_get() call */
-               kr_inaddr_set_port(task->addrlist, KR_DNS_TLS_PORT);
-               tls_client_param_t *tls_entry =
-                       tls_client_param_get(net->tls_client_params, task->addrlist);
-               if (tls_entry) {
-                       packet_source = NULL;
-                       sock_type = SOCK_STREAM;
-                       /* TODO in this case in tcp_task_make_connection() will be performed
-                        * redundant map_get() call. */
-               } else {
-                       /* The function is fairly cheap, so we just change there and back. */
-                       kr_inaddr_set_port(task->addrlist, KR_DNS_PORT);
-               }
-       }
-
-       int ret = 0;
-       if (sock_type == SOCK_DGRAM) {
-               /* Start fast retransmit with UDP. */
-               ret = udp_task_step(task, packet_source, packet);
-       } else {
-               /* TCP. Connect to upstream or send the query if connection already exists. */
-               assert (sock_type == SOCK_STREAM);
-               ret = tcp_task_step(task, packet_source, packet);
+       switch (task->transport->protocol)
+       {
+       case KR_TRANSPORT_UDP:
+               return udp_task_step(task, packet_source, packet);
+               break;
+       case KR_TRANSPORT_TCP: // fall through
+       case KR_TRANSPORT_TLS:
+               return tcp_task_step(task, packet_source, packet);
+       default:
+               assert(0);
+               break;
        }
-       return ret;
 }
 
 static int parse_packet(knot_pkt_t *query)
@@ -1667,12 +1623,15 @@ int worker_submit(struct session *session, const struct sockaddr *peer, knot_pkt
                }
                assert(!session_flags(session)->closing);
                addr = peer;
+               /* Note recieve time for RTT calculation */
+               task->recv_time = kr_now();
        }
        assert(uv_is_closing(session_get_handle(session)) == false);
 
        /* Packet was successfully parsed.
         * Task was created (found). */
        session_touch(session);
+
        /* Consume input and produce next message */
        return qr_task_step(task, addr, pkt);
 }
@@ -1727,7 +1686,7 @@ int worker_del_tcp_connected(struct worker_ctx *worker,
        return map_del_tcp_session(&worker->tcp_connected, addr);
 }
 
-static struct session* worker_find_tcp_connected(struct worker_ctx *worker,
+struct session* worker_find_tcp_connected(struct worker_ctx *worker,
                                                 const struct sockaddr* addr)
 {
        return map_find_tcp_session(&worker->tcp_connected, addr);
@@ -1753,7 +1712,7 @@ int worker_del_tcp_waiting(struct worker_ctx *worker,
        return map_del_tcp_session(&worker->tcp_waiting, addr);
 }
 
-static struct session* worker_find_tcp_waiting(struct worker_ctx *worker,
+struct session* worker_find_tcp_waiting(struct worker_ctx *worker,
                                               const struct sockaddr* addr)
 {
        return map_find_tcp_session(&worker->tcp_waiting, addr);
@@ -1827,12 +1786,9 @@ int worker_end_tcp(struct session *session)
        return kr_ok();
 }
 
-knot_pkt_t * worker_resolve_mk_pkt(const char *qname_str, uint16_t qtype, uint16_t qclass,
+knot_pkt_t *worker_resolve_mk_pkt_dname(knot_dname_t *qname, uint16_t qtype, uint16_t qclass,
                                   const struct kr_qflags *options)
 {
-       uint8_t qname[KNOT_DNAME_MAXLEN];
-       if (!knot_dname_from_str(qname, qname_str, sizeof(qname)))
-               return NULL;
        knot_pkt_t *pkt = knot_pkt_new(NULL, KNOT_EDNS_MAX_UDP_PAYLOAD, NULL);
        if (!pkt)
                return NULL;
@@ -1867,6 +1823,15 @@ knot_pkt_t * worker_resolve_mk_pkt(const char *qname_str, uint16_t qtype, uint16
        return pkt;
 }
 
+knot_pkt_t *worker_resolve_mk_pkt(const char *qname_str, uint16_t qtype, uint16_t qclass,
+                                  const struct kr_qflags *options)
+{
+       uint8_t qname[KNOT_DNAME_MAXLEN];
+       if (!knot_dname_from_str(qname, qname_str, sizeof(qname)))
+               return NULL;
+       return worker_resolve_mk_pkt_dname(qname, qtype, qclass, options);
+}
+
 struct qr_task *worker_resolve_start(knot_pkt_t *query, struct kr_qflags options)
 {
        struct worker_ctx *worker = the_worker;
index 0d7f3f82edec8c250708c27405ed9d209b0abbcb..1c64e1bae4fa89162588a0b5ff101d8be1598f69 100644 (file)
@@ -44,6 +44,9 @@ int worker_submit(struct session *session, const struct sockaddr *peer, knot_pkt
  */
 int worker_end_tcp(struct session *session);
 
+KR_EXPORT knot_pkt_t *worker_resolve_mk_pkt_dname(knot_dname_t *qname, uint16_t qtype, uint16_t qclass,
+                                  const struct kr_qflags *options);
+
 /**
  * Create a packet suitable for worker_resolve_start().  All in malloc() memory.
  */
@@ -93,6 +96,10 @@ int worker_del_tcp_connected(struct worker_ctx *worker,
                             const struct sockaddr *addr);
 int worker_del_tcp_waiting(struct worker_ctx *worker,
                           const struct sockaddr* addr);
+struct session* worker_find_tcp_waiting(struct worker_ctx *worker,
+                                              const struct sockaddr* addr);
+struct session* worker_find_tcp_connected(struct worker_ctx *worker,
+                                              const struct sockaddr* addr);
 knot_pkt_t *worker_task_get_pktbuf(const struct qr_task *task);
 
 struct request_ctx *worker_task_get_request(struct qr_task *task);
@@ -133,7 +140,7 @@ struct worker_stats {
 /** @cond internal */
 
 /** Number of request within timeout window. */
-#define MAX_PENDING KR_NSREP_MAXADDR
+#define MAX_PENDING 4
 
 /** Maximum response time from TCP upstream, milliseconds */
 #define MAX_TCP_INACTIVITY (KR_RESOLVE_TIME_LIMIT + KR_CONN_RTT_MAX)
index b6c5f276d05256f8f3959f6c042cf57850c1d3a5..b51f2edfdef4fe126a4def5e2f7bffb05f7f0902 100644 (file)
@@ -33,6 +33,7 @@
  */
 
 #include <inttypes.h> /* PRIu64 */
+#include <limits.h>
 #include <stdlib.h>
 #include <uv.h>
 #include <ucw/mempool.h>
index 8b8bc2d207bb805adba3e359b7f26a922b49b694..6f6c975d711571c3ac1aa712c4f6489142a31389 100644 (file)
@@ -29,7 +29,7 @@
 #include "lib/resolve.h"
 #include "lib/rplan.h"
 #include "lib/defines.h"
-#include "lib/nsrep.h"
+#include "lib/selection.h"
 #include "lib/module.h"
 #include "lib/dnssec/ta.h"
 
@@ -716,6 +716,7 @@ static int process_answer(knot_pkt_t *pkt, struct kr_request *req)
        if (!is_authoritative(pkt, query)) {
                if (!(query->flags.FORWARD) &&
                    pkt_class & (PKT_NXDOMAIN|PKT_NODATA)) {
+                       query->server_selection.error(query, req->upstream.transport, KR_SELECTION_LAME_DELEGATION);
                        VERBOSE_MSG("<= lame response: non-auth sent negative response\n");
                        return KR_STATE_FAIL;
                }
@@ -772,12 +773,6 @@ static int process_answer(knot_pkt_t *pkt, struct kr_request *req)
 
                if (query->flags.FORWARD) {
                        next->forward_flags.CNAME = true;
-                       if (query->parent == NULL) {
-                               state = kr_nsrep_copy_set(&next->ns, &query->ns);
-                               if (state != kr_ok()) {
-                                       return KR_STATE_FAIL;
-                               }
-                       }
                }
                next->cname_parent = query;
                /* Want DNSSEC if and only if it's posible to secure
@@ -1026,7 +1021,7 @@ static int resolve(kr_layer_t *ctx, knot_pkt_t *pkt)
        } else if (!is_paired_to_query(pkt, query)) {
                WITH_VERBOSE(query) {
                        const char *ns_str =
-                               req->upstream.addr ? kr_straddr(req->upstream.addr) : "(internal)";
+                               req->upstream.transport ? kr_straddr(&req->upstream.transport->address.ip) : "(internal)";
                        VERBOSE_MSG("<= ignoring mismatching response from %s\n",
                                        ns_str ? ns_str : "(kr_straddr failed)");
                }
@@ -1038,11 +1033,12 @@ static int resolve(kr_layer_t *ctx, knot_pkt_t *pkt)
                VERBOSE_MSG("<= truncated response, failover to TCP\n");
                if (query) {
                        /* Fail if already on TCP. */
-                       if (query->flags.TCP) {
+                       if (req->upstream.transport->protocol != KR_TRANSPORT_UDP) {
                                VERBOSE_MSG("<= TC=1 with TCP, bailing out\n");
+                               query->server_selection.error(query, req->upstream.transport, KR_SELECTION_TRUNCATED);
                                return resolve_error(pkt, req);
                        }
-                       query->flags.TCP = true;
+                       query->server_selection.error(query, req->upstream.transport, KR_SELECTION_TRUNCATED);
                }
                return KR_STATE_CONSUME;
        }
@@ -1055,6 +1051,10 @@ static int resolve(kr_layer_t *ctx, knot_pkt_t *pkt)
        const knot_lookup_t *rcode = knot_lookup_by_id(knot_rcode_names, knot_wire_get_rcode(pkt->wire));
 #endif
 
+       // We can't return directly from the switch because we have to give feedback to server selection first
+       int ret = 0;
+       int selection_error = -1;
+
        /* Check response code. */
        switch(knot_wire_get_rcode(pkt->wire)) {
        case KNOT_RCODE_NOERROR:
@@ -1064,19 +1064,51 @@ static int resolve(kr_layer_t *ctx, knot_pkt_t *pkt)
                knot_wire_set_rcode(req->answer->wire, KNOT_RCODE_YXDOMAIN);
                break;
        case KNOT_RCODE_REFUSED:
+               if (query->flags.STUB) {
+                        /* just pass answer through if in stub mode */
+                       break;
+               }
+               selection_error = KR_SELECTION_REFUSED;
+               VERBOSE_MSG("<= rcode: %s\n", rcode ? rcode->name : "??");
+               ret = resolve_badmsg(pkt, req, query);
+               break;
        case KNOT_RCODE_SERVFAIL:
                if (query->flags.STUB) {
                         /* just pass answer through if in stub mode */
                        break;
                }
-               /* fall through */
+               selection_error = KR_SELECTION_SERVFAIL;
+               VERBOSE_MSG("<= rcode: %s\n", rcode ? rcode->name : "??");
+               ret = resolve_badmsg(pkt, req, query);
+               break;
        case KNOT_RCODE_FORMERR:
+               selection_error = KR_SELECTION_FORMERROR;
+               VERBOSE_MSG("<= rcode: %s\n", rcode ? rcode->name : "??");
+               ret = resolve_badmsg(pkt, req, query);
+               break;
        case KNOT_RCODE_NOTIMPL:
+               selection_error = KR_SELECTION_NOTIMPL;
                VERBOSE_MSG("<= rcode: %s\n", rcode ? rcode->name : "??");
-               return resolve_badmsg(pkt, req, query);
+               ret = resolve_badmsg(pkt, req, query);
+               break;
        default:
+               selection_error = KR_SELECTION_OTHER_RCODE;
                VERBOSE_MSG("<= rcode: %s\n", rcode ? rcode->name : "??");
-               return resolve_error(pkt, req);
+               ret = resolve_error(pkt, req);
+               break;
+       }
+
+       if (query->server_selection.initialized) {
+               if (selection_error != -1) {
+                       query->server_selection.error(query, req->upstream.transport, selection_error);
+               } else {
+                       // Is this even true? Is this neccesary?
+                       query->server_selection.success(query, req->upstream.transport);
+               }
+       }
+
+       if (ret) {
+               return ret;
        }
 
        int state;
@@ -1106,7 +1138,7 @@ rrarray_finalize:
        (void)0;
        ranked_rr_array_t *selected[] = kr_request_selected(req);
        for (knot_section_t i = KNOT_ANSWER; i <= KNOT_ADDITIONAL; ++i) {
-               int ret = kr_ranked_rrarray_finalize(selected[i], query->uid, &req->pool);
+               ret = kr_ranked_rrarray_finalize(selected[i], query->uid, &req->pool);
                if (unlikely(ret)) {
                        return KR_STATE_FAIL;
                }
index cbbc0738a8aaa2e2d8940632492740e4cd916c0e..aa8aa8be40ff01cef805155b6ac69d821628b8ec 100644 (file)
@@ -23,6 +23,7 @@
 #include "lib/utils.h"
 #include "lib/defines.h"
 #include "lib/module.h"
+#include "lib/selection.h"
 
 #define VERBOSE_MSG(qry, ...) QRVERBOSE(qry, "vldr", __VA_ARGS__)
 
@@ -349,7 +350,7 @@ static knot_rrset_t *update_ds(struct kr_zonecut *cut, const knot_pktsection_t *
                        return NULL;
                }
        }
-       return new_ds;  
+       return new_ds;
 }
 
 static void mark_insecure_parents(const struct kr_query *qry)
@@ -1190,11 +1191,22 @@ static int hide_bogus(kr_layer_t *ctx) {
        return ctx->state;
 }
 
+static int validate_wrapper(kr_layer_t *ctx, knot_pkt_t *pkt) {
+       // Wrapper for now.
+       int ret = validate(ctx, pkt);
+       struct kr_request *req = ctx->req;
+       struct kr_query *qry = req->current_query;
+       if (ret & KR_STATE_FAIL && qry->flags.DNSSEC_BOGUS)
+               qry->server_selection.error(qry, req->upstream.transport, KR_SELECTION_DNSSEC_ERROR);
+       return ret;
+}
+
+
 /** Module implementation. */
 int validate_init(struct kr_module *self)
 {
        static const kr_layer_api_t layer = {
-               .consume = &validate,
+               .consume = &validate_wrapper,
                .answer_finalize = &hide_bogus,
        };
        self->layer = &layer;
index 03f747691766017489acdfa9e6cdcc4aa0e1bb8a..73262bd8502b416b0325430c047fa2ff00a08c1a 100644 (file)
@@ -24,9 +24,11 @@ libkres_src = files([
   'layer/iterate.c',
   'layer/validate.c',
   'module.c',
-  'nsrep.c',
   'resolve.c',
   'rplan.c',
+  'selection.c',
+  'selection_forward.c',
+  'selection_iter.c',
   'utils.c',
   'zonecut.c',
 ])
@@ -52,9 +54,11 @@ libkres_headers = files([
   'layer.h',
   'layer/iterate.h',
   'module.h',
-  'nsrep.h',
   'resolve.h',
   'rplan.h',
+  'selection.h',
+  'selection_forward.h',
+  'selection_iter.h',
   'utils.h',
   'zonecut.h',
 ])
diff --git a/lib/nsrep.c b/lib/nsrep.c
deleted file mode 100644 (file)
index c49f406..0000000
+++ /dev/null
@@ -1,570 +0,0 @@
-/*  Copyright (C) 2014-2017 CZ.NIC, z.s.p.o. <knot-dns@labs.nic.cz>
- *  SPDX-License-Identifier: GPL-3.0-or-later
- */
-
-#include <assert.h>
-#include <sys/socket.h>
-#include <netinet/in.h>
-#include <netdb.h>
-
-#include <arpa/inet.h>
-
-#include "lib/nsrep.h"
-#include "lib/rplan.h"
-#include "lib/resolve.h"
-#include "lib/defines.h"
-#include "lib/generic/pack.h"
-#include "contrib/ucw/lib.h"
-
-/** Some built-in unfairness ... */
-#ifndef FAVOUR_IPV6
-#define FAVOUR_IPV6 20 /* 20ms bonus for v6 */
-#endif
-
-/** @internal Macro to set address structure. */
-#define ADDR_SET(sa, family, addr, len, port) do {\
-       memcpy(&sa ## _addr, (addr), (len)); \
-       sa ## _family = (family); \
-       sa ## _port = htons(port); \
-} while (0)
-
-/** Update nameserver representation with current name/address pair. */
-static void update_nsrep(struct kr_nsrep *ns, size_t pos, uint8_t *addr, size_t addr_len, int port)
-{
-       if (addr == NULL) {
-               ns->addr[pos].ip.sa_family = AF_UNSPEC;
-               return;
-       }
-
-       /* Rotate previous addresses to the right. */
-       memmove(ns->addr + pos + 1, ns->addr + pos, (KR_NSREP_MAXADDR - pos - 1) * sizeof(ns->addr[0]));
-
-       switch(addr_len) {
-       case sizeof(struct in_addr):
-               ADDR_SET(ns->addr[pos].ip4.sin, AF_INET, addr, addr_len, port); break;
-       case sizeof(struct in6_addr):
-               ADDR_SET(ns->addr[pos].ip6.sin6, AF_INET6, addr, addr_len, port); break;
-       default: assert(0); break;
-       }
-}
-
-static void update_nsrep_set(struct kr_nsrep *ns, const knot_dname_t *name, uint8_t *addr[], unsigned score)
-{
-       /* NSLIST is not empty, empty NS cannot be a leader. */
-       if (!addr[0] && ns->addr[0].ip.sa_family != AF_UNSPEC) {
-               return;
-       }
-       /* Set new NS leader */
-       ns->name = name;
-       ns->score = score;
-       for (size_t i = 0; i < KR_NSREP_MAXADDR; ++i) {
-               if (addr[i]) {
-                       void *addr_val = pack_obj_val(addr[i]);
-                       size_t len = pack_obj_len(addr[i]);
-                       update_nsrep(ns, i, addr_val, len, KR_DNS_PORT);
-               } else {
-                       break;
-               }
-       }
-}
-
-#undef ADDR_SET
-
-/**
- * \param addr_set pack with one IP address per element */
-static unsigned eval_addr_set(const pack_t *addr_set, struct kr_context *ctx,
-                             struct kr_qflags opts, unsigned score, uint8_t *addr[])
-{
-       kr_nsrep_rtt_lru_t *rtt_cache = ctx->cache_rtt;
-       kr_nsrep_rtt_lru_entry_t *rtt_cache_entry_ptr[KR_NSREP_MAXADDR] = { NULL, };
-       assert (KR_NSREP_MAXADDR >= 2);
-       unsigned rtt_cache_entry_score[KR_NSREP_MAXADDR] = { score, KR_NS_MAX_SCORE + 1, };
-       uint64_t now = kr_now();
-
-       /* Name server is better candidate if it has address record. */
-       for (uint8_t *it = pack_head(*addr_set); it != pack_tail(*addr_set);
-                                               it = pack_obj_next(it)) {
-               void *val = pack_obj_val(it);
-               size_t len = pack_obj_len(it);
-               unsigned favour = 0;
-               bool is_valid = false;
-               /* Check if the address isn't disabled. */
-               if (len == sizeof(struct in6_addr)) {
-                       is_valid = !(opts.NO_IPV6);
-                       favour = FAVOUR_IPV6;
-               } else if (len == sizeof(struct in_addr)) {
-                       is_valid = !(opts.NO_IPV4);
-               } else {
-                       assert(!EINVAL);
-                       is_valid = false;
-               }
-
-               if (!is_valid) {
-                       continue;
-               }
-
-               /* Get score for the current address. */
-               kr_nsrep_rtt_lru_entry_t *cached = rtt_cache ?
-                                                  lru_get_try(rtt_cache, val, len) :
-                                                  NULL;
-               unsigned cur_addr_score = KR_NS_GLUED;
-               if (cached) {
-                       cur_addr_score = cached->score;
-                       if (cached->score >= KR_NS_TIMEOUT) {
-                               /* If NS once was marked as "timeouted",
-                                * it won't participate in NS elections
-                                * at least ctx->cache_rtt_tout_retry_interval milliseconds. */
-                               uint64_t elapsed = now - cached->tout_timestamp;
-                               elapsed = elapsed > UINT_MAX ? UINT_MAX : elapsed;
-                               if (elapsed > ctx->cache_rtt_tout_retry_interval) {
-                                       /* Select this NS for probing in this particular query,
-                                        * but don't change the cached score.
-                                        * For other queries this NS will remain "timeouted". */
-                                       cur_addr_score = KR_NS_LONG - 1;
-                               }
-                       }
-               }
-
-               /* We can't always use favour.  If these conditions held:
-                *
-                * rtt_cache_entry_score[i] < KR_NS_TIMEOUT
-                * rtt_cache_entry_score[i] + favour > KR_NS_TIMEOUT
-                * cur_addr_score < rtt_cache_entry_score[i] + favour
-                *
-                * we would prefer "certainly dead" cur_addr_score
-                * instead of "almost dead but alive" rtt_cache_entry_score[i]
-                */
-               const unsigned cur_favour = cur_addr_score < KR_NS_TIMEOUT ? favour : 0;
-               for (size_t i = 0; i < KR_NSREP_MAXADDR; ++i) {
-                       if (cur_addr_score >= rtt_cache_entry_score[i] + cur_favour)
-                               continue;
-
-                       /* Shake down previous contenders */
-                       for (size_t j = KR_NSREP_MAXADDR - 1; j > i; --j) {
-                               addr[j] = addr[j - 1];
-                               rtt_cache_entry_ptr[j] = rtt_cache_entry_ptr[j - 1];
-                               rtt_cache_entry_score[j] = rtt_cache_entry_score[j - 1];
-                       }
-                       addr[i] = it;
-                       rtt_cache_entry_score[i] = cur_addr_score;
-                       rtt_cache_entry_ptr[i] = cached;
-                       break;
-               }
-       }
-
-       /* At this point, rtt_cache_entry_ptr contains up to KR_NSREP_MAXADDR
-        * pointers to the rtt cache entries with the best scores for the given addr_set.
-        * Check if there are timeouted NS. */
-
-       for (size_t i = 0; i < KR_NSREP_MAXADDR; ++i) {
-               if (rtt_cache_entry_ptr[i] == NULL)
-                       continue;
-               if (rtt_cache_entry_ptr[i]->score < KR_NS_TIMEOUT)
-                       continue;
-
-               uint64_t elapsed = now - rtt_cache_entry_ptr[i]->tout_timestamp;
-               elapsed = elapsed > UINT_MAX ? UINT_MAX : elapsed;
-               if (elapsed <= ctx->cache_rtt_tout_retry_interval)
-                       continue;
-
-               /* rtt_cache_entry_ptr[i] points to "timeouted" rtt cache entry.
-                * The period of the ban on participation in elections has expired. */
-
-               if (VERBOSE_STATUS) {
-                       void *val = pack_obj_val(addr[i]);
-                       size_t len = pack_obj_len(addr[i]);
-                       char sa_str[INET6_ADDRSTRLEN];
-                       int af = (len == sizeof(struct in6_addr)) ? AF_INET6 : AF_INET;
-                       inet_ntop(af, val, sa_str, sizeof(sa_str));
-                       kr_log_verbose("[     ][nsre] probing timeouted NS: %s, score %i\n",
-                                      sa_str, rtt_cache_entry_ptr[i]->score);
-               }
-
-               rtt_cache_entry_ptr[i]->tout_timestamp = now;
-       }
-
-       return rtt_cache_entry_score[0];
-}
-
-static int eval_nsrep(const knot_dname_t *owner, const pack_t *addr_set, struct kr_query *qry)
-{
-       struct kr_nsrep *ns = &qry->ns;
-       struct kr_context *ctx = ns->ctx;
-       unsigned score = KR_NS_MAX_SCORE;
-       unsigned reputation = 0;
-       uint8_t *addr_choice[KR_NSREP_MAXADDR] = { NULL, };
-
-       /* Fetch NS reputation */
-       if (ctx->cache_rep) {
-               unsigned *cached = lru_get_try(ctx->cache_rep, (const char *)owner,
-                                              knot_dname_size(owner));
-               if (cached) {
-                       reputation = *cached;
-               }
-       }
-
-       /* Favour nameservers with unknown addresses to probe them,
-        * otherwise discover the current best address for the NS. */
-       if (addr_set->len == 0) {
-               score = KR_NS_UNKNOWN;
-               /* If the server doesn't have IPv6, give it disadvantage. */
-               if (reputation & KR_NS_NOIP6) {
-                       score += FAVOUR_IPV6;
-                       /* If the server is unknown but has rep record, treat it as timeouted */
-                       if (reputation & KR_NS_NOIP4) {
-                               score = KR_NS_UNKNOWN;
-                               /* Try to start with clean slate */
-                               if (!(qry->flags.NO_IPV6)) {
-                                       reputation &= ~KR_NS_NOIP6;
-                               }
-                               if (!(qry->flags.NO_IPV4)) {
-                                       reputation &= ~KR_NS_NOIP4;
-                               }
-                       }
-               }
-       } else {
-               score = eval_addr_set(addr_set, ctx, qry->flags, score, addr_choice);
-       }
-
-       /* Probabilistic bee foraging strategy (naive).
-        * The fastest NS is preferred by workers until it is depleted (timeouts or degrades),
-        * at the same time long distance scouts probe other sources (low probability).
-        * Servers on TIMEOUT will not have probed at all.
-        * Servers with score above KR_NS_LONG will have periodically removed from
-        * reputation cache, so that kresd can reprobe them. */
-       if (score >= KR_NS_TIMEOUT) {
-               return kr_ok();
-       } else if (score <= ns->score &&
-          (score < KR_NS_LONG  || qry->flags.NO_THROTTLE)) {
-               update_nsrep_set(ns, owner, addr_choice, score);
-               ns->reputation = reputation;
-       } else if (kr_rand_coin(1, 10) &&
-                  !kr_rand_coin(score, KR_NS_MAX_SCORE)) {
-               /* With 10% chance probe server with a probability
-                * given by its RTT / MAX_RTT. */
-               update_nsrep_set(ns, owner, addr_choice, score);
-               ns->reputation = reputation;
-               return 1; /* Stop evaluation */
-       } else if (ns->score > KR_NS_MAX_SCORE) {
-               /* Check if any server was already selected.
-                * If no, pick current server and continue evaluation. */
-               update_nsrep_set(ns, owner, addr_choice, score);
-               ns->reputation = reputation;
-       }
-
-       return kr_ok();
-}
-
-int kr_nsrep_set(struct kr_query *qry, size_t index, const struct sockaddr *sock)
-{
-       if (!qry) {
-               return kr_error(EINVAL);
-       }
-       if (index >= KR_NSREP_MAXADDR) {
-               return kr_error(ENOSPC);
-       }
-
-       if (!sock) {
-               qry->ns.name = (const uint8_t *)"";
-               qry->ns.addr[index].ip.sa_family = AF_UNSPEC;
-               return kr_ok();
-       }
-
-       switch (sock->sa_family) {
-       case AF_INET:
-               if (qry->flags.NO_IPV4) {
-                       return kr_error(ENOENT);
-               }
-               qry->ns.addr[index].ip4 = *(const struct sockaddr_in *)sock;
-               break;
-       case AF_INET6:
-               if (qry->flags.NO_IPV6) {
-                       return kr_error(ENOENT);
-               }
-               qry->ns.addr[index].ip6 = *(const struct sockaddr_in6 *)sock;
-               break;
-       default:
-               qry->ns.addr[index].ip.sa_family = AF_UNSPEC;
-               return kr_error(EINVAL);
-       }
-
-       qry->ns.name = (const uint8_t *)"";
-       /* Reset score on first entry */
-       if (index == 0) {
-               qry->ns.score = KR_NS_UNKNOWN;
-               qry->ns.reputation = 0;
-       }
-
-       /* Retrieve RTT from cache */
-       struct kr_context *ctx = qry->ns.ctx;
-       kr_nsrep_rtt_lru_entry_t *rtt_cache_entry = ctx
-               ? lru_get_try(ctx->cache_rtt, kr_inaddr(sock), kr_family_len(sock->sa_family))
-               : NULL;
-       if (rtt_cache_entry) {
-               qry->ns.score = MIN(qry->ns.score, rtt_cache_entry->score);
-       }
-
-       return kr_ok();
-}
-
-#define ELECT_INIT(ns, ctx_) do { \
-       (ns)->ctx = (ctx_); \
-       (ns)->addr[0].ip.sa_family = AF_UNSPEC; \
-       (ns)->reputation = 0; \
-       (ns)->score = KR_NS_MAX_SCORE + 1; \
-} while (0)
-
-int kr_nsrep_elect(struct kr_query *qry, struct kr_context *ctx)
-{
-       if (!qry || !ctx) {
-               //assert(!EINVAL);
-               return kr_error(EINVAL);
-       }
-
-       // First we dump the nsset into a temporary array
-       const int nsset_len = trie_weight(qry->zone_cut.nsset);
-       struct {
-               const knot_dname_t *name;
-               const pack_t *addrs;
-       } nsset[nsset_len];
-
-       trie_it_t *it;
-       int i = 0;
-       for (it = trie_it_begin(qry->zone_cut.nsset); !trie_it_finished(it);
-                                                       trie_it_next(it), ++i) {
-               /* we trust it's a correct dname */
-               nsset[i].name = (const knot_dname_t *)trie_it_key(it, NULL);
-               nsset[i].addrs = (const pack_t *)*trie_it_val(it);
-       }
-       trie_it_free(it);
-       assert(i == nsset_len);
-
-       // Now we sort it randomly, by select-sort.
-       for (i = 0; i < nsset_len - 1; ++i) {
-               // The winner for position i will be uniformly chosen from indices >= i
-               const int j = i + kr_rand_bytes(1) % (nsset_len - i);
-               // Now we swap the winner with index i
-               if (i == j) continue;
-               __typeof__((nsset[i])) tmp = nsset[i];
-               nsset[i] = nsset[j];
-               nsset[j] = tmp;
-       }
-
-       // Finally we run the original algorithm, in this randomized order.
-       struct kr_nsrep *ns = &qry->ns;
-       ELECT_INIT(ns, ctx);
-       int ret = kr_ok();
-       for (i = 0; i < nsset_len; ++i) {
-               ret = eval_nsrep(nsset[i].name, nsset[i].addrs, qry);
-               if (ret) break;
-       }
-
-       if (qry->ns.score <= KR_NS_MAX_SCORE && qry->ns.score >= KR_NS_LONG) {
-               /* This is a low-reliability probe,
-                * go with TCP to get ICMP reachability check. */
-               qry->flags.TCP = true;
-       }
-       return ret;
-}
-
-int kr_nsrep_elect_addr(struct kr_query *qry, struct kr_context *ctx)
-{
-       if (!qry || !ctx) {
-               //assert(!EINVAL);
-               return kr_error(EINVAL);
-       }
-
-       /* Get address list for this NS */
-       struct kr_nsrep *ns = &qry->ns;
-       ELECT_INIT(ns, ctx);
-       pack_t *addr_set = kr_zonecut_find(&qry->zone_cut, ns->name);
-       if (!addr_set) {
-               return kr_error(ENOENT);
-       }
-       /* Evaluate addr list */
-       uint8_t *addr_choice[KR_NSREP_MAXADDR] = { NULL, };
-       unsigned score = eval_addr_set(addr_set, ctx, qry->flags, ns->score, addr_choice);
-       update_nsrep_set(ns, ns->name, addr_choice, score);
-       return kr_ok();
-}
-
-#undef ELECT_INIT
-
-int kr_nsrep_update_rtt(struct kr_nsrep *ns, const struct sockaddr *addr,
-                       unsigned score, kr_nsrep_rtt_lru_t *cache, int umode)
-{
-       if (!cache || umode > KR_NS_MAX || umode < 0) {
-               return kr_error(EINVAL);
-       }
-
-       /* Get `addr`, and later its raw string. */
-       if (addr) {
-               /* Caller provided specific address, OK. */
-       } else if (ns != NULL) {
-               addr = &ns->addr[0].ip;
-       } else {
-               assert(false && "kr_nsrep_update_rtt: don't know what address to update");
-               return kr_error(EINVAL);
-       }
-       const char *addr_in = kr_inaddr(addr);
-       size_t addr_len = kr_inaddr_len(addr);
-       if (!addr_in || addr_len <= 0) {
-               assert(false && "kr_nsrep_update_rtt: incorrect address");
-               return kr_error(EINVAL);
-       }
-
-       bool is_new_entry = false;
-       kr_nsrep_rtt_lru_entry_t  *cur = lru_get_new(cache, addr_in, addr_len,
-                                                    (&is_new_entry));
-       if (!cur) {
-               return kr_ok();
-       }
-       if (score <= KR_NS_GLUED) {
-               score = KR_NS_GLUED + 1;
-       }
-       /* If there's nothing to update, we reset it unless KR_NS_UPDATE_NORESET
-        * mode was requested.  New items are zeroed by LRU automatically. */
-       if (is_new_entry && umode != KR_NS_UPDATE_NORESET) {
-               umode = KR_NS_RESET;
-       }
-       unsigned new_score = 0;
-       /* Update score, by default smooth over last two measurements. */
-       switch (umode) {
-       case KR_NS_UPDATE:
-       case KR_NS_UPDATE_NORESET:
-               new_score = (cur->score + score) / 2; break;
-       case KR_NS_RESET:  new_score = score; break;
-       case KR_NS_ADD:    new_score = MIN(KR_NS_MAX_SCORE - 1, cur->score + score); break;
-       case KR_NS_MAX:    new_score = MAX(cur->score, score); break;
-       default:           return kr_error(EINVAL);
-       }
-       /* Score limits */
-       if (new_score > KR_NS_MAX_SCORE) {
-               new_score = KR_NS_MAX_SCORE;
-       }
-       if (new_score >= KR_NS_TIMEOUT && cur->score < KR_NS_TIMEOUT) {
-               /* Set the timestamp only when NS became "timeouted" */
-               cur->tout_timestamp = kr_now();
-       }
-       cur->score = new_score;
-       return kr_ok();
-}
-
-int kr_nsrep_update_rep(struct kr_nsrep *ns, unsigned reputation, kr_nsrep_lru_t *cache)
-{
-       if (!ns || !cache ) {
-               return kr_error(EINVAL);
-       }
-
-       /* Store in the struct */
-       ns->reputation = reputation;
-       /* Store reputation in the LRU cache */
-       unsigned *cur = lru_get_new(cache, (const char *)ns->name,
-                                   knot_dname_size(ns->name), NULL);
-       if (cur) {
-               *cur = reputation;
-       }
-       return kr_ok();
-}
-
-int kr_nsrep_copy_set(struct kr_nsrep *dst, const struct kr_nsrep *src)
-{
-       if (!dst || !src ) {
-               return kr_error(EINVAL);
-       }
-
-       memcpy(dst, src, sizeof(struct kr_nsrep));
-       dst->name = (const uint8_t *)"";
-       dst->score = KR_NS_UNKNOWN;
-       dst->reputation = 0;
-
-       return kr_ok();
-}
-
-int kr_nsrep_sort(struct kr_nsrep *ns, struct kr_context *ctx)
-{
-       if (!ns || !ctx) {
-               assert(false);
-               return kr_error(EINVAL);
-       }
-
-       kr_nsrep_rtt_lru_t *rtt_cache = ctx->cache_rtt;
-
-       ns->reputation = 0;
-       ns->score = KR_NS_MAX_SCORE + 1;
-
-       if (ns->addr[0].ip.sa_family == AF_UNSPEC) {
-               return kr_error(EINVAL);
-       }
-
-       /* Compute the scores.  Unfortunately there's no space for scores
-        * along the addresses. */
-       unsigned scores[KR_NSREP_MAXADDR];
-       int i;
-       bool timeouted_address_is_already_selected = false;
-       for (i = 0; i < KR_NSREP_MAXADDR; ++i) {
-               const struct sockaddr *sa = &ns->addr[i].ip;
-               if (sa->sa_family == AF_UNSPEC) {
-                       break;
-               }
-               kr_nsrep_rtt_lru_entry_t *rtt_cache_entry = lru_get_try(rtt_cache,
-                                                                       kr_inaddr(sa),
-                                                                       kr_family_len(sa->sa_family));
-               if (!rtt_cache_entry) {
-                       scores[i] = 1; /* prefer unknown to probe RTT */
-               } else if (rtt_cache_entry->score < KR_NS_FWD_TIMEOUT) {
-                       /* some probability to bump bad ones up for re-probe */
-                       scores[i] = rtt_cache_entry->score;
-                       /* The lower the rtt, the more likely it will be selected. */
-                       if (!kr_rand_coin(rtt_cache_entry->score, KR_NS_FWD_TIMEOUT)) {
-                               scores[i] = 1;
-                       }
-               } else {
-                       uint64_t now = kr_now();
-                       uint64_t elapsed = now - rtt_cache_entry->tout_timestamp;
-                       scores[i] = KR_NS_MAX_SCORE + 1;
-                       elapsed = elapsed > UINT_MAX ? UINT_MAX : elapsed;
-                       if (elapsed > ctx->cache_rtt_tout_retry_interval &&
-                           !timeouted_address_is_already_selected) {
-                               scores[i] = 1;
-                               rtt_cache_entry->tout_timestamp = now;
-                               timeouted_address_is_already_selected = true;
-                       }
-               }
-
-               /* Give advantage to IPv6. */
-               if (scores[i] <= KR_NS_MAX_SCORE && sa->sa_family == AF_INET) {
-                       scores[i] += FAVOUR_IPV6;
-               }
-
-               if (VERBOSE_STATUS) {
-                       kr_log_verbose("[     ][nsre] score %d for %s;\t cached RTT: %d\n",
-                                       scores[i], kr_straddr(sa),
-                                       rtt_cache_entry ? rtt_cache_entry->score : -1);
-               }
-       }
-
-       /* Select-sort the addresses. */
-       const int count = i;
-       for (i = 0; i < count - 1; ++i) {
-               /* find min from i onwards */
-               int min_i = i;
-               for (int j = i + 1; j < count; ++j) {
-                       if (scores[j] < scores[min_i]) {
-                               min_i = j;
-                       }
-               }
-               /* swap the indices */
-               if (min_i != i) {
-                       SWAP(scores[min_i], scores[i]);
-                       SWAP(ns->addr[min_i], ns->addr[i]);
-               }
-       }
-
-       if (count > 0) {
-               ns->score = scores[0];
-               ns->reputation = 0;
-       }
-
-       return kr_ok();
-}
diff --git a/lib/nsrep.h b/lib/nsrep.h
deleted file mode 100644 (file)
index 57aecc8..0000000
+++ /dev/null
@@ -1,176 +0,0 @@
-/*  Copyright (C) 2014-2017 CZ.NIC, z.s.p.o. <knot-dns@labs.nic.cz>
- *  SPDX-License-Identifier: GPL-3.0-or-later
- */
-
-#pragma once
-
-#include <netinet/in.h>
-#include <sys/socket.h>
-#include <libknot/dname.h>
-#include <limits.h>
-
-#include "lib/defines.h"
-#include "lib/generic/lru.h"
-
-struct kr_query;
-
-/**
-  * NS RTT score (special values).
-  * @note RTT is measured in milliseconds.
-  */
-enum kr_ns_score {
-       KR_NS_MAX_SCORE     = 20 * KR_CONN_RTT_MAX, /* max possible value */
-       KR_NS_FWD_TIMEOUT   = (95 * 10000) / 100, /* timeout for upstream recursor,
-                                                  * 95 percents from max resolution time */
-       KR_NS_TIMEOUT       = (95 * KR_CONN_RTT_MAX) / 100, /* timeout for upstream auth */
-       KR_NS_LONG          = (3 * KR_NS_TIMEOUT) / 4,
-       KR_NS_UNKNOWN       = KR_NS_TIMEOUT / 2,
-       KR_NS_PENALTY       = 100,
-       KR_NS_GLUED         = 10
-};
-
-/**
- *  See kr_nsrep_update_rtt()
- */
-#define KR_NS_DEAD (((KR_NS_TIMEOUT * 4) + 3) / 3)
-#define KR_NS_FWD_DEAD (((KR_NS_FWD_TIMEOUT * 4) + 3) / 3)
-
-/** If once NS was marked as "timeouted", it won't participate in NS elections
- * at least KR_NS_TIMEOUT_RETRY_INTERVAL milliseconds (now: one second). */
-#define KR_NS_TIMEOUT_RETRY_INTERVAL 1000
-
-/**
- * NS QoS flags.
- */
-enum kr_ns_rep {
-       KR_NS_NOIP4  = 1 << 0, /**< NS has no IPv4 */
-       KR_NS_NOIP6  = 1 << 1, /**< NS has no IPv6 */
-       KR_NS_NOEDNS = 1 << 2  /**< NS has no EDNS support */
-};
-
-/**
- * NS RTT update modes.
- * First update is always KR_NS_RESET unless
- * KR_NS_UPDATE_NORESET mode had choosen.
- */
-enum kr_ns_update_mode {
-       KR_NS_UPDATE = 0,     /**< Update as smooth over last two measurements */
-       KR_NS_UPDATE_NORESET, /**< Same as KR_NS_UPDATE, but disable fallback to
-                              *   KR_NS_RESET on newly added entries.
-                              *   Zero is used as initial value. */
-       KR_NS_RESET,          /**< Set to given value */
-       KR_NS_ADD,            /**< Increment current value */
-       KR_NS_MAX             /**< Set to maximum of current/proposed value. */
-};
-
-struct kr_nsrep_rtt_lru_entry {
-       unsigned score;           /* combined rtt */
-       uint64_t tout_timestamp;  /* The time when score became
-                                  * greater or equal then KR_NS_TIMEOUT.
-                                  * Is meaningful only when score >= KR_NS_TIMEOUT */
-};
-
-typedef struct kr_nsrep_rtt_lru_entry kr_nsrep_rtt_lru_entry_t;
-
-/**
- * NS QoS tracking.
- */
-typedef lru_t(kr_nsrep_rtt_lru_entry_t) kr_nsrep_rtt_lru_t;
-
-/**
- * NS reputation tracking.
- */
-typedef lru_t(unsigned) kr_nsrep_lru_t;
-
-/* Maximum count of addresses probed in one go (last is left empty) */
-#define KR_NSREP_MAXADDR 4
-
-/**
- * Name server representation.
- * Contains extra information about the name server, e.g. score
- * or other metadata.
- */
-struct kr_nsrep
-{
-       unsigned score;                  /**< NS score */
-       unsigned reputation;             /**< NS reputation */
-       const knot_dname_t *name;        /**< NS name */
-       struct kr_context *ctx;          /**< Resolution context */
-       union inaddr addr[KR_NSREP_MAXADDR];        /**< NS address(es) */
-};
-
-/**
- * Set given NS address.  (Very low-level access to the list.)
- * @param  qry      updated query
- * @param  index    index of the updated target
- * @param  sock     socket address to use (sockaddr_in or sockaddr_in6 or NULL)
- * @return          0 or an error code, in particular kr_error(ENOENT) for net.ipvX
- */
-KR_EXPORT
-int kr_nsrep_set(struct kr_query *qry, size_t index, const struct sockaddr *sock);
-
-/**
- * Elect best nameserver/address pair from the nsset.
- * @param  qry          updated query
- * @param  ctx          resolution context
- * @return              0 or an error code
- */
-KR_EXPORT
-int kr_nsrep_elect(struct kr_query *qry, struct kr_context *ctx);
-
-/**
- * Elect best nameserver/address pair from the nsset.
- * @param  qry          updated query
- * @param  ctx          resolution context
- * @return              0 or an error code
- */
-KR_EXPORT
-int kr_nsrep_elect_addr(struct kr_query *qry, struct kr_context *ctx);
-
-/**
- * Update NS address RTT information.
- *
- * @brief In KR_NS_UPDATE mode reputation is smoothed over last N measurements.
- * 
- * @param  ns           updated NS representation
- * @param  addr         chosen address (NULL for first)
- * @param  score        new score (i.e. RTT), see enum kr_ns_score
- * @param  cache        RTT LRU cache
- * @param  umode        update mode (KR_NS_UPDATE or KR_NS_RESET or KR_NS_ADD)
- * @return              0 on success, error code on failure
- */
-KR_EXPORT
-int kr_nsrep_update_rtt(struct kr_nsrep *ns, const struct sockaddr *addr,
-                       unsigned score, kr_nsrep_rtt_lru_t *cache, int umode);
-
-/**
- * Update NSSET reputation information.
- * 
- * @param  ns           updated NS representation
- * @param  reputation   combined reputation flags, see enum kr_ns_rep
- * @param  cache        LRU cache
- * @return              0 on success, error code on failure
- */
-KR_EXPORT
-int kr_nsrep_update_rep(struct kr_nsrep *ns, unsigned reputation, kr_nsrep_lru_t *cache);
-/**
- * Copy NSSET reputation information and resets score.
- *
- * @param  dst          updated NS representation
- * @param  src          source NS representation
- * @return              0 on success, error code on failure
- */
-int kr_nsrep_copy_set(struct kr_nsrep *dst, const struct kr_nsrep *src);
-
-/**
- * Sort addresses in the query nsrep list by cached RTT.
- * if RTT is greater then KR_NS_TIMEOUT, address will placed at the beginning of the
- * nsrep list once in cache.ns_tout() milliseconds. Otherwise it will be sorted
- * as if it has cached RTT equal to KR_NS_MAX_SCORE + 1.
- * @param  ns           updated kr_nsrep
- * @param  ctx          name resolution context.
- * @return              0 or an error code
- * @note   ns reputation is zeroed and score is set to KR_NS_MAX_SCORE + 1.
- */
-KR_EXPORT
-int kr_nsrep_sort(struct kr_nsrep *ns,  struct kr_context *ctx);
index 9828cbfa5e1fae3061c902a7a7a7b0de360c4654..4d1b297543e43002d5628bf9c03aaafbba848f92 100644 (file)
@@ -11,6 +11,7 @@
 #include <libknot/rrtype/rdname.h>
 #include <libknot/descriptor.h>
 #include <ucw/mempool.h>
+#include <sys/socket.h>
 #include "kresconfig.h"
 #include "lib/resolve.h"
 #include "lib/layer.h"
@@ -109,8 +110,11 @@ static int answer_finalize_yield(kr_layer_t *ctx) { return kr_ok(); }
                if (mod->layer) { \
                        struct kr_layer layer = {.state = (r)->state, .api = mod->layer, .req = (r)}; \
                        if (layer.api && layer.api->func) { \
+                               /*printf("%s %s\n", STRINGIFY(func), (mod->name));*/ \
                                (r)->state = layer.api->func(&layer, ##__VA_ARGS__); \
+                               /*printf("%s %s %x\n", STRINGIFY(func), (mod->name), (r->state));*/ \
                                if ((r)->state == KR_STATE_YIELD) { \
+                                       /*printf("%s_yield %s\n", STRINGIFY(func), (mod->name));*/ \
                                        func ## _yield(&layer, ##__VA_ARGS__); \
                                        break; \
                                } \
@@ -147,7 +151,7 @@ static void randomized_qname_case(knot_dname_t * restrict qname, uint32_t secret
                return;
        }
        assert(qname);
-       const int len = knot_dname_size(qname) - 2; /* Skip first, last label. */
+       const int len = knot_dname_size(qname) - 2; /* Skip first, last label. First is length, last is always root */
        for (int i = 0; i < len; ++i) {
                /* Note: this relies on the fact that correct label lengths
                 * can't pass the isletter() test (by "luck"). */
@@ -157,23 +161,6 @@ static void randomized_qname_case(knot_dname_t * restrict qname, uint32_t secret
        }
 }
 
-/** Invalidate current NS/addr pair. */
-static int invalidate_ns(struct kr_rplan *rplan, struct kr_query *qry)
-{
-       if (qry->ns.addr[0].ip.sa_family != AF_UNSPEC) {
-               const char *addr = kr_inaddr(&qry->ns.addr[0].ip);
-               int addr_len = kr_inaddr_len(&qry->ns.addr[0].ip);
-               int ret = kr_zonecut_del(&qry->zone_cut, qry->ns.name, addr, addr_len);
-               /* Also remove it from the qry->ns.addr array.
-                * That's useful at least for STUB and FORWARD modes. */
-               memmove(qry->ns.addr, qry->ns.addr + 1,
-                       sizeof(qry->ns.addr[0]) * (KR_NSREP_MAXADDR - 1));
-               return ret;
-       } else {
-               return kr_zonecut_del_all(&qry->zone_cut, qry->ns.name);
-       }
-}
-
 /** This turns of QNAME minimisation if there is a non-terminal between current zone cut, and name target.
  *  It save several minimization steps, as the zone cut is likely final one.
  */
@@ -310,71 +297,6 @@ static int ns_fetch_cut(struct kr_query *qry, const knot_dname_t *requested_name
        return KR_STATE_PRODUCE;
 }
 
-static int ns_resolve_addr(struct kr_query *qry, struct kr_request *req)
-{
-       struct kr_rplan *rplan = &req->rplan;
-       struct kr_context *ctx = req->ctx;
-
-
-       /* Start NS queries from root, to avoid certain cases
-        * where a NS drops out of cache and the rest is unavailable,
-        * this would lead to dependency loop in current zone cut.
-        * Prefer IPv6 and continue with IPv4 if not available.
-        */
-       uint16_t next_type = 0;
-       if (!(qry->flags.AWAIT_IPV6) &&
-           !(ctx->options.NO_IPV6)) {
-               next_type = KNOT_RRTYPE_AAAA;
-               qry->flags.AWAIT_IPV6 = true;
-       } else if (!(qry->flags.AWAIT_IPV4) &&
-                  !(ctx->options.NO_IPV4)) {
-               next_type = KNOT_RRTYPE_A;
-               qry->flags.AWAIT_IPV4 = true;
-               /* Hmm, no useable IPv6 then. */
-               qry->ns.reputation |= KR_NS_NOIP6;
-               kr_nsrep_update_rep(&qry->ns, qry->ns.reputation, ctx->cache_rep);
-       }
-       /* Bail out if the query is already pending or dependency loop. */
-       if (!next_type || kr_rplan_satisfies(qry->parent, qry->ns.name, KNOT_CLASS_IN, next_type)) {
-               /* Fall back to SBELT if root server query fails. */
-               if (!next_type && qry->zone_cut.name[0] == '\0') {
-                       VERBOSE_MSG(qry, "=> fallback to root hints\n");
-                       kr_zonecut_set_sbelt(ctx, &qry->zone_cut);
-                       qry->flags.NO_THROTTLE = true; /* Pick even bad SBELT servers */
-                       return kr_error(EAGAIN);
-               }
-               /* No IPv4 nor IPv6, flag server as unusable. */
-               ++req->count_no_nsaddr;
-               VERBOSE_MSG(qry, "=> unresolvable NS address, bailing out (counter: %u)\n",
-                               req->count_no_nsaddr);
-               qry->ns.reputation |= KR_NS_NOIP4 | KR_NS_NOIP6;
-               kr_nsrep_update_rep(&qry->ns, qry->ns.reputation, ctx->cache_rep);
-               invalidate_ns(rplan, qry);
-               return kr_error(EHOSTUNREACH);
-       }
-       /* Push new query to the resolution plan */
-       struct kr_query *next =
-               kr_rplan_push(rplan, qry, qry->ns.name, KNOT_CLASS_IN, next_type);
-       if (!next) {
-               return kr_error(ENOMEM);
-       }
-       next->flags.NONAUTH = true;
-
-       /* At the root level with no NS addresses, add SBELT subrequest. */
-       int ret = 0;
-       if (qry->zone_cut.name[0] == '\0') {
-               ret = kr_zonecut_set_sbelt(ctx, &next->zone_cut);
-               if (ret == 0) { /* Copy TA and key since it's the same cut to avoid lookup. */
-                       kr_zonecut_copy_trust(&next->zone_cut, &qry->zone_cut);
-                       kr_zonecut_set_sbelt(ctx, &qry->zone_cut); /* Add SBELT to parent in case query fails. */
-                       qry->flags.NO_THROTTLE = true; /* Pick even bad SBELT servers */
-               }
-       } else {
-               next->flags.AWAIT_CUT = true;
-       }
-       return ret;
-}
-
 static int edns_put(knot_pkt_t *pkt, bool reclaim)
 {
        if (!pkt->opt_rr) {
@@ -392,6 +314,8 @@ static int edns_put(knot_pkt_t *pkt, bool reclaim)
        return knot_pkt_put(pkt, KNOT_COMPR_HINT_NONE, pkt->opt_rr, KNOT_PF_FREE);
 }
 
+
+
 /** Removes last EDNS OPT RR written to the packet. */
 static int edns_erase_and_reserve(knot_pkt_t *pkt)
 {
@@ -808,84 +732,6 @@ static int resolve_query(struct kr_request *request, const knot_pkt_t *packet)
        return request->state;
 }
 
-KR_PURE static bool kr_inaddr_equal(const struct sockaddr *a, const struct sockaddr *b)
-{
-       const int a_len = kr_inaddr_len(a);
-       const int b_len = kr_inaddr_len(b);
-       return a_len == b_len && memcmp(kr_inaddr(a), kr_inaddr(b), a_len) == 0;
-}
-
-static void update_nslist_rtt(struct kr_context *ctx, struct kr_query *qry, const struct sockaddr *src)
-{
-       /* Do not track in safe mode. */
-       if (qry->flags.SAFEMODE) {
-               return;
-       }
-
-       /* Calculate total resolution time from the time the query was generated. */
-       uint64_t elapsed = kr_now() - qry->timestamp_mono;
-       elapsed = elapsed > UINT_MAX ? UINT_MAX : elapsed;
-
-       /* NSs in the preference list prior to the one who responded will be penalised
-        * with the RETRY timer interval. This is because we know they didn't respond
-        * for N retries, so their RTT must be at least N * RETRY.
-        * The NS in the preference list that responded will have RTT relative to the
-        * time when the query was sent out, not when it was originated.
-        */
-       for (size_t i = 0; i < KR_NSREP_MAXADDR; ++i) {
-               const struct sockaddr *addr = &qry->ns.addr[i].ip;
-               if (addr->sa_family == AF_UNSPEC) {
-                       break;
-               }
-               /* If this address is the source of the answer, update its RTT */
-               if (kr_inaddr_equal(src, addr)) {
-                       kr_nsrep_update_rtt(&qry->ns, addr, elapsed, ctx->cache_rtt, KR_NS_UPDATE);
-                       WITH_VERBOSE(qry) {
-                               char addr_str[INET6_ADDRSTRLEN];
-                               inet_ntop(addr->sa_family, kr_inaddr(addr), addr_str, sizeof(addr_str));
-                               VERBOSE_MSG(qry, "<= server: '%s' rtt: %"PRIu64" ms\n",
-                                               addr_str, elapsed);
-                       }
-               } else {
-                       /* Response didn't come from this IP, but we know the RTT must be at least
-                        * several RETRY timer tries, e.g. if we have addresses [a, b, c] and we have
-                        * tried [a, b] when the answer from 'a' came after 350ms, then we know
-                        * that 'b' didn't respond for at least 350 - (1 * 300) ms. We can't say that
-                        * its RTT is 50ms, but we can say that its score shouldn't be less than 50. */
-                        kr_nsrep_update_rtt(&qry->ns, addr, elapsed, ctx->cache_rtt, KR_NS_MAX);
-                        WITH_VERBOSE(qry) {
-                               char addr_str[INET6_ADDRSTRLEN];
-                               inet_ntop(addr->sa_family, kr_inaddr(addr), addr_str, sizeof(addr_str));
-                               VERBOSE_MSG(qry, "<= server: '%s' rtt: >= %"PRIu64" ms\n",
-                                               addr_str, elapsed);
-                        }
-               }
-               /* Subtract query start time from elapsed time */
-               if (elapsed < KR_CONN_RETRY) {
-                       break;
-               }
-               elapsed = elapsed - KR_CONN_RETRY;
-       }
-}
-
-static void update_nslist_score(struct kr_request *request, struct kr_query *qry, const struct sockaddr *src, knot_pkt_t *packet)
-{
-       struct kr_context *ctx = request->ctx;
-       /* On successful answer, update preference list RTT and penalise timer  */
-       if (!(request->state & KR_STATE_FAIL)) {
-               /* Update RTT information for preference list */
-               update_nslist_rtt(ctx, qry, src);
-               /* Do not complete NS address resolution on soft-fail. */
-               const int rcode = packet ? knot_wire_get_rcode(packet->wire) : 0;
-               if (rcode != KNOT_RCODE_SERVFAIL && rcode != KNOT_RCODE_REFUSED) {
-                       qry->flags.AWAIT_IPV6 = false;
-                       qry->flags.AWAIT_IPV4 = false;
-               } else { /* Penalize SERVFAILs. */
-                       kr_nsrep_update_rtt(&qry->ns, src, KR_NS_PENALTY, ctx->cache_rtt, KR_NS_ADD);
-               }
-       }
-}
-
 static bool resolution_time_exceeded(struct kr_query *qry, uint64_t now)
 {
        uint64_t resolving_time = now - qry->creation_time_mono;
@@ -898,7 +744,7 @@ static bool resolution_time_exceeded(struct kr_query *qry, uint64_t now)
        return false;
 }
 
-int kr_resolve_consume(struct kr_request *request, const struct sockaddr *src, knot_pkt_t *packet)
+int kr_resolve_consume(struct kr_request *request, struct kr_transport **transport, knot_pkt_t *packet)
 {
        struct kr_rplan *rplan = &request->rplan;
 
@@ -918,11 +764,7 @@ int kr_resolve_consume(struct kr_request *request, const struct sockaddr *src, k
        }
        bool tried_tcp = (qry->flags.TCP);
        if (!packet || packet->size == 0) {
-               if (tried_tcp) {
-                       request->state = KR_STATE_FAIL;
-               } else {
-                       qry->flags.TCP = true;
-               }
+               return KR_STATE_PRODUCE;
        } else {
                /* Packet cleared, derandomize QNAME. */
                knot_dname_t *qname_raw = knot_pkt_qname(packet);
@@ -935,41 +777,14 @@ int kr_resolve_consume(struct kr_request *request, const struct sockaddr *src, k
                } else {
                        /* Fill in source and latency information. */
                        request->upstream.rtt = kr_now() - qry->timestamp_mono;
-                       request->upstream.addr = src;
+                       request->upstream.transport = transport ? *transport : NULL;
                        ITERATE_LAYERS(request, qry, consume, packet);
                        /* Clear temporary information */
-                       request->upstream.addr = NULL;
+                       request->upstream.transport = NULL;
                        request->upstream.rtt = 0;
                }
        }
 
-       /* Track RTT for iterative answers */
-       if (src && !(qry->flags.CACHED)) {
-               update_nslist_score(request, qry, src, packet);
-       }
-       /* Resolution failed, invalidate current NS. */
-       if (request->state & KR_STATE_FAIL) {
-               invalidate_ns(rplan, qry);
-               qry->flags.RESOLVED = false;
-       }
-
-       /* For multiple errors in a row; invalidate_ns() is not enough. */
-       if (!qry->flags.CACHED) {
-               if (request->state & KR_STATE_FAIL) {
-                       if (++request->count_fail_row > KR_CONSUME_FAIL_ROW_LIMIT) {
-                               if (VERBOSE_STATUS || kr_log_rtrace_enabled(request)) {
-                                       kr_log_req(request, 0, 2, "resl",
-                                               "=> too many failures in a row, "
-                                               "bail out (mitigation for NXNSAttack "
-                                               "CVE-2020-12667)\n");
-                               }
-                               return KR_STATE_FAIL;
-                       }
-               } else {
-                       request->count_fail_row = 0;
-               }
-       }
-
        /* Pop query if resolved. */
        if (request->state == KR_STATE_YIELD) {
                return KR_STATE_PRODUCE; /* Requery */
@@ -1340,17 +1155,83 @@ static int zone_cut_check(struct kr_request *request, struct kr_query *qry, knot
        return trust_chain_check(request, qry);
 }
 
-int kr_resolve_produce(struct kr_request *request, struct sockaddr **dst, int *type, knot_pkt_t *packet)
+
+int ns_resolve_addr(struct kr_query *qry, struct kr_request *param, struct kr_transport *transport)
+{
+       struct kr_rplan *rplan = &param->rplan;
+       struct kr_context *ctx = param->ctx;
+
+
+       /* Start NS queries from root, to avoid certain cases
+        * where a NS drops out of cache and the rest is unavailable,
+        * this would lead to dependency loop in current zone cut.
+        * Prefer IPv6 and continue with IPv4 if not available.
+        */
+       uint16_t next_type = 0;
+       if (!(qry->flags.AWAIT_IPV6) &&
+           !(ctx->options.NO_IPV6)) {
+               next_type = KNOT_RRTYPE_AAAA;
+               qry->flags.AWAIT_IPV6 = true;
+       } else if (!(qry->flags.AWAIT_IPV4) &&
+                  !(ctx->options.NO_IPV4)) {
+               next_type = KNOT_RRTYPE_A;
+               qry->flags.AWAIT_IPV4 = true;
+       }
+       /* Bail out if the query is already pending or dependency loop. */
+       if (!next_type || kr_rplan_satisfies(qry->parent, transport->name, KNOT_CLASS_IN, next_type)) {
+               /* Fall back to SBELT if root server query fails. */
+               if (!next_type && qry->zone_cut.name[0] == '\0') {
+                       VERBOSE_MSG(qry, "=> fallback to root hints\n");
+                       kr_zonecut_set_sbelt(ctx, &qry->zone_cut);
+                       qry->flags.NO_THROTTLE = true; /* Pick even bad SBELT servers */
+                       return kr_error(EAGAIN);
+               }
+               /* No IPv4 nor IPv6, flag server as unusable. */
+               VERBOSE_MSG(qry, "=> unresolvable NS address, bailing out\n");
+               kr_zonecut_del_all(&qry->zone_cut, transport->name);
+               return kr_error(EHOSTUNREACH);
+       }
+       /* Push new query to the resolution plan */
+       struct kr_query *next =
+               kr_rplan_push(rplan, qry, transport->name, KNOT_CLASS_IN, next_type);
+       if (!next) {
+               return kr_error(ENOMEM);
+       }
+       next->flags.NONAUTH = true;
+
+       /* At the root level with no NS addresses, add SBELT subrequest. */
+       int ret = 0;
+       if (qry->zone_cut.name[0] == '\0') {
+               ret = kr_zonecut_set_sbelt(ctx, &next->zone_cut);
+               if (ret == 0) { /* Copy TA and key since it's the same cut to avoid lookup. */
+                       kr_zonecut_copy_trust(&next->zone_cut, &qry->zone_cut);
+                       kr_zonecut_set_sbelt(ctx, &qry->zone_cut); /* Add SBELT to parent in case query fails. */
+                       qry->flags.NO_THROTTLE = true; /* Pick even bad SBELT servers */
+               }
+       } else {
+               next->flags.AWAIT_CUT = true;
+       }
+
+       return ret;
+}
+
+int kr_resolve_produce(struct kr_request *request, struct kr_transport **transport, knot_pkt_t *packet)
 {
        struct kr_rplan *rplan = &request->rplan;
-       unsigned ns_election_iter = 0;
 
        /* No query left for resolution */
        if (kr_rplan_empty(rplan)) {
                return KR_STATE_FAIL;
        }
-       /* If we have deferred answers, resume them. */
+
        struct kr_query *qry = array_tail(rplan->pending);
+
+       /* Initialize server selection */
+       if (!qry->server_selection.initialized) {
+               kr_server_selection_init(qry);
+       }
+
+       /* If we have deferred answers, resume them. */
        if (qry->deferred != NULL) {
                /* @todo: Refactoring validator, check trust chain before resuming. */
                int state = 0;
@@ -1428,65 +1309,29 @@ int kr_resolve_produce(struct kr_request *request, struct sockaddr **dst, int *t
                }
        }
 
-ns_election:
-
-       if (unlikely(request->count_no_nsaddr >= KR_COUNT_NO_NSADDR_LIMIT)) {
-               VERBOSE_MSG(qry, "=> too many unresolvable NSs, bail out "
-                               "(mitigation for NXNSAttack CVE-2020-12667)\n");
-               return KR_STATE_FAIL;
-       }
-       /* If the query has already selected a NS and is waiting for IPv4/IPv6 record,
-        * elect best address only, otherwise elect a completely new NS.
-        */
-       if(++ns_election_iter >= KR_ITER_LIMIT) {
-               VERBOSE_MSG(qry, "=> couldn't converge NS selection, bail out\n");
-               return KR_STATE_FAIL;
-       }
 
        const struct kr_qflags qflg = qry->flags;
        const bool retry = qflg.TCP || qflg.BADCOOKIE_AGAIN;
-       if (qflg.AWAIT_IPV4 || qflg.AWAIT_IPV6) {
-               kr_nsrep_elect_addr(qry, request->ctx);
-       } else if (qflg.FORWARD || qflg.STUB) {
-               kr_nsrep_sort(&qry->ns, request->ctx);
-               if (qry->ns.score > KR_NS_MAX_SCORE) {
-                       /* At the moment all NS have bad reputation.
-                        * But there can be existing connections*/
-                       VERBOSE_MSG(qry, "=> no valid NS left\n");
-                       return KR_STATE_FAIL;
-               }
-       } else if (!qry->ns.name || !retry) { /* Keep NS when requerying/stub/badcookie. */
+       if (!qflg.FORWARD && !qflg.STUB && !retry) { /* Keep NS when requerying/stub/badcookie. */
                /* Root DNSKEY must be fetched from the hints to avoid chicken and egg problem. */
                if (qry->sname[0] == '\0' && qry->stype == KNOT_RRTYPE_DNSKEY) {
                        kr_zonecut_set_sbelt(request->ctx, &qry->zone_cut);
                        qry->flags.NO_THROTTLE = true; /* Pick even bad SBELT servers */
                }
-               kr_nsrep_elect(qry, request->ctx);
-               if (qry->ns.score > KR_NS_MAX_SCORE) {
-                       if (kr_zonecut_is_empty(&qry->zone_cut)) {
-                               VERBOSE_MSG(qry, "=> no NS with an address\n");
-                       } else {
-                               VERBOSE_MSG(qry, "=> no valid NS left\n");
-                       }
-                       if (!qry->flags.NO_NS_FOUND) {
-                               qry->flags.NO_NS_FOUND = true;
-                       } else {
-                               ITERATE_LAYERS(request, qry, reset);
-                               kr_rplan_pop(rplan, qry);
-                       }
-                       return KR_STATE_PRODUCE;
-               }
        }
 
-       /* Resolve address records */
-       if (qry->ns.addr[0].ip.sa_family == AF_UNSPEC) {
-               int ret = ns_resolve_addr(qry, request);
-               if (ret != 0) {
-                       qry->flags.AWAIT_IPV6 = false;
+       qry->server_selection.choose_transport(qry, transport);
+
+       if (*transport == NULL) {
+               // There is no point in continuing.
+               return KR_STATE_FAIL;
+       }
+
+       if ((*transport)->protocol == KR_TRANSPORT_NOADDR) {
+               int ret = ns_resolve_addr(qry, qry->request, *transport);
+               if (ret) {
                        qry->flags.AWAIT_IPV4 = false;
-                       qry->flags.TCP = false;
-                       qry->ns.name = NULL;
-                       goto ns_election; /* Must try different NS */
+                       qry->flags.AWAIT_IPV6 = false;
                }
                ITERATE_LAYERS(request, qry, reset);
                return KR_STATE_PRODUCE;
@@ -1503,8 +1348,6 @@ ns_election:
         * kr_resolve_checkout().
         */
        qry->timestamp_mono = kr_now();
-       *dst = &qry->ns.addr[0].ip;
-       *type = (qry->flags.TCP) ? SOCK_STREAM : SOCK_DGRAM;
        return request->state;
 }
 
@@ -1541,7 +1384,7 @@ static bool outbound_request_update_cookies(struct kr_request *req,
 #endif /* defined(ENABLE_COOKIES) */
 
 int kr_resolve_checkout(struct kr_request *request, const struct sockaddr *src,
-                        struct sockaddr *dst, int type, knot_pkt_t *packet)
+                        struct kr_transport *transport, knot_pkt_t *packet)
 {
        /* @todo: Update documentation if this function becomes approved. */
 
@@ -1565,7 +1408,7 @@ int kr_resolve_checkout(struct kr_request *request, const struct sockaddr *src,
                 * actual cookie. If we don't know the server address then we
                 * also don't know the actual cookie size.
                 */
-               if (!outbound_request_update_cookies(request, src, dst)) {
+               if (!outbound_request_update_cookies(request, src, &transport->address.ip)) {
                        return kr_error(EINVAL);
                }
        }
@@ -1582,8 +1425,20 @@ int kr_resolve_checkout(struct kr_request *request, const struct sockaddr *src,
        /* Run the checkout layers and cancel on failure.
         * The checkout layer doesn't persist the state, so canceled subrequests
         * don't affect the resolution or rest of the processing. */
+       int type = -1;
+       switch(transport->protocol) {
+       case KR_TRANSPORT_UDP:
+               type = SOCK_DGRAM;
+               break;
+       case KR_TRANSPORT_TCP:
+       case KR_TRANSPORT_TLS:
+               type = SOCK_PACKET;
+               break;
+       default:
+               assert(0);
+       }
        int state = request->state;
-       ITERATE_LAYERS(request, qry, checkout, packet, dst, type);
+       ITERATE_LAYERS(request, qry, checkout, packet, &transport->address.ip, type);
        if (request->state & KR_STATE_FAIL) {
                request->state = state; /* Restore */
                return kr_error(ECANCELED);
@@ -1606,26 +1461,17 @@ int kr_resolve_checkout(struct kr_request *request, const struct sockaddr *src,
        WITH_VERBOSE(qry) {
 
        KR_DNAME_GET_STR(qname_str, knot_pkt_qname(packet));
+       KR_DNAME_GET_STR(ns_name, transport->name);
        KR_DNAME_GET_STR(zonecut_str, qry->zone_cut.name);
        KR_RRTYPE_GET_STR(type_str, knot_pkt_qtype(packet));
+       const char *ns_str = kr_straddr(&transport->address.ip);
 
-       for (size_t i = 0; i < KR_NSREP_MAXADDR; ++i) {
-               struct sockaddr *addr = &qry->ns.addr[i].ip;
-               if (addr->sa_family == AF_UNSPEC) {
-                       break;
-               }
-               if (!kr_inaddr_equal(dst, addr)) {
-                       continue;
-               }
-               const char *ns_str = kr_straddr(addr);
-               VERBOSE_MSG(qry,
-                       "=> id: '%05u' querying: '%s' score: %u zone cut: '%s' "
+       VERBOSE_MSG(qry,
+                       "=> id: '%05u' querying: '%s'@'%s' zone cut: '%s' "
                        "qname: '%s' qtype: '%s' proto: '%s'\n",
-                       qry->id, ns_str ? ns_str : "", qry->ns.score, zonecut_str,
+                       qry->id, ns_name, ns_str ? ns_str : "", zonecut_str,
                        qname_str, type_str, (qry->flags.TCP) ? "tcp" : "udp");
-
-               break;
-       }}
+       }
 
        return kr_ok();
 }
index db596a387624b9c38e5ba702f4a1df4698aef9b3..b9bbebfde47a4bb48a78a3ea85293ac62922fca2 100644 (file)
@@ -13,7 +13,7 @@
 #include "lib/layer.h"
 #include "lib/generic/map.h"
 #include "lib/generic/array.h"
-#include "lib/nsrep.h"
+#include "lib/selection.h"
 #include "lib/rplan.h"
 #include "lib/module.h"
 #include "lib/cache/api.h"
@@ -147,9 +147,7 @@ struct kr_context
        map_t negative_anchors;
        struct kr_zonecut root_hints;
        struct kr_cache cache;
-       kr_nsrep_rtt_lru_t *cache_rtt;
        unsigned cache_rtt_tout_retry_interval;
-       kr_nsrep_lru_t *cache_rep;
        module_array_t *modules;
        /* The cookie context structure should not be held within the cookies
         * module because of better access. */
@@ -167,6 +165,9 @@ struct kr_request_qsource_flags {
        bool http:1; /**< true if the request is on HTTP; only meaningful if (dst_addr). */
 };
 
+typedef bool (*addr_info_f)(struct sockaddr*);
+typedef void (*async_resolution_f)(knot_dname_t*, enum knot_rr_type);
+
 /**
  * Name resolution request.
  *
@@ -195,7 +196,7 @@ struct kr_request {
        } qsource;
        struct {
                unsigned rtt;                  /**< Current upstream RTT */
-               const struct sockaddr *addr;   /**< Current upstream address */
+               const struct kr_transport *transport;   /**< Current upstream transport */
        } upstream;                        /**< Upstream information, valid only in consume() phase */
        struct kr_qflags options;
        int state;
@@ -219,6 +220,14 @@ struct kr_request {
        trace_callback_f trace_finish; /**< Request finish tracepoint */
        int vars_ref; /**< Reference to per-request variable table. LUA_NOREF if not set. */
        knot_mm_t pool;
+       struct {
+               addr_info_f is_tls_capable;
+               addr_info_f is_tcp_connected;
+               addr_info_f is_tcp_waiting;
+               async_resolution_f async_ns_resolution;
+               union inaddr *forwarding_targets; /**< When forwarding, possible targets are put here */
+               size_t forward_targets_num;
+       } selection_context;
        unsigned int uid; /** for logging purposes only */
        unsigned int count_no_nsaddr;
        unsigned int count_fail_row;
@@ -256,7 +265,7 @@ int kr_resolve_begin(struct kr_request *request, struct kr_context *ctx, knot_pk
  * @return         any state
  */
 KR_EXPORT
-int kr_resolve_consume(struct kr_request *request, const struct sockaddr *src, knot_pkt_t *packet);
+int kr_resolve_consume(struct kr_request *request, struct kr_transport **transport, knot_pkt_t *packet);
 
 /**
  * Produce either next additional query or finish.
@@ -272,7 +281,7 @@ int kr_resolve_consume(struct kr_request *request, const struct sockaddr *src, k
  * @return         any state
  */
 KR_EXPORT
-int kr_resolve_produce(struct kr_request *request, struct sockaddr **dst, int *type, knot_pkt_t *packet);
+int kr_resolve_produce(struct kr_request *request, struct kr_transport **transport, knot_pkt_t *packet);
 
 /**
  * Finalises the outbound query packet with the knowledge of the IP addresses.
@@ -288,7 +297,7 @@ int kr_resolve_produce(struct kr_request *request, struct sockaddr **dst, int *t
  */
 KR_EXPORT
 int kr_resolve_checkout(struct kr_request *request, const struct sockaddr *src,
-                        struct sockaddr *dst, int type, knot_pkt_t *packet);
+                        struct kr_transport *transport, knot_pkt_t *packet);
 
 /**
  * Finish resolution and commit results if the state is DONE.
@@ -318,4 +327,3 @@ struct kr_rplan *kr_resolve_plan(struct kr_request *request);
  */
 KR_EXPORT KR_PURE
 knot_mm_t *kr_resolve_pool(struct kr_request *request);
-
index 18dc6b8276c0b3186b3ebd130d881b8a643a4a72..02e32d3e469b5e54b1c09d189c8178785ad40806 100644 (file)
@@ -159,22 +159,13 @@ static struct kr_query *kr_rplan_push_query(struct kr_rplan *rplan,
        qry->flags = rplan->request->options;
        qry->parent = parent;
        qry->request = rplan->request;
-       qry->ns.ctx = rplan->request->ctx;
-       qry->ns.addr[0].ip.sa_family = AF_UNSPEC;
+
        gettimeofday(&qry->timestamp, NULL);
        qry->timestamp_mono = kr_now();
        qry->creation_time_mono = parent ? parent->creation_time_mono : qry->timestamp_mono;
        kr_zonecut_init(&qry->zone_cut, (const uint8_t *)"", rplan->pool);
        qry->reorder = qry->flags.REORDER_RR ? kr_rand_bytes(sizeof(qry->reorder)) : 0;
 
-       /* When forwarding, keep the nameserver addresses. */
-       if (parent && parent->flags.FORWARD && qry->flags.FORWARD) {
-               ret = kr_nsrep_copy_set(&qry->ns, &parent->ns);
-               if (ret) {
-                       query_free(rplan->pool, qry);
-                       return NULL;
-               }
-       }
 
        assert((rplan->pending.len == 0 && rplan->resolved.len == 0)
                == (rplan->initial == NULL));
index 7575ecd5b11c9e6f810599529ec888537647d524..4690daf054958f44c65de9f5dfa5328b7b464d42 100644 (file)
@@ -8,9 +8,9 @@
 #include <libknot/dname.h>
 #include <libknot/codes.h>
 
+#include "lib/selection.h"
 #include "lib/cache/api.h"
 #include "lib/zonecut.h"
-#include "lib/nsrep.h"
 
 /** Query flags */
 struct kr_qflags {
@@ -101,8 +101,7 @@ struct kr_query {
        struct kr_query *cname_parent;
        struct kr_request *request; /**< Parent resolution request. */
        kr_stale_cb stale_cb; /**< See the type */
-       /* Beware: this must remain the last, because of lua bindings. */
-       struct kr_nsrep ns;
+       struct kr_server_selection server_selection;
 };
 
 /** @cond internal Array of queries. */
diff --git a/lib/selection.c b/lib/selection.c
new file mode 100644 (file)
index 0000000..48a0739
--- /dev/null
@@ -0,0 +1,333 @@
+#include <libknot/dname.h>
+
+#include "lib/selection.h"
+#include "lib/selection_forward.h"
+#include "lib/selection_iter.h"
+#include "lib/generic/pack.h"
+#include "lib/generic/trie.h"
+#include "lib/rplan.h"
+#include "lib/cache/api.h"
+#include "lib/resolve.h"
+
+#include "daemon/worker.h"
+#include "daemon/tls.h"
+
+#include "lib/utils.h"
+
+#define VERBOSE_MSG(qry, ...) QRVERBOSE((qry), "nsrep",  __VA_ARGS__)
+
+/** @internal Macro to set address structure. */
+#define ADDR_SET(sa, family, addr, len, port) do {\
+       memcpy(&sa ## _addr, (addr), (len)); \
+       sa ## _family = (family); \
+       sa ## _port = htons(port); \
+} while (0)
+
+/* Simple cache interface follows */
+
+#define KEY_PREFIX 'S'
+
+void *prefix_key(const uint8_t *ip, size_t len) {
+    void *key = malloc(len+1);
+    *(char*) key = KEY_PREFIX;
+    memcpy(key+1, ip, len);
+    return key;
+}
+
+#undef PREFIX
+
+struct rtt_state get_rtt_state(const uint8_t *ip, size_t len, struct kr_cache *cache) {
+    struct rtt_state state = {0,0};
+    knot_db_val_t value;
+    knot_db_t *db = cache->db;
+    struct kr_cdb_stats *stats = &cache->stats;
+    uint8_t *prefixed_ip = prefix_key(ip, len);
+
+    knot_db_val_t key = {.len = len + 1, .data = prefixed_ip};
+
+    if(cache->api->read(db, stats, &key, &value, 1)) {
+        state = (struct rtt_state){-1, -1}; // No value
+    } else {
+        assert(value.len == sizeof(struct rtt_state));
+        state = *(struct rtt_state *)value.data;
+    }
+
+    free(prefixed_ip);
+    return state;
+}
+
+int put_rtt_state(const uint8_t *ip, size_t len, struct rtt_state state, struct kr_cache *cache) {
+    knot_db_t *db = cache->db;
+    struct kr_cdb_stats *stats = &cache->stats;
+    uint8_t *prefixed_ip = prefix_key(ip, len);
+
+    knot_db_val_t key = {.len = len + 1, .data = prefixed_ip};
+    knot_db_val_t value = {.len = sizeof(struct rtt_state), .data = &state};
+
+    int ret = cache->api->write(db, stats, &key, &value, 1);
+    cache->api->commit(db, stats);
+
+    free(prefixed_ip);
+    return ret;
+}
+
+/* IP helper functions */
+
+void bytes_to_ip(uint8_t *bytes, size_t len, union inaddr *dst) {
+    switch(len) {
+    case sizeof(struct in_addr):
+        ADDR_SET(dst->ip4.sin, AF_INET, bytes, len, 0);
+        break;
+    case sizeof(struct in6_addr):
+        ADDR_SET(dst->ip6.sin6, AF_INET6, bytes, len, 0);
+        break;
+    default:
+        assert(0);
+    }
+}
+
+uint8_t* ip_to_bytes(const union inaddr *src, size_t len) {
+    switch(len) {
+    case sizeof(struct in_addr):
+        return (uint8_t *)&src->ip4.sin_addr;
+    case sizeof(struct in6_addr):
+        return (uint8_t *)&src->ip6.sin6_addr;
+    default:
+        assert(0);
+    }
+}
+
+#define DEFAULT_TIMEOUT 200
+#define MINIMAL_TIMEOUT_ADDITION 20
+
+// This is verbatim (minus the default timeout value and minimal variance) RFC2988, sec. 2
+int32_t calc_timeout(struct rtt_state state) {
+    if (state.srtt == -1 && state.variance == -1) {
+        return DEFAULT_TIMEOUT;
+    }
+    return state.srtt + MAX(4 * state.variance, MINIMAL_TIMEOUT_ADDITION);
+}
+
+// This is verbatim RFC2988, sec. 2
+struct rtt_state calc_rtt_state(struct rtt_state old, unsigned new_rtt) {
+    if (old.srtt == -1 && old.variance == -1) {
+        return (struct rtt_state){new_rtt, new_rtt/2};
+    }
+
+    struct rtt_state ret;
+
+    ret.srtt = 0.75 * old.srtt + 0.25 * new_rtt;
+    ret.variance = 0.875 * old.variance + 0.125 * abs(old.srtt - new_rtt);
+
+    return ret;
+}
+
+void check_tls_capable(struct address_state *address_state, struct kr_request *req, struct sockaddr *address) {
+    address_state->tls_capable = req->selection_context.is_tls_capable ? req->selection_context.is_tls_capable(address) : false;
+}
+
+void check_tcp_connections(struct address_state *address_state, struct kr_request *req, struct sockaddr *address) {
+    address_state->tcp_connected = req->selection_context.is_tcp_connected ? req->selection_context.is_tcp_connected(address) : false;
+    address_state->tcp_waiting = req->selection_context.is_tcp_waiting ? req->selection_context.is_tcp_waiting(address) : false;
+}
+
+void check_network_settings(struct address_state *address_state, size_t address_len, bool no_ipv4, bool no_ipv6) {
+    if (no_ipv4 && address_len == sizeof(struct in_addr)) {
+                address_state->generation = -1; // Invalidate due to IPv4 being disabled in flags
+    }
+    if (no_ipv6 && address_len == sizeof(struct in6_addr)) {
+        address_state->generation = -1; // Invalidate due to IPv6 being disabled in flags
+    }
+}
+
+int cmp_choices(const void *a, const void *b) {
+    struct choice *a_ = (struct choice *) a;
+    struct choice *b_ = (struct choice *) b;
+
+    int diff;
+    if ((diff = a_->address_state->error_count - b_->address_state->error_count)) {
+        return diff;
+    }
+    if ((diff = calc_timeout(a_->address_state->rtt_state) - calc_timeout(b_->address_state->rtt_state))) {
+        return diff;
+    }
+    return 0;
+}
+
+#define ERROR_LIMIT 2
+
+// Performs the actual selection (currently epsilon-greedy with epsilon = 0.05).
+struct kr_transport *choose_transport(struct choice choices[],
+                                             int choices_len,
+                                             knot_dname_t **unresolved,
+                                             int unresolved_len,
+                                             struct knot_mm *mempool,
+                                             bool tcp,
+                                             size_t *out_forward_index) {
+
+    struct kr_transport *transport = mm_alloc(mempool, sizeof(struct kr_transport));
+    memset(transport, 0, sizeof(struct kr_transport));
+    int choice = 0;
+
+    if (kr_rand_coin(1, 20) || choices_len == 0) {
+        // EXPLORE
+        int index = kr_rand_bytes(1) % (choices_len + unresolved_len);
+        if (index < unresolved_len) {
+            // We will resolve a new NS name
+            *transport = (struct kr_transport) {
+                .protocol = KR_TRANSPORT_NOADDR,
+                .name = unresolved[index]
+            };
+            return transport;
+        } else {
+            choice = index - unresolved_len;
+        }
+    } else {
+        // EXPLOIT
+        qsort(choices, choices_len, sizeof(struct choice), cmp_choices);
+        if (choices[0].address_state->error_count > ERROR_LIMIT) {
+            return NULL;
+        } else {
+            choice = 0;
+        }
+    }
+
+    *transport = (struct kr_transport) {
+        .name = choices[choice].address_state->name,
+        .protocol = tcp ? KR_TRANSPORT_TCP : KR_TRANSPORT_UDP,
+        .timeout = calc_timeout(choices[choice].address_state->rtt_state),
+    };
+
+
+    int port;
+    switch (transport->protocol)
+    {
+    case KR_TRANSPORT_TLS:
+        port = KR_DNS_TLS_PORT;
+        break;
+    case KR_TRANSPORT_UDP:
+    case KR_TRANSPORT_TCP:
+        port = KR_DNS_PORT;
+        break;
+    default:
+        assert(0);
+        break;
+    }
+
+
+    switch (choices[choice].address_len)
+    {
+    case sizeof(struct in_addr):
+        ADDR_SET(transport->address.ip4.sin, AF_INET, choices[choice].address, choices[choice].address_len, port);
+        transport->address_len = choices[choice].address_len;
+        break;
+    case sizeof(struct in6_addr):
+        ADDR_SET(transport->address.ip6.sin6, AF_INET6, choices[choice].address, choices[choice].address_len, port);
+        transport->address_len = choices[choice].address_len;
+        break;
+    default:
+        assert(0);
+        break;
+    }
+
+    if (out_forward_index) {
+        *out_forward_index = choices[choice].address_state->forward_index;
+    }
+
+    return transport;
+
+}
+
+void update_rtt(struct kr_query *qry, struct address_state *addr_state, const struct kr_transport *transport, unsigned rtt) {
+    if (!transport) {
+        return;
+    }
+
+    struct kr_cache *cache = &qry->request->ctx->cache;
+    struct rtt_state new_rtt_state = calc_rtt_state(addr_state->rtt_state, rtt);
+    uint8_t *address = ip_to_bytes(&transport->address, transport->address_len);
+       put_rtt_state(address, transport->address_len, new_rtt_state, cache);
+
+    WITH_VERBOSE(qry) {
+
+       KR_DNAME_GET_STR(ns_name, transport->name);
+       KR_DNAME_GET_STR(zonecut_str, qry->zone_cut.name);
+       const char *ns_str = kr_straddr(&transport->address.ip);
+
+       VERBOSE_MSG(qry,
+                       "=> id: '%05u' updating: '%s'@'%s' zone cut: '%s' with rtt %u to srtt: %d and variance: %d \n",
+                       qry->id, ns_name, ns_str ? ns_str : "", zonecut_str, rtt, new_rtt_state.srtt, new_rtt_state.variance);
+       }
+}
+
+
+void error(struct kr_query *qry, struct address_state *addr_state, const struct kr_transport *transport, enum kr_selection_error sel_error) {
+    if (!transport) {
+        return;
+    }
+
+    if (sel_error >= KR_SELECTION_NUMBER_OF_ERRORS) {
+        assert(0);
+    }
+
+    addr_state->errors[sel_error]++;
+    addr_state->error_count++;
+
+    WITH_VERBOSE(qry) {
+
+       KR_DNAME_GET_STR(ns_name, transport->name);
+       KR_DNAME_GET_STR(zonecut_str, qry->zone_cut.name);
+       const char *ns_str = kr_straddr(&transport->address.ip);
+
+       VERBOSE_MSG(qry,
+                       "=> id: '%05u' noting selection error: '%s'@'%s' zone cut: '%s' error no.:%d\n",
+                       qry->id, ns_name, ns_str ? ns_str : "", zonecut_str, sel_error);
+       }
+}
+
+
+
+void kr_server_selection_init(struct kr_query *qry) {
+    struct knot_mm *mempool = &qry->request->pool;
+    if (qry->flags.FORWARD || qry->flags.STUB) {
+        qry->server_selection = (struct kr_server_selection){
+            .initialized = true,
+            .choose_transport = forward_choose_transport,
+            .success = forward_success,
+            .update_rtt = forward_update_rtt,
+            .error = forward_error,
+            .local_state = NULL,
+        };
+        forward_local_state_init(mempool, &qry->server_selection.local_state, qry->request);
+    } else {
+        qry->server_selection = (struct kr_server_selection){
+            .initialized = true,
+            .choose_transport = iter_choose_transport,
+            .success = iter_success,
+            .update_rtt = iter_update_rtt,
+            .error = iter_error,
+            .local_state = NULL,
+        };
+        iter_local_state_init(mempool, &qry->server_selection.local_state);
+    }
+}
+
+int kr_forward_add_target(struct kr_request *req, size_t index, const struct sockaddr *sock) {
+    if (!req->selection_context.forwarding_targets) {
+        req->selection_context.forwarding_targets = mm_alloc(&req->pool, req->selection_context.forward_targets_num * sizeof(union inaddr));
+    }
+
+    switch (sock->sa_family) {
+        case AF_INET:
+            req->selection_context.forwarding_targets[index].ip4 = *(const struct sockaddr_in *)sock;
+            break;
+        case AF_INET6:
+            req->selection_context.forwarding_targets[index].ip6 = *(const struct sockaddr_in6 *)sock;
+            break;
+        default:
+            return kr_error(EINVAL);
+    }
+
+    return kr_ok();
+}
+
diff --git a/lib/selection.h b/lib/selection.h
new file mode 100644 (file)
index 0000000..f8c4913
--- /dev/null
@@ -0,0 +1,111 @@
+/*  Copyright (C) 2014-2020 CZ.NIC, z.s.p.o. <knot-dns@labs.nic.cz>
+ *  SPDX-License-Identifier: GPL-3.0-or-later
+ */
+
+#pragma once
+
+#include "lib/cache/api.h"
+
+enum kr_selection_error {
+    // Network errors
+    KR_SELECTION_TIMEOUT,
+    KR_SELECTION_TLS_HANDSHAKE_FAILED,
+    KR_SELECTION_TCP_CONNECT_FAILED,
+    KR_SELECTION_TCP_CONNECT_TIMEOUT,
+
+    // RCODEs
+    KR_SELECTION_REFUSED,
+    KR_SELECTION_SERVFAIL,
+    KR_SELECTION_FORMERROR,
+    KR_SELECTION_NOTIMPL,
+    KR_SELECTION_OTHER_RCODE,
+    KR_SELECTION_TRUNCATED,
+
+    // DNS errors
+    KR_SELECTION_DNSSEC_ERROR,
+    KR_SELECTION_LAME_DELEGATION,
+
+    KR_SELECTION_NUMBER_OF_ERRORS // Leave this last as it is used as array size.
+};
+
+enum kr_transport_protocol {
+    KR_TRANSPORT_NOADDR = 0,
+    KR_TRANSPORT_UDP,
+    KR_TRANSPORT_TCP,
+    KR_TRANSPORT_TLS,
+};
+
+struct kr_transport {
+    knot_dname_t *name;
+    union inaddr address;
+    size_t address_len;
+    enum kr_transport_protocol protocol;
+    unsigned timeout;
+};
+
+struct kr_server_selection
+{
+    bool initialized;
+    void (*choose_transport)(struct kr_query *qry, struct kr_transport **transport);
+    void (*success)(struct kr_query *qry, const struct kr_transport *transport);
+    void (*update_rtt)(struct kr_query *qry, const struct kr_transport *transport, unsigned rtt);
+    void (*error)(struct kr_query *qry, const struct kr_transport *transport, enum kr_selection_error error);
+
+    void *local_state;
+};
+
+// Initialize server selection structure inside qry.
+KR_EXPORT
+void kr_server_selection_init(struct kr_query *qry);
+
+KR_EXPORT
+int kr_forward_add_target(struct kr_request *req, size_t index, const struct sockaddr *sock);
+
+// To be held per IP address in the global LMDB cache
+struct rtt_state {
+    int32_t srtt;
+    int32_t variance;
+};
+
+// To be held per IP address and locally
+struct address_state {
+    unsigned int generation;
+    struct rtt_state rtt_state;
+    knot_dname_t *name;
+    bool tls_capable : 1;
+    bool tcp_waiting : 1;
+    bool tcp_connected : 1;
+
+    int forward_index;
+    int error_count;
+       int errors[KR_SELECTION_NUMBER_OF_ERRORS];
+};
+
+// Array of these is one of inputs for the actual selection algorithm (`iter_get_best_transport`)
+struct choice {
+    uint8_t *address;
+    size_t address_len;
+    struct address_state *address_state;
+};
+
+struct kr_transport *choose_transport(struct choice choices[],
+                                             int choices_len,
+                                             knot_dname_t **unresolved,
+                                             int unresolved_len,
+                                             struct knot_mm *mempool,
+                                             bool tcp,
+                                             size_t *out_forward_index);
+void update_rtt(struct kr_query *qry, struct address_state *addr_state, const struct kr_transport *transport, unsigned rtt);
+void error(struct kr_query *qry, struct address_state *addr_state, const struct kr_transport *transport, enum kr_selection_error sel_error);
+
+struct rtt_state get_rtt_state(const uint8_t *ip, size_t len, struct kr_cache *cache);
+int put_rtt_state(const uint8_t *ip, size_t len, struct rtt_state state, struct kr_cache *cache);
+
+void bytes_to_ip(uint8_t *bytes, size_t len, union inaddr *dst);
+uint8_t* ip_to_bytes(const union inaddr *src, size_t len);
+
+void check_tls_capable(struct address_state *address_state, struct kr_request *req, struct sockaddr *address);
+void check_tcp_connections(struct address_state *address_state, struct kr_request *req, struct sockaddr *address);
+void check_network_settings(struct address_state *address_state, size_t address_len, bool no_ipv4, bool no_ipv6);
+
+
diff --git a/lib/selection_forward.c b/lib/selection_forward.c
new file mode 100644 (file)
index 0000000..b804893
--- /dev/null
@@ -0,0 +1,95 @@
+/*  Copyright (C) 2014-2020 CZ.NIC, z.s.p.o. <knot-dns@labs.nic.cz>
+ *  SPDX-License-Identifier: GPL-3.0-or-later
+ */
+
+#include "lib/selection_forward.h"
+#include "lib/resolve.h"
+
+#define VERBOSE_MSG(qry, ...) QRVERBOSE((qry), "nsrep",  __VA_ARGS__)
+
+struct forward_local_state {
+    union inaddr *targets;
+    size_t target_num;
+    struct address_state *addr_states;
+    size_t last_choice_index;
+};
+
+void forward_local_state_init(struct knot_mm *mm, void **local_state, struct kr_request *req) {
+    assert(req->selection_context.forwarding_targets);
+    *local_state = mm_alloc(mm, sizeof(struct forward_local_state));
+    memset(*local_state, 0, sizeof(struct forward_local_state));
+
+    struct forward_local_state *forward_state = (struct forward_local_state *)*local_state;
+    forward_state->targets = req->selection_context.forwarding_targets;
+    forward_state->target_num = req->selection_context.forward_targets_num;
+
+    forward_state->addr_states = mm_alloc(mm, sizeof(struct address_state) * forward_state->target_num);
+    memset(forward_state->addr_states, 0, sizeof(struct address_state) * forward_state->target_num);
+}
+
+void forward_choose_transport(struct kr_query *qry, struct kr_transport **transport) {
+    struct forward_local_state *local_state = qry->server_selection.local_state;
+    struct choice choices[local_state->target_num];
+    int valid = 0;
+
+    for (int i = 0; i < local_state->target_num; i++) {
+        union inaddr *address = &local_state->targets[i];
+        size_t addr_len;
+        switch (address->ip.sa_family) {
+            case AF_INET:
+                addr_len = sizeof(struct in_addr);
+                break;
+            case AF_INET6:
+                addr_len = sizeof(struct in6_addr);
+                break;
+            default:
+                assert(0);
+        }
+
+        struct address_state *addr_state = &local_state->addr_states[i];
+        addr_state->name = (knot_dname_t *)"";
+        check_tls_capable(addr_state, qry->request, &address->ip);
+        check_tcp_connections(addr_state, qry->request, &address->ip);
+        check_network_settings(addr_state, addr_len, qry->flags.NO_IPV4, qry->flags.NO_IPV6);
+
+        if(addr_state->generation == -1) {
+            continue;
+        }
+        addr_state->forward_index = i;
+
+        addr_state->rtt_state = get_rtt_state(ip_to_bytes(address, addr_len), addr_len, &qry->request->ctx->cache);
+        const char *ns_str = kr_straddr(&address->ip);
+        if (VERBOSE_STATUS) {
+            printf("[nsrep] rtt of %s is %d, variance is %d\n", ns_str, addr_state->rtt_state.srtt, addr_state->rtt_state.variance);
+        }
+
+        choices[valid++] = (struct choice){
+            .address = ip_to_bytes(address, addr_len),
+            .address_len = addr_len,
+            .address_state = addr_state,
+        };
+    }
+
+    *transport = choose_transport(choices, valid, NULL, 0, &qry->request->pool, qry->flags.TCP, &local_state->last_choice_index);
+}
+
+void forward_success(struct kr_query *qry, const struct kr_transport *transport) {
+    return;
+}
+
+void forward_error(struct kr_query *qry, const struct kr_transport *transport, enum kr_selection_error sel_error) {
+    struct forward_local_state *local_state = qry->server_selection.local_state;
+       struct address_state *addr_state = &local_state->addr_states[local_state->last_choice_index];
+    error(qry, addr_state, transport, sel_error);
+}
+
+void forward_update_rtt(struct kr_query *qry, const struct kr_transport *transport, unsigned rtt) {
+    if (!transport) {
+        return;
+    }
+
+       struct forward_local_state *local_state = qry->server_selection.local_state;
+       struct address_state *addr_state = &local_state->addr_states[local_state->last_choice_index];
+
+    update_rtt(qry, addr_state, transport, rtt);
+}
\ No newline at end of file
diff --git a/lib/selection_forward.h b/lib/selection_forward.h
new file mode 100644 (file)
index 0000000..5a996e0
--- /dev/null
@@ -0,0 +1,14 @@
+/*  Copyright (C) 2014-2020 CZ.NIC, z.s.p.o. <knot-dns@labs.nic.cz>
+ *  SPDX-License-Identifier: GPL-3.0-or-later
+ */
+
+#pragma once
+
+#include "lib/selection.h"
+#include "lib/resolve.h"
+
+void forward_local_state_init(struct knot_mm *mm, void **local_state, struct kr_request *req);
+void forward_choose_transport(struct kr_query *qry, struct kr_transport **transport);
+void forward_success(struct kr_query *qry, const struct kr_transport *transport);
+void forward_error(struct kr_query *qry, const struct kr_transport *transport, enum kr_selection_error sel_error);
+void forward_update_rtt(struct kr_query *qry, const struct kr_transport *transport, unsigned rtt);
\ No newline at end of file
diff --git a/lib/selection_iter.c b/lib/selection_iter.c
new file mode 100644 (file)
index 0000000..07ab524
--- /dev/null
@@ -0,0 +1,209 @@
+/*  Copyright (C) 2014-2020 CZ.NIC, z.s.p.o. <knot-dns@labs.nic.cz>
+ *  SPDX-License-Identifier: GPL-3.0-or-later
+ */
+
+#include "lib/selection_iter.h"
+#include "lib/selection.h"
+
+#include "lib/generic/trie.h"
+#include "lib/generic/pack.h"
+#include "lib/zonecut.h"
+#include "lib/resolve.h"
+
+#define VERBOSE_MSG(qry, ...) QRVERBOSE((qry), "nsrep",  __VA_ARGS__)
+
+// To be held per query and locally
+struct iter_local_state {
+    trie_t *unresolved_names;
+    trie_t *addresses;
+    unsigned int generation; // Used to distinguish old and valid records in tries
+    knot_dname_t *zonecut_name;
+};
+
+// To be held per NS name and locally
+struct iter_name_state {
+    unsigned int generation;
+};
+
+void iter_local_state_init(struct knot_mm *mm, void **local_state) {
+    *local_state = mm_alloc(mm, sizeof(struct iter_local_state));
+    memset(*local_state, 0, sizeof(struct iter_local_state));
+}
+
+struct address_state *get_address_state(struct iter_local_state *local_state, const struct kr_transport *transport) {
+       trie_t *addresses = local_state->addresses;
+       uint8_t *address = ip_to_bytes(&transport->address, transport->address_len);
+
+       trie_val_t *address_state = trie_get_try(addresses, (char *)address, transport->address_len);
+
+       if (!address_state) {
+               assert(0);
+       }
+       return (struct address_state *)*address_state;
+}
+
+bool zonecut_changed(knot_dname_t *new, knot_dname_t *old) {
+    return knot_dname_cmp(old, new);
+}
+
+void iter_update_state_from_rtt_cache(struct iter_local_state *local_state, struct kr_cache *cache) {
+    trie_it_t *it;
+    for(it = trie_it_begin(local_state->addresses); !trie_it_finished(it); trie_it_next(it)) {
+        size_t address_len;
+        uint8_t *address = (uint8_t *)trie_it_key(it, &address_len);
+        struct address_state *address_state = (struct address_state *)*trie_it_val(it);
+        address_state->rtt_state = get_rtt_state(address, address_len, cache);
+        union inaddr addr;
+        bytes_to_ip(address, address_len, &addr);
+        const char *ns_str = kr_straddr(&addr.ip);
+        if (VERBOSE_STATUS) {
+            printf("[nsrep] rtt of %s is %d, variance is %d\n", ns_str, address_state->rtt_state.srtt, address_state->rtt_state.variance);
+        }
+    }
+    trie_it_free(it);
+}
+
+
+void iter_update_state_from_zonecut(struct iter_local_state *local_state, struct kr_zonecut *zonecut, struct knot_mm *mm) {
+       if (zonecut_changed(zonecut->name, local_state->zonecut_name) ||
+        local_state->unresolved_names == NULL || local_state->addresses == NULL) {
+        // Local state initialization
+        memset(local_state, 0, sizeof(struct iter_local_state));
+        local_state->unresolved_names = trie_create(mm);
+        local_state->addresses = trie_create(mm);
+        local_state->zonecut_name = knot_dname_copy(zonecut->name, mm);
+    }
+
+    local_state->generation++;
+
+    trie_it_t *it;
+    unsigned int current_generation = local_state->generation;
+
+    for(it = trie_it_begin(zonecut->nsset); !trie_it_finished(it); trie_it_next(it)) {
+        knot_dname_t *dname = (knot_dname_t *)trie_it_key(it, NULL);
+        pack_t *addresses = (pack_t *)*trie_it_val(it);
+
+        if (addresses->len == 0) {
+            // Name with no address
+            trie_val_t *val = trie_get_ins(local_state->unresolved_names, (char *)dname, knot_dname_size(dname));
+            if (!*val) {
+                // that we encountered for the first time
+                *val = mm_alloc(mm, sizeof(struct iter_name_state));
+                memset(*val, 0, sizeof(struct iter_name_state));
+            }
+            (*(struct iter_name_state **)val)->generation = current_generation;
+        } else {
+            // We have some addresses to work with, let's iterate over them
+            for(uint8_t *obj = pack_head(*addresses); obj != pack_tail(*addresses); obj = pack_obj_next(obj)) {
+                uint8_t *address = (uint8_t *)pack_obj_val(obj);
+                size_t address_len = pack_obj_len(obj);
+                trie_val_t *val = trie_get_ins(local_state->addresses, (char *)address, address_len);
+                if (!*val) {
+                    // We have have not seen this address before.
+                    *val = mm_alloc(mm, sizeof(struct address_state));
+                    memset(*val, 0, sizeof(struct address_state));
+                }
+                struct address_state *address_state = (*(struct address_state **)val);
+                address_state->generation = current_generation;
+                address_state->name = dname;
+            }
+        }
+    }
+
+    trie_it_free(it);
+}
+
+void iter_choose_transport(struct kr_query *qry, struct kr_transport **transport) {
+    struct knot_mm *mempool = qry->request->rplan.pool;
+    struct iter_local_state *local_state = (struct iter_local_state *)qry->server_selection.local_state;
+
+    iter_update_state_from_zonecut(local_state, &qry->zone_cut, mempool);
+    iter_update_state_from_rtt_cache(local_state, &qry->request->ctx->cache);
+
+    trie_it_t *it;
+    for(it = trie_it_begin(local_state->addresses); !trie_it_finished(it); trie_it_next(it)) {
+            size_t address_len;
+            uint8_t* address = (uint8_t *)trie_it_key(it, &address_len);
+
+            union inaddr tmp_address;
+            bytes_to_ip(address, address_len, &tmp_address);
+
+            struct address_state *address_state = (struct address_state *)*trie_it_val(it);
+            check_tls_capable(address_state, qry->request, &tmp_address.ip);
+            check_tcp_connections(address_state, qry->request, &tmp_address.ip);
+            check_network_settings(address_state, address_len, qry->flags.NO_IPV4, qry->flags.NO_IPV6);
+    }
+    trie_it_free(it);
+
+    // also take qry->flags.TCP into consideration (do that in the actual choosing function)
+
+    int num_addresses = trie_weight(local_state->addresses);
+    int num_unresolved_names = trie_weight(local_state->unresolved_names);
+
+    struct choice choices[num_addresses]; // Some will get unused, oh well
+    knot_dname_t *unresolved_names[num_unresolved_names];
+
+    int valid_addresses = 0;
+    for(it = trie_it_begin(local_state->addresses); !trie_it_finished(it); trie_it_next(it)) {
+        size_t address_len;
+        uint8_t* address = (uint8_t *)trie_it_key(it, &address_len);
+        struct address_state *address_state = (struct address_state *)*trie_it_val(it);
+        if (address_state->generation == local_state->generation) {
+            choices[valid_addresses].address = address;
+            choices[valid_addresses].address_len = address_len;
+            choices[valid_addresses].address_state = address_state;
+            valid_addresses++;
+        }
+
+    }
+
+    trie_it_free(it);
+
+    int to_resolve = 0;
+    for(it = trie_it_begin(local_state->unresolved_names); !trie_it_finished(it); trie_it_next(it)) {
+        struct iter_name_state *name_state = *(struct iter_name_state **)trie_it_val(it);
+        if (name_state->generation == local_state->generation) {
+            knot_dname_t *name = (knot_dname_t *)trie_it_key(it, NULL);
+            unresolved_names[to_resolve++] = name;
+        }
+    }
+
+    trie_it_free(it);
+
+    if (valid_addresses || to_resolve) {
+        *transport = choose_transport(choices, valid_addresses, unresolved_names, to_resolve, mempool, qry->flags.TCP, NULL);
+    } else {
+        *transport = NULL;
+    }
+
+    WITH_VERBOSE(qry) {
+        KR_DNAME_GET_STR(zonecut_str, qry->zone_cut.name);
+        if (*transport) {
+            KR_DNAME_GET_STR(ns_name, (*transport)->name);
+            const char *ns_str = kr_straddr(&(*transport)->address.ip);
+            VERBOSE_MSG(qry,
+                       "=> id: '%05u' choosing: '%s'@'%s' zone cut: '%s'\n",
+                       qry->id, ns_name, ns_str ? ns_str : "", zonecut_str);
+        } else {
+             VERBOSE_MSG(qry,
+                       "=> id: '%05u' no suitable transport, zone cut: '%s'\n",
+                       qry->id, zonecut_str);
+        }
+       }
+}
+
+void iter_success(struct kr_query *qry, const struct kr_transport *transport) {
+    return;
+}
+
+void iter_error(struct kr_query *qry, const struct kr_transport *transport, enum kr_selection_error sel_error) {
+       struct iter_local_state *local_state = qry->server_selection.local_state;
+       struct address_state *addr_state = get_address_state(local_state, transport);
+       error(qry, addr_state, transport, sel_error);
+}
+
+void iter_update_rtt(struct kr_query *qry, const struct kr_transport *transport, unsigned rtt) {
+       struct iter_local_state *local_state = qry->server_selection.local_state;
+       struct address_state *addr_state = get_address_state(local_state, transport);
+    update_rtt(qry, addr_state, transport, rtt);
+}
diff --git a/lib/selection_iter.h b/lib/selection_iter.h
new file mode 100644 (file)
index 0000000..4208b03
--- /dev/null
@@ -0,0 +1,13 @@
+/*  Copyright (C) 2014-2020 CZ.NIC, z.s.p.o. <knot-dns@labs.nic.cz>
+ *  SPDX-License-Identifier: GPL-3.0-or-later
+ */
+
+#pragma once
+
+#include "lib/selection.h"
+
+void iter_local_state_init(struct knot_mm *mm, void **local_state);
+void iter_choose_transport(struct kr_query *qry, struct kr_transport **transport);
+void iter_success(struct kr_query *qry, const struct kr_transport *transport);
+void iter_error(struct kr_query *qry, const struct kr_transport *transport, enum kr_selection_error sel_error);
+void iter_update_rtt(struct kr_query *qry, const struct kr_transport *transport, unsigned rtt);
\ No newline at end of file
index 97179409931658bbf93bf198884387c1d6e1c7db..56575576291e3b99d56e7898bae94338c918d196 100644 (file)
@@ -11,7 +11,7 @@
 #include "lib/defines.h"
 #include "lib/generic/array.h"
 #include "lib/module.h"
-#include "lib/nsrep.h"
+#include "lib/selection.h"
 #include "lib/resolve.h"
 
 #include <gnutls/gnutls.h>
index 3b017951294208a5c87719d68031cea5decb89ce..7bf6bbaeb7c64eb3ae0ffa5ae5b72e3a1cbaee2b 100644 (file)
@@ -117,6 +117,7 @@ static inline void free_const(const void *what)
        free((void *)what);
 }
 
+// Use this for alocations with mm.
 static inline void *mm_alloc(knot_mm_t *mm, size_t size)
 {
        if (mm) return mm->alloc(mm->ctx, size);
@@ -136,6 +137,7 @@ KR_EXPORT
 void *mm_realloc(knot_mm_t *mm, void *what, size_t size, size_t prev_size);
 
 /** Trivial malloc() wrapper. */
+// Use mm_alloc for alocations into mempool
 void *mm_malloc(void *ctx, size_t n);
 /** posix_memalign() wrapper. */
 void *mm_malloc_aligned(void *ctx, size_t n);
index 5839b9743d496cde98af372075b44a6e870fb845..7227bf9c0982f613e3e788e1a16a1bb4f9ef8ab0 100644 (file)
@@ -337,17 +337,8 @@ static addrset_info_t fetch_addr(pack_t *addrs, const knot_dname_t *ns, uint16_t
                                        (int)rd->len, (int)rrtype);
                        continue;
                }
-               /* Check RTT cache - whether the IP is usable or not. */
-               kr_nsrep_rtt_lru_entry_t *rtt_e = ctx->cache_rtt
-                       ? lru_get_try(ctx->cache_rtt, (const char *)rd->data, rd->len)
-                       : NULL;
-               const bool unusable = rtt_e && rtt_e->score >= KR_NS_TIMEOUT
-                       && qry->creation_time_mono
-                          < rtt_e->tout_timestamp + ctx->cache_rtt_tout_retry_interval;
-               if (!unusable) {
-                       result = AI_OK;
-                       ++usable_cnt;
-               }
+               result = AI_OK;
+               ++usable_cnt;
 
                ret = pack_obj_push(addrs, rd->data, rd->len);
                assert(!ret); /* didn't fit because of incorrectly reserved memory */
@@ -413,16 +404,10 @@ static int fetch_ns(struct kr_context *ctx, struct kr_zonecut *cut,
                pack_init(**pack);
 
                addrset_info_t infos[2];
+
                /* Fetch NS reputation and decide whether to prefetch A/AAAA records. */
-               unsigned *cached = lru_get_try(ctx->cache_rep,
-                                       (const char *)ns_name, ns_size);
-               unsigned reputation = (cached) ? *cached : 0;
-               infos[0] = (reputation & KR_NS_NOIP4) || qry->flags.NO_IPV4
-                       ? AI_REPUT
-                       : fetch_addr(*pack, ns_name, KNOT_RRTYPE_A, cut->pool, qry);
-               infos[1] = (reputation & KR_NS_NOIP6) || qry->flags.NO_IPV6
-                       ? AI_REPUT
-                       : fetch_addr(*pack, ns_name, KNOT_RRTYPE_AAAA, cut->pool, qry);
+               infos[0] = fetch_addr(*pack, ns_name, KNOT_RRTYPE_A, cut->pool, qry);
+               infos[1] = fetch_addr(*pack, ns_name, KNOT_RRTYPE_AAAA, cut->pool, qry);
 
                #if 0 /* rather unlikely to be useful unless changing some zcut code */
                WITH_VERBOSE(qry) {
index 070b2c08b6ba08702ca4bbf8c421addad849646a..2179bba54629312294b3cad4ef149da61430679d 100644 (file)
@@ -77,18 +77,14 @@ function policy.MIRROR(target)
 end
 
 -- Override the list of nameservers (forwarders)
-local function set_nslist(qry, list)
+local function set_nslist(req, list)
+       req.selection_context.forward_targets_num = #list
        local ns_i = 0
        for _, ns in ipairs(list) do
-               -- kr_nsrep_set() can return kr_error(ENOENT), it's OK
-               if ffi.C.kr_nsrep_set(qry, ns_i, ns) == 0 then
+               if ffi.C.kr_forward_add_target(req, ns_i, ns) == 0 then
                        ns_i = ns_i + 1
                end
        end
-       -- If less than maximum NSs, insert guard to terminate the list
-       if ns_i < 3 then
-               assert(ffi.C.kr_nsrep_set(qry, ns_i, nil) == 0);
-       end
        if ns_i == 0 then
                -- would use assert() but don't want to compose the message if not triggered
                error('no usable address in NS set (check net.ipv4 and '
@@ -102,7 +98,6 @@ function policy.STUB(target)
        if type(target) == 'table' then
                for _, v in pairs(target) do
                        table.insert(list, addr2sock(v, 53))
-                       assert(#list <= 4, 'at most 4 STUB targets are supported')
                end
        else
                table.insert(list, addr2sock(target, 53))
@@ -112,7 +107,7 @@ function policy.STUB(target)
                -- Switch mode to stub resolver, do not track origin zone cut since it's not real authority NS
                qry.flags.STUB = true
                qry.flags.ALWAYS_CUT = false
-               set_nslist(qry, list)
+               set_nslist(req, list)
                return state
        end
 end
@@ -123,7 +118,6 @@ function policy.FORWARD(target)
        if type(target) == 'table' then
                for _, v in pairs(target) do
                        table.insert(list, addr2sock(v, 53))
-                       assert(#list <= 4, 'at most 4 FORWARD targets are supported')
                end
        else
                table.insert(list, addr2sock(target, 53))
@@ -136,7 +130,7 @@ function policy.FORWARD(target)
                qry.flags.ALWAYS_CUT = false
                qry.flags.NO_MINIMIZE = true
                qry.flags.AWAIT_CUT = true
-               set_nslist(qry, list)
+               set_nslist(req, list)
                return state
        end
 end
index 6997cbb0d00898b1976c1d40e72dcca05f72779e..b4fd5d7f489185866518dfecb6e04c6758184f48 100644 (file)
@@ -147,7 +147,7 @@ static int collect_rtt(kr_layer_t *ctx, knot_pkt_t *pkt)
 {
        struct kr_request *req = ctx->req;
        struct kr_query *qry = req->current_query;
-       if (qry->flags.CACHED || !req->upstream.addr) {
+       if (qry->flags.CACHED || !req->upstream.transport) {
                return ctx->state;
        }
 
@@ -158,7 +158,7 @@ static int collect_rtt(kr_layer_t *ctx, knot_pkt_t *pkt)
        /* Socket address is encoded into sockaddr_in6 struct that
         * unions with sockaddr_in and differ in sa_family */
        struct sockaddr_in6 *e = &data->upstreams.q.at[data->upstreams.head];
-       const struct sockaddr *src = req->upstream.addr;
+       const struct sockaddr *src = &req->upstream.transport->address.ip;
        switch (src->sa_family) {
        case AF_INET:  memcpy(e, src, sizeof(struct sockaddr_in)); break;
        case AF_INET6: memcpy(e, src, sizeof(struct sockaddr_in6)); break;