]> git.ipfire.org Git - thirdparty/knot-resolver.git/commitdiff
daemon/worker: fixes for handling of some non-fatal TLS errors, metrics
authorMarek Vavruša <mvavrusa@cloudflare.com>
Fri, 3 Aug 2018 22:04:45 +0000 (15:04 -0700)
committerMarek Vavruša <mvavrusa@cloudflare.com>
Fri, 7 Sep 2018 17:45:21 +0000 (10:45 -0700)
The handshake now properly deals with GNUTLS_E_INTERRUPTED to retry,
and GNUTLS_E_WARNING_ALERT_RECEIVED and GNUTLS_E_GOT_APPLICATION_DATA
during session resumption.

Added a metric for monitoring TLS handshake errors.

Added `net.tls_handshake_timeout([milliseconds])` for configurable
TLS handshake timeout (default is 6000ms), and documentation for
`net.tcp_in_idle([milliseconds])`.

daemon/README.rst
daemon/bindings.c
daemon/network.c
daemon/network.h
daemon/tls.c
daemon/worker.c
daemon/worker.h

index 87c2df0d05e80fb8e48e2ac97932c091857dc2f9..82b4bfb85d099d2587f0e7f805326c10564983df 100644 (file)
@@ -491,6 +491,27 @@ For when listening on ``localhost`` just doesn't cut it.
    Get/set the IPv4 address used to perform queries.  There is also ``net.outgoing_v6`` for IPv6.
    The default is ``nil``, which lets the OS choose any address.
 
+.. function:: net.tcp_in_idle([milliseconds])
+
+   Set TCP idle connection timeout in milliseconds. This is the limit for how long the connections to upstreams are kept open when idle. Default is 10000.
+
+   .. code-block:: lua
+
+      > net.tcp_in_idle()
+      10000
+      > net.tcp_in_idle(5000)
+      true
+
+.. function:: net.tls_handshake_timeout([milliseconds])
+
+   Set TLS handshake timeout in milliseconds. It should be set to accomodate 2-3 roundtrips to upstreams for TLS1.2. Default is 6000.
+
+   .. code-block:: lua
+
+      > net.tls_handshake_timeout()
+      6000
+      > net.tls_handshake_timeout(2000)
+      true
 
 .. _tls-server-config:
 
index 07969933155b4e3574b55884c8874a0ddc70bee9..338157ec54545fca6ac18c0c04a48705223d104f 100644 (file)
@@ -833,37 +833,53 @@ static int net_outgoing(lua_State *L, int family)
 static int net_outgoing_v4(lua_State *L) { return net_outgoing(L, AF_INET); }
 static int net_outgoing_v6(lua_State *L) { return net_outgoing(L, AF_INET6); }
 
-static int net_tcp_in_idle(lua_State *L)
+static int net_update_timeout(lua_State *L, uint64_t *timeout, const char *name)
 {
-       struct engine *engine = engine_luaget(L);
-       struct network *net = &engine->net;
-
        /* Only return current idle timeout. */
        if (lua_gettop(L) == 0) {
-               lua_pushnumber(L, net->tcp.in_idle_timeout);
+               lua_pushnumber(L, *timeout);
                return 1;
        }
 
        if ((lua_gettop(L) != 1)) {
-               lua_pushstring(L, "net.tcp_in_idle takes one parameter: (\"idle timeout\")");
+               lua_pushstring(L, name);
+               lua_pushstring(L, " takes one parameter: (\"idle timeout\")");
                lua_error(L);
        }
 
        if (lua_isnumber(L, 1)) {
                int idle_timeout = lua_tointeger(L, 1);
                if (idle_timeout <= 0) {
-                       lua_pushstring(L, "net.tcp_in_idle parameter has to be positive number");
+                       lua_pushstring(L, name);
+                       lua_pushstring(L, " parameter has to be positive number");
                        lua_error(L);
                }
-               net->tcp.in_idle_timeout = idle_timeout;
+               *timeout = idle_timeout;
        } else {
-               lua_pushstring(L, "net.tcp_in_idle parameter has to be positive number");
+               lua_pushstring(L, name);
+               lua_pushstring(L, " parameter has to be positive number");
                lua_error(L);
        }
        lua_pushboolean(L, true);
        return 1;
 }
 
+static int net_tcp_in_idle(lua_State *L)
+{
+       struct engine *engine = engine_luaget(L);
+       struct network *net = &engine->net;
+
+       return net_update_timeout(L, &net->tcp.in_idle_timeout, "net.tcp_in_idle");
+}
+
+static int net_tls_handshake_timeout(lua_State *L)
+{
+       struct engine *engine = engine_luaget(L);
+       struct network *net = &engine->net;
+
+       return net_update_timeout(L, &net->tcp.tls_handshake_timeout, "net.tls_handshake_timeout");
+}
+
 int lib_net(lua_State *L)
 {
        static const luaL_Reg lib[] = {
@@ -883,6 +899,7 @@ int lib_net(lua_State *L)
                { "outgoing_v4",  net_outgoing_v4 },
                { "outgoing_v6",  net_outgoing_v6 },
                { "tcp_in_idle",  net_tcp_in_idle },
+               { "tls_handshake_timeout",  net_tls_handshake_timeout },
                { NULL, NULL }
        };
        register_lib(L, "net", lib);
@@ -1733,6 +1750,8 @@ static int wrk_stats(lua_State *L)
        lua_setfield(L, -2, "dropped");
        lua_pushnumber(L, worker->stats.timeout);
        lua_setfield(L, -2, "timeout");
+       lua_pushnumber(L, worker->stats.handshake_errors);
+       lua_setfield(L, -2, "handshake_errors");
        /* Add subset of rusage that represents counters. */
        uv_rusage_t rusage;
        if (uv_getrusage(&rusage) == 0) {
index 456c71701a0748da894fef8c642abecf1dc012b4..9a131cdbdc9861b8729a7ed21c0f237102f6b385 100644 (file)
@@ -57,6 +57,7 @@ void network_init(struct network *net, uv_loop_t *loop, int tcp_backlog)
                net->tcp.in_idle_timeout = 10000;
                net->tcp_backlog = tcp_backlog;
                net->tcp.in_idle_timeout = 10000;
+               net->tcp.tls_handshake_timeout = TLS_MAX_HANDSHAKE_TIME;
        }
 }
 
index 548d61a23ea13e24c157615036366c1f9571e75e..dbaf826afe73fac242689d10e12cefe832f72390 100644 (file)
@@ -44,6 +44,7 @@ typedef array_t(struct endpoint*) endpoint_array_t;
 
 struct net_tcp_param {
        uint64_t in_idle_timeout;
+    uint64_t tls_handshake_timeout;
 };
 
 struct tls_session_ticket_ctx;
index 6fd5b1dd5deeebb552692c943790875361975791..8e6c52f1348f81c5333e40f1de43bd55dd01c322 100644 (file)
@@ -90,6 +90,46 @@ static ssize_t kres_gnutls_pull(gnutls_transport_ptr_t h, void *buf, size_t len)
        return transfer;
 }
 
+/** Perform TLS handshake and handle error codes according to the documentation.
+  * See See https://gnutls.org/manual/html_node/TLS-handshake.html#TLS-handshake
+  * The function returns kr_ok() or success or non fatal error, kr_error(EAGAIN) on blocking, or kr_error(EIO) on fatal error.
+  */
+static int tls_handshake(struct tls_common_ctx *ctx, tls_handshake_cb handshake_cb) {
+       struct session *session = ctx->session;
+       const char *logstring = ctx->client_side ? client_logstring : server_logstring;
+
+       int err = gnutls_handshake(ctx->tls_session);
+       if (err == GNUTLS_E_SUCCESS) {
+               /* Handshake finished, return success */
+               ctx->handshake_state = TLS_HS_DONE;
+               kr_log_verbose("[%s] TLS handshake with %s has completed\n",
+                              logstring,  kr_straddr(&session->peer.ip));
+               if (handshake_cb) {
+                       handshake_cb(session, 0);
+               }
+       } else if (err == GNUTLS_E_AGAIN) {
+               return kr_error(EAGAIN);
+       } else if (gnutls_error_is_fatal(err)) {
+               /* Fatal errors, return error as it's not recoverable */
+               kr_log_verbose("[%s] gnutls_handshake failed: %s (%d)\n",
+                            logstring,
+                            gnutls_strerror_name(err), err);
+               if (handshake_cb) {
+                       handshake_cb(session, -1);
+               }
+               return kr_error(EIO);
+       } else if (err == GNUTLS_E_WARNING_ALERT_RECEIVED) {
+               /* Handle warning when in verbose mode */
+               const char *alert_name = gnutls_alert_get_name(gnutls_alert_get(ctx->tls_session));
+               if (alert_name != NULL) {
+                       kr_log_verbose("[%s] TLS alert from %s received: %s\n",
+                                      logstring, kr_straddr(&session->peer.ip), alert_name);
+               }
+       }
+       return kr_ok();
+}
+
+
 struct tls_ctx_t *tls_new(struct worker_ctx *worker)
 {
        assert(worker != NULL);
@@ -281,58 +321,48 @@ int tls_process(struct worker_ctx *worker, uv_stream_t *handle, const uint8_t *b
        tls_p->nread = nread >= 0 ? nread : 0;
        tls_p->consumed = 0;
 
-       /* Ensure TLS handshake is performed before receiving data. */
-       while (tls_p->handshake_state == TLS_HS_IN_PROGRESS) {
-               int err = gnutls_handshake(tls_p->tls_session);
-               if (err == GNUTLS_E_SUCCESS) {
-                       tls_p->handshake_state = TLS_HS_DONE;
-                       kr_log_verbose("[%s] TLS handshake with %s has completed\n",
-                                      logstring,  kr_straddr(&session->peer.ip));
-                       if (tls_p->handshake_cb) {
-                               tls_p->handshake_cb(tls_p->session, 0);
-                       }
-               } else if (err == GNUTLS_E_AGAIN) {
-                       return 0;
-               } else if (gnutls_error_is_fatal(err)) {
-                       kr_log_verbose("[%s] gnutls_handshake failed: %s (%d)\n",
-                                    logstring,
-                                    gnutls_strerror_name(err), err);
-                       if (tls_p->handshake_cb) {
-                               tls_p->handshake_cb(tls_p->session, -1);
-                       }
-                       return kr_error(err);
+       /* Ensure TLS handshake is performed before receiving data.
+        * See https://www.gnutls.org/manual/html_node/TLS-handshake.html */
+       while (tls_p->handshake_state <= TLS_HS_IN_PROGRESS) {
+               int err = tls_handshake(tls_p, tls_p->handshake_cb);
+               if (err == kr_error(EAGAIN)) {
+                       return 0; /* Wait for more data */
+               } else if (err != kr_ok()) {
+                       return err;
                }
        }
 
+       /* See https://gnutls.org/manual/html_node/Data-transfer-and-termination.html#Data-transfer-and-termination */
        int submitted = 0;
        bool is_retrying = false;
        uint64_t retrying_start = 0;
        while (true) {
                ssize_t count = gnutls_record_recv(tls_p->tls_session, tls_p->recv_buf, sizeof(tls_p->recv_buf));
                if (count == GNUTLS_E_AGAIN) {
-                       break;    /* No data available */
-               } else if (count == GNUTLS_E_INTERRUPTED ||
-                          count == GNUTLS_E_REHANDSHAKE) {
-                       if (!is_retrying) {
-                               is_retrying = true;
-                               retrying_start = kr_now();
-                       }
-                       uint64_t elapsed = kr_now() - retrying_start;
-                       if (elapsed > TLS_MAX_HANDSHAKE_TIME) {
-                               return kr_error(EIO);
-                       }
-                       continue; /* Try reading again */
+                       break; /* No data available */
                } else if (count < 0) {
+                       /* Retry on non-fatal errors (alerts, rehandshake) */
+                       if (!gnutls_error_is_fatal(count)) {
+                               if (!is_retrying) {
+                                       is_retrying = true;
+                                       retrying_start = kr_now();
+                               }
+                               uint64_t elapsed = kr_now() - retrying_start;
+                               if (elapsed < TLS_MAX_HANDSHAKE_TIME) {
+                                       continue; /* Try reading again */
+                               }
+                       }
                        kr_log_verbose("[%s] gnutls_record_recv failed: %s (%zd)\n",
                                     logstring, gnutls_strerror_name(count), count);
-                       return kr_error(EIO);
+                       /* Propagate errors to lower layer */
+                       count = kr_error(EIO);
                }
                DEBUG_MSG("[%s] submitting %zd data to worker\n", logstring, count);
                int ret = worker_process_tcp(worker, handle, tls_p->recv_buf, count);
                if (ret < 0) {
                        return ret;
                }
-               if (count == 0) {
+               if (count <= 0) {
                        break;
                }
                submitted += ret;
@@ -983,7 +1013,7 @@ int tls_client_connect_start(struct tls_client_ctx_t *client_ctx,
        struct tls_common_ctx *ctx = &client_ctx->c;
 
        gnutls_session_set_ptr(ctx->tls_session, client_ctx);
-       gnutls_handshake_set_timeout(ctx->tls_session, KR_CONN_RTT_MAX * 3);
+       gnutls_handshake_set_timeout(ctx->tls_session, ctx->worker->engine->net.tcp.tls_handshake_timeout);
        session->tls_client_ctx = client_ctx;
        ctx->handshake_cb = handshake_cb;
        ctx->handshake_state = TLS_HS_IN_PROGRESS;
@@ -995,14 +1025,15 @@ int tls_client_connect_start(struct tls_client_ctx_t *client_ctx,
                                        tls_params->session_data.size);
        }
 
-       int ret = gnutls_handshake(ctx->tls_session);
-       if (ret == GNUTLS_E_SUCCESS) {
-               return kr_ok();
-       } else if (gnutls_error_is_fatal(ret) != 0) {
-               kr_log_verbose("[tls_client] handshake failed (%s)\n", gnutls_strerror(ret));
-               return kr_error(ECONNABORTED);
+       /* See https://www.gnutls.org/manual/html_node/Asynchronous-operation.html */
+       while (ctx->handshake_state <= TLS_HS_IN_PROGRESS) {
+               /* Don't pass the handshake callback as the connection isn't registered yet. */
+               int ret = tls_handshake(ctx, NULL);
+               if (ret != kr_ok()) {
+                       return ret;
+               }
        }
-       return kr_error(EAGAIN);
+       return kr_ok();
 }
 
 tls_hs_state_t tls_get_hs_state(const struct tls_common_ctx *ctx)
index f0f89a122cb65b3fa77fe350a84a32c80e0b2ceb..cdd2ae24bdd59e2fa07950f0e359a9e26d10a5a1 100644 (file)
@@ -1184,7 +1184,7 @@ static void on_connect(uv_connect_t *req, int status)
                        array_del(session->waiting, 0);
                        assert(task->refs > 1);
                        qr_task_unref(task);
-                       qr_task_step(task, NULL, NULL);
+                       qr_task_step(task, &peer->ip, NULL);
                }
                assert(session->tasks.len == 0);
                iorequest_release(worker, req);
@@ -1247,8 +1247,12 @@ static void on_connect(uv_connect_t *req, int status)
                }
        }
 
+       /* Either handshake or sending data has failed */
        while (session->waiting.len > 0) {
                struct qr_task *task = session->waiting.at[0];
+               worker->stats.handshake_errors += 1;
+               /* Notify resolver of the outgoing query timeout, as there's no further step */
+               kr_resolve_consume(&task->ctx->req, &session->peer.ip, NULL);
                session_del_tasks(session, task);
                array_del(session->waiting, 0);
                ioreq_kill_pending(task);
@@ -1295,7 +1299,7 @@ static void on_tcp_connect_timeout(uv_timer_t *timer)
                array_del(session->waiting, 0);
                assert(task->refs > 1);
                qr_task_unref(task);
-               qr_task_step(task, NULL, NULL);
+               qr_task_step(task, &peer->ip, NULL);
        }
 
        assert (session->tasks.len == 0);
@@ -1319,6 +1323,8 @@ static void on_tcp_watchdog_timeout(uv_timer_t *timer)
                        struct qr_task *task = session->waiting.at[0];
                        task->timeouts += 1;
                        worker->stats.timeout += 1;
+                       /* Notify resolver of the outgoing query timeout, as there's no further step */
+                       kr_resolve_consume(&task->ctx->req, &session->peer.ip, NULL);
                        array_del(session->waiting, 0);
                        session_del_tasks(session, task);
                        ioreq_kill_pending(task);
@@ -1362,7 +1368,7 @@ static void on_udp_timeout(uv_timer_t *timer)
                for (uint16_t i = 0; i < MIN(task->pending_count, KR_NSREP_MAXADDR); ++i) {
                        struct sockaddr *choice = (struct sockaddr *)(&addrlist[i]);
                        if (choice->sa_family == AF_UNSPEC) {
-                               break;
+                               continue;
                        }
                        WITH_VERBOSE(qry) {
                                char addr_str[INET6_ADDRSTRLEN];
@@ -1376,7 +1382,7 @@ static void on_udp_timeout(uv_timer_t *timer)
        }
        task->timeouts += 1;
        worker->stats.timeout += 1;
-       qr_task_step(task, NULL, NULL);
+       qr_task_step(task, &session->peer.ip, NULL);
 }
 
 static void on_session_idle_timeout(uv_timer_t *timer)
@@ -1876,7 +1882,7 @@ static int qr_task_step(struct qr_task *task,
                                worker_del_tcp_waiting(ctx->worker, addr);
                                iorequest_release(ctx->worker, conn);
                                subreq_finalize(task, packet_source, packet);
-                               return qr_task_step(task, NULL, NULL);
+                               return qr_task_step(task, &session->peer.ip, NULL);
                        }
                }
        }
index 0d1bcc20825ff82ef65f2ab06c803f5e5170208e..1bd90706f4bee027370947bc339210bf83af3755 100644 (file)
@@ -145,6 +145,7 @@ struct worker_ctx {
                size_t queries;
                size_t dropped;
                size_t timeout;
+               size_t handshake_errors;
        } stats;
 
        struct zone_import_ctx* z_import;