From 64e7575dcc2399f49dd6b681dc55632468e8914c Mon Sep 17 00:00:00 2001 From: =?utf8?q?Marek=20Vavrus=CC=8Ca?= Date: Fri, 3 Aug 2018 15:04:45 -0700 Subject: [PATCH] daemon/worker: fixes for handling of some non-fatal TLS errors, metrics The handshake now properly deals with GNUTLS_E_INTERRUPTED to retry, and GNUTLS_E_WARNING_ALERT_RECEIVED and GNUTLS_E_GOT_APPLICATION_DATA during session resumption. Added a metric for monitoring TLS handshake errors. Added `net.tls_handshake_timeout([milliseconds])` for configurable TLS handshake timeout (default is 6000ms), and documentation for `net.tcp_in_idle([milliseconds])`. --- daemon/README.rst | 21 +++++++++ daemon/bindings.c | 37 +++++++++++---- daemon/network.c | 1 + daemon/network.h | 1 + daemon/tls.c | 115 +++++++++++++++++++++++++++++----------------- daemon/worker.c | 16 +++++-- daemon/worker.h | 1 + 7 files changed, 136 insertions(+), 56 deletions(-) diff --git a/daemon/README.rst b/daemon/README.rst index 87c2df0d0..82b4bfb85 100644 --- a/daemon/README.rst +++ b/daemon/README.rst @@ -491,6 +491,27 @@ For when listening on ``localhost`` just doesn't cut it. Get/set the IPv4 address used to perform queries. There is also ``net.outgoing_v6`` for IPv6. The default is ``nil``, which lets the OS choose any address. +.. function:: net.tcp_in_idle([milliseconds]) + + Set TCP idle connection timeout in milliseconds. This is the limit for how long the connections to upstreams are kept open when idle. Default is 10000. + + .. code-block:: lua + + > net.tcp_in_idle() + 10000 + > net.tcp_in_idle(5000) + true + +.. function:: net.tls_handshake_timeout([milliseconds]) + + Set TLS handshake timeout in milliseconds. It should be set to accomodate 2-3 roundtrips to upstreams for TLS1.2. Default is 6000. + + .. code-block:: lua + + > net.tls_handshake_timeout() + 6000 + > net.tls_handshake_timeout(2000) + true .. _tls-server-config: diff --git a/daemon/bindings.c b/daemon/bindings.c index 079699331..338157ec5 100644 --- a/daemon/bindings.c +++ b/daemon/bindings.c @@ -833,37 +833,53 @@ static int net_outgoing(lua_State *L, int family) static int net_outgoing_v4(lua_State *L) { return net_outgoing(L, AF_INET); } static int net_outgoing_v6(lua_State *L) { return net_outgoing(L, AF_INET6); } -static int net_tcp_in_idle(lua_State *L) +static int net_update_timeout(lua_State *L, uint64_t *timeout, const char *name) { - struct engine *engine = engine_luaget(L); - struct network *net = &engine->net; - /* Only return current idle timeout. */ if (lua_gettop(L) == 0) { - lua_pushnumber(L, net->tcp.in_idle_timeout); + lua_pushnumber(L, *timeout); return 1; } if ((lua_gettop(L) != 1)) { - lua_pushstring(L, "net.tcp_in_idle takes one parameter: (\"idle timeout\")"); + lua_pushstring(L, name); + lua_pushstring(L, " takes one parameter: (\"idle timeout\")"); lua_error(L); } if (lua_isnumber(L, 1)) { int idle_timeout = lua_tointeger(L, 1); if (idle_timeout <= 0) { - lua_pushstring(L, "net.tcp_in_idle parameter has to be positive number"); + lua_pushstring(L, name); + lua_pushstring(L, " parameter has to be positive number"); lua_error(L); } - net->tcp.in_idle_timeout = idle_timeout; + *timeout = idle_timeout; } else { - lua_pushstring(L, "net.tcp_in_idle parameter has to be positive number"); + lua_pushstring(L, name); + lua_pushstring(L, " parameter has to be positive number"); lua_error(L); } lua_pushboolean(L, true); return 1; } +static int net_tcp_in_idle(lua_State *L) +{ + struct engine *engine = engine_luaget(L); + struct network *net = &engine->net; + + return net_update_timeout(L, &net->tcp.in_idle_timeout, "net.tcp_in_idle"); +} + +static int net_tls_handshake_timeout(lua_State *L) +{ + struct engine *engine = engine_luaget(L); + struct network *net = &engine->net; + + return net_update_timeout(L, &net->tcp.tls_handshake_timeout, "net.tls_handshake_timeout"); +} + int lib_net(lua_State *L) { static const luaL_Reg lib[] = { @@ -883,6 +899,7 @@ int lib_net(lua_State *L) { "outgoing_v4", net_outgoing_v4 }, { "outgoing_v6", net_outgoing_v6 }, { "tcp_in_idle", net_tcp_in_idle }, + { "tls_handshake_timeout", net_tls_handshake_timeout }, { NULL, NULL } }; register_lib(L, "net", lib); @@ -1733,6 +1750,8 @@ static int wrk_stats(lua_State *L) lua_setfield(L, -2, "dropped"); lua_pushnumber(L, worker->stats.timeout); lua_setfield(L, -2, "timeout"); + lua_pushnumber(L, worker->stats.handshake_errors); + lua_setfield(L, -2, "handshake_errors"); /* Add subset of rusage that represents counters. */ uv_rusage_t rusage; if (uv_getrusage(&rusage) == 0) { diff --git a/daemon/network.c b/daemon/network.c index 456c71701..9a131cdbd 100644 --- a/daemon/network.c +++ b/daemon/network.c @@ -57,6 +57,7 @@ void network_init(struct network *net, uv_loop_t *loop, int tcp_backlog) net->tcp.in_idle_timeout = 10000; net->tcp_backlog = tcp_backlog; net->tcp.in_idle_timeout = 10000; + net->tcp.tls_handshake_timeout = TLS_MAX_HANDSHAKE_TIME; } } diff --git a/daemon/network.h b/daemon/network.h index 548d61a23..dbaf826af 100644 --- a/daemon/network.h +++ b/daemon/network.h @@ -44,6 +44,7 @@ typedef array_t(struct endpoint*) endpoint_array_t; struct net_tcp_param { uint64_t in_idle_timeout; + uint64_t tls_handshake_timeout; }; struct tls_session_ticket_ctx; diff --git a/daemon/tls.c b/daemon/tls.c index 6fd5b1dd5..8e6c52f13 100644 --- a/daemon/tls.c +++ b/daemon/tls.c @@ -90,6 +90,46 @@ static ssize_t kres_gnutls_pull(gnutls_transport_ptr_t h, void *buf, size_t len) return transfer; } +/** Perform TLS handshake and handle error codes according to the documentation. + * See See https://gnutls.org/manual/html_node/TLS-handshake.html#TLS-handshake + * The function returns kr_ok() or success or non fatal error, kr_error(EAGAIN) on blocking, or kr_error(EIO) on fatal error. + */ +static int tls_handshake(struct tls_common_ctx *ctx, tls_handshake_cb handshake_cb) { + struct session *session = ctx->session; + const char *logstring = ctx->client_side ? client_logstring : server_logstring; + + int err = gnutls_handshake(ctx->tls_session); + if (err == GNUTLS_E_SUCCESS) { + /* Handshake finished, return success */ + ctx->handshake_state = TLS_HS_DONE; + kr_log_verbose("[%s] TLS handshake with %s has completed\n", + logstring, kr_straddr(&session->peer.ip)); + if (handshake_cb) { + handshake_cb(session, 0); + } + } else if (err == GNUTLS_E_AGAIN) { + return kr_error(EAGAIN); + } else if (gnutls_error_is_fatal(err)) { + /* Fatal errors, return error as it's not recoverable */ + kr_log_verbose("[%s] gnutls_handshake failed: %s (%d)\n", + logstring, + gnutls_strerror_name(err), err); + if (handshake_cb) { + handshake_cb(session, -1); + } + return kr_error(EIO); + } else if (err == GNUTLS_E_WARNING_ALERT_RECEIVED) { + /* Handle warning when in verbose mode */ + const char *alert_name = gnutls_alert_get_name(gnutls_alert_get(ctx->tls_session)); + if (alert_name != NULL) { + kr_log_verbose("[%s] TLS alert from %s received: %s\n", + logstring, kr_straddr(&session->peer.ip), alert_name); + } + } + return kr_ok(); +} + + struct tls_ctx_t *tls_new(struct worker_ctx *worker) { assert(worker != NULL); @@ -281,58 +321,48 @@ int tls_process(struct worker_ctx *worker, uv_stream_t *handle, const uint8_t *b tls_p->nread = nread >= 0 ? nread : 0; tls_p->consumed = 0; - /* Ensure TLS handshake is performed before receiving data. */ - while (tls_p->handshake_state == TLS_HS_IN_PROGRESS) { - int err = gnutls_handshake(tls_p->tls_session); - if (err == GNUTLS_E_SUCCESS) { - tls_p->handshake_state = TLS_HS_DONE; - kr_log_verbose("[%s] TLS handshake with %s has completed\n", - logstring, kr_straddr(&session->peer.ip)); - if (tls_p->handshake_cb) { - tls_p->handshake_cb(tls_p->session, 0); - } - } else if (err == GNUTLS_E_AGAIN) { - return 0; - } else if (gnutls_error_is_fatal(err)) { - kr_log_verbose("[%s] gnutls_handshake failed: %s (%d)\n", - logstring, - gnutls_strerror_name(err), err); - if (tls_p->handshake_cb) { - tls_p->handshake_cb(tls_p->session, -1); - } - return kr_error(err); + /* Ensure TLS handshake is performed before receiving data. + * See https://www.gnutls.org/manual/html_node/TLS-handshake.html */ + while (tls_p->handshake_state <= TLS_HS_IN_PROGRESS) { + int err = tls_handshake(tls_p, tls_p->handshake_cb); + if (err == kr_error(EAGAIN)) { + return 0; /* Wait for more data */ + } else if (err != kr_ok()) { + return err; } } + /* See https://gnutls.org/manual/html_node/Data-transfer-and-termination.html#Data-transfer-and-termination */ int submitted = 0; bool is_retrying = false; uint64_t retrying_start = 0; while (true) { ssize_t count = gnutls_record_recv(tls_p->tls_session, tls_p->recv_buf, sizeof(tls_p->recv_buf)); if (count == GNUTLS_E_AGAIN) { - break; /* No data available */ - } else if (count == GNUTLS_E_INTERRUPTED || - count == GNUTLS_E_REHANDSHAKE) { - if (!is_retrying) { - is_retrying = true; - retrying_start = kr_now(); - } - uint64_t elapsed = kr_now() - retrying_start; - if (elapsed > TLS_MAX_HANDSHAKE_TIME) { - return kr_error(EIO); - } - continue; /* Try reading again */ + break; /* No data available */ } else if (count < 0) { + /* Retry on non-fatal errors (alerts, rehandshake) */ + if (!gnutls_error_is_fatal(count)) { + if (!is_retrying) { + is_retrying = true; + retrying_start = kr_now(); + } + uint64_t elapsed = kr_now() - retrying_start; + if (elapsed < TLS_MAX_HANDSHAKE_TIME) { + continue; /* Try reading again */ + } + } kr_log_verbose("[%s] gnutls_record_recv failed: %s (%zd)\n", logstring, gnutls_strerror_name(count), count); - return kr_error(EIO); + /* Propagate errors to lower layer */ + count = kr_error(EIO); } DEBUG_MSG("[%s] submitting %zd data to worker\n", logstring, count); int ret = worker_process_tcp(worker, handle, tls_p->recv_buf, count); if (ret < 0) { return ret; } - if (count == 0) { + if (count <= 0) { break; } submitted += ret; @@ -983,7 +1013,7 @@ int tls_client_connect_start(struct tls_client_ctx_t *client_ctx, struct tls_common_ctx *ctx = &client_ctx->c; gnutls_session_set_ptr(ctx->tls_session, client_ctx); - gnutls_handshake_set_timeout(ctx->tls_session, KR_CONN_RTT_MAX * 3); + gnutls_handshake_set_timeout(ctx->tls_session, ctx->worker->engine->net.tcp.tls_handshake_timeout); session->tls_client_ctx = client_ctx; ctx->handshake_cb = handshake_cb; ctx->handshake_state = TLS_HS_IN_PROGRESS; @@ -995,14 +1025,15 @@ int tls_client_connect_start(struct tls_client_ctx_t *client_ctx, tls_params->session_data.size); } - int ret = gnutls_handshake(ctx->tls_session); - if (ret == GNUTLS_E_SUCCESS) { - return kr_ok(); - } else if (gnutls_error_is_fatal(ret) != 0) { - kr_log_verbose("[tls_client] handshake failed (%s)\n", gnutls_strerror(ret)); - return kr_error(ECONNABORTED); + /* See https://www.gnutls.org/manual/html_node/Asynchronous-operation.html */ + while (ctx->handshake_state <= TLS_HS_IN_PROGRESS) { + /* Don't pass the handshake callback as the connection isn't registered yet. */ + int ret = tls_handshake(ctx, NULL); + if (ret != kr_ok()) { + return ret; + } } - return kr_error(EAGAIN); + return kr_ok(); } tls_hs_state_t tls_get_hs_state(const struct tls_common_ctx *ctx) diff --git a/daemon/worker.c b/daemon/worker.c index f0f89a122..cdd2ae24b 100644 --- a/daemon/worker.c +++ b/daemon/worker.c @@ -1184,7 +1184,7 @@ static void on_connect(uv_connect_t *req, int status) array_del(session->waiting, 0); assert(task->refs > 1); qr_task_unref(task); - qr_task_step(task, NULL, NULL); + qr_task_step(task, &peer->ip, NULL); } assert(session->tasks.len == 0); iorequest_release(worker, req); @@ -1247,8 +1247,12 @@ static void on_connect(uv_connect_t *req, int status) } } + /* Either handshake or sending data has failed */ while (session->waiting.len > 0) { struct qr_task *task = session->waiting.at[0]; + worker->stats.handshake_errors += 1; + /* Notify resolver of the outgoing query timeout, as there's no further step */ + kr_resolve_consume(&task->ctx->req, &session->peer.ip, NULL); session_del_tasks(session, task); array_del(session->waiting, 0); ioreq_kill_pending(task); @@ -1295,7 +1299,7 @@ static void on_tcp_connect_timeout(uv_timer_t *timer) array_del(session->waiting, 0); assert(task->refs > 1); qr_task_unref(task); - qr_task_step(task, NULL, NULL); + qr_task_step(task, &peer->ip, NULL); } assert (session->tasks.len == 0); @@ -1319,6 +1323,8 @@ static void on_tcp_watchdog_timeout(uv_timer_t *timer) struct qr_task *task = session->waiting.at[0]; task->timeouts += 1; worker->stats.timeout += 1; + /* Notify resolver of the outgoing query timeout, as there's no further step */ + kr_resolve_consume(&task->ctx->req, &session->peer.ip, NULL); array_del(session->waiting, 0); session_del_tasks(session, task); ioreq_kill_pending(task); @@ -1362,7 +1368,7 @@ static void on_udp_timeout(uv_timer_t *timer) for (uint16_t i = 0; i < MIN(task->pending_count, KR_NSREP_MAXADDR); ++i) { struct sockaddr *choice = (struct sockaddr *)(&addrlist[i]); if (choice->sa_family == AF_UNSPEC) { - break; + continue; } WITH_VERBOSE(qry) { char addr_str[INET6_ADDRSTRLEN]; @@ -1376,7 +1382,7 @@ static void on_udp_timeout(uv_timer_t *timer) } task->timeouts += 1; worker->stats.timeout += 1; - qr_task_step(task, NULL, NULL); + qr_task_step(task, &session->peer.ip, NULL); } static void on_session_idle_timeout(uv_timer_t *timer) @@ -1876,7 +1882,7 @@ static int qr_task_step(struct qr_task *task, worker_del_tcp_waiting(ctx->worker, addr); iorequest_release(ctx->worker, conn); subreq_finalize(task, packet_source, packet); - return qr_task_step(task, NULL, NULL); + return qr_task_step(task, &session->peer.ip, NULL); } } } diff --git a/daemon/worker.h b/daemon/worker.h index 0d1bcc208..1bd90706f 100644 --- a/daemon/worker.h +++ b/daemon/worker.h @@ -145,6 +145,7 @@ struct worker_ctx { size_t queries; size_t dropped; size_t timeout; + size_t handshake_errors; } stats; struct zone_import_ctx* z_import; -- 2.47.2