From: Alan T. DeKok Date: Mon, 23 Aug 2021 19:49:59 +0000 (-0400) Subject: rework to respect response_window, etc. X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=0ca19dcb23c82a7265c48e774fd67449d3b0c713;p=thirdparty%2Ffreeradius-server.git rework to respect response_window, etc. --- diff --git a/src/modules/rlm_radius/rlm_radius_udp.c b/src/modules/rlm_radius/rlm_radius_udp.c index b8bee724ef6..dfbb2a30dc1 100644 --- a/src/modules/rlm_radius/rlm_radius_udp.c +++ b/src/modules/rlm_radius/rlm_radius_udp.c @@ -460,7 +460,7 @@ static void conn_error_status_check(UNUSED fr_event_list_t *el, UNUSED int fd, U fr_connection_signal_reconnect(conn, FR_CONNECTION_FAILED); } -/** Status check timedout +/** Status check timer when opening the connection for the first time. * * Setup retries, or fail the connection. */ @@ -1532,15 +1532,52 @@ static int encode(rlm_radius_udp_t const *inst, request_t *request, udp_request_ /** Revive a connection after "revive_interval" * */ -static void revive_timer(UNUSED fr_event_list_t *el, UNUSED fr_time_t now, void *uctx) +static void revive_timeout(UNUSED fr_event_list_t *el, UNUSED fr_time_t now, void *uctx) { fr_trunk_connection_t *tconn = talloc_get_type_abort(uctx, fr_trunk_connection_t); udp_handle_t *h = talloc_get_type_abort(tconn->conn->h, udp_handle_t); - INFO("%s - Shutting down and reviving connection %s", h->module_name, h->name); + INFO("%s - Reviving connection %s", h->module_name, h->name); fr_trunk_connection_signal_reconnect(tconn, FR_CONNECTION_FAILED); } +/** Mark a connection dead after "zombie_interval" + * + */ +static void zombie_timeout(fr_event_list_t *el, fr_time_t now, void *uctx) +{ + fr_trunk_connection_t *tconn = talloc_get_type_abort(uctx, fr_trunk_connection_t); + udp_handle_t *h = talloc_get_type_abort(tconn->conn->h, udp_handle_t); + + INFO("%s - No replies during 'zombie_period', marking connection %s as dead", h->module_name, h->name); + + /* + * Don't use this connection, and re-queue all of its + * requests onto other connections. + */ + fr_trunk_connection_signal_inactive(tconn); + (void) fr_trunk_connection_requests_requeue(tconn, FR_TRUNK_REQUEST_STATE_ALL, 0, false); + + /* + * We do have status checks. Try to reconnect the + * connection immediately. If the status checks pass, + * then the connection will be marked "alive" + */ + if (h->inst->parent->status_check) { + fr_trunk_connection_signal_reconnect(tconn, FR_CONNECTION_FAILED); + return; + } + + /* + * Revive the connection after a time. + */ + if (fr_event_timer_at(h, el, &h->zombie_ev, now + h->inst->parent->revive_interval, revive_timeout, h) < 0) { + ERROR("Failed inserting revive timeout for connection"); + fr_trunk_connection_signal_reconnect(tconn, FR_CONNECTION_FAILED); + } +} + + /** See if the connection is zombied. * * We check for zombie when major events happen: @@ -1561,12 +1598,10 @@ static void revive_timer(UNUSED fr_event_list_t *el, UNUSED fr_time_t now, void * for zombie at BOTH the timeout and the mux / write function. * * @return - * - true if a connection state change was triggered. - * The connection is likely now a zombie or was reconnected. - * - false if the connection did not change state. It may - * still be a zombie, but it was a zombie when this + * - true if the connection is zombie. + * - false if the connection is not zombie. */ -static bool check_for_zombie(fr_event_list_t *el, fr_trunk_connection_t *tconn, fr_time_t now) +static bool check_for_zombie(fr_event_list_t *el, fr_trunk_connection_t *tconn, fr_time_t now, fr_time_t last_sent) { udp_handle_t *h = talloc_get_type_abort(tconn->conn->h, udp_handle_t); @@ -1577,104 +1612,84 @@ static bool check_for_zombie(fr_event_list_t *el, fr_trunk_connection_t *tconn, fr_assert(!h->inst->replicate); /* - * If there's already a zombie check started, don't do - * another one. - * - * Or if we never sent a packet, we don't know (or care) - * if the home server is up. - * - * Or if we had sent packets, and then went idle. + * If we're status checking OR already zombie, don't go to zombie * - * Or we had replies, and then went idle. - * - * We do checks for both sent && replied, because we - * could have sent packets without getting replies (and - * then mark it zombie), or we could have gotten some - * replies which then stopped coming back (and then mark - * it zombie). */ - if (h->status_checking || h->zombie_ev || !h->last_sent || (h->last_sent <= h->last_idle) || - (h->last_reply && (h->last_reply <= h->last_idle))) { - return false; - } + if (h->status_checking || h->zombie_ev) return true; if (now == 0) now = fr_time(); /* - * We've sent a packet since we last went idle, and/or - * we've received replies since we last went idle. - * - * If we have a reply, then set the zombie timeout from - * when we received the last reply. - * - * If we haven't seen a reply, then set the zombie - * timeout from when we first started sending packets. + * We received a reply since this packet was sent, the connection isn't zombie. */ - if (h->last_reply) { - if ((h->last_reply + h->inst->parent->zombie_period) >= now) return false; - DEBUG2("%s - We have passed 'zombie_period' time since the last reply on connection %s", - h->module_name, h->name); - } else { - if ((h->first_sent + h->inst->parent->zombie_period) >= now) return false; - DEBUG2("%s - We have passed 'zombie_period' time since we first sent a packet, and " - "there have been no replies on connection %s", h->module_name, h->name); - } + if (h->last_reply >= last_sent) return false; /* - * No status checks: this connection is dead. - * - * We will requeue this packet on another - * connection. + * If we've seen ANY response in the allowed window, then the connection is still alive. */ - if (!h->inst->parent->status_check) { - fr_time_t when; - - WARN("%s - Connection failed. Reviving it in %pVs", h->module_name, - fr_box_time_delta(h->inst->parent->revive_interval)); - fr_trunk_connection_signal_inactive(tconn); - (void) fr_trunk_connection_requests_requeue(tconn, FR_TRUNK_REQUEST_STATE_ALL, 0, false); - - when = now + h->inst->parent->revive_interval; - if (fr_event_timer_at(h, el, &h->zombie_ev, when, revive_timer, tconn) < 0) { - fr_trunk_connection_signal_reconnect(tconn, FR_CONNECTION_FAILED); - return true; - } - - return true; - } + if (last_sent && ((last_sent + h->inst->parent->response_window) < now)) return false; /* * Mark the connection as inactive, but keep sending * packets on it. */ WARN("%s - Entering Zombie state - connection %s", h->module_name, h->name); - h->status_checking = true; - - /* - * Move ALL requests to other connections! - */ fr_trunk_connection_signal_inactive(tconn); - (void) fr_trunk_connection_requests_requeue(tconn, FR_TRUNK_REQUEST_STATE_ALL, 0, false); - /* - * Queue up the status check packet. It will be sent - * when the connection is writable. - */ - h->status_u->retry.start = 0; - h->status_r->treq = NULL; + if (h->inst->parent->status_check) { + h->status_checking = true; - if (fr_trunk_request_enqueue_on_conn(&h->status_r->treq, tconn, h->status_request, - h->status_u, h->status_r, true) != FR_TRUNK_ENQUEUE_OK) { - fr_trunk_connection_signal_reconnect(tconn, FR_CONNECTION_FAILED); + /* + * Queue up the status check packet. It will be sent + * when the connection is writable. + */ + h->status_u->retry.start = 0; + h->status_r->treq = NULL; + + if (fr_trunk_request_enqueue_on_conn(&h->status_r->treq, tconn, h->status_request, + h->status_u, h->status_r, true) != FR_TRUNK_ENQUEUE_OK) { + fr_trunk_connection_signal_reconnect(tconn, FR_CONNECTION_FAILED); + } + } else { + if (fr_event_timer_at(h, el, &h->zombie_ev, now + h->inst->parent->zombie_period, zombie_timeout, h) < 0) { + ERROR("Failed inserting zombie timeout for connection"); + fr_trunk_connection_signal_reconnect(tconn, FR_CONNECTION_FAILED); + } } return true; } -/** Handle retries for a request_t +/** Handle timeouts when a request is being sent synchronously * */ static void request_timeout(fr_event_list_t *el, fr_time_t now, void *uctx) +{ + fr_trunk_request_t *treq = talloc_get_type_abort(uctx, fr_trunk_request_t); + udp_handle_t *h; + udp_request_t *u = talloc_get_type_abort(treq->preq, udp_request_t); + udp_result_t *r = talloc_get_type_abort(treq->rctx, udp_result_t); + fr_trunk_connection_t *tconn = treq->tconn; + + fr_assert(treq->state == FR_TRUNK_REQUEST_STATE_SENT); /* No other states should be timing out */ + fr_assert(treq->preq); /* Must still have a protocol request */ + fr_assert(u->rr); + fr_assert(tconn); + + h = talloc_get_type_abort(treq->tconn->conn->h, udp_handle_t); + + r->rcode = RLM_MODULE_FAIL; + fr_trunk_request_signal_complete(treq); + + fr_assert(!u->status_check); + + check_for_zombie(el, tconn, now, u->retry.start); +} + +/** Handle retries when a request is being sent asynchronously + * + */ +static void request_retry(UNUSED fr_event_list_t *el, fr_time_t now, void *uctx) { fr_trunk_request_t *treq = talloc_get_type_abort(uctx, fr_trunk_request_t); udp_handle_t *h; @@ -1690,28 +1705,56 @@ static void request_timeout(fr_event_list_t *el, fr_time_t now, void *uctx) h = talloc_get_type_abort(treq->tconn->conn->h, udp_handle_t); - if (!u->status_check) { - /* - * If the connection just became a zombie - * the request that just timedout will - * have moved back into the trunk backlog, - * been assigned to another connection - * or freed. - * - * In any case we must not continue to - * work with it, because we have no idea - * what state its in. - */ - if (check_for_zombie(el, tconn, now)) return; + fr_assert(!u->status_check); - } else { - /* - * Reset replies to 0 as we only count - * contiguous, good, replies. - */ - u->num_replies = 0; + switch (fr_retry_next(&u->retry, now)) { + /* + * Queue the request for retransmission. + * + * @todo - set up "next" timer here, instead of in + * request_mux() ? That way we can catch the case of + * packets sitting in the queue for extended periods of + * time, and still run the timers. + */ + case FR_RETRY_CONTINUE: + fr_trunk_request_requeue(treq); + return; + + case FR_RETRY_MRD: + REDEBUG("Reached maximum_retransmit_duration (%pVs > %pVs), failing request", + fr_box_time_delta(now - u->retry.start), fr_box_time_delta(u->retry.config->mrd)); + break; + + case FR_RETRY_MRC: + REDEBUG("Reached maximum_retransmit_count (%u > %u), failing request", + u->retry.count, u->retry.config->mrc); + break; } + r->rcode = RLM_MODULE_FAIL; + fr_trunk_request_signal_complete(treq); + + check_for_zombie(el, tconn, now, u->retry.start); +} + +static void status_check_retry(UNUSED fr_event_list_t *el, fr_time_t now, void *uctx) +{ + fr_trunk_request_t *treq = talloc_get_type_abort(uctx, fr_trunk_request_t); + udp_handle_t *h; + udp_request_t *u = talloc_get_type_abort(treq->preq, udp_request_t); + udp_result_t *r = talloc_get_type_abort(treq->rctx, udp_result_t); + request_t *request = treq->request; + fr_trunk_connection_t *tconn = treq->tconn; + + fr_assert(treq->state == FR_TRUNK_REQUEST_STATE_SENT); /* No other states should be timing out */ + fr_assert(treq->preq); /* Must still have a protocol request */ + fr_assert(u->rr); + fr_assert(tconn); + + h = talloc_get_type_abort(treq->tconn->conn->h, udp_handle_t); + + fr_assert(u->status_check); + switch (fr_retry_next(&u->retry, now)) { /* * Queue the request for retransmission. @@ -1739,12 +1782,14 @@ static void request_timeout(fr_event_list_t *el, fr_time_t now, void *uctx) r->rcode = RLM_MODULE_FAIL; fr_trunk_request_signal_complete(treq); - if (!u->status_check) return; + WARN("%s - No response to status check, marking connection as dead - %s", h->module_name, h->name); - WARN("%s - No response to status check, marking connection as dead - %s", h->module_name, h->name); - - h->status_checking = false; - fr_trunk_connection_signal_reconnect(tconn, FR_CONNECTION_FAILED); + /* + * We're no longer status checking, reconnect the + * connection. + */ + h->status_checking = false; + fr_trunk_connection_signal_reconnect(tconn, FR_CONNECTION_FAILED); } static void request_mux(fr_event_list_t *el, @@ -1757,10 +1802,10 @@ static void request_mux(fr_event_list_t *el, size_t total_len = 0; /* - * If the connection just became a zombie - * don't try and enqueue things on it! + * If the connection is zombie, then don't try to enqueue + * things on it! */ - if (check_for_zombie(el, tconn, 0)) return; + if (check_for_zombie(el, tconn, 0, h->last_sent)) return; /* * Encode multiple packets in preparation @@ -1963,25 +2008,36 @@ static void request_mux(fr_event_list_t *el, action = "Retransmitted"; } - if (!inst->parent->synchronous) { + if (u->status_check) { + RDEBUG("%s status check. Expecting response within %pVs", action, + fr_box_time_delta(u->retry.rt)); + + if (fr_event_timer_at(u, el, &u->ev, u->retry.next, status_check_retry, treq) < 0) { + RERROR("Failed inserting retransmit timeout for connection"); + fr_trunk_request_signal_fail(treq); + continue; + } + + } else if (!inst->parent->synchronous) { RDEBUG("%s request. Expecting response within %pVs", action, fr_box_time_delta(u->retry.rt)); - if (fr_event_timer_at(u, el, &u->ev, u->retry.next, request_timeout, treq) < 0) { + if (fr_event_timer_at(u, el, &u->ev, u->retry.next, request_retry, treq) < 0) { RERROR("Failed inserting retransmit timeout for connection"); fr_trunk_request_signal_fail(treq); continue; } - } else { + + } else if (u->retry.count == 1) { + if (fr_event_timer_at(u, el, &u->ev, u->retry.start + h->inst->parent->response_window, request_timeout, treq) < 0) { + RERROR("Failed inserting timeout for connection"); + fr_trunk_request_signal_fail(treq); + continue; + } + /* * If the packet doesn't get a response, * then udp_request_free() will notice, and run conn_zombie() - * - * @todo - set up a response_window which is LESS - * than max_request_time. That way we - * can return "fail", and process the - * request through a fail handler, - * instead of just freeing it. */ RDEBUG("%s request. Relying on NAS to perform more retransmissions", action); } @@ -2641,13 +2697,19 @@ static void mod_signal(module_ctx_t const *mctx, UNUSED request_t *request, */ case FR_SIGNAL_DUP: /* - * Connection just became a zombie - * we don't want to retransmit - * as the connection is now dead - * and we have no idea what state - * the request is in. + * If we're not synchronous, then rely on + * request_retry() to do the retransmissions. + */ + if (!t->inst->parent->synchronous) return; + + /* + * We are synchronous, retransmit the current + * request on the same connection. + * + * If it's zombie, we still resend it. If the + * connection is dead, then a callback will move + * this request to a new connection. */ - if (check_for_zombie(t->el, r->treq->tconn, 0)) return; fr_trunk_request_requeue(r->treq); return;