From: Marcin Siodelski Date: Wed, 27 Mar 2024 12:22:12 +0000 (+0100) Subject: [#3250] Delayed transition to terminated X-Git-Tag: Kea-2.5.8~79 X-Git-Url: http://git.ipfire.org/gitweb/?a=commitdiff_plain;h=f9e7fdb7b3fe34d40f563fdf3c7009a93d991c6a;p=thirdparty%2Fkea.git [#3250] Delayed transition to terminated --- diff --git a/src/hooks/dhcp/high_availability/communication_state.cc b/src/hooks/dhcp/high_availability/communication_state.cc index 38b7566df3..39cc1ccbc4 100644 --- a/src/hooks/dhcp/high_availability/communication_state.cc +++ b/src/hooks/dhcp/high_availability/communication_state.cc @@ -58,8 +58,8 @@ CommunicationState::CommunicationState(const IOServicePtr& io_service, const HAConfigPtr& config) : io_service_(io_service), config_(config), timer_(), interval_(0), poke_time_(boost::posix_time::microsec_clock::universal_time()), - heartbeat_impl_(0), partner_state_(-1), partner_scopes_(), - clock_skew_(0, 0, 0, 0), last_clock_skew_warn_(), + heartbeat_impl_(0), partner_state_(-1), partner_state_time_(), + partner_scopes_(), clock_skew_(0, 0, 0, 0), last_clock_skew_warn_(), my_time_at_skew_(), partner_time_at_skew_(), analyzed_messages_count_(0), unsent_update_count_(0), partner_unsent_update_count_{0, 0}, mutex_(new mutex()) { @@ -114,13 +114,33 @@ CommunicationState::setPartnerUnavailable() { void CommunicationState::setPartnerStateInternal(const std::string& state) { try { - partner_state_ = stringToState(state); + auto new_partner_state = stringToState(state); + if (new_partner_state != partner_state_) { + setCurrentPartnerStateTimeInternal(); + } + partner_state_ = new_partner_state; } catch (...) { isc_throw(BadValue, "unsupported HA partner state returned " << state); } } +time_duration +CommunicationState::getDurationSincePartnerStateTime() const { + ptime now = boost::posix_time::microsec_clock::universal_time(); + if (MultiThreadingMgr::instance().getMode()) { + std::lock_guard lk(*mutex_); + return (now - partner_state_time_); + } else { + return (now - partner_state_time_); + } +} + +void +CommunicationState::setCurrentPartnerStateTimeInternal() { + partner_state_time_ = boost::posix_time::microsec_clock::universal_time(); +} + std::set CommunicationState::getPartnerScopes() const { if (MultiThreadingMgr::instance().getMode()) { diff --git a/src/hooks/dhcp/high_availability/communication_state.h b/src/hooks/dhcp/high_availability/communication_state.h index 2ca01db97d..9bdaa78656 100644 --- a/src/hooks/dhcp/high_availability/communication_state.h +++ b/src/hooks/dhcp/high_availability/communication_state.h @@ -123,6 +123,22 @@ private: void setPartnerStateInternal(const std::string& state); public: + + /// @brief Returns the duration since the partner was first seen in the + /// current state. + /// + /// @return A duration since the partner has been in the current state. + boost::posix_time::time_duration getDurationSincePartnerStateTime() const; + +private: + + /// @brief Sets partner's state time to current time. + /// + /// It marks the time when the partner was first seen in the current state. + void setCurrentPartnerStateTimeInternal(); + +public: + /// @brief Returns scopes served by the partner server. /// /// @return A set of scopes served by the partner. @@ -706,6 +722,9 @@ protected: /// Negative value means that the partner's state is unknown. int partner_state_; + /// Holds a time when partner was first seen in the current state. + boost::posix_time::ptime partner_state_time_; + /// @brief Last known set of scopes served by the partner server. std::set partner_scopes_; diff --git a/src/hooks/dhcp/high_availability/ha_messages.cc b/src/hooks/dhcp/high_availability/ha_messages.cc index e59517b97e..b3f6377dd7 100644 --- a/src/hooks/dhcp/high_availability/ha_messages.cc +++ b/src/hooks/dhcp/high_availability/ha_messages.cc @@ -123,6 +123,7 @@ extern const isc::log::MessageID HA_SYNC_HANDLER_FAILED = "HA_SYNC_HANDLER_FAILE extern const isc::log::MessageID HA_SYNC_START = "HA_SYNC_START"; extern const isc::log::MessageID HA_SYNC_SUCCESSFUL = "HA_SYNC_SUCCESSFUL"; extern const isc::log::MessageID HA_TERMINATED = "HA_TERMINATED"; +extern const isc::log::MessageID HA_TERMINATED_PARTNER_DID_NOT_RESTART = "HA_TERMINATED_PARTNER_DID_NOT_RESTART"; extern const isc::log::MessageID HA_TERMINATED_RESTART_PARTNER = "HA_TERMINATED_RESTART_PARTNER"; } // namespace ha @@ -247,6 +248,7 @@ const char* values[] = { "HA_SYNC_START", "%1: starting lease database synchronization with %2", "HA_SYNC_SUCCESSFUL", "%1: lease database synchronization with %2 completed successfully in %3", "HA_TERMINATED", "HA %1: service terminated due to an unrecoverable condition. Check previous error message(s), address the problem and restart!", + "HA_TERMINATED_PARTNER_DID_NOT_RESTART", "%1: service is terminating because the terminated partner was not restarted within %2 minutes", "HA_TERMINATED_RESTART_PARTNER", "%1: waiting for the partner in the TERMINATED state to be restarted", NULL }; diff --git a/src/hooks/dhcp/high_availability/ha_messages.h b/src/hooks/dhcp/high_availability/ha_messages.h index 8a0c405c20..c2597a8d4f 100644 --- a/src/hooks/dhcp/high_availability/ha_messages.h +++ b/src/hooks/dhcp/high_availability/ha_messages.h @@ -124,6 +124,7 @@ extern const isc::log::MessageID HA_SYNC_HANDLER_FAILED; extern const isc::log::MessageID HA_SYNC_START; extern const isc::log::MessageID HA_SYNC_SUCCESSFUL; extern const isc::log::MessageID HA_TERMINATED; +extern const isc::log::MessageID HA_TERMINATED_PARTNER_DID_NOT_RESTART; extern const isc::log::MessageID HA_TERMINATED_RESTART_PARTNER; } // namespace ha diff --git a/src/hooks/dhcp/high_availability/ha_messages.mes b/src/hooks/dhcp/high_availability/ha_messages.mes index b258188243..7611ec87ee 100644 --- a/src/hooks/dhcp/high_availability/ha_messages.mes +++ b/src/hooks/dhcp/high_availability/ha_messages.mes @@ -749,3 +749,13 @@ This informational message is issued when the server has been restarted after correcting the clock skew. The partner server is still in the terminated state. The partner must be restarted before the server can synchronize the database and start normal operation. + +% HA_TERMINATED_PARTNER_DID_NOT_RESTART %1: service is terminating because the terminated partner was not restarted within %2 minutes +This warning message is issued to indicate that the HA service is terminating +because partner server is in the terminated state and was not restarted within +an expacted time frame. The terminated servers should be restarted after correcting +the problem that caused the termination. They can be restarted sequentially but the +duration between the restarts should not be too long. If it is long it may mean that +the restart of one of the servers was unintentional (e.g., power outage). If the +restarted server remains in the waiting state it cannot serve DHCP clients. Moving +to the terminated state at least allows for responding to the DHCP traffic. diff --git a/src/hooks/dhcp/high_availability/ha_service.cc b/src/hooks/dhcp/high_availability/ha_service.cc index 9b00b3fb99..db49b98f4a 100644 --- a/src/hooks/dhcp/high_availability/ha_service.cc +++ b/src/hooks/dhcp/high_availability/ha_service.cc @@ -70,6 +70,7 @@ const int HAService::HA_MAINTENANCE_START_EVT; const int HAService::HA_MAINTENANCE_CANCEL_EVT; const int HAService::HA_CONTROL_RESULT_MAINTENANCE_NOT_ALLOWED; const int HAService::HA_SYNCED_PARTNER_UNAVAILABLE_EVT; +const int HAService::HA_WAITING_TO_TERMINATED_ST_DELAY_MINUTES; HAService::HAService(const unsigned int id, const IOServicePtr& io_service, const NetworkStatePtr& network_state, const HAConfigPtr& config, @@ -884,7 +885,17 @@ HAService::waitingStateHandler() { postNextEvent(NOP_EVT); break; - case HA_TERMINATED_ST: + case HA_TERMINATED_ST: { + auto partner_in_terminated = communication_state_->getDurationSincePartnerStateTime(); + if (!partner_in_terminated.is_not_a_date_time() && + partner_in_terminated.minutes() >= HA_WAITING_TO_TERMINATED_ST_DELAY_MINUTES) { + LOG_WARN(ha_logger, HA_TERMINATED_PARTNER_DID_NOT_RESTART) + .arg(config_->getThisServerName()) + .arg(HA_WAITING_TO_TERMINATED_ST_DELAY_MINUTES); + verboseTransition(HA_TERMINATED_ST); + break; + } + // We have checked above whether the clock skew is exceeding the threshold // and we should terminate. If we're here, it means that the clock skew // is acceptable. The partner may be still in the terminated state because @@ -895,7 +906,7 @@ HAService::waitingStateHandler() { .arg(config_->getThisServerName()); postNextEvent(NOP_EVT); break; - + } case HA_WAITING_ST: // If both servers are waiting, the primary server 'wins' and is // transitioned to the next state first. diff --git a/src/hooks/dhcp/high_availability/ha_service.h b/src/hooks/dhcp/high_availability/ha_service.h index 681b1d23f8..8d677bdc63 100644 --- a/src/hooks/dhcp/high_availability/ha_service.h +++ b/src/hooks/dhcp/high_availability/ha_service.h @@ -80,6 +80,10 @@ public: /// Control result returned in response to ha-maintenance-notify. static const int HA_CONTROL_RESULT_MAINTENANCE_NOT_ALLOWED = 1001; + /// A delay in minutes to transition from the waiting to terminated state + /// when the partner remains in terminated state. + static const int HA_WAITING_TO_TERMINATED_ST_DELAY_MINUTES = 10; + protected: /// @brief Callback invoked when request was sent and a response received diff --git a/src/hooks/dhcp/high_availability/tests/communication_state_unittest.cc b/src/hooks/dhcp/high_availability/tests/communication_state_unittest.cc index 519d58d43f..19ccc58e08 100644 --- a/src/hooks/dhcp/high_availability/tests/communication_state_unittest.cc +++ b/src/hooks/dhcp/high_availability/tests/communication_state_unittest.cc @@ -65,6 +65,14 @@ public: /// tested in a different test case. void partnerStateUnavailableTest(); + /// @brief Verifies that the duration since the partner time is updated + /// correctly when the partner has certain state. + void partnerStateTimeExplicitStateTest(); + + /// @brief Verifies that the duration since the partner time is updated + /// correctly when the partner is unavailable. + void partnerStateTimeUnavailableTest(); + /// @brief Verifies that the partner's scopes are set and retrieved correctly. void partnerScopesTest(); @@ -208,6 +216,64 @@ CommunicationStateTest::partnerStateTest() { EXPECT_THROW(state_.setPartnerState("unsupported"), BadValue); } +// Verifies that the duration since the partner time is updated +// correctly when the partner has certain state. +void +CommunicationStateTest::partnerStateTimeExplicitStateTest() { + // Initially the partner state time is not set. + time_duration duration_since_partner_time = state_.getDurationSincePartnerStateTime(); + EXPECT_TRUE(duration_since_partner_time.is_not_a_date_time()); + + // Set new partner state. The state time should be set close + // to the current time. + state_.setPartnerState("waiting"); + + // Sleep for a short period of time to ensure that the returned + // duration is non-zero. + usleep(10); + + // The duration must be set. + time_duration duration_since_waiting = state_.getDurationSincePartnerStateTime(); + ASSERT_FALSE(duration_since_waiting.is_not_a_date_time()); + + // Check that the partner time is within 10 seconds range from + // the current time. 10 seconds difference may sound too high but + // this test is not about the precision. It is more important that + // it is stable. In the HA state machine the duration of 10 seconds + // is relatively low anyway. + EXPECT_GT(duration_since_waiting.fractional_seconds(), 0); + EXPECT_LT(duration_since_waiting.seconds(), 10); +} + +// Verifies that the duration since the partner time is updated +// correctly when the partner is unavailable. +void +CommunicationStateTest::partnerStateTimeUnavailableTest() { + // Initially the partner state time is not set. + time_duration duration_since_partner_time = state_.getDurationSincePartnerStateTime(); + EXPECT_TRUE(duration_since_partner_time.is_not_a_date_time()); + + // Set new partner state. The state time should be set close + // to the current time. + state_.setPartnerUnavailable(); + + // Sleep for a short period of time to ensure that the returned + // duration is non-zero. + usleep(10); + + // The duration must be set. + time_duration duration_since_unavail = state_.getDurationSincePartnerStateTime(); + ASSERT_FALSE(duration_since_unavail.is_not_a_date_time()); + + // Check that the partner time is within 10 seconds range from + // the current time. 10 seconds difference may sound too high but + // this test is not about the precision. It is more important that + // it is stable. In the HA state machine the duration of 10 seconds + // is relatively low anyway. + EXPECT_GT(duration_since_unavail.fractional_seconds(), 0); + EXPECT_LT(duration_since_unavail.seconds(), 10); +} + // Verifies that the partner state is set to unavailable. void CommunicationStateTest::partnerStateUnavailableTest() { @@ -987,6 +1053,24 @@ TEST_F(CommunicationStateTest, partnerStateUnavailableTestMultiThreading) { partnerStateUnavailableTest(); } +TEST_F(CommunicationStateTest, partnerStateTimeExplicitStateTest) { + partnerStateTimeExplicitStateTest(); +} + +TEST_F(CommunicationStateTest, partnerStateTimeExplcitStateTestMultiThreading) { + MultiThreadingMgr::instance().setMode(true); + partnerStateTimeExplicitStateTest(); +} + +TEST_F(CommunicationStateTest, partnerStateTimeUnavailableTest) { + partnerStateTimeUnavailableTest(); +} + +TEST_F(CommunicationStateTest, partnerStateTimeUnavailableTestMultiThreading) { + MultiThreadingMgr::instance().setMode(true); + partnerStateTimeUnavailableTest(); +} + TEST_F(CommunicationStateTest, partnerScopesTest) { partnerScopesTest(); } diff --git a/src/hooks/dhcp/high_availability/tests/ha_service_unittest.cc b/src/hooks/dhcp/high_availability/tests/ha_service_unittest.cc index 1198f82e6c..21c0c5de0d 100644 --- a/src/hooks/dhcp/high_availability/tests/ha_service_unittest.cc +++ b/src/hooks/dhcp/high_availability/tests/ha_service_unittest.cc @@ -6912,6 +6912,53 @@ TEST_F(HAServiceStateMachineTest, terminateNoTransitionOnRestart) { << "' state"; } +// This test checks that the server transitions back to the terminated state +// from the waiting state when the partner sits in the terminated state. +TEST_F(HAServiceStateMachineTest, terminateWhenPartnerRemainsTerminated) { + auto config_storage = createValidConfiguration(); + partner_->startup(); + startService(config_storage); + NakedCommunicationState4Ptr state(new NakedCommunicationState4(io_service_, + config_storage)); + service_->communication_state_ = state; + + // Set partner's time to the current time. This guarantees that the clock + // skew is below 60s and there is no reason for the server to transition + // to the terminated state. + partner_->setDateTime(HttpDateTime().rfc1123Format()); + // The partner is in the terminated state to simulate sequential restart + // of the two servers from the terminated state. + partner_->transition("terminated"); + // This server is in the waiting state which simulates the restart case. + service_->transition(HA_WAITING_ST, HAService::NOP_EVT); + // Run the heartbeat. + waitForEvent(HAService::HA_HEARTBEAT_COMPLETE_EVT); + // The server should remain in the waiting state because the clock skew + // is low. + EXPECT_EQ(HA_WAITING_ST, service_->getCurrState()) + << "expected that the server remains in 'waiting' state" + << "', but transitioned to the '" + << service_->getStateLabel(service_->getCurrState()) + << "' state"; + + // Move the partner state time to long ago. It should cause our server to + // assume that the administrator neglected to restart the terminated partner + // and transition to the terminated state to continue serving the DHCP + // clients. + boost::posix_time::ptime long_ago = HttpDateTime().fromRfc1123("Sun, 06 Nov 1994 08:49:37 GMT").getPtime(); + state->partner_state_time_ = long_ago; + + // Run the heartbeat. + waitForEvent(HAService::HA_HEARTBEAT_COMPLETE_EVT); + + // The server should have transitioned to the terminated state. + EXPECT_EQ(HA_TERMINATED_ST, service_->getCurrState()) + << "expected that the server transitions to the 'terminated' state" + << "', but transitioned to the '" + << service_->getStateLabel(service_->getCurrState()) + << "' state"; +} + // This test checks all combinations of server and partner states and the // resulting state to which the server transitions. This server is secondary. // There is another test which validates state transitions from the diff --git a/src/hooks/dhcp/high_availability/tests/ha_test.h b/src/hooks/dhcp/high_availability/tests/ha_test.h index 8f52190fc0..911ff9cb87 100644 --- a/src/hooks/dhcp/high_availability/tests/ha_test.h +++ b/src/hooks/dhcp/high_availability/tests/ha_test.h @@ -61,6 +61,7 @@ public: using StateType::partner_time_at_skew_; using StateType::unsent_update_count_; using StateType::getRejectedLeaseUpdatesCountFromContainer; + using StateType::partner_state_time_; }; /// @brief Type of the NakedCommunicationState for DHCPv4.