const HAConfigPtr& config)
: io_service_(io_service), config_(config), timer_(), interval_(0),
poke_time_(boost::posix_time::microsec_clock::universal_time()),
- heartbeat_impl_(0), partner_state_(-1), partner_scopes_(),
- clock_skew_(0, 0, 0, 0), last_clock_skew_warn_(),
+ heartbeat_impl_(0), partner_state_(-1), partner_state_time_(),
+ partner_scopes_(), clock_skew_(0, 0, 0, 0), last_clock_skew_warn_(),
my_time_at_skew_(), partner_time_at_skew_(),
analyzed_messages_count_(0), unsent_update_count_(0),
partner_unsent_update_count_{0, 0}, mutex_(new mutex()) {
void
CommunicationState::setPartnerStateInternal(const std::string& state) {
try {
- partner_state_ = stringToState(state);
+ auto new_partner_state = stringToState(state);
+ if (new_partner_state != partner_state_) {
+ setCurrentPartnerStateTimeInternal();
+ }
+ partner_state_ = new_partner_state;
} catch (...) {
isc_throw(BadValue, "unsupported HA partner state returned "
<< state);
}
}
+time_duration
+CommunicationState::getDurationSincePartnerStateTime() const {
+ ptime now = boost::posix_time::microsec_clock::universal_time();
+ if (MultiThreadingMgr::instance().getMode()) {
+ std::lock_guard<std::mutex> lk(*mutex_);
+ return (now - partner_state_time_);
+ } else {
+ return (now - partner_state_time_);
+ }
+}
+
+void
+CommunicationState::setCurrentPartnerStateTimeInternal() {
+ partner_state_time_ = boost::posix_time::microsec_clock::universal_time();
+}
+
std::set<std::string>
CommunicationState::getPartnerScopes() const {
if (MultiThreadingMgr::instance().getMode()) {
void setPartnerStateInternal(const std::string& state);
public:
+
+ /// @brief Returns the duration since the partner was first seen in the
+ /// current state.
+ ///
+ /// @return A duration since the partner has been in the current state.
+ boost::posix_time::time_duration getDurationSincePartnerStateTime() const;
+
+private:
+
+ /// @brief Sets partner's state time to current time.
+ ///
+ /// It marks the time when the partner was first seen in the current state.
+ void setCurrentPartnerStateTimeInternal();
+
+public:
+
/// @brief Returns scopes served by the partner server.
///
/// @return A set of scopes served by the partner.
/// Negative value means that the partner's state is unknown.
int partner_state_;
+ /// Holds a time when partner was first seen in the current state.
+ boost::posix_time::ptime partner_state_time_;
+
/// @brief Last known set of scopes served by the partner server.
std::set<std::string> partner_scopes_;
extern const isc::log::MessageID HA_SYNC_START = "HA_SYNC_START";
extern const isc::log::MessageID HA_SYNC_SUCCESSFUL = "HA_SYNC_SUCCESSFUL";
extern const isc::log::MessageID HA_TERMINATED = "HA_TERMINATED";
+extern const isc::log::MessageID HA_TERMINATED_PARTNER_DID_NOT_RESTART = "HA_TERMINATED_PARTNER_DID_NOT_RESTART";
extern const isc::log::MessageID HA_TERMINATED_RESTART_PARTNER = "HA_TERMINATED_RESTART_PARTNER";
} // namespace ha
"HA_SYNC_START", "%1: starting lease database synchronization with %2",
"HA_SYNC_SUCCESSFUL", "%1: lease database synchronization with %2 completed successfully in %3",
"HA_TERMINATED", "HA %1: service terminated due to an unrecoverable condition. Check previous error message(s), address the problem and restart!",
+ "HA_TERMINATED_PARTNER_DID_NOT_RESTART", "%1: service is terminating because the terminated partner was not restarted within %2 minutes",
"HA_TERMINATED_RESTART_PARTNER", "%1: waiting for the partner in the TERMINATED state to be restarted",
NULL
};
extern const isc::log::MessageID HA_SYNC_START;
extern const isc::log::MessageID HA_SYNC_SUCCESSFUL;
extern const isc::log::MessageID HA_TERMINATED;
+extern const isc::log::MessageID HA_TERMINATED_PARTNER_DID_NOT_RESTART;
extern const isc::log::MessageID HA_TERMINATED_RESTART_PARTNER;
} // namespace ha
correcting the clock skew. The partner server is still in the terminated
state. The partner must be restarted before the server can synchronize the
database and start normal operation.
+
+% HA_TERMINATED_PARTNER_DID_NOT_RESTART %1: service is terminating because the terminated partner was not restarted within %2 minutes
+This warning message is issued to indicate that the HA service is terminating
+because partner server is in the terminated state and was not restarted within
+an expacted time frame. The terminated servers should be restarted after correcting
+the problem that caused the termination. They can be restarted sequentially but the
+duration between the restarts should not be too long. If it is long it may mean that
+the restart of one of the servers was unintentional (e.g., power outage). If the
+restarted server remains in the waiting state it cannot serve DHCP clients. Moving
+to the terminated state at least allows for responding to the DHCP traffic.
const int HAService::HA_MAINTENANCE_CANCEL_EVT;
const int HAService::HA_CONTROL_RESULT_MAINTENANCE_NOT_ALLOWED;
const int HAService::HA_SYNCED_PARTNER_UNAVAILABLE_EVT;
+const int HAService::HA_WAITING_TO_TERMINATED_ST_DELAY_MINUTES;
HAService::HAService(const unsigned int id, const IOServicePtr& io_service,
const NetworkStatePtr& network_state, const HAConfigPtr& config,
postNextEvent(NOP_EVT);
break;
- case HA_TERMINATED_ST:
+ case HA_TERMINATED_ST: {
+ auto partner_in_terminated = communication_state_->getDurationSincePartnerStateTime();
+ if (!partner_in_terminated.is_not_a_date_time() &&
+ partner_in_terminated.minutes() >= HA_WAITING_TO_TERMINATED_ST_DELAY_MINUTES) {
+ LOG_WARN(ha_logger, HA_TERMINATED_PARTNER_DID_NOT_RESTART)
+ .arg(config_->getThisServerName())
+ .arg(HA_WAITING_TO_TERMINATED_ST_DELAY_MINUTES);
+ verboseTransition(HA_TERMINATED_ST);
+ break;
+ }
+
// We have checked above whether the clock skew is exceeding the threshold
// and we should terminate. If we're here, it means that the clock skew
// is acceptable. The partner may be still in the terminated state because
.arg(config_->getThisServerName());
postNextEvent(NOP_EVT);
break;
-
+ }
case HA_WAITING_ST:
// If both servers are waiting, the primary server 'wins' and is
// transitioned to the next state first.
/// Control result returned in response to ha-maintenance-notify.
static const int HA_CONTROL_RESULT_MAINTENANCE_NOT_ALLOWED = 1001;
+ /// A delay in minutes to transition from the waiting to terminated state
+ /// when the partner remains in terminated state.
+ static const int HA_WAITING_TO_TERMINATED_ST_DELAY_MINUTES = 10;
+
protected:
/// @brief Callback invoked when request was sent and a response received
/// tested in a different test case.
void partnerStateUnavailableTest();
+ /// @brief Verifies that the duration since the partner time is updated
+ /// correctly when the partner has certain state.
+ void partnerStateTimeExplicitStateTest();
+
+ /// @brief Verifies that the duration since the partner time is updated
+ /// correctly when the partner is unavailable.
+ void partnerStateTimeUnavailableTest();
+
/// @brief Verifies that the partner's scopes are set and retrieved correctly.
void partnerScopesTest();
EXPECT_THROW(state_.setPartnerState("unsupported"), BadValue);
}
+// Verifies that the duration since the partner time is updated
+// correctly when the partner has certain state.
+void
+CommunicationStateTest::partnerStateTimeExplicitStateTest() {
+ // Initially the partner state time is not set.
+ time_duration duration_since_partner_time = state_.getDurationSincePartnerStateTime();
+ EXPECT_TRUE(duration_since_partner_time.is_not_a_date_time());
+
+ // Set new partner state. The state time should be set close
+ // to the current time.
+ state_.setPartnerState("waiting");
+
+ // Sleep for a short period of time to ensure that the returned
+ // duration is non-zero.
+ usleep(10);
+
+ // The duration must be set.
+ time_duration duration_since_waiting = state_.getDurationSincePartnerStateTime();
+ ASSERT_FALSE(duration_since_waiting.is_not_a_date_time());
+
+ // Check that the partner time is within 10 seconds range from
+ // the current time. 10 seconds difference may sound too high but
+ // this test is not about the precision. It is more important that
+ // it is stable. In the HA state machine the duration of 10 seconds
+ // is relatively low anyway.
+ EXPECT_GT(duration_since_waiting.fractional_seconds(), 0);
+ EXPECT_LT(duration_since_waiting.seconds(), 10);
+}
+
+// Verifies that the duration since the partner time is updated
+// correctly when the partner is unavailable.
+void
+CommunicationStateTest::partnerStateTimeUnavailableTest() {
+ // Initially the partner state time is not set.
+ time_duration duration_since_partner_time = state_.getDurationSincePartnerStateTime();
+ EXPECT_TRUE(duration_since_partner_time.is_not_a_date_time());
+
+ // Set new partner state. The state time should be set close
+ // to the current time.
+ state_.setPartnerUnavailable();
+
+ // Sleep for a short period of time to ensure that the returned
+ // duration is non-zero.
+ usleep(10);
+
+ // The duration must be set.
+ time_duration duration_since_unavail = state_.getDurationSincePartnerStateTime();
+ ASSERT_FALSE(duration_since_unavail.is_not_a_date_time());
+
+ // Check that the partner time is within 10 seconds range from
+ // the current time. 10 seconds difference may sound too high but
+ // this test is not about the precision. It is more important that
+ // it is stable. In the HA state machine the duration of 10 seconds
+ // is relatively low anyway.
+ EXPECT_GT(duration_since_unavail.fractional_seconds(), 0);
+ EXPECT_LT(duration_since_unavail.seconds(), 10);
+}
+
// Verifies that the partner state is set to unavailable.
void
CommunicationStateTest::partnerStateUnavailableTest() {
partnerStateUnavailableTest();
}
+TEST_F(CommunicationStateTest, partnerStateTimeExplicitStateTest) {
+ partnerStateTimeExplicitStateTest();
+}
+
+TEST_F(CommunicationStateTest, partnerStateTimeExplcitStateTestMultiThreading) {
+ MultiThreadingMgr::instance().setMode(true);
+ partnerStateTimeExplicitStateTest();
+}
+
+TEST_F(CommunicationStateTest, partnerStateTimeUnavailableTest) {
+ partnerStateTimeUnavailableTest();
+}
+
+TEST_F(CommunicationStateTest, partnerStateTimeUnavailableTestMultiThreading) {
+ MultiThreadingMgr::instance().setMode(true);
+ partnerStateTimeUnavailableTest();
+}
+
TEST_F(CommunicationStateTest, partnerScopesTest) {
partnerScopesTest();
}
<< "' state";
}
+// This test checks that the server transitions back to the terminated state
+// from the waiting state when the partner sits in the terminated state.
+TEST_F(HAServiceStateMachineTest, terminateWhenPartnerRemainsTerminated) {
+ auto config_storage = createValidConfiguration();
+ partner_->startup();
+ startService(config_storage);
+ NakedCommunicationState4Ptr state(new NakedCommunicationState4(io_service_,
+ config_storage));
+ service_->communication_state_ = state;
+
+ // Set partner's time to the current time. This guarantees that the clock
+ // skew is below 60s and there is no reason for the server to transition
+ // to the terminated state.
+ partner_->setDateTime(HttpDateTime().rfc1123Format());
+ // The partner is in the terminated state to simulate sequential restart
+ // of the two servers from the terminated state.
+ partner_->transition("terminated");
+ // This server is in the waiting state which simulates the restart case.
+ service_->transition(HA_WAITING_ST, HAService::NOP_EVT);
+ // Run the heartbeat.
+ waitForEvent(HAService::HA_HEARTBEAT_COMPLETE_EVT);
+ // The server should remain in the waiting state because the clock skew
+ // is low.
+ EXPECT_EQ(HA_WAITING_ST, service_->getCurrState())
+ << "expected that the server remains in 'waiting' state"
+ << "', but transitioned to the '"
+ << service_->getStateLabel(service_->getCurrState())
+ << "' state";
+
+ // Move the partner state time to long ago. It should cause our server to
+ // assume that the administrator neglected to restart the terminated partner
+ // and transition to the terminated state to continue serving the DHCP
+ // clients.
+ boost::posix_time::ptime long_ago = HttpDateTime().fromRfc1123("Sun, 06 Nov 1994 08:49:37 GMT").getPtime();
+ state->partner_state_time_ = long_ago;
+
+ // Run the heartbeat.
+ waitForEvent(HAService::HA_HEARTBEAT_COMPLETE_EVT);
+
+ // The server should have transitioned to the terminated state.
+ EXPECT_EQ(HA_TERMINATED_ST, service_->getCurrState())
+ << "expected that the server transitions to the 'terminated' state"
+ << "', but transitioned to the '"
+ << service_->getStateLabel(service_->getCurrState())
+ << "' state";
+}
+
// This test checks all combinations of server and partner states and the
// resulting state to which the server transitions. This server is secondary.
// There is another test which validates state transitions from the
using StateType::partner_time_at_skew_;
using StateType::unsent_update_count_;
using StateType::getRejectedLeaseUpdatesCountFromContainer;
+ using StateType::partner_state_time_;
};
/// @brief Type of the NakedCommunicationState for DHCPv4.