From: Marcin Siodelski Date: Fri, 16 Sep 2022 08:46:31 +0000 (+0200) Subject: [#2408] HA service tracks rejected leases X-Git-Tag: Kea-2.3.1~47 X-Git-Url: http://git.ipfire.org/gitweb.cgi?a=commitdiff_plain;h=89d2727478e5b7990fae2b849f8490fa8bc489dd;p=thirdparty%2Fkea.git [#2408] HA service tracks rejected leases --- diff --git a/src/hooks/dhcp/high_availability/ha_messages.cc b/src/hooks/dhcp/high_availability/ha_messages.cc index 2e7872d8f2..b106917fb8 100644 --- a/src/hooks/dhcp/high_availability/ha_messages.cc +++ b/src/hooks/dhcp/high_availability/ha_messages.cc @@ -67,6 +67,7 @@ extern const isc::log::MessageID HA_LEASE_SYNC_STALE_LEASE6_SKIP = "HA_LEASE_SYN extern const isc::log::MessageID HA_LEASE_UPDATES_DISABLED = "HA_LEASE_UPDATES_DISABLED"; extern const isc::log::MessageID HA_LEASE_UPDATES_ENABLED = "HA_LEASE_UPDATES_ENABLED"; extern const isc::log::MessageID HA_LEASE_UPDATE_COMMUNICATIONS_FAILED = "HA_LEASE_UPDATE_COMMUNICATIONS_FAILED"; +extern const isc::log::MessageID HA_LEASE_UPDATE_CONFLICT = "HA_LEASE_UPDATE_CONFLICT"; extern const isc::log::MessageID HA_LEASE_UPDATE_CREATE_UPDATE_FAILED_ON_PEER = "HA_LEASE_UPDATE_CREATE_UPDATE_FAILED_ON_PEER"; extern const isc::log::MessageID HA_LEASE_UPDATE_DELETE_FAILED_ON_PEER = "HA_LEASE_UPDATE_DELETE_FAILED_ON_PEER"; extern const isc::log::MessageID HA_LEASE_UPDATE_FAILED = "HA_LEASE_UPDATE_FAILED"; @@ -173,6 +174,7 @@ const char* values[] = { "HA_LEASE_UPDATES_DISABLED", "lease updates will not be sent to the partner while in %1 state", "HA_LEASE_UPDATES_ENABLED", "lease updates will be sent to the partner while in %1 state", "HA_LEASE_UPDATE_COMMUNICATIONS_FAILED", "%1: failed to communicate with %2: %3", + "HA_LEASE_UPDATE_CONFLICT", "%1: lease update to %2 returned conflict status code: %3", "HA_LEASE_UPDATE_CREATE_UPDATE_FAILED_ON_PEER", "%1: failed to create or update the lease having type %2 for address %3, reason: %4", "HA_LEASE_UPDATE_DELETE_FAILED_ON_PEER", "%1: failed to delete the lease having type %2 for address %3, reason: %4", "HA_LEASE_UPDATE_FAILED", "%1: lease update to %2 failed: %3", diff --git a/src/hooks/dhcp/high_availability/ha_messages.h b/src/hooks/dhcp/high_availability/ha_messages.h index 4798d31bb1..310fe28227 100644 --- a/src/hooks/dhcp/high_availability/ha_messages.h +++ b/src/hooks/dhcp/high_availability/ha_messages.h @@ -68,6 +68,7 @@ extern const isc::log::MessageID HA_LEASE_SYNC_STALE_LEASE6_SKIP; extern const isc::log::MessageID HA_LEASE_UPDATES_DISABLED; extern const isc::log::MessageID HA_LEASE_UPDATES_ENABLED; extern const isc::log::MessageID HA_LEASE_UPDATE_COMMUNICATIONS_FAILED; +extern const isc::log::MessageID HA_LEASE_UPDATE_CONFLICT; extern const isc::log::MessageID HA_LEASE_UPDATE_CREATE_UPDATE_FAILED_ON_PEER; extern const isc::log::MessageID HA_LEASE_UPDATE_DELETE_FAILED_ON_PEER; extern const isc::log::MessageID HA_LEASE_UPDATE_FAILED; diff --git a/src/hooks/dhcp/high_availability/ha_messages.mes b/src/hooks/dhcp/high_availability/ha_messages.mes index 79ad89e75b..6247df6508 100644 --- a/src/hooks/dhcp/high_availability/ha_messages.mes +++ b/src/hooks/dhcp/high_availability/ha_messages.mes @@ -374,6 +374,13 @@ This warning message indicates that there was a problem in communication with a HA peer while processing a DHCP client query and sending lease update. The client's DHCP message will be dropped. +% HA_LEASE_UPDATE_CONFLICT %1: lease update to %2 returned conflict status code: %3 +This warning message indicates that partner returned a conflict status code +in response to a lease update. The client's DHCP message will be dropped. +If the server is configured to track conflicting lease updates, it may +eventually transition to the terminated state when the configured threshold +is exceeded. + % HA_LEASE_UPDATE_CREATE_UPDATE_FAILED_ON_PEER %1: failed to create or update the lease having type %2 for address %3, reason: %4 This informational message is issued when one of the leases failed to be created or updated on the HA peer while processing the lease updates sent diff --git a/src/hooks/dhcp/high_availability/ha_service.cc b/src/hooks/dhcp/high_availability/ha_service.cc index f844b33b04..c12f04704d 100644 --- a/src/hooks/dhcp/high_availability/ha_service.cc +++ b/src/hooks/dhcp/high_availability/ha_service.cc @@ -49,6 +49,13 @@ public: CtrlChannelError(file, line, what) {} }; +/// @brief Exception thrown when conflict status code has been returned. +class ConflictError : public CtrlChannelError { +public: + ConflictError(const char* file, size_t line, const char* what) : + CtrlChannelError(file, line, what) {} +}; + } namespace isc { @@ -474,6 +481,7 @@ HAService::partnerDownStateHandler() { query_filter_.serveDefaultScopes(); } adjustNetworkState(); + communication_state_->clearRejectedLeases(); // Log if the state machine is paused. conditionalLogPausedState(); @@ -608,6 +616,7 @@ HAService::readyStateHandler() { if (doOnEntry()) { query_filter_.serveNoScopes(); adjustNetworkState(); + communication_state_->clearRejectedLeases(); // Log if the state machine is paused. conditionalLogPausedState(); @@ -687,6 +696,7 @@ HAService::syncingStateHandler() { if (doOnEntry()) { query_filter_.serveNoScopes(); adjustNetworkState(); + communication_state_->clearRejectedLeases(); // Log if the state machine is paused. conditionalLogPausedState(); @@ -776,6 +786,7 @@ HAService::terminatedStateHandler() { if (doOnEntry()) { query_filter_.serveDefaultScopes(); adjustNetworkState(); + communication_state_->clearRejectedLeases(); // In the terminated state we don't send heartbeat. communication_state_->stopHeartbeat(); @@ -797,6 +808,7 @@ HAService::waitingStateHandler() { if (doOnEntry()) { query_filter_.serveNoScopes(); adjustNetworkState(); + communication_state_->clearRejectedLeases(); // Log if the state machine is paused. conditionalLogPausedState(); @@ -1101,6 +1113,11 @@ HAService::shouldTerminate() const { // If not issue a warning if it's getting large. if (!should_terminate) { communication_state_->clockSkewShouldWarn(); + // Check if we should terminate because the number of rejected leases + // has been exceeded. + should_terminate = + config_->getMaxRejectedClients() && + (config_->getMaxRejectedClients() <= communication_state_->getRejectedLeasesCount()); } return (should_terminate); @@ -1363,13 +1380,16 @@ HAService::asyncSendLeaseUpdate(const QueryPtrType& query, " HA peer. This is programmatic error"); } - // There are three possible groups of errors during the lease update. + // There are four possible groups of errors during the lease update. // One is the IO error causing issues in communication with the peer. - // Another one is an HTTP parsing error. The last type of error is - // when non-success error code is returned in the response carried - // in the HTTP message or if the JSON response is otherwise broken. + // Another one is an HTTP parsing error. The third type occurs when + // the partner receives the command but it is invalid or there is + // an internal processing error. Finally, the forth type is when the + // conflict status code is returned in the response indicating that + // the lease update does not match the partner's configuration. bool lease_update_success = true; + bool lease_update_conflict = false; // Handle first two groups of errors. if (ec || !error_str.empty()) { @@ -1384,7 +1404,6 @@ HAService::asyncSendLeaseUpdate(const QueryPtrType& query, } else { - // Handle third group of errors. try { int rcode = 0; auto args = verifyAsyncResponse(response, rcode); @@ -1392,7 +1411,19 @@ HAService::asyncSendLeaseUpdate(const QueryPtrType& query, // updates and we should log them. logFailedLeaseUpdates(query, args); + } catch (const ConflictError& ex) { + // Handle forth group of errors. + lease_update_conflict = true; + lease_update_success = false; + communication_state_->reportRejectedLease(query); + + LOG_WARN(ha_logger, HA_LEASE_UPDATE_CONFLICT) + .arg(query->getLabel()) + .arg(config->getLogLabel()) + .arg(ex.what()); + } catch (const std::exception& ex) { + // Handle third group of errors. LOG_WARN(ha_logger, HA_LEASE_UPDATE_FAILED) .arg(query->getLabel()) .arg(config->getLogLabel()) @@ -1405,7 +1436,7 @@ HAService::asyncSendLeaseUpdate(const QueryPtrType& query, // We don't care about the result of the lease update to the backup server. // It is a best effort update. - if ((config->getRole() != HAConfig::PeerConfig::BACKUP) && !lease_update_success) { + if ((config->getRole() != HAConfig::PeerConfig::BACKUP) && !lease_update_conflict && !lease_update_success) { // If we were unable to communicate with the partner we set partner's // state as unavailable. communication_state_->setPartnerState("unavailable"); @@ -2956,9 +2987,12 @@ HAService::verifyAsyncResponse(const HttpResponsePtr& response, int& rcode) { // Include an error code. s << "error code " << rcode; - if (rcode == CONTROL_RESULT_COMMAND_UNSUPPORTED) { + switch (rcode) { + case CONTROL_RESULT_COMMAND_UNSUPPORTED: isc_throw(CommandUnsupportedError, s.str()); - } else { + case CONTROL_RESULT_CONFLICT: + isc_throw(ConflictError, s.str()); + default: isc_throw(CtrlChannelError, s.str()); } } diff --git a/src/hooks/dhcp/high_availability/ha_service.h b/src/hooks/dhcp/high_availability/ha_service.h index 288a03671f..dbce0bf133 100644 --- a/src/hooks/dhcp/high_availability/ha_service.h +++ b/src/hooks/dhcp/high_availability/ha_service.h @@ -470,13 +470,20 @@ protected: bool shouldPartnerDown() const; /// @brief Indicates if the server should transition to the terminated - /// state as a result of high clock skew. + /// state. + /// + /// There are two reasons for the server to transition to the terminated + /// state. First, when the clock skew being too high. Second, when the + /// server monitors rejected lease updates and the maximum configured + /// rejected updates have been exceeded. + /// + /// If the clock skew is is higher than 30 seconds but lower than 60 + /// seconds this method only logs a warning. In case, the clock skew + /// exceeds 60 seconds, this method logs a warning and returns true. /// - /// It indicates that the server should transition to the terminated - /// state because of the clock skew being too high. If the clock skew is - /// is higher than 30 seconds but lower than 60 seconds this method - /// only logs a warning. In case, the clock skew exceeds 60 seconds, this - /// method logs a warning and returns true. + /// If the clock skew is acceptable the function can cause the transition + /// to the terminated state when the number of recorded rejected lease + /// updates exceeded the configured threshold. /// /// @return true if the server should transition to the terminated state, /// false otherwise. diff --git a/src/hooks/dhcp/high_availability/tests/ha_service_unittest.cc b/src/hooks/dhcp/high_availability/tests/ha_service_unittest.cc index 5f38e899c5..70339db57b 100644 --- a/src/hooks/dhcp/high_availability/tests/ha_service_unittest.cc +++ b/src/hooks/dhcp/high_availability/tests/ha_service_unittest.cc @@ -893,6 +893,8 @@ public: if (should_fail) { EXPECT_EQ(HA_UNAVAILABLE_ST, state->getPartnerState()); + } else { + EXPECT_NE(state->getPartnerState(), HA_UNAVAILABLE_ST); } } @@ -1320,7 +1322,7 @@ public: /// @brief Tests scenarios when one of the servers to which a /// lease update is sent returns an error. - void testSndUpdatesControlResultError() { + void testSendUpdatesControlResultError() { // Instruct the server 2 to return an error as a result of receiving a command. factory2_->getResponseCreator()->setControlResult(CONTROL_RESULT_ERROR); @@ -1370,7 +1372,7 @@ public: /// @brief Tests scenarios when one of the servers to which a /// lease update is sent does not authorize the local server. - void testSndUpdatesControlResultUnauthorized() { + void testSendUpdatesControlResultUnauthorized() { // Instruct the server 2 to require authentication. factory2_->getResponseCreator()->addBasicAuth("foo", "bar"); @@ -1449,6 +1451,30 @@ public: EXPECT_FALSE(delete_request3); } + /// @brief Tests scenarios when one of the servers to which a + /// lease update is sent returns an error. + void testSendUpdatesControlResultConflict() { + // Instruct the server 2 to return an error as a result of receiving a command. + factory2_->getResponseCreator()->setControlResult(CONTROL_RESULT_CONFLICT); + factory3_->getResponseCreator()->setControlResult(CONTROL_RESULT_CONFLICT); + + // Start HTTP servers. + ASSERT_NO_THROW({ + listener_->start(); + listener2_->start(); + listener3_->start(); + }); + + testSendLeaseUpdates([] { + ADD_FAILURE() << "unpark function called but expected that " + "the packet is dropped"; + }, false, 1); + + // Ensure that the server has recorded a lease update conflict. The conflict + // reported by the backup server should not count. + EXPECT_EQ(1, service_->communication_state_->getRejectedLeasesCount()); + } + /// @brief Tests scenarios when all lease updates are sent successfully. void testSendSuccessfulUpdates6() { // Start HTTP servers. @@ -2273,27 +2299,40 @@ TEST_F(HAServiceTest, sendUpdatesBackupServerOfflineMultiThreading) { // Test scenario when one of the servers to which a lease update is sent // returns an error. TEST_F(HAServiceTest, sendUpdatesControlResultError) { - testSndUpdatesControlResultError(); + testSendUpdatesControlResultError(); } // Test scenario when one of the servers to which a lease update is sent // returns an error. TEST_F(HAServiceTest, sendUpdatesControlResultErrorMultiThreading) { MultiThreadingMgr::instance().setMode(true); - testSndUpdatesControlResultError(); + testSendUpdatesControlResultError(); } // Test scenario when one of the servers to which a lease update is sent // requires not provided authentication. TEST_F(HAServiceTest, sendUpdatesControlResultUnauthorized) { - testSndUpdatesControlResultUnauthorized(); + testSendUpdatesControlResultUnauthorized(); } // Test scenario when one of the servers to which a lease update is sent // requires not provided authentication. TEST_F(HAServiceTest, sendUpdatesControlResultUnauthorizedMultiThreading) { MultiThreadingMgr::instance().setMode(true); - testSndUpdatesControlResultUnauthorized(); + testSendUpdatesControlResultUnauthorized(); +} + +// Test scenario when one of the servers to which a lease update is sent +// returns a conflict status code. +TEST_F(HAServiceTest, sendUpdatesControlResultConflict) { + testSendUpdatesControlResultConflict(); +} + +// Test scenario when one of the servers to which a lease update is sent +// returns a conflict status code. +TEST_F(HAServiceTest, sendUpdatesControlResultConflictMultiThreading) { + MultiThreadingMgr::instance().setMode(true); + testSendUpdatesControlResultConflict(); } // Test scenario when all lease updates are sent successfully. @@ -5108,6 +5147,25 @@ public: service_->runModel(HAService::NOP_EVT); } + /// @brief Simulates rejected lease updates by the partner. + /// + /// Too many rejected lease updates should transition the server to the + /// terminated state. + /// @param leases_num number of simulated rejected leases. + void simulateRejectedLeaseUpdates(const unsigned leases_num = 100) { + // If the state machine is about to transition to another state, + // let's make sure it performs this transition. + service_->runModel(HAService::NOP_EVT); + // Simulate the rejected lease updates. + for (auto i = 0; i < leases_num; ++i) { + // Create query with random HW address. + Pkt4Ptr query4 = createQuery4(randomKey(HWAddr::ETHERNET_HWADDR_LEN)); + static_cast(state_->reportRejectedLease(query4)); + } + // The state machine needs to react to the rejected leases. + service_->runModel(HAService::NOP_EVT); + } + /// @brief Checks transitions dependent on the partner's state. /// /// This method uses @c partner_ object to control the state of the partner. @@ -5360,6 +5418,18 @@ public: return (isDoingHeartbeat()); } + /// @brief Transitions the server to the specified state and checks that it + /// clears the rejected leases counter. + /// + /// @param my_state this server's state + /// @return true if the rejected leases counter has been cleared. + bool expectClearRejectedLeases(const MyState& my_state) { + simulateRejectedLeaseUpdates(1); + service_->verboseTransition(my_state.state_); + service_->runModel(TestHAService::NOP_EVT); + return (state_->getRejectedLeasesCount() == 0); + } + /// @brief Pointer to the communication state used in the tests. NakedCommunicationState4Ptr state_; /// @brief Pointer to the partner used in some tests. @@ -7696,4 +7766,58 @@ TEST_F(HAServiceStateMachineTest, shouldSendLeaseUpdatesPassiveBackup) { EXPECT_TRUE(expectLeaseUpdates(MyState(HA_WAITING_ST), peer_config)); } +// Verifies that the server transitions to the terminated state as a result +// of too many rejected leases. +TEST_F(HAServiceStateMachineTest, shouldTerminateDueToRejectedLeases) { + startService(createValidConfiguration()); + service_->verboseTransition(HA_LOAD_BALANCING_ST); + service_->runModel(HAService::NOP_EVT); + + // Simulate many rejected leases. It should cause the server to transition + // to the terminated state. + simulateRejectedLeaseUpdates(100); + EXPECT_EQ(HA_TERMINATED_ST, service_->getCurrState()); +} + +// Verifies that rejected leases are cleared upon entering certain states and +// are not cleared upon entering other states in the load balancing mode. +TEST_F(HAServiceStateMachineTest, clearRejectedLeaseUpdatesLoadBalancing) { + startService(createValidConfiguration()); + + EXPECT_FALSE(expectClearRejectedLeases(MyState(HA_COMMUNICATION_RECOVERY_ST))); + EXPECT_FALSE(expectClearRejectedLeases(MyState(HA_LOAD_BALANCING_ST))); + EXPECT_FALSE(expectClearRejectedLeases(MyState(HA_IN_MAINTENANCE_ST))); + EXPECT_TRUE(expectClearRejectedLeases(MyState(HA_PARTNER_DOWN_ST))); + EXPECT_FALSE(expectClearRejectedLeases(MyState(HA_PARTNER_IN_MAINTENANCE_ST))); + EXPECT_TRUE(expectClearRejectedLeases(MyState(HA_READY_ST))); + EXPECT_TRUE(expectClearRejectedLeases(MyState(HA_TERMINATED_ST))); + EXPECT_TRUE(expectClearRejectedLeases(MyState(HA_WAITING_ST))); +} + +// Verifies that rejected leases are cleared upon entering certain states and +// are not cleared upon entering other states in the hot standby mode. +TEST_F(HAServiceStateMachineTest, clearRejectedLeaseUpdatesHotStandby) { + HAConfigPtr config_storage = createValidConfiguration(HAConfig::HOT_STANDBY); + config_storage->getPeerConfig("server2")->setRole("standby"); + startService(config_storage); + + EXPECT_FALSE(expectClearRejectedLeases(MyState(HA_HOT_STANDBY_ST))); + EXPECT_FALSE(expectClearRejectedLeases(MyState(HA_IN_MAINTENANCE_ST))); + EXPECT_TRUE(expectClearRejectedLeases(MyState(HA_PARTNER_DOWN_ST))); + EXPECT_FALSE(expectClearRejectedLeases(MyState(HA_PARTNER_IN_MAINTENANCE_ST))); + EXPECT_TRUE(expectClearRejectedLeases(MyState(HA_READY_ST))); + EXPECT_TRUE(expectClearRejectedLeases(MyState(HA_TERMINATED_ST))); + EXPECT_TRUE(expectClearRejectedLeases(MyState(HA_WAITING_ST))); +} + +// Verifies that rejected leases are cleared upon entering certain states and +// are not cleared upon entering other states in the passive backup mode. +TEST_F(HAServiceStateMachineTest, clearRejectedLeaseUpdatesPassiveBackup) { + HAConfigPtr config_storage = createValidPassiveBackupConfiguration(); + config_storage->getPeerConfig("server2")->setRole("backup"); + startService(config_storage); + + EXPECT_FALSE(expectClearRejectedLeases(MyState(HA_PASSIVE_BACKUP_ST))); +} + } // end of anonymous namespace diff --git a/src/hooks/dhcp/high_availability/tests/ha_test.cc b/src/hooks/dhcp/high_availability/tests/ha_test.cc index 7dd35bbe33..8869297820 100644 --- a/src/hooks/dhcp/high_availability/tests/ha_test.cc +++ b/src/hooks/dhcp/high_availability/tests/ha_test.cc @@ -155,6 +155,7 @@ HATest::createValidJsonConfiguration(const HAConfig::HAMode& ha_mode) const { " \"max-response-delay\": 1000," " \"max-ack-delay\": 10000," " \"max-unacked-clients\": 10," + " \"max-rejected-clients\": 10," " \"wait-backup-ack\": false," " \"peers\": [" " {"