- Failover pairs now implement 'MAC Affinity' on leases moving from the

author David Hankins <dhankins@isc.org>

Fri, 16 Jun 2006 19:26:45 +0000 (19:26 +0000)

committer David Hankins <dhankins@isc.org>

Fri, 16 Jun 2006 19:26:45 +0000 (19:26 +0000)
author David Hankins <dhankins@isc.org>
Fri, 16 Jun 2006 19:26:45 +0000 (19:26 +0000)
committer David Hankins <dhankins@isc.org>
Fri, 16 Jun 2006 19:26:45 +0000 (19:26 +0000)
diff --git a/RELNOTES b/RELNOTES

index 5bfc9406eb4807b35866d0e62bdcff45b86cb42a..0f358ca71a174f070df2f89510c628c2854f4aa8 100644 (file)
--- a/RELNOTES
+++ b/RELNOTES
@@ -106,6 +106,19 @@ and for prodding me into improving it.
  - Some patches to improve DHCP Server startup speed from Andrew Matheson
    have been incorporated.
  
+- Failover pairs now implement 'MAC Affinity' on leases moving from the
+  active to free states.  Leases that belonged to the failover secondary
+  are moved to BACKUP state rather than FREE upon exiting EXPIRED state.
+  If lease rebalancing must move leases, it tries first to move leases
+  that belong to the peer in need.
+
+- The server no longer sends POOLREQ messages unless the pool is severely
+  misbalanced in the peer's favor (see 'man dhcpd.conf' for more details).
+
+- Pool rebalance events no longer happen upon successfully allocating a
+  lease.  Instead, they happen on a schedule.  See 'man dhcpd.conf' for the
+  min-balance and max-balance statements for more information.
+
                         Changes since 3.0.4
  
  - A warning that host statements declared within subnet or shared-network
diff --git a/common/conflex.c b/common/conflex.c

index db3de04505e0573cbb41e86d9b680b7c205d77e8..33ffad542ae4e13fe89ae11602a98c4ca78ef0a5 100644 (file)
--- a/common/conflex.c
+++ b/common/conflex.c
@@ -34,7 +34,7 @@
  
  #ifndef lint
  static char copyright[] =
-"$Id: conflex.c,v 1.99 2006/06/06 16:35:18 dhankins Exp $ Copyright (c) 2004-2006 Internet Systems Consortium.  All rights reserved.\n";
+"$Id: conflex.c,v 1.100 2006/06/16 19:26:44 dhankins Exp $ Copyright (c) 2004-2006 Internet Systems Consortium.  All rights reserved.\n";
  #endif /* not lint */
  
  #include "dhcpd.h"
@@ -836,8 +836,16 @@ static enum dhcp_token intern (atom, dfv)
                 if (!strncasecmp (atom + 1, "ax", 2)) {
                         if (!atom [3])
                                 return TOKEN_MAX;
-                       if (!strcasecmp (atom + 3, "-lease-time"))
-                               return MAX_LEASE_TIME;
+                       if (!strcasecmp (atom + 3, "-balance"))
+                               return MAX_BALANCE;
+                       if (!strcasecmp (atom + 3, "-lease-")) {
+                               if (!strcasecmp(atom + 10, "misbalance"))
+                                       return MAX_LEASE_MISBALANCE;
+                               if (!strcasecmp(atom + 10, "ownership"))
+                                       return MAX_LEASE_OWNERSHIP;
+                               if (!strcasecmp(atom + 10, "time"))
+                                       return MAX_LEASE_TIME;
+                       }
                         if (!strcasecmp (atom + 3, "-transmit-idle"))
                                 return MAX_TRANSMIT_IDLE;
                         if (!strcasecmp (atom + 3, "-response-delay"))
@@ -846,6 +854,8 @@ static enum dhcp_token intern (atom, dfv)
                                 return MAX_UNACKED_UPDATES;
                 }
                 if (!strncasecmp (atom + 1, "in-", 3)) {
+                       if (!strcasecmp (atom + 4, "balance"))
+                               return MIN_BALANCE;
                         if (!strcasecmp (atom + 4, "lease-time"))
                                 return MIN_LEASE_TIME;
                         if (!strcasecmp (atom + 4, "secs"))
diff --git a/includes/dhcpd.h b/includes/dhcpd.h

index 952edd29b3c074f1640b606cc2a213088f7c04a4..4735be73e58de60bd389dbfe4f92c30ff76b6f29 100644 (file)
--- a/includes/dhcpd.h
+++ b/includes/dhcpd.h
@@ -2666,8 +2666,8 @@ isc_result_t dhcp_failover_set_state (dhcp_failover_state_t *,
                                       enum failover_state);
  isc_result_t dhcp_failover_peer_state_changed (dhcp_failover_state_t *,
                                                failover_message_t *);
-int dhcp_failover_pool_rebalance (dhcp_failover_state_t *);
-int dhcp_failover_pool_check (struct pool *);
+void dhcp_failover_pool_rebalance (void *);
+void dhcp_failover_pool_check (struct pool *);
  int dhcp_failover_state_pool_check (dhcp_failover_state_t *);
  void dhcp_failover_timeout (void *);
  void dhcp_failover_send_contact (void *);
@@ -2751,6 +2751,7 @@ void dhcp_failover_recover_done (void *);
  void failover_print PROTO ((char *, unsigned *, unsigned, const char *));
  void update_partner PROTO ((struct lease *));
  int load_balance_mine (struct packet *, dhcp_failover_state_t *);
+int peer_wants_lease (struct lease *);
  binding_state_t normal_binding_state_transition_check (struct lease *,
                                                        dhcp_failover_state_t *,
                                                        binding_state_t,
diff --git a/includes/dhctoken.h b/includes/dhctoken.h

index ea332c4b65e77d64bcaf9e6e2a259e8cab585ab9..5de43130758ebe8bc7731dade7024ce600f51d1d 100644 (file)
--- a/includes/dhctoken.h
+++ b/includes/dhctoken.h
@@ -318,7 +318,11 @@ enum dhcp_token {
         SIZE = 622,
         EPOCH = 623,
         DB_TIME_FORMAT = 624,
-       LOCAL = 625
+       LOCAL = 625,
+       MAX_LEASE_MISBALANCE = 626,
+       MAX_LEASE_OWNERSHIP = 627,
+       MAX_BALANCE = 628,
+       MIN_BALANCE = 629
  };
  
  #define is_identifier(x)       ((x) >= FIRST_TOKEN &&  \
diff --git a/includes/failover.h b/includes/failover.h

index 576d6bb2db0a5998232a1cd9dc015c88c9b3a99e..c35f3249538ece429d14179822e290637bf98910 100644 (file)
--- a/includes/failover.h
+++ b/includes/failover.h
@@ -49,6 +49,31 @@ typedef struct {
         u_int8_t *data;
  } failover_option_t;
  
+/* Failover configuration defaults. */
+#ifndef  DEFAULT_MAX_BALANCE_TIME
+# define DEFAULT_MAX_BALANCE_TIME      3600
+#endif
+
+#ifndef  DEFAULT_MIN_BALANCE_TIME
+# define DEFAULT_MIN_BALANCE_TIME      60
+#endif
+
+#ifndef  DEFAULT_MAX_LEASE_MISBALANCE
+# define DEFAULT_MAX_LEASE_MISBALANCE   15
+#endif
+
+#ifndef  DEFAULT_MAX_LEASE_OWNERSHIP
+# define DEFAULT_MAX_LEASE_OWNERSHIP    10
+#endif
+
+#ifndef  DEFAULT_MAX_FLYING_UPDATES
+# define DEFAULT_MAX_FLYING_UPDATES    100
+#endif
+
+#ifndef  DEFAULT_MAX_RESPONSE_DELAY
+# define DEFAULT_MAX_RESPONSE_DELAY    20
+#endif
+
  #define FM_OFFSET(x) (long)(&(((failover_message_t *)0) -> x))
  
  /* All of the below definitions are mandated by draft-ietf-dhc-failover-12.
@@ -313,6 +338,10 @@ typedef struct _dhcp_failover_state {
         u_int8_t *hba;  /* Hash bucket array for load balancing. */
         int load_balance_max_secs;
  
+       unsigned int max_lease_misbalance, max_lease_ownership;
+       u_int32_t max_balance, min_balance;
+       TIME last_balance, sched_balance;
+
         enum service_state service_state;
         const char *nrr;        /* Printable reason why we're in the
                                    not_responding service state (empty
diff --git a/server/confpars.c b/server/confpars.c

index f4969f9da2c71a5018bf981cb454892351f13cb0..06c74cf557a11192df9115cedbab26b3d3bc582c 100644 (file)
--- a/server/confpars.c
+++ b/server/confpars.c
@@ -34,7 +34,7 @@
  
  #ifndef lint
  static char copyright[] =
-"$Id: confpars.c,v 1.156 2006/06/15 17:49:49 dhankins Exp $ Copyright (c) 2004-2006 Internet Systems Consortium.  All rights reserved.\n";
+"$Id: confpars.c,v 1.157 2006/06/16 19:26:44 dhankins Exp $ Copyright (c) 2004-2006 Internet Systems Consortium.  All rights reserved.\n";
  #endif /* not lint */
  
  #include "dhcpd.h"
@@ -876,6 +876,22 @@ void parse_failover_peer (cfile, group, type)
                         cp -> port = atoi (val);
                         break;
  
+                     case MAX_LEASE_MISBALANCE:
+                       tp = &peer->max_lease_misbalance;
+                       goto parse_idle;
+
+                     case MAX_LEASE_OWNERSHIP:
+                       tp = &peer->max_lease_ownership;
+                       goto parse_idle;
+
+                     case MAX_BALANCE:
+                       tp = &peer->max_balance;
+                       goto parse_idle;
+
+                     case MIN_BALANCE:
+                       tp = &peer->min_balance;
+                       goto parse_idle;
+
                       case MAX_RESPONSE_DELAY:
                         tp = &cp -> max_response_delay;
                       parse_idle:
@@ -1011,16 +1027,22 @@ void parse_failover_peer (cfile, group, type)
                             "primary failover server must have mclt.");
             }
         }
-       if (!peer -> me.max_flying_updates) {
-               peer -> me.max_flying_updates = 100;
-       }
-       if (!peer -> me.max_response_delay) {
-               peer -> me.max_response_delay = 60;
-       }
  
-       if (type == SHARED_NET_DECL) {
-               group -> shared_network -> failover_peer = peer;
-       }
+       if (!peer->max_lease_misbalance)
+               peer->max_lease_misbalance = DEFAULT_MAX_LEASE_MISBALANCE;
+       if (!peer->max_lease_ownership)
+               peer->max_lease_ownership = DEFAULT_MAX_LEASE_OWNERSHIP;
+       if (!peer->max_balance)
+               peer->max_balance = DEFAULT_MAX_BALANCE_TIME;
+       if (!peer->min_balance)
+               peer->min_balance = DEFAULT_MIN_BALANCE_TIME;
+       if (!peer->me.max_flying_updates)
+               peer->me.max_flying_updates = DEFAULT_MAX_FLYING_UPDATES;
+       if (!peer->me.max_response_delay)
+               peer->me.max_response_delay = DEFAULT_MAX_RESPONSE_DELAY;
+
+       if (type == SHARED_NET_DECL)
+               group->shared_network->failover_peer = peer;
  
         /* Set the initial state. */
         if (peer -> i_am == primary) {
diff --git a/server/dhcp.c b/server/dhcp.c

index 0b89dca32b67ae8f15d6ab5526ffd61e4c9a1a7c..8be04bf43b9d2df3c199353ecb97d4ec340b4bc7 100644 (file)
--- a/server/dhcp.c
+++ b/server/dhcp.c
@@ -34,7 +34,7 @@
  
  #ifndef lint
  static char copyright[] =
-"$Id: dhcp.c,v 1.205 2006/06/15 17:52:06 dhankins Exp $ Copyright (c) 2004-2006 Internet Systems Consortium.  All rights reserved.\n";
+"$Id: dhcp.c,v 1.206 2006/06/16 19:26:45 dhankins Exp $ Copyright (c) 2004-2006 Internet Systems Consortium.  All rights reserved.\n";
  #endif /* not lint */
  
  #include "dhcpd.h"
@@ -289,16 +289,13 @@ void dhcpdiscover (packet, ms_nulltp)
         if (lease && lease -> pool && lease -> pool -> failover_peer) {
                 peer = lease -> pool -> failover_peer;
  
-               /* If the lease is ours to allocate, then allocate it. */
-               if (lease_mine_to_reallocate(lease)) {
-                       if (lease->pool && lease->pool->failover_peer)
-                               dhcp_failover_pool_check(lease->pool);
-
-               /* If the lease is active, it belongs to the client.  This
+               /* If the lease is ours to allocate, then allocate it.
+                * If the lease is active, it belongs to the client.  This
                  * is the right lease, if we are to offer one.  We decide
                  * wether or not to offer later on.
                  */
-               } else if (lease->binding_state == FTS_ACTIVE) {
+               if (lease->binding_state == FTS_ACTIVE ||
+                   lease_mine_to_reallocate(lease)) {
                         ; /* This space intentionally left blank. */
  
                 /* Otherwise, we can't let the client have this lease. */
@@ -327,10 +324,6 @@ void dhcpdiscover (packet, ms_nulltp)
                                            packet -> shared_network -> name);
                         return;
                 }
-#if defined (FAILOVER_PROTOCOL)
-               if (lease -> pool && lease -> pool -> failover_peer)
-                       dhcp_failover_pool_check (lease -> pool);
-#endif
         }
  
  #if defined (FAILOVER_PROTOCOL)
diff --git a/server/dhcpd.conf.5 b/server/dhcpd.conf.5

index 4d766724ba1f50d9b12fc6b0ff6250ea157dcbbf..d886d9b42e8cafa1999cd8092420b961f1087a56 100644 (file)
--- a/server/dhcpd.conf.5
+++ b/server/dhcpd.conf.5
@@ -28,7 +28,7 @@
  .\" see ``http://www.vix.com''.   To learn more about Nominum, Inc., see
  .\" ``http://www.nominum.com''.
  .\"
-.\" $Id: dhcpd.conf.5,v 1.72 2006/06/15 17:49:49 dhankins Exp $
+.\" $Id: dhcpd.conf.5,v 1.73 2006/06/16 19:26:45 dhankins Exp $
  .\"
  .TH dhcpd.conf 5
  .SH NAME
@@ -613,6 +613,46 @@ the port number declared in the \fBport\fR statement.
  .RE
  .PP
  The 
+.I max-lease-misbalance
+statement
+.RS 0.25i
+.PP
+.B max-lease-misbalance \fIinteger\fR\fB;\fR
+.PP
+The \fBmax-lease-misbalance\fR statement tells the DHCP server what
+percentage of total free leases (as defined as the total number of
+leases in either the FREE or BACKUP states) a peer is allowed to own
+before a rebalance check is made.  Configuring higher values causes
+the server to rebalance less frequently, but permits a larger misbalance
+between the FREE and BACKUP lease pools.  Configuring a lower value
+causes the server to rebalance more frequently, but keeps the pools more
+balanced.  ISC DHCP servers no longer send POOLREQ messages unless the
+misbalance is at least twice this percentage in the peer's favor.  Valid
+values are between 0 and 100.  The default is 15.
+.RE
+.PP
+The
+.I max-lease-ownership
+statement
+.RS 0.25i
+.PP
+.B max-lease-ownership \fIinteger\fR\fB;\fR
+.PP
+The \fBmax-lease-ownership\fR statement tells the DHCP server what
+percentage of total free leases either it or its peer are normally allowed to
+own in excess of balance for the purpose of MAC Address Affinity.  When a
+server undergoes a lease rebalancing operation, it first tries to move as
+many leases as it can to the peer whose previous client was Load-Balanced to
+that peer (as governed by the Load Balance Algorithm, see the \fBsplit\fR
+configuration value).  The \fBmax-lease-ownership\fR value determines the
+maximum percentage of leases either server will hold before giving its
+peer the oldest leases (regardless of the previous client's place in the
+Load Balance algorithm).  Valid values are between 0 and 100, and should
+probably be less than the \fBmax-lease-misbalance\fR value.  Larger values
+will allow servers to retain leases to reallocate to returning clients,
+smaller values promote pool balance.  The default is 10.
+.PP
+The
  .I max-response-delay
  statement
  .RS 0.25i
@@ -629,11 +669,6 @@ constantly making and breaking connections.   This parameter must be
  specified.
  .RE
  .PP
-The 
-.I max-unacked-updates
-statement
-.RS 0.25i
-.PP
  .B max-unacked-updates \fIcount\fR\fB;\fR
  .PP
  The \fBmax-unacked-updates\fR statement tells the DHCP server how
@@ -643,6 +678,33 @@ to say what a good value for this is, but 10 seems to work.   This
  parameter must be specified.
  .RE
  .PP
+The
+.I min-balance
+and
+.I max-balance
+statements
+.RS 0.25i
+.PP
+.B min-balance \fIseconds\fR\fB;\fR
+.B max-balance \fIseconds\fR\fB;\fR
+.PP
+The DHCP Server schedules pool rebalance events at a time between these
+two values, estimated to be when the the \fBmax-lease-misbalance\fR percent
+of leases have been allocated by its peer.  This estimate is reached from
+however many seconds have elapsed since the oldest lease in the failover
+peer's pool has been expired.
+.PP
+The \fBmin-balance\fR value defaults to 60, one minute, and the
+\fBmax-balance\fR value defaults to 3600, one hour.
+.PP
+Lease rebalancing events can be CPU intensive, particular on installations
+where failover peers may have large numbers of pools and addresses to
+examine, so these parameters should be used to keep the estimation of
+the need for pool rebalance sane...not so long that you are in danger of
+exhausting your pool, not so short that your server is constantly
+rebalancing.
+.RE
+.PP
  The 
  .I mclt
  statement
@@ -672,10 +734,14 @@ statement
  The split statement specifies the split between the primary and
  secondary for the purposes of load balancing.   Whenever a client
  makes a DHCP request, the DHCP server runs a hash on the client
-identification.   If the hash comes out to less than the split value,
-the primary answers.   If it comes out to equal to or more than the
-split, the secondary answers.   The only meaningful value is 128, and can
-only be configured on the primary.
+identification, resulting in value from 0 to 255.  This is used as
+an index into a 256 bit field.  If the bit at that index is set,
+the primary is responsible.  If the bit at that index is not set,
+the secondary is responsible.  The \fBsplit\fR value determines
+how many of the leading bits are set to one.  So, in practice, higher
+split values will cause the primary to serve more clients than the
+secondary.  Lower split values, the converse.  Legal values are between
+0 and 255, of which the most reasonable is 128.
  .RE
  .PP
  The 
@@ -695,10 +761,27 @@ for such fine-grained control, however.   An example hba statement:
        00:00:00:00:00:00:00:00:00:00:00:00:00:00:00:00;
  .fi
  .PP
-This is equivalent to a \fBsplit 128;\fR statement.  You must only have
-\fBsplit\fR or \fBhba\fR defined, never both.  For most cases, the
-fine-grained control that \fBhba\fR offers isn't necessary, and \fBsplit\fR
-should be used.  As such, the use of \fBhba\fR is deprecated.
+This is equivalent to a \fBsplit 128;\fR statement, and identical.  The
+following two examples are also equivalent to a \fBsplit\fR of 128, but 
+are not identical:
+.PP
+.nf
+  hba aa:aa:aa:aa:aa:aa:aa:aa:aa:aa:aa:aa:aa:aa:aa:aa:
+      aa:aa:aa:aa:aa:aa:aa:aa:aa:aa:aa:aa:aa:aa:aa:aa;
+
+  hba 55:55:55:55:55:55:55:55:55:55:55:55:55:55:55:55:
+      55:55:55:55:55:55:55:55:55:55:55:55:55:55:55:55;
+.fi
+.PP
+They are equivalent, because half the bits are set to 0, half are set to
+1 (0xa and 0x5 are 1010 and 0101 binary respectively) and consequently this
+would roughly divide the clients equally between the servers.  They are not
+identical, because the actual peers this would load balance to each server
+are different for each example.
+.PP
+You must only have \fBsplit\fR or \fBhba\fR defined, never both.  For most
+cases, the fine-grained control that \fBhba\fR offers isn't necessary, and
+\fBsplit\fR should be used.
  .RE
  .PP
  The 
diff --git a/server/failover.c b/server/failover.c

index 7726a844730cfa35d3c1b3544fa36f4299ebf4f4..5dbd6064baa0718b13a120aa70b164644868f88f 100644 (file)
--- a/server/failover.c
+++ b/server/failover.c
@@ -34,7 +34,7 @@
  
  #ifndef lint
  static char copyright[] =
-"$Id: failover.c,v 1.61 2006/05/04 21:14:21 dhankins Exp $ Copyright (c) 2004-2006 Internet Systems Consortium.  All rights reserved.\n";
+"$Id: failover.c,v 1.62 2006/06/16 19:26:45 dhankins Exp $ Copyright (c) 2004-2006 Internet Systems Consortium.  All rights reserved.\n";
  #endif /* not lint */
  
  #include "dhcpd.h"
@@ -53,6 +53,12 @@ static isc_result_t failover_message_reference (failover_message_t **,
  static isc_result_t failover_message_dereference (failover_message_t **,
                                                   const char *file, int line);
  
+static void dhcp_failover_pool_reqbalance(dhcp_failover_state_t *state);
+static int dhcp_failover_pool_dobalance(dhcp_failover_state_t *state);
+static INLINE int secondary_not_hoarding(dhcp_failover_state_t *state,
+                                        struct pool *p);
+
+
  void dhcp_failover_startup ()
  {
         dhcp_failover_state_t *state;
@@ -1383,7 +1389,7 @@ isc_result_t dhcp_failover_state_signal (omapi_object_t *o,
                         dhcp_failover_process_update_done (state,
                                                            link -> imsg);
                 } else if (link -> imsg -> type == FTM_POOLREQ) {
-                       dhcp_failover_pool_rebalance (state);
+                       dhcp_failover_pool_reqbalance(state);
                 } else if (link -> imsg -> type == FTM_POOLRESP) {
                         log_info ("pool response: %ld leases",
                                   (unsigned long)
@@ -2211,10 +2217,40 @@ isc_result_t dhcp_failover_peer_state_changed (dhcp_failover_state_t *state,
         return ISC_R_SUCCESS;
  }
  
-int dhcp_failover_pool_rebalance (dhcp_failover_state_t *state)
+/* Entry from timer. */
+void dhcp_failover_pool_rebalance(void *failover_state)
  {
-       int lts;
+       dhcp_failover_state_t *state;
+
+       state = (dhcp_failover_state_t *)failover_state;
+
+       if (dhcp_failover_pool_dobalance(state))
+               dhcp_failover_send_updates(state);
+}
+
+/* Entry from POOLREQ. */
+static void dhcp_failover_pool_reqbalance(dhcp_failover_state_t *state)
+{
+       int queued;
+
+       queued = dhcp_failover_pool_dobalance(state);
+
+       dhcp_failover_send_poolresp(state, queued);
+
+       if (queued)
+               dhcp_failover_send_updates(state);
+       else
+               log_info("peer %s: Got POOLREQ, answering negatively!  "
+                        "Peer may be out of leases or database inconsistent.",
+                        state->name);
+}
+
+/* Do the meat of the work common to all forms of pool rebalance. */
+static int dhcp_failover_pool_dobalance(dhcp_failover_state_t *state)
+{
+       int lts, total, thresh, hold, pass;
         int leases_queued = 0;
+       int reqsent = 0;
         struct lease *lp = (struct lease *)0;
         struct lease *next = (struct lease *)0;
         struct shared_network *s;
@@ -2223,14 +2259,17 @@ int dhcp_failover_pool_rebalance (dhcp_failover_state_t *state)
         binding_state_t peer_lease_state;
         binding_state_t my_lease_state;
         struct lease **lq;
-       int tenper;
  
-       if (state -> me.state != normal || state -> i_am == secondary)
+       if (state -> me.state != normal)
                 return 0;
  
-       for (s = shared_networks; s; s = s -> next) {
-           for (p = s -> pools; p; p = p -> next) {
-               if (p -> failover_peer != state)
+       state->last_balance = cur_time;
+       cancel_timeout(dhcp_failover_pool_rebalance, state);
+       state->sched_balance = 0;
+
+       for (s = shared_networks ; s ; s = s->next) {
+           for (p = s->pools ; p ; p = p->next) {
+               if (p->failover_peer != state)
                     continue;
  
                 /* Right now we're giving the peer half of the free leases.
@@ -2239,125 +2278,205 @@ int dhcp_failover_pool_rebalance (dhcp_failover_state_t *state)
                    of leases the peer has, will be how many more leases we
                    have than the peer has.   So if we send half that number
                    to the peer, we should be even. */
-               if (p -> failover_peer -> i_am == primary) {
-                       lts = (p -> free_leases - p -> backup_leases) / 2;
+               if (p->failover_peer->i_am == primary) {
+                       lts = (p->free_leases - p->backup_leases) / 2;
                         peer_lease_state = FTS_BACKUP;
                         my_lease_state = FTS_FREE;
-                       lq = &p -> free;
+                       lq = &p->free;
                 } else {
-                       lts = (p -> backup_leases - p -> free_leases) / 2;
+                       lts = (p->backup_leases - p->free_leases) / 2;
                         peer_lease_state = FTS_FREE;
                         my_lease_state = FTS_BACKUP;
-                       lq = &p -> backup;
+                       lq = &p->backup;
+               }
+
+               log_info ("pool %lx %s  total %d  free %d  backup %d  lts %d",
+                       (unsigned long)p,
+                       (p->shared_network ?
+                        p->shared_network->name : ""), p->lease_count,
+                       p->free_leases, p->backup_leases, lts);
+
+               total = p->backup_leases + p->free_leases;
+
+               thresh = ((total * state->max_lease_misbalance) + 50) / 100;
+               hold = ((total * state->max_lease_ownership) + 50) / 100;
+
+               /* If lts is in the negatives (we need leases) more than
+                * negative double the thresh%, panic and send poolreq to
+                * hopefully wake up the peer.
+                */
+               if (!reqsent && (lts < (thresh * -2))) {
+                       dhcp_failover_send_poolreq(state);
+                       reqsent = 1;
                 }
  
-               tenper = (p -> backup_leases + p -> free_leases) / 10;
-               if (tenper == 0)
-                       tenper = 1;
-               if (lts > tenper) {
-                   log_info ("pool %lx %s  total %d  free %d  %s %d  lts %d",
-                         (unsigned long)p,
-                         (p -> shared_network ?
-                          p -> shared_network -> name : ""), p -> lease_count,
-                         p -> free_leases, "backup", p -> backup_leases, lts);
+               /* Do not go through the process unless at least we have
+                * more than thresh% more leases than the peer.
+                */
+               if (lts <= thresh) {
+                       log_info("pool %lx %s: lts <= max-lease-misbalance "
+                                "(%d), pool rebalance event skipped.",
+                                (unsigned long)p,
+                                (p->shared_network ?
+                                 p->shared_network->name : ""), thresh);
+
+                       /* Recalculate next rebalance event timer. */
+                       dhcp_failover_pool_check(p);
+                       continue;
+               }
  
-                   lease_reference (&lp, *lq, MDL);
+               /* In the first pass, try to allocate leases to the
+                * peer which it would normally be responsible for (if
+                * the lease has a hardware address or client-identifier,
+                * and the load-balance-algorithm chooses the peer to
+                * answer that address), up to a hold% excess in the peer's
+                * favor.  In the second pass, just send the oldest (first
+                * on the list) leases up to a hold% excess in our favor.
+                *
+                * This could make for additional pool rebalance
+                * events, but preserving MAC possession should be
+                * worth it.
+                */
+               pass = 0;
+               lease_reference(&lp, *lq, MDL);
  
-                   while (lp && lts) {
-                       /* Remember the next lease in the list. */
+               /* hold may be zero (consider the case where there are 2
+                * leases, both on one server), therefore use >=.
+                */
+               while (lp && (lts >= (pass ? hold : -hold))) {
                         if (next)
-                           lease_dereference (&next, MDL);
-                       if (lp -> next)
-                           lease_reference (&next, lp -> next, MDL);
-
-                       --lts;
-                       ++leases_queued;
-                       lp -> next_binding_state = peer_lease_state;
-                       lp -> tstp = cur_time;
-                       lp -> starts = cur_time;
-
-                       if (!supersede_lease (lp, (struct lease *)0, 0, 1, 0)
-                           || !write_lease (lp))
-                       {
-                           log_info ("can't commit lease %s on giveaway",
-                                     piaddr (lp -> ip_addr));
+                           lease_dereference(&next, MDL);
+                       if (lp->next)
+                           lease_reference(&next, lp->next, MDL);
+
+                       if (pass || peer_wants_lease(lp)) {
+                           --lts;
+                           ++leases_queued;
+                           lp->next_binding_state = peer_lease_state;
+                           lp->tstp = cur_time;
+                           lp->starts = cur_time;
+
+                           if (!supersede_lease(lp, NULL, 0, 1, 0) ||
+                               !write_lease(lp))
+                                   log_error("can't commit lease %s on "
+                                             "giveaway", piaddr(lp->ip_addr));
                         }
  
-                       lease_dereference (&lp, MDL);
+                       lease_dereference(&lp, MDL);
                         if (next)
-                               lease_reference (&lp, next, MDL);
-                   }
-                   if (next)
-                       lease_dereference (&next, MDL);
-                   if (lp)
-                       lease_dereference (&lp, MDL);
-
-               }
-               if (lts > 1) {
-                       log_info ("lease imbalance - lts = %d", lts);
+                               lease_reference(&lp, next, MDL);
+                       else if (!pass) {
+                               pass = 1;
+                               lease_reference(&lp, *lq, MDL);
+                       }
                 }
+
+               if (next)
+                       lease_dereference(&next, MDL);
+               if (lp)
+                       lease_dereference(&lp, MDL);
+
+               if (lts > thresh)
+                       log_error("lease imbalance persists - lts = %d", lts);
+ 
+               /* Recalculate next rebalance event timer. */
+               dhcp_failover_pool_check(p);
             }
         }
-       commit_leases();
-       dhcp_failover_send_poolresp (state, leases_queued);
-       dhcp_failover_send_updates (state);
+
+       if (leases_queued)
+               commit_leases();
+
         return leases_queued;
  }
  
-int dhcp_failover_pool_check (struct pool *pool)
+/* dhcp_failover_pool_check: Called whenever FREE or BACKUP leases change
+ * states, on both servers.  Check the scheduled time to rebalance the pool
+ * and lower it if applicable.
+ */
+void
+dhcp_failover_pool_check(struct pool *pool)
  {
-       int lts;
-       struct lease *lp;
-       int tenper;
+       dhcp_failover_state_t *peer;
+       TIME est1, est2;
  
-       if (!pool -> failover_peer ||
-           pool -> failover_peer -> me.state != normal)
-               return 0;
+       peer = pool->failover_peer;
+
+       if(!peer || peer->me.state != normal)
+               return;
  
-       if (pool -> failover_peer -> i_am == primary)
-               lts = (pool -> backup_leases - pool -> free_leases) / 2;
+       /* Estimate the time left until lease exhaustion.
+        * The first lease on the backup or free lists is also the oldest
+        * lease.  It is reasonable to guess that it will take at least
+        * as much time for a pool to run out of leases, as the present
+        * age of the oldest lease (seconds since it expired).
+        *
+        * Note that this isn't so sane of an assumption if the oldest
+        * lease is a virgin (ends = 0), we wind up sending this against
+        * the max_balance bounds check.
+        */
+       if(pool->free && pool->free->ends < cur_time)
+               est1 = cur_time - pool->free->ends;
         else
-               lts = (pool -> free_leases - pool -> backup_leases) / 2;
-
-       log_info ("pool %lx %s total %d  free %d  backup %d  lts %d",
-                 (unsigned long)pool,
-                 pool -> shared_network ? pool -> shared_network -> name : "",
-                 pool -> lease_count,
-                 pool -> free_leases, pool -> backup_leases, lts);
-
-       tenper = (pool -> backup_leases + pool -> free_leases) / 10;
-       if (tenper == 0)
-               tenper = 1;
-       if (lts > tenper) {
-               /* XXX What about multiple pools? */
-               if (pool -> failover_peer -> i_am == secondary) {
-                       /* Ask the primary to send us leases. */
-                       dhcp_failover_send_poolreq (pool -> failover_peer);
-                       return 1;
-               } else {
-                       /* Figure out how many leases to skip on the backup
-                          list.   We skip the earliest leases on the list
-                          to reduce the chance of trying to steal a lease
-                          that the secondary is about to allocate. */
-                       int i = pool -> backup_leases - lts;
-                       log_info ("Taking %d leases from secondary.", lts);
-                       for (lp = pool -> backup; lp; lp = lp -> next) {
-                               /* Skip to the last leases on the free
-                                  list, because they are less likely
-                                  to already have been allocated. */
-                               if (i)
-                                       --i;
-                               else {
-                                       lp -> desired_binding_state = FTS_FREE;
-                                       dhcp_failover_queue_update (lp, 1);
-                                       --lts;
-                               }
-                       }
-                       if (lts)
-                               log_info ("failed to take %d leases.", lts);
-               }
+               est1 = 0;
+
+       if(pool->backup && pool->backup->ends < cur_time)
+               est2 = cur_time - pool->backup->ends;
+       else
+               est2 = 0;
+
+       /* We don't want to schedule rebalance for when we think we'll run
+        * out of leases, we want to schedule the rebalance for when we think
+        * the disparity will be 'large enough' to warrant action.
+        */
+       est1 = ((est1 * peer->max_lease_misbalance) + 50) / 100;
+       est2 = ((est2 * peer->max_lease_misbalance) + 50) / 100;
+
+       /* Guess when the local system will begin issuing POOLREQ panic
+        * attacks because "max_lease_misbalance*2" has been exceeded.
+        */
+       if(peer->i_am == primary)
+               est1 *= 2;
+       else
+               est2 *= 2;
+
+       /* Select the smallest time. */
+       if(est1 > est2)
+               est1 = est2;
+
+       /* Bounded by the maximum configured value. */
+       if(est1 > peer->max_balance)
+               est1 = peer->max_balance;
+
+       /* Project this time into the future. */
+       est1 += cur_time;
+
+       /* Do not move the time down under the minimum. */
+       est2 = peer->last_balance + peer->min_balance;
+       if(peer->last_balance && (est1 < est2))
+               est1 = est2;
+
+       /* Do not move the time forward, or reset to the same time. */
+       if(peer->sched_balance) {
+               if (est1 >= peer->sched_balance)
+                       return;
+
+               /* We are about to schedule the time down, cancel the
+                * current timeout.
+                */
+               cancel_timeout(dhcp_failover_pool_rebalance, peer);
         }
-       return 0;
+
+       /* The time is different, and lower, use it. */
+       peer->sched_balance = est1;
+
+#if defined(DEBUG_FAILOVER_TIMING)
+       log_info("add_timeout +%d dhcp_failover_pool_rebalance",
+                est1 - cur_time);
+#endif
+       add_timeout(est1, dhcp_failover_pool_rebalance, peer,
+                       (tvref_t)dhcp_failover_state_reference,
+                       (tvunref_t)dhcp_failover_state_dereference);
  }
  
  int dhcp_failover_state_pool_check (dhcp_failover_state_t *state)
@@ -2370,9 +2489,7 @@ int dhcp_failover_state_pool_check (dhcp_failover_state_t *state)
                 for (p = s -> pools; p; p = p -> next) {
                         if (p -> failover_peer != state)
                                 continue;
-                       /* Only need to request rebalance on one pool. */
-                       if (dhcp_failover_pool_check (p))
-                               return 1;
+                       dhcp_failover_pool_check (p);
                 }
         }
         return 0;
@@ -4600,6 +4717,7 @@ isc_result_t dhcp_failover_process_bind_update (dhcp_failover_state_t *state,
         int reason = FTR_MISC_REJECT;
         const char *message;
         int new_binding_state;
+       int send_to_backup = 0;
  
         ia.len = sizeof msg -> assigned_addr;
         memcpy (ia.iabuf, &msg -> assigned_addr, ia.len);
@@ -4784,8 +4902,17 @@ isc_result_t dhcp_failover_process_bind_update (dhcp_failover_state_t *state,
                     new_binding_state == FTS_RELEASED ||
                     new_binding_state == FTS_RESET) {
                         lt -> next_binding_state = FTS_FREE;
-               } else
+
+                       /* Mac address affinity.  Assign the lease to
+                        * BACKUP state if we are the primary and the
+                        * peer is more likely to reallocate this lease
+                        * to a returning client.
+                        */
+                       if (state->i_am == primary)
+                               send_to_backup = peer_wants_lease(lt);
+               } else {
                         lt -> next_binding_state = new_binding_state;
+               }
                 msg -> binding_status = lt -> next_binding_state;
         }
  
@@ -4795,10 +4922,27 @@ isc_result_t dhcp_failover_process_bind_update (dhcp_failover_state_t *state,
                 message = "database update failed";
               bad:
                 dhcp_failover_send_bind_ack (state, msg, reason, message);
+               goto out;
         } else {
                 dhcp_failover_queue_ack (state, msg);
         }
  
+       /* If it is probably wise, assign lease to backup state if the peer
+        * is not already hoarding leases.
+        */
+       if (send_to_backup && secondary_not_hoarding(state, lt->pool)) {
+               lt->next_binding_state = FTS_BACKUP;
+               lt->tstp = cur_time;
+               lt->starts = cur_time;
+
+               if (!supersede_lease(lt, NULL, 0, 1, 0) ||
+                   !write_lease(lt))
+                       log_error("can't commit lease %s for mac addr "
+                                 "affinity", piaddr(lt->ip_addr));
+
+               dhcp_failover_send_updates(state);
+       }
+
        out:
         if (lt)
                 lease_dereference (&lt, MDL);
@@ -4808,6 +4952,34 @@ isc_result_t dhcp_failover_process_bind_update (dhcp_failover_state_t *state,
         return ISC_R_SUCCESS;
  }
  
+/* This was hairy enough I didn't want to do it all in an if statement.
+ *
+ * Returns: Truth is the secondary is allowed to get more leases based upon
+ * MAC address affinity.  False otherwise.
+ */
+static INLINE int
+secondary_not_hoarding(dhcp_failover_state_t *state, struct pool *p) {
+       int total;
+       int hold;
+       int lts;
+
+       total = p->free_leases + p->backup_leases;
+
+       /* How many leases is one side or the other allowed to "hold"? */
+       hold = ((total * state->max_lease_ownership) + 50) / 100;
+
+       /* If we were to send leases (or if the secondary were to send us
+        * leases in the negative direction), how many would that be?
+        */
+       lts = (p->free_leases - p->backup_leases) / 2;
+
+       /* The peer is not hoarding leases if we would send them more leases
+        * (or they would take fewer leases) than the maximum they are allowed
+        * to hold (the negative hold).
+        */
+       return(lts > -hold);
+}
+
  isc_result_t dhcp_failover_process_bind_ack (dhcp_failover_state_t *state,
                                              failover_message_t *msg)
  {
@@ -4864,6 +5036,26 @@ isc_result_t dhcp_failover_process_bind_ack (dhcp_failover_state_t *state,
                         lease->next_binding_state = FTS_FREE;
                 supersede_lease(lease, (struct lease *)0, 0, 0, 0);
                 write_lease(lease);
+
+               /* Lease has returned to FREE state from the
+                * transitional states.  If the lease 'belongs'
+                * to a client that would be served by the
+                * peer, process a binding update now to send
+                * the lease to backup state.
+                */
+               if (state->i_am == primary &&
+                   peer_wants_lease(lease)) {
+                       lease->next_binding_state = FTS_BACKUP;
+                       lease->tstp = cur_time;
+                       lease->starts = cur_time;
+
+                       if (!supersede_lease(lease, NULL, 0, 1, 0) ||
+                           !write_lease(lease))
+                               log_error("can't commit lease %s for "
+                                         "client affinity",
+                                         piaddr(lease->ip_addr));
+               }
+
                 if (state->me.state == normal)
                         commit_leases ();
         } else {
@@ -5242,6 +5434,40 @@ int load_balance_mine (struct packet *packet, dhcp_failover_state_t *state)
                 return !hm;
  }
  
+/* The inverse of load_balance_mine ("load balance theirs").  We can't
+ * use the regular load_balance_mine() and invert it because of the case
+ * where there might not be an HBA, and we want to indicate false here
+ * in this case only.
+ */
+int
+peer_wants_lease(struct lease *lp)
+{
+       dhcp_failover_state_t *state;
+       unsigned char hbaix;
+       int hm;
+
+       if (!lp->pool)
+               return 0;
+
+       state = lp->pool->failover_peer;
+
+       if (!state || !state->hba)
+               return 0;
+
+       if (lp->uid_len)
+               hbaix = loadb_p_hash (lp->uid, lp->uid_len);
+       else
+               hbaix = loadb_p_hash (lp->hardware_addr.hbuf,
+                                     lp->hardware_addr.hlen);
+
+       hm = state->hba[(hbaix >> 3) & 0x1F] & (1 << (hbaix & 0x07));
+
+       if (state->i_am == primary)
+               return !hm;
+       else
+               return hm;
+}
+
  /* This deals with what to do with bind updates when
     we're in the normal state 
  
diff --git a/server/mdb.c b/server/mdb.c

index 3d10960524178a301a34a369250acd9d8b880f68..86b060a1ec45dad4b651138a2041aaf7b388e089 100644 (file)
--- a/server/mdb.c
+++ b/server/mdb.c
@@ -34,7 +34,7 @@
  
  #ifndef lint
  static char copyright[] =
-"$Id: mdb.c,v 1.80 2006/06/09 15:51:02 dhankins Exp $ Copyright (c) 2004-2006 Internet Systems Consortium.  All rights reserved.\n";
+"$Id: mdb.c,v 1.81 2006/06/16 19:26:45 dhankins Exp $ Copyright (c) 2004-2006 Internet Systems Consortium.  All rights reserved.\n";
  #endif /* not lint */
  
  #include "dhcpd.h"
@@ -875,8 +875,9 @@ int supersede_lease (comp, lease, commit, propogate, pimmediate)
         int enter_hwaddr = 0;
         struct lease *lp, **lq, *prev;
         TIME lp_next_state;
-
  #if defined (FAILOVER_PROTOCOL)
+       int do_pool_check = 0;
+
         /* We must commit leases before sending updates regarding them
            to failover peers.  It is, therefore, an error to set pimmediate
            and not commit. */
@@ -1070,6 +1071,10 @@ int supersede_lease (comp, lease, commit, propogate, pimmediate)
                 lq = &comp -> pool -> free;
                 if (!(comp->flags & RESERVED_LEASE))
                         comp->pool->free_leases--;
+
+#if defined(FAILOVER_PROTOCOL)
+               do_pool_check = 1;
+#endif
                 break;
  
               case FTS_ACTIVE:
@@ -1090,6 +1095,10 @@ int supersede_lease (comp, lease, commit, propogate, pimmediate)
                 lq = &comp -> pool -> backup;
                 if (!(comp->flags & RESERVED_LEASE))
                         comp->pool->backup_leases--;
+
+#if defined(FAILOVER_PROTOCOL)
+               do_pool_check = 1;
+#endif
                 break;
  
               default:
@@ -1180,6 +1189,8 @@ int supersede_lease (comp, lease, commit, propogate, pimmediate)
                 if (!dhcp_failover_queue_update (comp, pimmediate))
                         return 0;
         }
+       if (do_pool_check && comp->pool->failover_peer)
+               dhcp_failover_pool_check(comp->pool);
  #endif
  
         /* If the current binding state has already expired, do an
author	David Hankins <dhankins@isc.org>
	Fri, 16 Jun 2006 19:26:45 +0000 (19:26 +0000)
committer	David Hankins <dhankins@isc.org>
	Fri, 16 Jun 2006 19:26:45 +0000 (19:26 +0000)
RELNOTES		patch \| blob \| blame \| history
common/conflex.c		patch \| blob \| blame \| history
includes/dhcpd.h		patch \| blob \| blame \| history
includes/dhctoken.h		patch \| blob \| blame \| history
includes/failover.h		patch \| blob \| blame \| history
server/confpars.c		patch \| blob \| blame \| history
server/dhcp.c		patch \| blob \| blame \| history
server/dhcpd.conf.5		patch \| blob \| blame \| history
server/failover.c		patch \| blob \| blame \| history
server/mdb.c		patch \| blob \| blame \| history