.\" see ``http://www.vix.com''. To learn more about Nominum, Inc., see
.\" ``http://www.nominum.com''.
.\"
-.\" $Id: dhcpd.conf.5,v 1.72 2006/06/15 17:49:49 dhankins Exp $
+.\" $Id: dhcpd.conf.5,v 1.73 2006/06/16 19:26:45 dhankins Exp $
.\"
.TH dhcpd.conf 5
.SH NAME
.RE
.PP
The
+.I max-lease-misbalance
+statement
+.RS 0.25i
+.PP
+.B max-lease-misbalance \fIinteger\fR\fB;\fR
+.PP
+The \fBmax-lease-misbalance\fR statement tells the DHCP server what
+percentage of total free leases (as defined as the total number of
+leases in either the FREE or BACKUP states) a peer is allowed to own
+before a rebalance check is made. Configuring higher values causes
+the server to rebalance less frequently, but permits a larger misbalance
+between the FREE and BACKUP lease pools. Configuring a lower value
+causes the server to rebalance more frequently, but keeps the pools more
+balanced. ISC DHCP servers no longer send POOLREQ messages unless the
+misbalance is at least twice this percentage in the peer's favor. Valid
+values are between 0 and 100. The default is 15.
+.RE
+.PP
+The
+.I max-lease-ownership
+statement
+.RS 0.25i
+.PP
+.B max-lease-ownership \fIinteger\fR\fB;\fR
+.PP
+The \fBmax-lease-ownership\fR statement tells the DHCP server what
+percentage of total free leases either it or its peer are normally allowed to
+own in excess of balance for the purpose of MAC Address Affinity. When a
+server undergoes a lease rebalancing operation, it first tries to move as
+many leases as it can to the peer whose previous client was Load-Balanced to
+that peer (as governed by the Load Balance Algorithm, see the \fBsplit\fR
+configuration value). The \fBmax-lease-ownership\fR value determines the
+maximum percentage of leases either server will hold before giving its
+peer the oldest leases (regardless of the previous client's place in the
+Load Balance algorithm). Valid values are between 0 and 100, and should
+probably be less than the \fBmax-lease-misbalance\fR value. Larger values
+will allow servers to retain leases to reallocate to returning clients,
+smaller values promote pool balance. The default is 10.
+.PP
+The
.I max-response-delay
statement
.RS 0.25i
specified.
.RE
.PP
-The
-.I max-unacked-updates
-statement
-.RS 0.25i
-.PP
.B max-unacked-updates \fIcount\fR\fB;\fR
.PP
The \fBmax-unacked-updates\fR statement tells the DHCP server how
parameter must be specified.
.RE
.PP
+The
+.I min-balance
+and
+.I max-balance
+statements
+.RS 0.25i
+.PP
+.B min-balance \fIseconds\fR\fB;\fR
+.B max-balance \fIseconds\fR\fB;\fR
+.PP
+The DHCP Server schedules pool rebalance events at a time between these
+two values, estimated to be when the the \fBmax-lease-misbalance\fR percent
+of leases have been allocated by its peer. This estimate is reached from
+however many seconds have elapsed since the oldest lease in the failover
+peer's pool has been expired.
+.PP
+The \fBmin-balance\fR value defaults to 60, one minute, and the
+\fBmax-balance\fR value defaults to 3600, one hour.
+.PP
+Lease rebalancing events can be CPU intensive, particular on installations
+where failover peers may have large numbers of pools and addresses to
+examine, so these parameters should be used to keep the estimation of
+the need for pool rebalance sane...not so long that you are in danger of
+exhausting your pool, not so short that your server is constantly
+rebalancing.
+.RE
+.PP
The
.I mclt
statement
The split statement specifies the split between the primary and
secondary for the purposes of load balancing. Whenever a client
makes a DHCP request, the DHCP server runs a hash on the client
-identification. If the hash comes out to less than the split value,
-the primary answers. If it comes out to equal to or more than the
-split, the secondary answers. The only meaningful value is 128, and can
-only be configured on the primary.
+identification, resulting in value from 0 to 255. This is used as
+an index into a 256 bit field. If the bit at that index is set,
+the primary is responsible. If the bit at that index is not set,
+the secondary is responsible. The \fBsplit\fR value determines
+how many of the leading bits are set to one. So, in practice, higher
+split values will cause the primary to serve more clients than the
+secondary. Lower split values, the converse. Legal values are between
+0 and 255, of which the most reasonable is 128.
.RE
.PP
The
00:00:00:00:00:00:00:00:00:00:00:00:00:00:00:00;
.fi
.PP
-This is equivalent to a \fBsplit 128;\fR statement. You must only have
-\fBsplit\fR or \fBhba\fR defined, never both. For most cases, the
-fine-grained control that \fBhba\fR offers isn't necessary, and \fBsplit\fR
-should be used. As such, the use of \fBhba\fR is deprecated.
+This is equivalent to a \fBsplit 128;\fR statement, and identical. The
+following two examples are also equivalent to a \fBsplit\fR of 128, but
+are not identical:
+.PP
+.nf
+ hba aa:aa:aa:aa:aa:aa:aa:aa:aa:aa:aa:aa:aa:aa:aa:aa:
+ aa:aa:aa:aa:aa:aa:aa:aa:aa:aa:aa:aa:aa:aa:aa:aa;
+
+ hba 55:55:55:55:55:55:55:55:55:55:55:55:55:55:55:55:
+ 55:55:55:55:55:55:55:55:55:55:55:55:55:55:55:55;
+.fi
+.PP
+They are equivalent, because half the bits are set to 0, half are set to
+1 (0xa and 0x5 are 1010 and 0101 binary respectively) and consequently this
+would roughly divide the clients equally between the servers. They are not
+identical, because the actual peers this would load balance to each server
+are different for each example.
+.PP
+You must only have \fBsplit\fR or \fBhba\fR defined, never both. For most
+cases, the fine-grained control that \fBhba\fR offers isn't necessary, and
+\fBsplit\fR should be used.
.RE
.PP
The
#ifndef lint
static char copyright[] =
-"$Id: failover.c,v 1.61 2006/05/04 21:14:21 dhankins Exp $ Copyright (c) 2004-2006 Internet Systems Consortium. All rights reserved.\n";
+"$Id: failover.c,v 1.62 2006/06/16 19:26:45 dhankins Exp $ Copyright (c) 2004-2006 Internet Systems Consortium. All rights reserved.\n";
#endif /* not lint */
#include "dhcpd.h"
static isc_result_t failover_message_dereference (failover_message_t **,
const char *file, int line);
+static void dhcp_failover_pool_reqbalance(dhcp_failover_state_t *state);
+static int dhcp_failover_pool_dobalance(dhcp_failover_state_t *state);
+static INLINE int secondary_not_hoarding(dhcp_failover_state_t *state,
+ struct pool *p);
+
+
void dhcp_failover_startup ()
{
dhcp_failover_state_t *state;
dhcp_failover_process_update_done (state,
link -> imsg);
} else if (link -> imsg -> type == FTM_POOLREQ) {
- dhcp_failover_pool_rebalance (state);
+ dhcp_failover_pool_reqbalance(state);
} else if (link -> imsg -> type == FTM_POOLRESP) {
log_info ("pool response: %ld leases",
(unsigned long)
return ISC_R_SUCCESS;
}
-int dhcp_failover_pool_rebalance (dhcp_failover_state_t *state)
+/* Entry from timer. */
+void dhcp_failover_pool_rebalance(void *failover_state)
{
- int lts;
+ dhcp_failover_state_t *state;
+
+ state = (dhcp_failover_state_t *)failover_state;
+
+ if (dhcp_failover_pool_dobalance(state))
+ dhcp_failover_send_updates(state);
+}
+
+/* Entry from POOLREQ. */
+static void dhcp_failover_pool_reqbalance(dhcp_failover_state_t *state)
+{
+ int queued;
+
+ queued = dhcp_failover_pool_dobalance(state);
+
+ dhcp_failover_send_poolresp(state, queued);
+
+ if (queued)
+ dhcp_failover_send_updates(state);
+ else
+ log_info("peer %s: Got POOLREQ, answering negatively! "
+ "Peer may be out of leases or database inconsistent.",
+ state->name);
+}
+
+/* Do the meat of the work common to all forms of pool rebalance. */
+static int dhcp_failover_pool_dobalance(dhcp_failover_state_t *state)
+{
+ int lts, total, thresh, hold, pass;
int leases_queued = 0;
+ int reqsent = 0;
struct lease *lp = (struct lease *)0;
struct lease *next = (struct lease *)0;
struct shared_network *s;
binding_state_t peer_lease_state;
binding_state_t my_lease_state;
struct lease **lq;
- int tenper;
- if (state -> me.state != normal || state -> i_am == secondary)
+ if (state -> me.state != normal)
return 0;
- for (s = shared_networks; s; s = s -> next) {
- for (p = s -> pools; p; p = p -> next) {
- if (p -> failover_peer != state)
+ state->last_balance = cur_time;
+ cancel_timeout(dhcp_failover_pool_rebalance, state);
+ state->sched_balance = 0;
+
+ for (s = shared_networks ; s ; s = s->next) {
+ for (p = s->pools ; p ; p = p->next) {
+ if (p->failover_peer != state)
continue;
/* Right now we're giving the peer half of the free leases.
of leases the peer has, will be how many more leases we
have than the peer has. So if we send half that number
to the peer, we should be even. */
- if (p -> failover_peer -> i_am == primary) {
- lts = (p -> free_leases - p -> backup_leases) / 2;
+ if (p->failover_peer->i_am == primary) {
+ lts = (p->free_leases - p->backup_leases) / 2;
peer_lease_state = FTS_BACKUP;
my_lease_state = FTS_FREE;
- lq = &p -> free;
+ lq = &p->free;
} else {
- lts = (p -> backup_leases - p -> free_leases) / 2;
+ lts = (p->backup_leases - p->free_leases) / 2;
peer_lease_state = FTS_FREE;
my_lease_state = FTS_BACKUP;
- lq = &p -> backup;
+ lq = &p->backup;
+ }
+
+ log_info ("pool %lx %s total %d free %d backup %d lts %d",
+ (unsigned long)p,
+ (p->shared_network ?
+ p->shared_network->name : ""), p->lease_count,
+ p->free_leases, p->backup_leases, lts);
+
+ total = p->backup_leases + p->free_leases;
+
+ thresh = ((total * state->max_lease_misbalance) + 50) / 100;
+ hold = ((total * state->max_lease_ownership) + 50) / 100;
+
+ /* If lts is in the negatives (we need leases) more than
+ * negative double the thresh%, panic and send poolreq to
+ * hopefully wake up the peer.
+ */
+ if (!reqsent && (lts < (thresh * -2))) {
+ dhcp_failover_send_poolreq(state);
+ reqsent = 1;
}
- tenper = (p -> backup_leases + p -> free_leases) / 10;
- if (tenper == 0)
- tenper = 1;
- if (lts > tenper) {
- log_info ("pool %lx %s total %d free %d %s %d lts %d",
- (unsigned long)p,
- (p -> shared_network ?
- p -> shared_network -> name : ""), p -> lease_count,
- p -> free_leases, "backup", p -> backup_leases, lts);
+ /* Do not go through the process unless at least we have
+ * more than thresh% more leases than the peer.
+ */
+ if (lts <= thresh) {
+ log_info("pool %lx %s: lts <= max-lease-misbalance "
+ "(%d), pool rebalance event skipped.",
+ (unsigned long)p,
+ (p->shared_network ?
+ p->shared_network->name : ""), thresh);
+
+ /* Recalculate next rebalance event timer. */
+ dhcp_failover_pool_check(p);
+ continue;
+ }
- lease_reference (&lp, *lq, MDL);
+ /* In the first pass, try to allocate leases to the
+ * peer which it would normally be responsible for (if
+ * the lease has a hardware address or client-identifier,
+ * and the load-balance-algorithm chooses the peer to
+ * answer that address), up to a hold% excess in the peer's
+ * favor. In the second pass, just send the oldest (first
+ * on the list) leases up to a hold% excess in our favor.
+ *
+ * This could make for additional pool rebalance
+ * events, but preserving MAC possession should be
+ * worth it.
+ */
+ pass = 0;
+ lease_reference(&lp, *lq, MDL);
- while (lp && lts) {
- /* Remember the next lease in the list. */
+ /* hold may be zero (consider the case where there are 2
+ * leases, both on one server), therefore use >=.
+ */
+ while (lp && (lts >= (pass ? hold : -hold))) {
if (next)
- lease_dereference (&next, MDL);
- if (lp -> next)
- lease_reference (&next, lp -> next, MDL);
-
- --lts;
- ++leases_queued;
- lp -> next_binding_state = peer_lease_state;
- lp -> tstp = cur_time;
- lp -> starts = cur_time;
-
- if (!supersede_lease (lp, (struct lease *)0, 0, 1, 0)
- || !write_lease (lp))
- {
- log_info ("can't commit lease %s on giveaway",
- piaddr (lp -> ip_addr));
+ lease_dereference(&next, MDL);
+ if (lp->next)
+ lease_reference(&next, lp->next, MDL);
+
+ if (pass || peer_wants_lease(lp)) {
+ --lts;
+ ++leases_queued;
+ lp->next_binding_state = peer_lease_state;
+ lp->tstp = cur_time;
+ lp->starts = cur_time;
+
+ if (!supersede_lease(lp, NULL, 0, 1, 0) ||
+ !write_lease(lp))
+ log_error("can't commit lease %s on "
+ "giveaway", piaddr(lp->ip_addr));
}
- lease_dereference (&lp, MDL);
+ lease_dereference(&lp, MDL);
if (next)
- lease_reference (&lp, next, MDL);
- }
- if (next)
- lease_dereference (&next, MDL);
- if (lp)
- lease_dereference (&lp, MDL);
-
- }
- if (lts > 1) {
- log_info ("lease imbalance - lts = %d", lts);
+ lease_reference(&lp, next, MDL);
+ else if (!pass) {
+ pass = 1;
+ lease_reference(&lp, *lq, MDL);
+ }
}
+
+ if (next)
+ lease_dereference(&next, MDL);
+ if (lp)
+ lease_dereference(&lp, MDL);
+
+ if (lts > thresh)
+ log_error("lease imbalance persists - lts = %d", lts);
+
+ /* Recalculate next rebalance event timer. */
+ dhcp_failover_pool_check(p);
}
}
- commit_leases();
- dhcp_failover_send_poolresp (state, leases_queued);
- dhcp_failover_send_updates (state);
+
+ if (leases_queued)
+ commit_leases();
+
return leases_queued;
}
-int dhcp_failover_pool_check (struct pool *pool)
+/* dhcp_failover_pool_check: Called whenever FREE or BACKUP leases change
+ * states, on both servers. Check the scheduled time to rebalance the pool
+ * and lower it if applicable.
+ */
+void
+dhcp_failover_pool_check(struct pool *pool)
{
- int lts;
- struct lease *lp;
- int tenper;
+ dhcp_failover_state_t *peer;
+ TIME est1, est2;
- if (!pool -> failover_peer ||
- pool -> failover_peer -> me.state != normal)
- return 0;
+ peer = pool->failover_peer;
+
+ if(!peer || peer->me.state != normal)
+ return;
- if (pool -> failover_peer -> i_am == primary)
- lts = (pool -> backup_leases - pool -> free_leases) / 2;
+ /* Estimate the time left until lease exhaustion.
+ * The first lease on the backup or free lists is also the oldest
+ * lease. It is reasonable to guess that it will take at least
+ * as much time for a pool to run out of leases, as the present
+ * age of the oldest lease (seconds since it expired).
+ *
+ * Note that this isn't so sane of an assumption if the oldest
+ * lease is a virgin (ends = 0), we wind up sending this against
+ * the max_balance bounds check.
+ */
+ if(pool->free && pool->free->ends < cur_time)
+ est1 = cur_time - pool->free->ends;
else
- lts = (pool -> free_leases - pool -> backup_leases) / 2;
-
- log_info ("pool %lx %s total %d free %d backup %d lts %d",
- (unsigned long)pool,
- pool -> shared_network ? pool -> shared_network -> name : "",
- pool -> lease_count,
- pool -> free_leases, pool -> backup_leases, lts);
-
- tenper = (pool -> backup_leases + pool -> free_leases) / 10;
- if (tenper == 0)
- tenper = 1;
- if (lts > tenper) {
- /* XXX What about multiple pools? */
- if (pool -> failover_peer -> i_am == secondary) {
- /* Ask the primary to send us leases. */
- dhcp_failover_send_poolreq (pool -> failover_peer);
- return 1;
- } else {
- /* Figure out how many leases to skip on the backup
- list. We skip the earliest leases on the list
- to reduce the chance of trying to steal a lease
- that the secondary is about to allocate. */
- int i = pool -> backup_leases - lts;
- log_info ("Taking %d leases from secondary.", lts);
- for (lp = pool -> backup; lp; lp = lp -> next) {
- /* Skip to the last leases on the free
- list, because they are less likely
- to already have been allocated. */
- if (i)
- --i;
- else {
- lp -> desired_binding_state = FTS_FREE;
- dhcp_failover_queue_update (lp, 1);
- --lts;
- }
- }
- if (lts)
- log_info ("failed to take %d leases.", lts);
- }
+ est1 = 0;
+
+ if(pool->backup && pool->backup->ends < cur_time)
+ est2 = cur_time - pool->backup->ends;
+ else
+ est2 = 0;
+
+ /* We don't want to schedule rebalance for when we think we'll run
+ * out of leases, we want to schedule the rebalance for when we think
+ * the disparity will be 'large enough' to warrant action.
+ */
+ est1 = ((est1 * peer->max_lease_misbalance) + 50) / 100;
+ est2 = ((est2 * peer->max_lease_misbalance) + 50) / 100;
+
+ /* Guess when the local system will begin issuing POOLREQ panic
+ * attacks because "max_lease_misbalance*2" has been exceeded.
+ */
+ if(peer->i_am == primary)
+ est1 *= 2;
+ else
+ est2 *= 2;
+
+ /* Select the smallest time. */
+ if(est1 > est2)
+ est1 = est2;
+
+ /* Bounded by the maximum configured value. */
+ if(est1 > peer->max_balance)
+ est1 = peer->max_balance;
+
+ /* Project this time into the future. */
+ est1 += cur_time;
+
+ /* Do not move the time down under the minimum. */
+ est2 = peer->last_balance + peer->min_balance;
+ if(peer->last_balance && (est1 < est2))
+ est1 = est2;
+
+ /* Do not move the time forward, or reset to the same time. */
+ if(peer->sched_balance) {
+ if (est1 >= peer->sched_balance)
+ return;
+
+ /* We are about to schedule the time down, cancel the
+ * current timeout.
+ */
+ cancel_timeout(dhcp_failover_pool_rebalance, peer);
}
- return 0;
+
+ /* The time is different, and lower, use it. */
+ peer->sched_balance = est1;
+
+#if defined(DEBUG_FAILOVER_TIMING)
+ log_info("add_timeout +%d dhcp_failover_pool_rebalance",
+ est1 - cur_time);
+#endif
+ add_timeout(est1, dhcp_failover_pool_rebalance, peer,
+ (tvref_t)dhcp_failover_state_reference,
+ (tvunref_t)dhcp_failover_state_dereference);
}
int dhcp_failover_state_pool_check (dhcp_failover_state_t *state)
for (p = s -> pools; p; p = p -> next) {
if (p -> failover_peer != state)
continue;
- /* Only need to request rebalance on one pool. */
- if (dhcp_failover_pool_check (p))
- return 1;
+ dhcp_failover_pool_check (p);
}
}
return 0;
int reason = FTR_MISC_REJECT;
const char *message;
int new_binding_state;
+ int send_to_backup = 0;
ia.len = sizeof msg -> assigned_addr;
memcpy (ia.iabuf, &msg -> assigned_addr, ia.len);
new_binding_state == FTS_RELEASED ||
new_binding_state == FTS_RESET) {
lt -> next_binding_state = FTS_FREE;
- } else
+
+ /* Mac address affinity. Assign the lease to
+ * BACKUP state if we are the primary and the
+ * peer is more likely to reallocate this lease
+ * to a returning client.
+ */
+ if (state->i_am == primary)
+ send_to_backup = peer_wants_lease(lt);
+ } else {
lt -> next_binding_state = new_binding_state;
+ }
msg -> binding_status = lt -> next_binding_state;
}
message = "database update failed";
bad:
dhcp_failover_send_bind_ack (state, msg, reason, message);
+ goto out;
} else {
dhcp_failover_queue_ack (state, msg);
}
+ /* If it is probably wise, assign lease to backup state if the peer
+ * is not already hoarding leases.
+ */
+ if (send_to_backup && secondary_not_hoarding(state, lt->pool)) {
+ lt->next_binding_state = FTS_BACKUP;
+ lt->tstp = cur_time;
+ lt->starts = cur_time;
+
+ if (!supersede_lease(lt, NULL, 0, 1, 0) ||
+ !write_lease(lt))
+ log_error("can't commit lease %s for mac addr "
+ "affinity", piaddr(lt->ip_addr));
+
+ dhcp_failover_send_updates(state);
+ }
+
out:
if (lt)
lease_dereference (<, MDL);
return ISC_R_SUCCESS;
}
+/* This was hairy enough I didn't want to do it all in an if statement.
+ *
+ * Returns: Truth is the secondary is allowed to get more leases based upon
+ * MAC address affinity. False otherwise.
+ */
+static INLINE int
+secondary_not_hoarding(dhcp_failover_state_t *state, struct pool *p) {
+ int total;
+ int hold;
+ int lts;
+
+ total = p->free_leases + p->backup_leases;
+
+ /* How many leases is one side or the other allowed to "hold"? */
+ hold = ((total * state->max_lease_ownership) + 50) / 100;
+
+ /* If we were to send leases (or if the secondary were to send us
+ * leases in the negative direction), how many would that be?
+ */
+ lts = (p->free_leases - p->backup_leases) / 2;
+
+ /* The peer is not hoarding leases if we would send them more leases
+ * (or they would take fewer leases) than the maximum they are allowed
+ * to hold (the negative hold).
+ */
+ return(lts > -hold);
+}
+
isc_result_t dhcp_failover_process_bind_ack (dhcp_failover_state_t *state,
failover_message_t *msg)
{
lease->next_binding_state = FTS_FREE;
supersede_lease(lease, (struct lease *)0, 0, 0, 0);
write_lease(lease);
+
+ /* Lease has returned to FREE state from the
+ * transitional states. If the lease 'belongs'
+ * to a client that would be served by the
+ * peer, process a binding update now to send
+ * the lease to backup state.
+ */
+ if (state->i_am == primary &&
+ peer_wants_lease(lease)) {
+ lease->next_binding_state = FTS_BACKUP;
+ lease->tstp = cur_time;
+ lease->starts = cur_time;
+
+ if (!supersede_lease(lease, NULL, 0, 1, 0) ||
+ !write_lease(lease))
+ log_error("can't commit lease %s for "
+ "client affinity",
+ piaddr(lease->ip_addr));
+ }
+
if (state->me.state == normal)
commit_leases ();
} else {
return !hm;
}
+/* The inverse of load_balance_mine ("load balance theirs"). We can't
+ * use the regular load_balance_mine() and invert it because of the case
+ * where there might not be an HBA, and we want to indicate false here
+ * in this case only.
+ */
+int
+peer_wants_lease(struct lease *lp)
+{
+ dhcp_failover_state_t *state;
+ unsigned char hbaix;
+ int hm;
+
+ if (!lp->pool)
+ return 0;
+
+ state = lp->pool->failover_peer;
+
+ if (!state || !state->hba)
+ return 0;
+
+ if (lp->uid_len)
+ hbaix = loadb_p_hash (lp->uid, lp->uid_len);
+ else
+ hbaix = loadb_p_hash (lp->hardware_addr.hbuf,
+ lp->hardware_addr.hlen);
+
+ hm = state->hba[(hbaix >> 3) & 0x1F] & (1 << (hbaix & 0x07));
+
+ if (state->i_am == primary)
+ return !hm;
+ else
+ return hm;
+}
+
/* This deals with what to do with bind updates when
we're in the normal state