From: Ronnie Sahlberg Date: Thu, 7 Jun 2007 00:19:24 +0000 (+1000) Subject: distribute the takenover nodes more evenly among the surviving nodes X-Git-Tag: tevent-0.9.20~348^2~2543^2 X-Git-Url: http://git.ipfire.org/gitweb.cgi?a=commitdiff_plain;h=e0fbd04697d05edd32e419993a0f0e4a19633626;p=thirdparty%2Fsamba.git distribute the takenover nodes more evenly among the surviving nodes (This used to be ctdb commit 25d18b8bab399cc5d9def081086925896f8de3e9) --- diff --git a/ctdb/takeover/ctdb_takeover.c b/ctdb/takeover/ctdb_takeover.c index a90a1a08c9c..537c3d29603 100644 --- a/ctdb/takeover/ctdb_takeover.c +++ b/ctdb/takeover/ctdb_takeover.c @@ -412,25 +412,44 @@ int ctdb_takeover_run(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap) int i, j; int ret; struct ctdb_public_ip ip; - - /* work out which node will look after each public IP */ + int takeover_node = 0; + + /* Work out which node will look after each public IP. + * takeover_node cycles over the nodes and is incremented each time a + * node has been assigned to take over for another node. + * This spreads the failed nodes out across the remaining + * nodes more evenly + */ for (i=0;inum;i++) { if ((nodemap->nodes[i].flags & NODE_FLAGS_CONNECTED) && !(nodemap->nodes[i].flags & NODE_FLAGS_DISABLED)) { ctdb->nodes[i]->takeover_vnn = nodemap->nodes[i].vnn; } else { + j = takeover_node; + /* assign this dead nodes IP to the next higher node */ - for (j=(i+1)%nodemap->num; - j != i; - j=(j+1)%nodemap->num) { + ctdb->nodes[i]->takeover_vnn = -1; + while(1) { if ((nodemap->nodes[j].flags & NODE_FLAGS_CONNECTED) && !(nodemap->nodes[j].flags & NODE_FLAGS_DISABLED) && ctdb_same_subnet(ctdb->nodes[j]->public_address, ctdb->nodes[i]->public_address, ctdb->nodes[j]->public_netmask_bits)) { ctdb->nodes[i]->takeover_vnn = nodemap->nodes[j].vnn; + takeover_node = (takeover_node+1)%nodemap->num; + break; + } + /* ok, try the next node instead then */ + j=(j+1)%nodemap->num; + + /* If we wrapped back to the takeover node node + * without finding any ENABLED nodes + * there is not much to do + */ + if (j==takeover_node) { break; } + } /* if no enabled node can take it, then we @@ -438,10 +457,9 @@ int ctdb_takeover_run(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap) probably means that some subsystem (such as NFS) is sick on all nodes. Best we can do is to keep the other services up. */ - if (j == i) { - for (j=(i+1)%nodemap->num; - j != i; - j=(j+1)%nodemap->num) { + if (ctdb->nodes[i]->takeover_vnn == -1) { + j = takeover_node; + while(1) { if ((nodemap->nodes[j].flags & NODE_FLAGS_CONNECTED) && ctdb_same_subnet(ctdb->nodes[j]->public_address, ctdb->nodes[i]->public_address, @@ -449,12 +467,23 @@ int ctdb_takeover_run(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap) ctdb->nodes[i]->takeover_vnn = nodemap->nodes[j].vnn; DEBUG(0,("All available nodes disabled for %s - using a connected node\n", ctdb->nodes[i]->public_address)); + takeover_node = (takeover_node+1)%nodemap->num; + break; + } + /* ok, try the next node instead then */ + j=(j+1)%nodemap->num; + + /* If we wrapped back to the takeover node node + * without finding any ENABLED nodes + * there is not much to do + */ + if (j==takeover_node) { break; } } } - if (j == i) { + if (ctdb->nodes[i]->takeover_vnn == -1) { DEBUG(0,(__location__ " No node available on same network to take %s\n", ctdb->nodes[i]->public_address)); ctdb->nodes[i]->takeover_vnn = -1;