* The tree lock must be held for the result to be valid.
*/
static bool
-is_leaf(dns_rbtnode_t *node) {
+is_last_node_on_its_level(dns_rbtnode_t *node) {
return (node->parent != NULL && node->parent->down == node &&
node->left == NULL && node->right == NULL);
}
continue;
}
- if (is_leaf(node) && rbtdb->loop != NULL) {
+ if (is_last_node_on_its_level(node) && rbtdb->loop != NULL) {
send_to_prune_tree(
rbtdb, node,
isc_rwlocktype_write DNS__DB_FLARG_PASS);
if (write_locked) {
/*
- * If this node is the only one in the level it's in, deleting
- * this node may recursively make its parent the only node in
- * the parent level; if so, and if no one is currently using
- * the parent node, this is almost the only opportunity to
- * clean it up. But the recursive cleanup is not that trivial
- * since the child and parent may be in different lock buckets,
- * which would cause a lock order reversal problem. To avoid
- * the trouble, we'll dispatch a separate event for batch
- * cleaning. We need to check whether we're deleting the node
- * as a result of pruning to avoid infinite dispatching.
- * Note: pruning happens only when a loop has been set for the
- * rbtdb. If the user of the rbtdb chooses not to set a loop,
- * it's their responsibility to purge stale leaves (e.g. by
- * periodic walk-through).
+ * If this node is the only one left on its RBTDB level,
+ * attempt pruning the RBTDB (i.e. deleting empty nodes that
+ * are ancestors of 'node' and are not interior nodes) starting
+ * from this node (see prune_tree()). The main reason this is
+ * not done immediately, but asynchronously, is that the
+ * ancestors of 'node' are almost guaranteed to belong to
+ * different node buckets and we don't want to do juggle locks
+ * right now.
+ *
+ * Since prune_tree() also calls dns__rbtdb_decref(), check the
+ * value of the 'pruning' parameter (which is only set to
+ * 'true' in the dns__rbtdb_decref() call present in
+ * prune_tree()) to prevent an infinite loop and to allow a
+ * node sent to prune_tree() to be deleted by the delete_node()
+ * call in the code branch below.
*/
-
- if (!pruning && is_leaf(node) && rbtdb->loop != NULL) {
+ if (!pruning && is_last_node_on_its_level(node) &&
+ rbtdb->loop != NULL)
+ {
send_to_prune_tree(rbtdb, node,
*nlocktypep DNS__DB_FLARG_PASS);
no_reference = false;
} else {
- /* We can now delete the node. */
-
+ /*
+ * The node can now be deleted.
+ */
delete_node(rbtdb, node);
}
} else {
}
/*
- * Prune the tree by cleaning up single leaves. A single execution of this
- * function cleans up a single node; if the parent of the latter becomes a
- * single leaf on its own level as a result, the parent is then also sent to
- * this function.
+ * Prune the RBTDB tree of trees. Start by attempting to delete a node that is
+ * the only one left on its RBTDB level (see the send_to_prune_tree() call in
+ * dns__rbtdb_decref()). Then, if the node has a parent (which can either
+ * exist on the same RBTDB level or on an upper RBTDB level), check whether the
+ * latter is an interior node (i.e. a node with a non-NULL 'down' pointer). If
+ * the parent node is not an interior node, attempt deleting the parent node as
+ * well and then move on to examining the parent node's parent, etc. Continue
+ * traversing the RBTDB tree until a node is encountered that is still an
+ * interior node after the previously-processed node gets deleted.
+ *
+ * It is acceptable for a node sent to this function to NOT be deleted in the
+ * process (e.g. if it gets reactivated in the meantime). Furthermore, node
+ * deletion is not a prerequisite for continuing RBTDB traversal.
+ *
+ * This function gets called once for every "starting node" and it continues
+ * traversing the RBTDB until the stop condition is met. In the worst case,
+ * the number of nodes processed by a single execution of this function is the
+ * number of tree levels, which is at most the maximum number of domain name
+ * labels (127); however, it should be much smaller in practice and deleting
+ * empty RBTDB nodes is critical to keeping the amount of memory used by the
+ * cache memory context within the configured limit anyway.
*/
static void
prune_tree(void *arg) {
isc_mem_put(rbtdb->common.mctx, prune, sizeof(*prune));
TREE_WRLOCK(&rbtdb->tree_lock, &tlocktype);
-
- parent = node->parent;
-
NODE_WRLOCK(&rbtdb->node_locks[locknum].lock, &nlocktype);
- dns__rbtdb_decref(rbtdb, node, 0, &nlocktype, &tlocktype, true,
- true DNS__DB_FILELINE);
- NODE_UNLOCK(&rbtdb->node_locks[locknum].lock, &nlocktype);
+ do {
+ parent = node->parent;
+ dns__rbtdb_decref(rbtdb, node, 0, &nlocktype, &tlocktype, true,
+ true DNS__DB_FILELINE);
- if (parent != NULL && is_leaf(parent)) {
- NODE_WRLOCK(&rbtdb->node_locks[parent->locknum].lock,
- &nlocktype);
- send_to_prune_tree(rbtdb, parent, nlocktype);
- NODE_UNLOCK(&rbtdb->node_locks[parent->locknum].lock,
- &nlocktype);
- }
+ /*
+ * Check whether the parent is an interior node. Note that it
+ * might have been one before the dns__rbtdb_decref() call on
+ * the previous line, but decrementing the reference count for
+ * 'node' could have caused 'node->parent->down' to become
+ * NULL.
+ */
+ if (parent != NULL && parent->down == NULL) {
+ /*
+ * Keep the node lock if possible; otherwise, release
+ * the old lock and acquire one for the parent.
+ */
+ if (parent->locknum != locknum) {
+ NODE_UNLOCK(&rbtdb->node_locks[locknum].lock,
+ &nlocktype);
+ locknum = parent->locknum;
+ NODE_WRLOCK(&rbtdb->node_locks[locknum].lock,
+ &nlocktype);
+ }
+ /*
+ * We need to gain a reference to the parent node
+ * before decrementing it in the next iteration.
+ */
+ dns__rbtdb_newref(rbtdb, parent,
+ nlocktype DNS__DB_FLARG_PASS);
+ } else {
+ parent = NULL;
+ }
+
+ node = parent;
+ } while (node != NULL);
+ NODE_UNLOCK(&rbtdb->node_locks[locknum].lock, &nlocktype);
TREE_UNLOCK(&rbtdb->tree_lock, &tlocktype);
dns_db_detach((dns_db_t **)&rbtdb);