]> git.ipfire.org Git - thirdparty/bind9.git/commitdiff
Reduce lock contention during RBTDB tree pruning
authorOndřej Surý <ondrej@isc.org>
Wed, 21 Feb 2024 10:45:36 +0000 (11:45 +0100)
committerOndřej Surý <ondrej@isc.org>
Thu, 29 Feb 2024 10:23:03 +0000 (11:23 +0100)
The log message for commit 24381cc36d8528f5a4046fb2614451aeac4cdfc1
explained:

    In some older BIND 9 branches, the extra queuing overhead eliminated by
    this change could be remotely exploited to cause excessive memory use.
    Due to architectural shift, this branch is not vulnerable to that issue,
    but applying the fix to the latter is nevertheless deemed prudent for
    consistency and to make the code future-proof.

However, it turned out that having a single queue for the nodes to be
pruned increased lock contention to a level where cleaning up nodes from
the RBTDB took too long, causing the amount of memory used by the cache
to grow indefinitely over time.

This commit reverts the change to the pruning mechanism introduced by
commit 24381cc36d8528f5a4046fb2614451aeac4cdfc1 as BIND branches newer
than 9.16 were not affected by the excessive event queueing overhead
issue mentioned in the log message for the above commit.

lib/dns/include/dns/rbt.h
lib/dns/rbt.c
lib/dns/rbtdb.c
lib/dns/rbtdb_p.h

index be15e919538e23e59388da4b4e4ed19656b02a47..766166467a8dfda405d06bbc8f0297b421bb83a0 100644 (file)
@@ -112,12 +112,6 @@ struct dns_rbtnode {
         */
        ISC_LINK(dns_rbtnode_t) deadlink;
 
-       /*%
-        * This linked list is used to store nodes from which tree pruning can
-        * be started.
-        */
-       ISC_LINK(dns_rbtnode_t) prunelink;
-
        /*@{*/
        /*!
         * These values are used in the RBT DB implementation.  The appropriate
index a438d935e284ea412efab0e0b7323047f17b5996..e4b5b5d6efa3eb84ac183479fb043cefa4779693 100644 (file)
@@ -1406,7 +1406,6 @@ rbtnode_new(isc_mem_t *mctx, const dns_name_t *name) {
        };
 
        ISC_LINK_INIT(node, deadlink);
-       ISC_LINK_INIT(node, prunelink);
 
        isc_refcount_init(&node->references, 0);
 
index 9da26c64ce5f63627ccf81be32a67d73892574d1..c9ff795ca34f8bc0047f41005987f4fb2d4e8a68 100644 (file)
@@ -467,7 +467,6 @@ free_rbtdb(dns_rbtdb_t *rbtdb, bool log) {
        unsigned int i;
        isc_result_t result;
        char buf[DNS_NAME_FORMATSIZE];
-       dns_rbtnode_t *node = NULL;
        dns_rbt_t **treep = NULL;
        isc_time_t start;
 
@@ -490,6 +489,8 @@ free_rbtdb(dns_rbtdb_t *rbtdb, bool log) {
         * the overhead of unlinking all nodes here should be negligible.
         */
        for (i = 0; i < rbtdb->node_lock_count; i++) {
+               dns_rbtnode_t *node = NULL;
+
                node = ISC_LIST_HEAD(rbtdb->deadnodes[i]);
                while (node != NULL) {
                        ISC_LIST_UNLINK(rbtdb->deadnodes[i], node, deadlink);
@@ -497,12 +498,6 @@ free_rbtdb(dns_rbtdb_t *rbtdb, bool log) {
                }
        }
 
-       node = ISC_LIST_HEAD(rbtdb->prunenodes);
-       while (node != NULL) {
-               ISC_LIST_UNLINK(rbtdb->prunenodes, node, prunelink);
-               node = ISC_LIST_HEAD(rbtdb->prunenodes);
-       }
-
        rbtdb->quantum = (rbtdb->loop != NULL) ? 100 : 0;
 
        for (;;) {
@@ -1181,26 +1176,16 @@ is_leaf(dns_rbtnode_t *node) {
                node->left == NULL && node->right == NULL);
 }
 
-/*%
- * The tree lock must be held when this function is called as it reads and
- * updates rbtdb->prunenodes.
- */
 static void
 send_to_prune_tree(dns_rbtdb_t *rbtdb, dns_rbtnode_t *node,
                   isc_rwlocktype_t locktype DNS__DB_FLARG) {
-       bool pruning_queued = (ISC_LIST_HEAD(rbtdb->prunenodes) != NULL);
-
-       INSIST(locktype == isc_rwlocktype_write);
+       prune_t *prune = isc_mem_get(rbtdb->common.mctx, sizeof(*prune));
+       *prune = (prune_t){ .node = node };
 
+       dns_db_attach((dns_db_t *)rbtdb, &prune->db);
        dns__rbtdb_newref(rbtdb, node, locktype DNS__DB_FLARG_PASS);
-       INSIST(!ISC_LINK_LINKED(node, prunelink));
-       ISC_LIST_APPEND(rbtdb->prunenodes, node, prunelink);
 
-       if (!pruning_queued) {
-               dns_db_t *db = NULL;
-               dns_db_attach((dns_db_t *)rbtdb, &db);
-               isc_async_run(rbtdb->loop, prune_tree, db);
-       }
+       isc_async_run(rbtdb->loop, prune_tree, prune);
 }
 
 /*%
@@ -1498,83 +1483,64 @@ restore_locks:
 }
 
 /*
- * Prune the tree by recursively cleaning up single leaves.  Go through all
- * nodes stored in the rbtdb->prunenodes list; for each of them, in the worst
- * case, it will be necessary to traverse a number of tree levels equal to the
- * maximum legal number of domain name labels (127); in practice, the number of
- * tree levels to traverse will virtually always be much smaller (a few levels
- * at most).  While holding the tree lock throughout this entire operation is
- * less than ideal, so is splitting the latter up by queueing a separate
- * prune_tree() run for each node to start pruning from (as queueing requires
- * allocating memory and can therefore potentially be exploited to exhaust
- * available memory).  Also note that actually freeing up the memory used by
- * RBTDB nodes (which is what this function does) is essential to keeping cache
- * memory use in check, so since the tree lock needs to be acquired anyway,
- * freeing as many nodes as possible before the tree lock gets released is
- * prudent.
+ * Prune the tree by recursively cleaning-up single leaves.  In the worst
+ * case, the number of iteration is the number of tree levels, which is at
+ * most the maximum number of domain name labels, i.e, 127.  In practice, this
+ * should be much smaller (only a few times), and even the worst case would be
+ * acceptable for a single event.
  */
 static void
 prune_tree(void *arg) {
-       dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)arg;
-       dns_rbtnode_t *node = NULL;
+       prune_t *prune = (prune_t *)arg;
+       dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)prune->db;
+       dns_rbtnode_t *node = prune->node;
        dns_rbtnode_t *parent = NULL;
        unsigned int locknum;
        isc_rwlocktype_t tlocktype = isc_rwlocktype_none;
        isc_rwlocktype_t nlocktype = isc_rwlocktype_none;
 
+       isc_mem_put(rbtdb->common.mctx, prune, sizeof(*prune));
+
        TREE_WRLOCK(&rbtdb->tree_lock, &tlocktype);
+       locknum = node->locknum;
+       NODE_WRLOCK(&rbtdb->node_locks[locknum].lock, &nlocktype);
+       do {
+               parent = node->parent;
+               dns__rbtdb_decref(rbtdb, node, 0, &nlocktype, &tlocktype, true,
+                                 true DNS__DB_FILELINE);
 
-       while ((node = ISC_LIST_HEAD(rbtdb->prunenodes)) != NULL) {
-               locknum = node->locknum;
-               NODE_WRLOCK(&rbtdb->node_locks[locknum].lock, &nlocktype);
-               do {
-                       if (ISC_LINK_LINKED(node, prunelink)) {
-                               ISC_LIST_UNLINK(rbtdb->prunenodes, node,
-                                               prunelink);
+               if (parent != NULL && parent->down == NULL) {
+                       /*
+                        * node was the only down child of the parent and has
+                        * just been removed.  We'll then need to examine the
+                        * parent.  Keep the lock if possible; otherwise,
+                        * release the old lock and acquire one for the parent.
+                        */
+                       if (parent->locknum != locknum) {
+                               NODE_UNLOCK(&rbtdb->node_locks[locknum].lock,
+                                           &nlocktype);
+                               locknum = parent->locknum;
+                               NODE_WRLOCK(&rbtdb->node_locks[locknum].lock,
+                                           &nlocktype);
                        }
 
-                       parent = node->parent;
-                       dns__rbtdb_decref(rbtdb, node, 0, &nlocktype,
-                                         &tlocktype, true,
-                                         true DNS__DB_FILELINE);
-
-                       if (parent != NULL && parent->down == NULL) {
-                               /*
-                                * node was the only down child of the parent
-                                * and has just been removed.  We'll then need
-                                * to examine the parent.  Keep the lock if
-                                * possible; otherwise, release the old lock and
-                                * acquire one for the parent.
-                                */
-                               if (parent->locknum != locknum) {
-                                       NODE_UNLOCK(
-                                               &rbtdb->node_locks[locknum].lock,
-                                               &nlocktype);
-                                       locknum = parent->locknum;
-                                       NODE_WRLOCK(
-                                               &rbtdb->node_locks[locknum].lock,
-                                               &nlocktype);
-                               }
-
-                               /*
-                                * We need to gain a reference to the node
-                                * before decrementing it in the next iteration.
-                                */
-                               if (ISC_LINK_LINKED(parent, deadlink)) {
-                                       ISC_LIST_UNLINK(
-                                               rbtdb->deadnodes[locknum],
+                       /*
+                        * We need to gain a reference to the node before
+                        * decrementing it in the next iteration.
+                        */
+                       if (ISC_LINK_LINKED(parent, deadlink)) {
+                               ISC_LIST_UNLINK(rbtdb->deadnodes[locknum],
                                                parent, deadlink);
-                               }
-                               dns__rbtdb_newref(rbtdb, parent,
-                                                 nlocktype DNS__DB_FILELINE);
-                       } else {
-                               parent = NULL;
                        }
+                       dns__rbtdb_newref(rbtdb, parent,
+                                         nlocktype DNS__DB_FILELINE);
+               } else {
+                       parent = NULL;
+               }
 
-                       node = parent;
-               } while (node != NULL);
-               NODE_UNLOCK(&rbtdb->node_locks[locknum].lock, &nlocktype);
-       }
+               node = parent;
+       } while (node != NULL);
+       NODE_UNLOCK(&rbtdb->node_locks[locknum].lock, &nlocktype);
        TREE_UNLOCK(&rbtdb->tree_lock, &tlocktype);
 
        dns_db_detach((dns_db_t **)&rbtdb);
@@ -3951,8 +3917,6 @@ dns__rbtdb_create(isc_mem_t *mctx, const dns_name_t *origin, dns_dbtype_t type,
                ISC_LIST_INIT(rbtdb->deadnodes[i]);
        }
 
-       ISC_LIST_INIT(rbtdb->prunenodes);
-
        rbtdb->active = rbtdb->node_lock_count;
 
        for (i = 0; i < (int)(rbtdb->node_lock_count); i++) {
index b772f8953920dca86bcfa41e9b306de095f09750..401bccc00ef3f78214706984ab212666cdac507e 100644 (file)
@@ -152,10 +152,6 @@ struct dns_rbtdb {
         */
        dns_rbtnodelist_t *deadnodes;
 
-       /* List of nodes from which recursive tree pruning can be started from.
-        * Locked by tree_lock. */
-       dns_rbtnodelist_t prunenodes;
-
        /*
         * Heaps.  These are used for TTL based expiry in a cache,
         * or for zone resigning in a zone DB.  hmctx is the memory
@@ -202,6 +198,14 @@ typedef struct {
        isc_stdtime_t now;
 } rbtdb_load_t;
 
+/*%
+ * Prune context
+ */
+typedef struct {
+       dns_db_t *db;
+       dns_rbtnode_t *node;
+} prune_t;
+
 extern dns_dbmethods_t dns__rbtdb_zonemethods;
 extern dns_dbmethods_t dns__rbtdb_cachemethods;