]> git.ipfire.org Git - thirdparty/bind9.git/commitdiff
Limit isc_task_send() overhead for tree pruning
authorMichał Kępień <michal@isc.org>
Thu, 4 Jan 2024 12:39:27 +0000 (13:39 +0100)
committerMichał Kępień <michal@isc.org>
Fri, 5 Jan 2024 11:40:50 +0000 (12:40 +0100)
Instead of issuing a separate isc_task_send() call for every RBTDB node
that triggers tree pruning, maintain a list of nodes from which tree
pruning can be started from and only issue an isc_task_send() call if
pruning has not yet been triggered by another RBTDB node.

The extra queuing overhead eliminated by this change could be remotely
exploited to cause excessive memory use.

As this change modifies struct dns_rbtnode by adding a new 'prunelink'
member to it, bump MAPAPI to prevent any attempts of loading map-format
zone files created using older BIND 9 versions.

(cherry picked from commit 24381cc36d8528f5a4046fb2614451aeac4cdfc1)

lib/dns/include/dns/rbt.h
lib/dns/mapapi
lib/dns/rbt.c
lib/dns/rbtdb.c

index 4a6b0781c78d9bdcb1637be0e7d611a0e025d8ae..6cfd40ffddfc95b000795dff65a2578bcf8cbcc0 100644 (file)
@@ -140,6 +140,12 @@ struct dns_rbtnode {
         */
        ISC_LINK(dns_rbtnode_t) deadlink;
 
+       /*%
+        * This linked list is used to store nodes from which tree pruning can
+        * be started.
+        */
+       ISC_LINK(dns_rbtnode_t) prunelink;
+
        /*@{*/
        /*!
         * These values are used in the RBT DB implementation.  The appropriate
index 1b502d34cc9d7310bd252d89cdfde75ff9edf49e..a46e1909370ca2de5a33c7e6c3dc48a67ccd9fb0 100644 (file)
@@ -13,4 +13,4 @@
 # Whenever releasing a new major release of BIND9, set this value
 # back to 1.0 when releasing the first alpha.  Map files are *never*
 # compatible across major releases.
-MAPAPI=3.0
+MAPAPI=4.0
index d5d18b836eb8dfc429f573fd89b1ce76e0c57023..780a950a48bae70a9aecd73f49a4c83598ff7178 100644 (file)
@@ -2307,6 +2307,7 @@ create_node(isc_mem_t *mctx, const dns_name_t *name, dns_rbtnode_t **nodep) {
        HASHVAL(node) = 0;
 
        ISC_LINK_INIT(node, deadlink);
+       ISC_LINK_INIT(node, prunelink);
 
        LOCKNUM(node) = 0;
        WILD(node) = 0;
index fbb4b84259a8dbaccf0639879805c32871489f10..723d169532d331d277b5f40094c17938cbbeb130 100644 (file)
@@ -521,6 +521,10 @@ struct dns_rbtdb {
         */
        rbtnodelist_t *deadnodes;
 
+       /* List of nodes from which recursive tree pruning can be started from.
+        * Locked by tree_lock. */
+       rbtnodelist_t prunenodes;
+
        /*
         * Heaps.  These are used for TTL based expiry in a cache,
         * or for zone resigning in a zone DB.  hmctx is the memory
@@ -1067,6 +1071,7 @@ free_rbtdb(dns_rbtdb_t *rbtdb, bool log, isc_event_t *event) {
        unsigned int i;
        isc_result_t result;
        char buf[DNS_NAME_FORMATSIZE];
+       dns_rbtnode_t *node = NULL;
        dns_rbt_t **treep;
        isc_time_t start;
 
@@ -1092,8 +1097,6 @@ free_rbtdb(dns_rbtdb_t *rbtdb, bool log, isc_event_t *event) {
         * the overhead of unlinking all nodes here should be negligible.
         */
        for (i = 0; i < rbtdb->node_lock_count; i++) {
-               dns_rbtnode_t *node;
-
                node = ISC_LIST_HEAD(rbtdb->deadnodes[i]);
                while (node != NULL) {
                        ISC_LIST_UNLINK(rbtdb->deadnodes[i], node, deadlink);
@@ -1101,6 +1104,12 @@ free_rbtdb(dns_rbtdb_t *rbtdb, bool log, isc_event_t *event) {
                }
        }
 
+       node = ISC_LIST_HEAD(rbtdb->prunenodes);
+       while (node != NULL) {
+               ISC_LIST_UNLINK(rbtdb->prunenodes, node, prunelink);
+               node = ISC_LIST_HEAD(rbtdb->prunenodes);
+       }
+
        if (event == NULL) {
                rbtdb->quantum = (rbtdb->task != NULL) ? 100 : 0;
        }
@@ -1935,19 +1944,32 @@ is_leaf(dns_rbtnode_t *node) {
                node->left == NULL && node->right == NULL);
 }
 
+/*%
+ * The tree lock must be held when this function is called as it reads and
+ * updates rbtdb->prunenodes.
+ */
 static void
 send_to_prune_tree(dns_rbtdb_t *rbtdb, dns_rbtnode_t *node,
                   isc_rwlocktype_t locktype) {
-       isc_event_t *ev;
-       dns_db_t *db;
+       bool pruning_queued = (ISC_LIST_HEAD(rbtdb->prunenodes) != NULL);
+
+       INSIST(locktype == isc_rwlocktype_write);
 
-       ev = isc_event_allocate(rbtdb->common.mctx, NULL, DNS_EVENT_RBTPRUNE,
-                               prune_tree, node, sizeof(isc_event_t));
        new_reference(rbtdb, node, locktype);
-       db = NULL;
-       attach((dns_db_t *)rbtdb, &db);
-       ev->ev_sender = db;
-       isc_task_send(rbtdb->task, &ev);
+       INSIST(!ISC_LINK_LINKED(node, prunelink));
+       ISC_LIST_APPEND(rbtdb->prunenodes, node, prunelink);
+
+       if (!pruning_queued) {
+               isc_event_t *ev = NULL;
+               dns_db_t *db = NULL;
+
+               attach((dns_db_t *)rbtdb, &db);
+
+               ev = isc_event_allocate(rbtdb->common.mctx, NULL,
+                                       DNS_EVENT_RBTPRUNE, prune_tree, db,
+                                       sizeof(isc_event_t));
+               isc_task_send(rbtdb->task, &ev);
+       }
 }
 
 /*%
@@ -2222,17 +2244,26 @@ restore_locks:
 }
 
 /*
- * Prune the tree by recursively cleaning-up single leaves.  In the worst
- * case, the number of iteration is the number of tree levels, which is at
- * most the maximum number of domain name labels, i.e, 127.  In practice, this
- * should be much smaller (only a few times), and even the worst case would be
- * acceptable for a single event.
+ * Prune the tree by recursively cleaning up single leaves.  Go through all
+ * nodes stored in the rbtdb->prunenodes list; for each of them, in the worst
+ * case, it will be necessary to traverse a number of tree levels equal to the
+ * maximum legal number of domain name labels (127); in practice, the number of
+ * tree levels to traverse will virtually always be much smaller (a few levels
+ * at most).  While holding the tree lock throughout this entire operation is
+ * less than ideal, so is splitting the latter up by queueing a separate
+ * prune_tree() run for each node to start pruning from (as queueing requires
+ * allocating memory and can therefore potentially be exploited to exhaust
+ * available memory).  Also note that actually freeing up the memory used by
+ * RBTDB nodes (which is what this function does) is essential to keeping cache
+ * memory use in check, so since the tree lock needs to be acquired anyway,
+ * freeing as many nodes as possible before the tree lock gets released is
+ * prudent.
  */
 static void
 prune_tree(isc_task_t *task, isc_event_t *event) {
-       dns_rbtdb_t *rbtdb = event->ev_sender;
-       dns_rbtnode_t *node = event->ev_arg;
-       dns_rbtnode_t *parent;
+       dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)event->ev_arg;
+       dns_rbtnode_t *node = NULL;
+       dns_rbtnode_t *parent = NULL;
        unsigned int locknum;
 
        UNUSED(task);
@@ -2240,44 +2271,60 @@ prune_tree(isc_task_t *task, isc_event_t *event) {
        isc_event_free(&event);
 
        RWLOCK(&rbtdb->tree_lock, isc_rwlocktype_write);
-       locknum = node->locknum;
-       NODE_LOCK(&rbtdb->node_locks[locknum].lock, isc_rwlocktype_write);
-       do {
-               parent = node->parent;
-               decrement_reference(rbtdb, node, 0, isc_rwlocktype_write,
-                                   isc_rwlocktype_write, true);
 
-               if (parent != NULL && parent->down == NULL) {
-                       /*
-                        * node was the only down child of the parent and has
-                        * just been removed.  We'll then need to examine the
-                        * parent.  Keep the lock if possible; otherwise,
-                        * release the old lock and acquire one for the parent.
-                        */
-                       if (parent->locknum != locknum) {
-                               NODE_UNLOCK(&rbtdb->node_locks[locknum].lock,
-                                           isc_rwlocktype_write);
-                               locknum = parent->locknum;
-                               NODE_LOCK(&rbtdb->node_locks[locknum].lock,
-                                         isc_rwlocktype_write);
+       while ((node = ISC_LIST_HEAD(rbtdb->prunenodes)) != NULL) {
+               locknum = node->locknum;
+               NODE_LOCK(&rbtdb->node_locks[locknum].lock,
+                         isc_rwlocktype_write);
+               do {
+                       if (ISC_LINK_LINKED(node, prunelink)) {
+                               ISC_LIST_UNLINK(rbtdb->prunenodes, node,
+                                               prunelink);
                        }
 
-                       /*
-                        * We need to gain a reference to the node before
-                        * decrementing it in the next iteration.
-                        */
-                       if (ISC_LINK_LINKED(parent, deadlink)) {
-                               ISC_LIST_UNLINK(rbtdb->deadnodes[locknum],
+                       parent = node->parent;
+                       decrement_reference(rbtdb, node, 0,
+                                           isc_rwlocktype_write,
+                                           isc_rwlocktype_write, true);
+
+                       if (parent != NULL && parent->down == NULL) {
+                               /*
+                                * node was the only down child of the parent
+                                * and has just been removed.  We'll then need
+                                * to examine the parent.  Keep the lock if
+                                * possible; otherwise, release the old lock and
+                                * acquire one for the parent.
+                                */
+                               if (parent->locknum != locknum) {
+                                       NODE_UNLOCK(
+                                               &rbtdb->node_locks[locknum].lock,
+                                               isc_rwlocktype_write);
+                                       locknum = parent->locknum;
+                                       NODE_LOCK(
+                                               &rbtdb->node_locks[locknum].lock,
+                                               isc_rwlocktype_write);
+                               }
+
+                               /*
+                                * We need to gain a reference to the node
+                                * before decrementing it in the next iteration.
+                                */
+                               if (ISC_LINK_LINKED(parent, deadlink)) {
+                                       ISC_LIST_UNLINK(
+                                               rbtdb->deadnodes[locknum],
                                                parent, deadlink);
+                               }
+                               new_reference(rbtdb, parent,
+                                             isc_rwlocktype_write);
+                       } else {
+                               parent = NULL;
                        }
-                       new_reference(rbtdb, parent, isc_rwlocktype_write);
-               } else {
-                       parent = NULL;
-               }
 
-               node = parent;
-       } while (node != NULL);
-       NODE_UNLOCK(&rbtdb->node_locks[locknum].lock, isc_rwlocktype_write);
+                       node = parent;
+               } while (node != NULL);
+               NODE_UNLOCK(&rbtdb->node_locks[locknum].lock,
+                           isc_rwlocktype_write);
+       }
        RWUNLOCK(&rbtdb->tree_lock, isc_rwlocktype_write);
 
        detach((dns_db_t **)&rbtdb);
@@ -8735,6 +8782,8 @@ dns_rbtdb_create(isc_mem_t *mctx, const dns_name_t *origin, dns_dbtype_t type,
                ISC_LIST_INIT(rbtdb->deadnodes[i]);
        }
 
+       ISC_LIST_INIT(rbtdb->prunenodes);
+
        rbtdb->active = rbtdb->node_lock_count;
 
        for (i = 0; i < (int)(rbtdb->node_lock_count); i++) {