]> git.ipfire.org Git - thirdparty/bind9.git/commitdiff
Adaptive memory allocation strategy for qp-tries
authoralessio <alessio@isc.org>
Sun, 9 Mar 2025 08:13:16 +0000 (09:13 +0100)
committerEvan Hunt <each@isc.org>
Thu, 22 May 2025 22:19:27 +0000 (15:19 -0700)
qp-tries allocate their nodes (twigs) in chunks to reduce allocator
pressure and improve memory locality. The choice of chunk size presents
a tradeoff: larger chunks benefit qp-tries with many values (as seen
in large zones and resolvers) but waste memory in smaller use cases.

Previously, our fixed chunk size of 2^10 twigs meant that even an
empty qp-trie would consume 12KB of memory, while reducing this size
would negatively impact resolver performance.

This commit implements an adaptive chunking strategy that:
 - Tracks the size of the most recently allocated chunk.
 - Doubles the chunk size for each new allocation until reaching a
   predefined maximum.

This approach effectively balances memory efficiency for small tries
while maintaining the performance benefits of larger chunk sizes for
bigger data structures.

This commit also splits the callback freeing qpmultis into two
phases, one that frees the underlying qptree, and one that reclaims
the qpmulti memory. In order to prevent races between the qpmulti
destructor and chunk garbage collection jobs, the second phase is
protected by reference counting.

lib/dns/include/dns/qp.h
lib/dns/qp.c
lib/dns/qp_p.h
lib/dns/qpcache.c
lib/isc/include/isc/util.h
lib/isc/mem.c

index 16eca977d991b3832fd76ac3e6d3d37ec4a4f274..d72730c80acb94ea7adcfcee2a846f3b8dcf5906 100644 (file)
 #include <dns/name.h>
 #include <dns/types.h>
 
+/*%
+ * How many bytes a qp-trie might allocate as part of an insert. Needed for
+ * overmem checks.
+ */
+#define QP_SAFETY_MARGIN ((1ul << 12ul) * 12)
+
 /*%
  * A `dns_qp_t` supports single-threaded read/write access.
  */
@@ -306,7 +312,6 @@ typedef struct dns_qp_memusage {
        size_t hold;        /*%< nodes retained for readers */
        size_t free;        /*%< nodes to be reclaimed */
        size_t node_size;   /*%< in bytes */
-       size_t chunk_size;  /*%< nodes per chunk */
        size_t chunk_count; /*%< allocated chunks */
        size_t bytes;       /*%< total memory in chunks and metadata */
        bool   fragmented;  /*%< trie needs compaction */
index 79a0b8e6eebe1573897587ca0c00bec17e5a3f54..afed2a7fcf69b9bbbe58dfe846a1c340bf21b832 100644 (file)
@@ -96,6 +96,19 @@ static atomic_uint_fast64_t rollback_time;
 #define TRACE(...)
 #endif
 
+#if DNS_QPMULTI_TRACE
+ISC_REFCOUNT_STATIC_TRACE_DECL(dns_qpmulti);
+#define dns_qpmulti_ref(ptr) dns_qpmulti__ref(ptr, __func__, __FILE__, __LINE__)
+#define dns_qpmulti_unref(ptr) \
+       dns_qpmulti__unref(ptr, __func__, __FILE__, __LINE__)
+#define dns_qpmulti_attach(ptr, ptrp) \
+       dns_qpmulti__attach(ptr, ptrp, __func__, __FILE__, __LINE__)
+#define dns_qpmulti_detach(ptrp) \
+       dns_qpmulti__detach(ptrp, __func__, __FILE__, __LINE__)
+#else
+ISC_REFCOUNT_STATIC_DECL(dns_qpmulti);
+#endif
+
 /***********************************************************************
  *
  *  converting DNS names to trie keys
@@ -425,7 +438,7 @@ write_protect(dns_qp_t *qp, dns_qpchunk_t chunk) {
 
 #else
 
-#define chunk_get_raw(qp)      isc_mem_allocate(qp->mctx, QP_CHUNK_BYTES)
+#define chunk_get_raw(qp, size) isc_mem_allocate(qp->mctx, size)
 #define chunk_free_raw(qp, ptr) isc_mem_free(qp->mctx, ptr)
 
 #define chunk_shrink_raw(qp, ptr, size) isc_mem_reallocate(qp->mctx, ptr, size)
@@ -454,6 +467,22 @@ cells_immutable(dns_qp_t *qp, dns_qpref_t ref) {
        }
 }
 
+/*
+ * Find the next power that is both bigger than size and prev_capacity,
+ * but still within the chunk min and max sizes.
+ */
+static dns_qpcell_t
+next_capacity(uint32_t prev_capacity, uint32_t size) {
+       /*
+        * Unfortunately builtin_clz is undefined for 0. We work around this
+        * issue by flooring the request size at 2.
+        */
+       size = ISC_MAX3(size, prev_capacity, 2u);
+       uint32_t log2 = 32u - __builtin_clz(size - 1u);
+
+       return 1U << ISC_CLAMP(log2, QP_CHUNK_LOG_MIN, QP_CHUNK_LOG_MAX);
+}
+
 /*
  * Create a fresh bump chunk and allocate some twigs from it.
  */
@@ -462,9 +491,15 @@ chunk_alloc(dns_qp_t *qp, dns_qpchunk_t chunk, dns_qpweight_t size) {
        INSIST(qp->base->ptr[chunk] == NULL);
        INSIST(qp->usage[chunk].used == 0);
        INSIST(qp->usage[chunk].free == 0);
+       INSIST(qp->chunk_capacity <= QP_CHUNK_SIZE);
 
-       qp->base->ptr[chunk] = chunk_get_raw(qp);
-       qp->usage[chunk] = (qp_usage_t){ .exists = true, .used = size };
+       qp->chunk_capacity = next_capacity(qp->chunk_capacity * 2u, size);
+       qp->base->ptr[chunk] =
+               chunk_get_raw(qp, qp->chunk_capacity * sizeof(dns_qpnode_t));
+
+       qp->usage[chunk] = (qp_usage_t){ .exists = true,
+                                        .used = size,
+                                        .capacity = qp->chunk_capacity };
        qp->used_count += size;
        qp->bump = chunk;
        qp->fender = 0;
@@ -544,7 +579,7 @@ alloc_twigs(dns_qp_t *qp, dns_qpweight_t size) {
        dns_qpchunk_t chunk = qp->bump;
        dns_qpcell_t cell = qp->usage[chunk].used;
 
-       if (cell + size <= QP_CHUNK_SIZE) {
+       if (cell + size <= qp->usage[chunk].capacity) {
                qp->usage[chunk].used += size;
                qp->used_count += size;
                return make_ref(chunk, cell);
@@ -697,38 +732,47 @@ reclaim_chunks_cb(struct rcu_head *arg) {
        REQUIRE(QPMULTI_VALID(multi));
 
        LOCK(&multi->mutex);
-
        dns_qp_t *qp = &multi->writer;
-       REQUIRE(QP_VALID(qp));
-
-       unsigned int free = 0;
-       isc_nanosecs_t start = isc_time_monotonic();
 
-       for (unsigned int i = 0; i < rcuctx->count; i++) {
-               dns_qpchunk_t chunk = rcuctx->chunk[i];
-               if (qp->usage[chunk].snapshot) {
-                       /* cleanup when snapshot is destroyed */
-                       qp->usage[chunk].snapfree = true;
-               } else {
-                       chunk_free(qp, chunk);
-                       free++;
-               }
-       }
+       /*
+        * If chunk_max is zero, chunks have already been freed.
+        */
+       if (qp->chunk_max != 0) {
+               unsigned int free = 0;
+               isc_nanosecs_t start = isc_time_monotonic();
 
-       isc_mem_putanddetach(&rcuctx->mctx, rcuctx,
-                            STRUCT_FLEX_SIZE(rcuctx, chunk, rcuctx->count));
+               INSIST(QP_VALID(qp));
 
-       isc_nanosecs_t time = isc_time_monotonic() - start;
-       recycle_time += time;
+               for (unsigned int i = 0; i < rcuctx->count; i++) {
+                       dns_qpchunk_t chunk = rcuctx->chunk[i];
+                       if (qp->usage[chunk].snapshot) {
+                               /* clean up when snapshot is destroyed */
+                               qp->usage[chunk].snapfree = true;
+                       } else {
+                               chunk_free(qp, chunk);
+                               free++;
+                       }
+               }
 
-       if (free > 0) {
-               LOG_STATS("qp reclaim" PRItime "free %u chunks", time, free);
-               LOG_STATS("qp reclaim leaf %u live %u used %u free %u hold %u",
-                         qp->leaf_count, qp->used_count - qp->free_count,
-                         qp->used_count, qp->free_count, qp->hold_count);
+               isc_nanosecs_t time = isc_time_monotonic() - start;
+               recycle_time += time;
+
+               if (free > 0) {
+                       LOG_STATS("qp reclaim" PRItime "free %u chunks", time,
+                                 free);
+                       LOG_STATS(
+                               "qp reclaim leaf %u live %u used %u free %u "
+                               "hold %u",
+                               qp->leaf_count, qp->used_count - qp->free_count,
+                               qp->used_count, qp->free_count, qp->hold_count);
+               }
        }
 
        UNLOCK(&multi->mutex);
+
+       dns_qpmulti_detach(&multi);
+       isc_mem_putanddetach(&rcuctx->mctx, rcuctx,
+                            STRUCT_FLEX_SIZE(rcuctx, chunk, rcuctx->count));
 }
 
 /*
@@ -773,6 +817,11 @@ reclaim_chunks(dns_qpmulti_t *multi) {
                }
        }
 
+       /*
+        * Reference the qpmulti object to keep it from being
+        * freed until reclaim_chunks_cb() runs.
+        */
+       dns_qpmulti_ref(multi);
        call_rcu(&rcuctx->rcu_head, reclaim_chunks_cb);
 
        LOG_STATS("qp will reclaim %u chunks", count);
@@ -1023,12 +1072,13 @@ dns_qp_memusage(dns_qp_t *qp) {
                .hold = qp->hold_count,
                .free = qp->free_count,
                .node_size = sizeof(dns_qpnode_t),
-               .chunk_size = QP_CHUNK_SIZE,
                .fragmented = QP_NEEDGC(qp),
        };
 
+       size_t chunk_usage_bytes = 0;
        for (dns_qpchunk_t chunk = 0; chunk < qp->chunk_max; chunk++) {
                if (qp->base->ptr[chunk] != NULL) {
+                       chunk_usage_bytes += qp->usage[chunk].capacity;
                        memusage.chunk_count += 1;
                }
        }
@@ -1037,7 +1087,7 @@ dns_qp_memusage(dns_qp_t *qp) {
         * XXXFANF does not subtract chunks that have been shrunk,
         * and does not count unreclaimed dns_qpbase_t objects
         */
-       memusage.bytes = memusage.chunk_count * QP_CHUNK_BYTES +
+       memusage.bytes = chunk_usage_bytes +
                         qp->chunk_max * sizeof(qp->base->ptr[0]) +
                         qp->chunk_max * sizeof(qp->usage[0]);
 
@@ -1055,7 +1105,7 @@ dns_qpmulti_memusage(dns_qpmulti_t *multi) {
        dns_qp_memusage_t memusage = dns_qp_memusage(qp);
 
        if (qp->transaction_mode == QP_UPDATE) {
-               memusage.bytes -= QP_CHUNK_BYTES;
+               memusage.bytes -= qp->usage[qp->bump].capacity;
                memusage.bytes += qp->usage[qp->bump].used *
                                  sizeof(dns_qpnode_t);
        }
@@ -1440,12 +1490,12 @@ dns_qpmulti_create(isc_mem_t *mctx, const dns_qpmethods_t *methods, void *uctx,
        REQUIRE(qpmp != NULL && *qpmp == NULL);
 
        dns_qpmulti_t *multi = isc_mem_get(mctx, sizeof(*multi));
-       *multi = (dns_qpmulti_t){
-               .magic = QPMULTI_MAGIC,
-               .reader_ref = INVALID_REF,
-       };
+       *multi = (dns_qpmulti_t){ .magic = QPMULTI_MAGIC,
+                                 .reader_ref = INVALID_REF,
+                                 .references = ISC_REFCOUNT_INITIALIZER(1) };
        isc_mutex_init(&multi->mutex);
        ISC_LIST_INIT(multi->snapshots);
+
        /*
         * Do not waste effort allocating a bump chunk that will be thrown
         * away when a transaction is opened. dns_qpmulti_update() always
@@ -1465,11 +1515,13 @@ destroy_guts(dns_qp_t *qp) {
        if (qp->chunk_max == 0) {
                return;
        }
+
        for (dns_qpchunk_t chunk = 0; chunk < qp->chunk_max; chunk++) {
                if (qp->base->ptr[chunk] != NULL) {
                        chunk_free(qp, chunk);
                }
        }
+       qp->chunk_max = 0;
        ENSURE(qp->used_count == 0);
        ENSURE(qp->free_count == 0);
        ENSURE(isc_refcount_current(&qp->base->refcount) == 1);
@@ -1495,7 +1547,26 @@ dns_qp_destroy(dns_qp_t **qptp) {
 }
 
 static void
-qpmulti_destroy_cb(struct rcu_head *arg) {
+qpmulti_free_mem(dns_qpmulti_t *multi) {
+       REQUIRE(QPMULTI_VALID(multi));
+
+       /* reassure thread sanitizer */
+       LOCK(&multi->mutex);
+       dns_qp_t *qp = &multi->writer;
+       UNLOCK(&multi->mutex);
+
+       isc_mutex_destroy(&multi->mutex);
+       isc_mem_putanddetach(&qp->mctx, multi, sizeof(*multi));
+}
+
+#if QPMULTI_TRACE
+ISC_REFCOUNT_STATIC_TRACE_IMPL(dns_qpmulti, qpmulti_free_mem)
+#else
+ISC_REFCOUNT_STATIC_IMPL(dns_qpmulti, qpmulti_free_mem)
+#endif
+
+static void
+qpmulti_destroy_guts_cb(struct rcu_head *arg) {
        qp_rcuctx_t *rcuctx = caa_container_of(arg, qp_rcuctx_t, rcu_head);
        REQUIRE(QPRCU_VALID(rcuctx));
        /* only nonzero for reclaim_chunks_cb() */
@@ -1514,10 +1585,9 @@ qpmulti_destroy_cb(struct rcu_head *arg) {
 
        UNLOCK(&multi->mutex);
 
-       isc_mutex_destroy(&multi->mutex);
+       dns_qpmulti_detach(&multi);
        isc_mem_putanddetach(&rcuctx->mctx, rcuctx,
                             STRUCT_FLEX_SIZE(rcuctx, chunk, rcuctx->count));
-       isc_mem_putanddetach(&qp->mctx, multi, sizeof(*multi));
 }
 
 void
@@ -1543,7 +1613,7 @@ dns_qpmulti_destroy(dns_qpmulti_t **qpmp) {
                .multi = multi,
        };
        isc_mem_attach(qp->mctx, &rcuctx->mctx);
-       call_rcu(&rcuctx->rcu_head, qpmulti_destroy_cb);
+       call_rcu(&rcuctx->rcu_head, qpmulti_destroy_guts_cb);
 }
 
 /***********************************************************************
index 0206c5fd56bcf6c99d198dc48f23cbfb139a5451..add3bdeaeebec9f85dbb726069560ef30a37a0a2 100644 (file)
@@ -19,6 +19,8 @@
 
 #pragma once
 
+#include <isc/refcount.h>
+
 #include <dns/qp.h>
 
 /***********************************************************************
@@ -141,22 +143,29 @@ enum {
  * size to make the allocator work harder.
  */
 #ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
-#define QP_CHUNK_LOG 7
+#define QP_CHUNK_LOG_MIN 6
+#define QP_CHUNK_LOG_MAX 7
 #else
-#define QP_CHUNK_LOG 10
+#define QP_CHUNK_LOG_MIN 6
+#define QP_CHUNK_LOG_MAX 10
 #endif
 
-STATIC_ASSERT(6 <= QP_CHUNK_LOG && QP_CHUNK_LOG <= 20,
-             "qp-trie chunk size is unreasonable");
+STATIC_ASSERT(6 <= QP_CHUNK_LOG_MIN && QP_CHUNK_LOG_MIN <= QP_CHUNK_LOG_MAX,
+             "qp-trie min chunk size is unreasonable");
+STATIC_ASSERT(6 <= QP_CHUNK_LOG_MAX && QP_CHUNK_LOG_MAX <= 20,
+             "qp-trie max chunk size is unreasonable");
 
-#define QP_CHUNK_SIZE  (1U << QP_CHUNK_LOG)
+#define QP_CHUNK_SIZE  (1U << QP_CHUNK_LOG_MAX)
 #define QP_CHUNK_BYTES (QP_CHUNK_SIZE * sizeof(dns_qpnode_t))
 
+STATIC_ASSERT(QP_SAFETY_MARGIN >= QP_CHUNK_BYTES,
+             "qp-trie safety margin too small");
+
 /*
  * We need a bitfield this big to count how much of a chunk is in use:
- * it needs to count from 0 up to and including `1 << QP_CHUNK_LOG`.
+ * it needs to count from 0 up to and including `1 << QP_CHUNK_LOG_MAX`.
  */
-#define QP_USAGE_BITS (QP_CHUNK_LOG + 1)
+#define QP_USAGE_BITS (QP_CHUNK_LOG_MAX + 1)
 
 /*
  * A chunk needs to be compacted if it is less full than this threshold.
@@ -268,6 +277,8 @@ ref_cell(dns_qpref_t ref) {
 typedef struct qp_usage {
        /*% the allocation point, increases monotonically */
        dns_qpcell_t used : QP_USAGE_BITS;
+       /*% the actual size of the allocation */
+       dns_qpcell_t capacity : QP_USAGE_BITS;
        /*% count of nodes no longer needed, also monotonic */
        dns_qpcell_t free : QP_USAGE_BITS;
        /*% qp->base->ptr[chunk] != NULL */
@@ -322,6 +333,7 @@ typedef struct qp_rcuctx {
        struct rcu_head rcu_head;
        isc_mem_t *mctx;
        dns_qpmulti_t *multi;
+       ISC_LINK(struct qp_rcuctx) link;
        dns_qpchunk_t count;
        dns_qpchunk_t chunk[];
 } qp_rcuctx_t;
@@ -479,6 +491,8 @@ struct dns_qp {
        dns_qpcell_t used_count, free_count;
        /*% free cells that cannot be recovered right now */
        dns_qpcell_t hold_count;
+       /*% capacity of last allocated chunk, for exponential chunk growth */
+       dns_qpcell_t chunk_capacity;
        /*% what kind of transaction was most recently started [MT] */
        enum { QP_NONE, QP_WRITE, QP_UPDATE } transaction_mode : 2;
        /*% compact the entire trie [MT] */
@@ -523,6 +537,8 @@ struct dns_qpmulti {
        dns_qp_t *rollback;
        /*% all snapshots of this trie */
        ISC_LIST(dns_qpsnap_t) snapshots;
+       /*% refcount for memory reclamation */
+       isc_refcount_t references;
 };
 
 /***********************************************************************
index 032dc47a1ae222029b0c110b7eaedfd0247e40ea..40b7b206c7d1568691486fe77fe97fe3f5c60fc2 100644 (file)
@@ -519,7 +519,7 @@ qpcache_miss(qpcache_t *qpdb, dns_slabheader_t *newheader,
                size_t purgesize =
                        2 * (sizeof(qpcnode_t) +
                             dns_name_size(&HEADERNODE(newheader)->name)) +
-                       rdataset_size(newheader) + 12288;
+                       rdataset_size(newheader) + QP_SAFETY_MARGIN;
 
                expire_lru_headers(qpdb, idx, purgesize, nlocktypep,
                                   tlocktypep DNS__DB_FLARG_PASS);
index 9fed22df60c71b4157018b40ab9ad32b799f1312..971f21dc1ee6aab3cc0ddbe4cf1720580e9ec690 100644 (file)
@@ -69,6 +69,8 @@
 
 #define ISC_CLAMP(v, x, y) ((v) < (x) ? (x) : ((v) > (y) ? (y) : (v)))
 
+#define ISC_MAX3(a, b, c) ISC_MAX(ISC_MAX((a), (b)), (c))
+
 /*%
  * The UNCONST() macro can be used to omit warnings produced by certain
  * compilers when operating with pointers declared with the const type qual-
index ff34485ae0e527cfc3b994f779b0b03fc0eb0562..c1b0ab7331c878f947d4d37a674ad876374cf5a5 100644 (file)
@@ -434,9 +434,10 @@ isc__mem_initialize(void) {
 
 void
 isc__mem_shutdown(void) {
-       /* should be called after an rcu_barrier() */
        bool empty;
 
+       rcu_barrier();
+
        isc__mem_checkdestroyed();
 
        LOCK(&contextslock);