]> git.ipfire.org Git - thirdparty/bind9.git/commitdiff
Add isc_sieve unit implementing SIEVE-LRU algorithm
authorOndřej Surý <ondrej@isc.org>
Sun, 23 Feb 2025 13:36:35 +0000 (14:36 +0100)
committerEvan Hunt <each@isc.org>
Wed, 26 Mar 2025 22:36:33 +0000 (15:36 -0700)
This is the core implementation of the SIEVE algorithm described in the
following paper:

  Zhang, Yazhuo, Juncheng Yang, Yao Yue, Ymir Vigfusson, and K V
  Rashmi. “SIEVE Is Simpler than LRU: An Efficient Turn-Key Eviction
  Algorithm for Web Caches,” n.d.. available online from
  https://junchengyang.com/publication/nsdi24-SIEVE.pdf

lib/dns/include/dns/rdataslab.h
lib/dns/qpcache.c
lib/dns/rdataslab.c
lib/isc/Makefile.am
lib/isc/include/isc/sieve.h [new file with mode: 0644]
tests/dns/qpdb_test.c

index 7f393b1e56f740054321c3f2fc9681d051090a3a..5f4c2cfba600cc4a38f17df2469f5715ec6b74b7 100644 (file)
@@ -85,14 +85,13 @@ struct dns_slabheader {
         * when the "cyclic" rrset-order is required.
         */
 
-       unsigned int  resign_lsb : 1;
+       /* resigning (zone) and TTL-cleaning (cache) */
+       uint16_t      resign_lsb : 1;
        isc_stdtime_t resign;
+       isc_heap_t   *heap;
        unsigned int  heap_index;
-       /*%<
-        * Used for TTL-based cache cleaning.
-        */
 
-       isc_stdtime_t     last_used;
+       /* Used for stale refresh */
        _Atomic(uint32_t) last_refresh_fail_ts;
 
        dns_slabheader_proof_t *noqname;
@@ -127,7 +126,12 @@ struct dns_slabheader {
         * this rdataset, if any.
         */
 
+       dns_gluelist_t *gluelist;
+
+       /*% Used for SIEVE-LRU (cache) and changed_list (zone) */
        ISC_LINK(struct dns_slabheader) link;
+       /*% Used for SIEVE-LRU */
+       bool visited;
 
        /*%
         * Case vector.  If the bit is set then the corresponding
@@ -135,10 +139,6 @@ struct dns_slabheader {
         * rendering that character upper case.
         */
        unsigned char upper[32];
-
-       isc_heap_t *heap;
-
-       dns_gluelist_t *gluelist;
 };
 
 enum {
index 1391945c023c451b8165530563471a5bf79fddff..1b506082039491ead69ae135a40b9addc1a3bc2e 100644 (file)
@@ -16,7 +16,6 @@
 #include <inttypes.h>
 #include <stdalign.h>
 #include <stdbool.h>
-#include <sys/mman.h>
 
 #include <isc/ascii.h>
 #include <isc/async.h>
@@ -34,6 +33,7 @@
 #include <isc/refcount.h>
 #include <isc/result.h>
 #include <isc/rwlock.h>
+#include <isc/sieve.h>
 #include <isc/stdio.h>
 #include <isc/string.h>
 #include <isc/time.h>
  */
 #define QPDB_VIRTUAL 300
 
-/*%
- * Whether to rate-limit updating the LRU to avoid possible thread contention.
- * Updating LRU requires write locking, so we don't do it every time the
- * record is touched - only after some time passes.
- */
-#ifndef DNS_QPDB_LIMITLRUUPDATE
-#define DNS_QPDB_LIMITLRUUPDATE 1
-#endif
-
-/*% Time after which we update LRU for glue records, 5 minutes */
-#define DNS_QPDB_LRUUPDATE_GLUE 300
-/*% Time after which we update LRU for all other records, 10 minutes */
-#define DNS_QPDB_LRUUPDATE_REGULAR 600
-
 /*
  * This defines the number of headers that we try to expire each time the
  * expire_ttl_headers() is run.  The number should be small enough, so the
 #define DNS_QPDB_EXPIRE_TTL_COUNT 10
 
 /*%
- * This is the structure that is used for each node in the qp trie of trees.
+ * This is the structure that is used for each node in the qp trie of
+ * trees.
  */
 typedef struct qpcnode qpcnode_t;
 struct qpcnode {
@@ -224,11 +211,6 @@ typedef struct qpcache_bucket {
        /* Per-bucket lock. */
        isc_rwlock_t lock;
 
-       /*
-        * Linked list used to implement LRU cache cleaning.
-        */
-       dns_slabheaderlist_t lru;
-
        /*
         * The heap is used for TTL based expiry.  Note that qpcache->hmctx
         * is the memory context to use for heap memory; this differs from
@@ -236,10 +218,14 @@ typedef struct qpcache_bucket {
         */
        isc_heap_t *heap;
 
+       /* SIEVE-LRU cache cleaning state. */
+       ISC_SIEVE(dns_slabheader_t) sieve;
+
        /* Padding to prevent false sharing between locks. */
        uint8_t __padding[ISC_OS_CACHELINE_SIZE -
                          (sizeof(isc_queue_t) + sizeof(isc_rwlock_t) +
-                          sizeof(dns_slabheaderlist_t) + sizeof(isc_heap_t *)) %
+                          sizeof(isc_heap_t *) +
+                          sizeof(ISC_SIEVE(dns_slabheader_t))) %
                                  ISC_OS_CACHELINE_SIZE];
 
 } qpcache_bucket_t;
@@ -285,17 +271,6 @@ struct qpcache {
         */
        uint32_t serve_stale_refresh;
 
-       /*
-        * Start point % node_lock_count for next LRU cleanup.
-        */
-       atomic_uint lru_sweep;
-
-       /*
-        * When performing LRU cleaning limit cleaning to headers that were
-        * last used at or before this.
-        */
-       _Atomic(isc_stdtime_t) last_used;
-
        /* Locked by tree_lock. */
        dns_qp_t *tree;
        dns_qp_t *nsec;
@@ -457,6 +432,9 @@ qpcache__destroy(qpcache_t *qpdb);
 
 static dns_dbmethods_t qpdb_cachemethods;
 
+static void
+cleanup_deadnodes_cb(void *arg);
+
 /*%
  * 'init_count' is used to initialize 'newheader->count' which in turn
  * is used to determine where in the cycle rrset-order cyclic starts.
@@ -480,115 +458,83 @@ static atomic_uint_fast16_t init_count = 0;
  * Failure to follow this hierarchy can result in deadlock.
  */
 
-/*%
- * Routines for LRU-based cache management.
+/*
+ * Cache-eviction routines.
  */
 
-/*%
- * See if a given cache entry that is being reused needs to be updated
- * in the LRU-list.  From the LRU management point of view, this function is
- * expected to return true for almost all cases.  When used with threads,
- * however, this may cause a non-negligible performance penalty because a
- * writer lock will have to be acquired before updating the list.
- * If DNS_QPDB_LIMITLRUUPDATE is defined to be non 0 at compilation time, this
- * function returns true if the entry has not been updated for some period of
- * time.  We differentiate the NS or glue address case and the others since
- * experiments have shown that the former tends to be accessed relatively
- * infrequently and the cost of cache miss is higher (e.g., a missing NS records
- * may cause external queries at a higher level zone, involving more
- * transactions).
- *
- * Caller must hold the node (read or write) lock.
- */
-static bool
-need_headerupdate(dns_slabheader_t *header, isc_stdtime_t now) {
-       if (DNS_SLABHEADER_GETATTR(header, (DNS_SLABHEADERATTR_NONEXISTENT |
-                                           DNS_SLABHEADERATTR_ANCIENT |
-                                           DNS_SLABHEADERATTR_ZEROTTL)) != 0)
-       {
-               return false;
-       }
+static void
+expireheader(dns_slabheader_t *header, isc_rwlocktype_t *nlocktypep,
+            isc_rwlocktype_t *tlocktypep, dns_expire_t reason DNS__DB_FLARG);
 
-#if DNS_QPDB_LIMITLRUUPDATE
-       if (header->type == dns_rdatatype_ns ||
-           (header->trust == dns_trust_glue &&
-            dns_rdatatype_isaddr(header->type)))
-       {
-               /*
-                * Glue records are updated if at least DNS_QPDB_LRUUPDATE_GLUE
-                * seconds have passed since the previous update time.
-                */
-               return header->last_used + DNS_QPDB_LRUUPDATE_GLUE <= now;
+static size_t
+rdataset_size(dns_slabheader_t *header) {
+       if (EXISTS(header)) {
+               return dns_rdataslab_size(header);
        }
 
-       /*
-        * Other records are updated if DNS_QPDB_LRUUPDATE_REGULAR seconds
-        * have passed.
-        */
-       return header->last_used + DNS_QPDB_LRUUPDATE_REGULAR <= now;
-#else
-       UNUSED(now);
-
-       return true;
-#endif /* if DNS_QPDB_LIMITLRUUPDATE */
+       return sizeof(*header);
 }
 
-/*%
- * Update the timestamp of a given cache entry and move it to the head
- * of the corresponding LRU list.
- *
- * Caller must hold the node (write) lock.
- *
- * Note that the we do NOT touch the heap here, as the TTL has not changed.
- */
 static void
-update_header(qpcache_t *qpdb, dns_slabheader_t *header, isc_stdtime_t now) {
-       /* To be checked: can we really assume this? XXXMLG */
-       INSIST(ISC_LINK_LINKED(header, link));
+expire_lru_headers(qpcache_t *qpdb, uint32_t idx, size_t requested,
+                  isc_rwlocktype_t *nlocktypep,
+                  isc_rwlocktype_t *tlocktypep DNS__DB_FLARG) {
+       size_t expired = 0;
+
+       do {
+               dns_slabheader_t *header =
+                       ISC_SIEVE_NEXT(qpdb->buckets[idx].sieve, visited, link);
+               if (header == NULL) {
+                       return;
+               }
+
+               ISC_SIEVE_UNLINK(qpdb->buckets[idx].sieve, header, link);
 
-       ISC_LIST_UNLINK(qpdb->buckets[HEADERNODE(header)->locknum].lru, header,
-                       link);
-       header->last_used = now;
-       ISC_LIST_PREPEND(qpdb->buckets[HEADERNODE(header)->locknum].lru, header,
-                        link);
+               expired += rdataset_size(header);
+
+               expireheader(header, nlocktypep, tlocktypep,
+                            dns_expire_lru DNS__DB_FLARG_PASS);
+       } while (expired < requested);
 }
 
 static void
-maybe_update_headers(qpcache_t *qpdb, dns_slabheader_t *found,
-                    dns_slabheader_t *foundsig, isc_rwlock_t *nlock,
-                    isc_rwlocktype_t *nlocktypep, isc_stdtime_t now) {
-       if (need_headerupdate(found, now) ||
-           (foundsig != NULL && need_headerupdate(foundsig, now)))
-       {
-               if (*nlocktypep != isc_rwlocktype_write) {
-                       NODE_FORCEUPGRADE(nlock, nlocktypep);
-               }
-               if (need_headerupdate(found, now)) {
-                       update_header(qpdb, found, now);
-               }
-               if (foundsig != NULL && need_headerupdate(foundsig, now)) {
-                       update_header(qpdb, foundsig, now);
-               }
+qpcache_miss(qpcache_t *qpdb, dns_slabheader_t *newheader,
+            isc_rwlocktype_t *nlocktypep,
+            isc_rwlocktype_t *tlocktypep DNS__DB_FLARG) {
+       uint32_t idx = HEADERNODE(newheader)->locknum;
+
+       isc_heap_insert(qpdb->buckets[idx].heap, newheader);
+       newheader->heap = qpdb->buckets[idx].heap;
+
+       if (isc_mem_isovermem(qpdb->common.mctx)) {
+               /*
+                * Maximum estimated size of the data being added: The size
+                * of the rdataset, plus a new QP database node and nodename,
+                * and a possible additional NSEC node and nodename. Also add
+                * a 12k margin for a possible QP-trie chunk allocation.
+                * (It's okay to overestimate, we want to get cache memory
+                * down quickly.)
+                */
+
+               size_t purgesize =
+                       2 * (sizeof(qpcnode_t) +
+                            dns_name_size(&HEADERNODE(newheader)->name)) +
+                       rdataset_size(newheader) + 12288;
+
+               expire_lru_headers(qpdb, idx, purgesize, nlocktypep,
+                                  tlocktypep DNS__DB_FLARG_PASS);
        }
+
+       ISC_SIEVE_INSERT(qpdb->buckets[idx].sieve, newheader, link);
 }
 
-/*
- * Locking:
- * If a routine is going to lock more than one lock in this module, then
- * the locking must be done in the following order:
- *
- *      Tree Lock
- *
- *      Node Lock       (Only one from the set may be locked at one time by
- *                       any caller)
- *
- *      Database Lock
- *
- * Failure to follow this hierarchy can result in deadlock.
- *
- * Deleting Nodes:
- * For zone databases the node for the origin of the zone MUST NOT be deleted.
- */
+static void
+qpcache_hit(qpcache_t *qpdb ISC_ATTR_UNUSED, dns_slabheader_t *header) {
+       /*
+        * On cache hit, we only mark the header as seen.
+        */
+       ISC_SIEVE_MARK(header, visited);
+}
 
 /*
  * DB Routines
@@ -733,9 +679,6 @@ qpcnode_acquire(qpcache_t *qpdb, qpcnode_t *node, isc_rwlocktype_t nlocktype,
                                tlocktype DNS__DB_FLARG_PASS);
 }
 
-static void
-cleanup_deadnodes(void *arg);
-
 /*
  * Decrement the external references to a node. If the counter
  * goes to zero, decrement the node use counter in the qpcache object
@@ -838,7 +781,8 @@ qpcnode_release(qpcache_t *qpdb, qpcnode_t *node, isc_rwlocktype_t *nlocktypep,
                        isc_loop_t *loop = isc_loop_get(qpdb->loopmgr,
                                                        node->locknum);
 
-                       isc_async_run(loop, cleanup_deadnodes, qpdb);
+                       qpcache_ref(qpdb);
+                       isc_async_run(loop, cleanup_deadnodes_cb, qpdb);
                }
        }
 
@@ -921,13 +865,6 @@ setttl(dns_slabheader_t *header, isc_stdtime_t newts) {
 
        header->expire = newts;
 
-       if (header->db == NULL || !dns_db_iscache(header->db)) {
-               return;
-       }
-
-       /*
-        * This is a cache. Adjust the heaps if necessary.
-        */
        if (header->heap == NULL || header->heap_index == 0 || newts == oldts) {
                return;
        }
@@ -1130,9 +1067,11 @@ bindrdatasets(qpcache_t *qpdb, qpcnode_t *qpnode, dns_slabheader_t *found,
              dns_rdataset_t *sigrdataset DNS__DB_FLARG) {
        bindrdataset(qpdb, qpnode, found, now, nlocktype, tlocktype,
                     rdataset DNS__DB_FLARG_PASS);
+       qpcache_hit(qpdb, found);
        if (!NEGATIVE(found) && foundsig != NULL) {
                bindrdataset(qpdb, qpnode, foundsig, now, nlocktype, tlocktype,
                             sigrdataset DNS__DB_FLARG_PASS);
+               qpcache_hit(qpdb, foundsig);
        }
 }
 
@@ -1172,9 +1111,6 @@ setup_delegation(qpc_search_t *search, dns_dbnode_t **nodep,
                              search->zonecut_sigheader, search->now, nlocktype,
                              tlocktype, rdataset,
                              sigrdataset DNS__DB_FLARG_PASS);
-               maybe_update_headers(search->qpdb, search->zonecut_header,
-                                    search->zonecut_sigheader, nlock,
-                                    &nlocktype, search->now);
                NODE_UNLOCK(nlock, &nlocktype);
        }
 
@@ -1411,8 +1347,6 @@ find_deepest_zonecut(qpc_search_t *search, qpcnode_t *node,
                                      search->now, nlocktype,
                                      isc_rwlocktype_none, rdataset,
                                      sigrdataset DNS__DB_FLARG_PASS);
-                       maybe_update_headers(search->qpdb, found, foundsig,
-                                            nlock, &nlocktype, search->now);
                }
 
                NODE_UNLOCK(nlock, &nlocktype);
@@ -1505,8 +1439,6 @@ find_coveringnsec(qpc_search_t *search, const dns_name_t *name,
                bindrdatasets(search->qpdb, node, found, foundsig, search->now,
                              nlocktype, isc_rwlocktype_none, rdataset,
                              sigrdataset DNS__DB_FLARG_PASS);
-               maybe_update_headers(search->qpdb, found, foundsig, nlock,
-                                    &nlocktype, search->now);
                dns_name_copy(fname, foundname);
 
                result = DNS_R_COVERINGNSEC;
@@ -1796,8 +1728,6 @@ qpcache_find(dns_db_t *db, const dns_name_t *name, dns_dbversion_t *version,
                        bindrdatasets(search.qpdb, node, nsecheader, nsecsig,
                                      search.now, nlocktype, tlocktype,
                                      rdataset, sigrdataset DNS__DB_FLARG_PASS);
-                       maybe_update_headers(search.qpdb, nsecheader, nsecsig,
-                                            nlock, &nlocktype, search.now);
                        result = DNS_R_COVERINGNSEC;
                        goto node_exit;
                }
@@ -1831,8 +1761,6 @@ qpcache_find(dns_db_t *db, const dns_name_t *name, dns_dbversion_t *version,
                        bindrdatasets(search.qpdb, node, nsheader, nssig,
                                      search.now, nlocktype, tlocktype,
                                      rdataset, sigrdataset DNS__DB_FLARG_PASS);
-                       maybe_update_headers(search.qpdb, nsheader, nssig,
-                                            nlock, &nlocktype, search.now);
                        result = DNS_R_DELEGATION;
                        goto node_exit;
                }
@@ -1885,8 +1813,6 @@ qpcache_find(dns_db_t *db, const dns_name_t *name, dns_dbversion_t *version,
                bindrdatasets(search.qpdb, node, found, foundsig, search.now,
                              nlocktype, tlocktype, rdataset,
                              sigrdataset DNS__DB_FLARG_PASS);
-               maybe_update_headers(search.qpdb, found, foundsig, nlock,
-                                    &nlocktype, search.now);
        }
 
 node_exit:
@@ -1978,8 +1904,6 @@ seek_ns_headers(qpc_search_t *search, qpcnode_t *node, dns_dbnode_t **nodep,
        bindrdatasets(search->qpdb, node, found, foundsig, search->now,
                      nlocktype, *tlocktype, rdataset,
                      sigrdataset DNS__DB_FLARG_PASS);
-       maybe_update_headers(search->qpdb, found, foundsig, nlock, &nlocktype,
-                            search->now);
 
        NODE_UNLOCK(nlock, &nlocktype);
 
@@ -2115,8 +2039,6 @@ qpcache_findrdataset(dns_db_t *db, dns_dbnode_t *node, dns_dbversion_t *version,
                bindrdatasets(qpdb, qpnode, found, foundsig, search.now,
                              nlocktype, isc_rwlocktype_none, rdataset,
                              sigrdataset DNS__DB_FLARG_PASS);
-               maybe_update_headers(qpdb, found, foundsig, nlock, &nlocktype,
-                                    search.now);
        }
 
        NODE_UNLOCK(nlock, &nlocktype);
@@ -2219,114 +2141,6 @@ expiredata(dns_db_t *db, dns_dbnode_t *node, void *data) {
        INSIST(tlocktype == isc_rwlocktype_none);
 }
 
-static size_t
-rdataset_size(dns_slabheader_t *header) {
-       if (EXISTS(header)) {
-               return dns_rdataslab_size(header);
-       }
-
-       return sizeof(*header);
-}
-
-static size_t
-expire_lru_headers(qpcache_t *qpdb, unsigned int locknum,
-                  isc_rwlocktype_t *nlocktypep, isc_rwlocktype_t *tlocktypep,
-                  size_t purgesize DNS__DB_FLARG) {
-       dns_slabheader_t *header = NULL;
-       size_t purged = 0;
-
-       for (header = ISC_LIST_TAIL(qpdb->buckets[locknum].lru);
-            header != NULL && header->last_used <= qpdb->last_used &&
-            purged <= purgesize;
-            header = ISC_LIST_TAIL(qpdb->buckets[locknum].lru))
-       {
-               size_t header_size = rdataset_size(header);
-
-               /*
-                * Unlink the entry at this point to avoid checking it
-                * again even if it's currently used someone else and
-                * cannot be purged at this moment.  This entry won't be
-                * referenced any more (so unlinking is safe) since the
-                * TTL will be reset to 0.
-                */
-               ISC_LIST_UNLINK(qpdb->buckets[locknum].lru, header, link);
-               expireheader(header, nlocktypep, tlocktypep,
-                            dns_expire_lru DNS__DB_FLARG_PASS);
-               purged += header_size;
-       }
-
-       return purged;
-}
-
-/*%
- * Purge some expired and/or stale (i.e. unused for some period) cache entries
- * due to an overmem condition.  To recover from this condition quickly,
- * we clean up entries up to the size of newly added rdata that triggered
- * the overmem; this is accessible via newheader.
- *
- * The LRU lists tails are processed in LRU order to the nearest second.
- *
- * A write lock on the tree must be held.
- */
-static void
-overmem(qpcache_t *qpdb, dns_slabheader_t *newheader,
-       isc_rwlocktype_t *tlocktypep DNS__DB_FLARG) {
-       uint32_t locknum_start = qpdb->lru_sweep++ % qpdb->buckets_count;
-       uint32_t locknum = locknum_start;
-       size_t purgesize, purged = 0;
-       isc_stdtime_t min_last_used = 0;
-       size_t max_passes = 8;
-
-       /*
-        * Maximum estimated size of the data being added: The size
-        * of the rdataset, plus a new QP database node and nodename,
-        * and a possible additional NSEC node and nodename. Also add
-        * a 12k margin for a possible QP-trie chunk allocation.
-        * (It's okay to overestimate, we want to get cache memory
-        * down quickly.)
-        */
-       purgesize = 2 * (sizeof(qpcnode_t) +
-                        dns_name_size(&HEADERNODE(newheader)->name)) +
-                   rdataset_size(newheader) + 12288;
-again:
-       do {
-               isc_rwlocktype_t nlocktype = isc_rwlocktype_none;
-               isc_rwlock_t *nlock = &qpdb->buckets[locknum].lock;
-               NODE_WRLOCK(nlock, &nlocktype);
-
-               purged += expire_lru_headers(
-                       qpdb, locknum, &nlocktype, tlocktypep,
-                       purgesize - purged DNS__DB_FLARG_PASS);
-
-               /*
-                * Work out the oldest remaining last_used values of the list
-                * tails as we walk across the array of lru lists.
-                */
-               dns_slabheader_t *header =
-                       ISC_LIST_TAIL(qpdb->buckets[locknum].lru);
-               if (header != NULL &&
-                   (min_last_used == 0 || header->last_used < min_last_used))
-               {
-                       min_last_used = header->last_used;
-               }
-               NODE_UNLOCK(nlock, &nlocktype);
-               locknum = (locknum + 1) % qpdb->buckets_count;
-       } while (locknum != locknum_start && purged <= purgesize);
-
-       /*
-        * Update qpdb->last_used if we have walked all the list tails and have
-        * not freed the required amount of memory.
-        */
-       if (purged < purgesize) {
-               if (min_last_used != 0) {
-                       qpdb->last_used = min_last_used;
-                       if (max_passes-- > 0) {
-                               goto again;
-                       }
-               }
-       }
-}
-
 /*%
  * These functions allow the heap code to rank the priority of each
  * element.  It returns true if v1 happens "sooner" than v2.
@@ -2385,7 +2199,7 @@ qpcache__destroy(qpcache_t *qpdb) {
        for (i = 0; i < qpdb->buckets_count; i++) {
                NODE_DESTROYLOCK(&qpdb->buckets[i].lock);
 
-               INSIST(ISC_LIST_EMPTY(qpdb->buckets[i].lru));
+               INSIST(ISC_SIEVE_EMPTY(qpdb->buckets[i].sieve));
 
                INSIST(isc_queue_empty(&qpdb->buckets[i].deadnodes));
                isc_queue_destroy(&qpdb->buckets[i].deadnodes);
@@ -2428,9 +2242,7 @@ qpcache_destroy(dns_db_t *arg) {
  * to wait for the tree write lock.
  */
 static void
-cleanup_deadnodes(void *arg) {
-       qpcache_t *qpdb = arg;
-       uint16_t locknum = isc_tid();
+cleanup_deadnodes(qpcache_t *qpdb, uint16_t locknum) {
        isc_rwlocktype_t tlocktype = isc_rwlocktype_none;
        isc_rwlocktype_t nlocktype = isc_rwlocktype_none;
        isc_rwlock_t *nlock = &qpdb->buckets[locknum].lock;
@@ -2444,8 +2256,7 @@ cleanup_deadnodes(void *arg) {
        TREE_WRLOCK(&qpdb->tree_lock, &tlocktype);
        NODE_WRLOCK(nlock, &nlocktype);
 
-       RUNTIME_CHECK(isc_queue_splice(&deadnodes,
-                                      &qpdb->buckets[locknum].deadnodes));
+       isc_queue_splice(&deadnodes, &qpdb->buckets[locknum].deadnodes);
        isc_queue_for_each_entry_safe(&deadnodes, qpnode, qpnext, deadlink) {
                qpcnode_release(qpdb, qpnode, &nlocktype,
                                &tlocktype DNS__DB_FILELINE);
@@ -2455,6 +2266,14 @@ cleanup_deadnodes(void *arg) {
        TREE_UNLOCK(&qpdb->tree_lock, &tlocktype);
 }
 
+static void
+cleanup_deadnodes_cb(void *arg) {
+       qpcache_t *qpdb = arg;
+       uint16_t locknum = isc_tid();
+
+       cleanup_deadnodes(qpdb, locknum);
+       qpcache_unref(qpdb);
+}
 /*
  * This function is assumed to be called when a node is newly referenced
  * and can be in the deadnode list.  In that case the node will be references
@@ -2657,7 +2476,6 @@ add(qpcache_t *qpdb, qpcnode_t *qpnode,
        dns_slabheader_t *prioheader = NULL, *expireheader = NULL;
        dns_typepair_t negtype = 0;
        dns_trust_t trust;
-       int idx;
        uint32_t ntypes = 0;
 
        if ((options & DNS_DBADD_FORCE) != 0) {
@@ -2833,17 +2651,9 @@ find_header:
                        if (header->expire > newheader->expire) {
                                setttl(header, newheader->expire);
                        }
-                       if (header->last_used != now) {
-                               ISC_LIST_UNLINK(
-                                       qpdb->buckets[HEADERNODE(header)->locknum]
-                                               .lru,
-                                       header, link);
-                               header->last_used = now;
-                               ISC_LIST_PREPEND(
-                                       qpdb->buckets[HEADERNODE(header)->locknum]
-                                               .lru,
-                                       header, link);
-                       }
+
+                       qpcache_hit(qpdb, header);
+
                        if (header->noqname == NULL &&
                            newheader->noqname != NULL)
                        {
@@ -2895,17 +2705,9 @@ find_header:
                        if (header->expire > newheader->expire) {
                                setttl(header, newheader->expire);
                        }
-                       if (header->last_used != now) {
-                               ISC_LIST_UNLINK(
-                                       qpdb->buckets[HEADERNODE(header)->locknum]
-                                               .lru,
-                                       header, link);
-                               header->last_used = now;
-                               ISC_LIST_PREPEND(
-                                       qpdb->buckets[HEADERNODE(header)->locknum]
-                                               .lru,
-                                       header, link);
-                       }
+
+                       qpcache_hit(qpdb, header);
+
                        if (header->noqname == NULL &&
                            newheader->noqname != NULL)
                        {
@@ -2927,17 +2729,9 @@ find_header:
                        return ISC_R_SUCCESS;
                }
 
-               idx = HEADERNODE(newheader)->locknum;
-               isc_heap_insert(qpdb->buckets[idx].heap, newheader);
-               newheader->heap = qpdb->buckets[idx].heap;
-               if (ZEROTTL(newheader)) {
-                       newheader->last_used = qpdb->last_used + 1;
-                       ISC_LIST_APPEND(qpdb->buckets[idx].lru, newheader,
-                                       link);
-               } else {
-                       ISC_LIST_PREPEND(qpdb->buckets[idx].lru, newheader,
-                                        link);
-               }
+               qpcache_miss(qpdb, newheader, &nlocktype,
+                            &tlocktype DNS__DB_FLARG_PASS);
+
                if (topheader_prev != NULL) {
                        topheader_prev->next = newheader;
                } else {
@@ -2961,17 +2755,8 @@ find_header:
                /* No rdatasets of the given type exist at the node. */
                INSIST(newheader->down == NULL);
 
-               idx = HEADERNODE(newheader)->locknum;
-               isc_heap_insert(qpdb->buckets[idx].heap, newheader);
-               newheader->heap = qpdb->buckets[idx].heap;
-               if (ZEROTTL(newheader)) {
-                       ISC_LIST_APPEND(qpdb->buckets[idx].lru, newheader,
-                                       link);
-               } else {
-                       ISC_LIST_PREPEND(qpdb->buckets[idx].lru, newheader,
-                                        link);
-               }
-
+               qpcache_miss(qpdb, newheader, &nlocktype,
+                            &tlocktype DNS__DB_FLARG_PASS);
                if (prio_header(newheader)) {
                        /* This is a priority type, prepend it */
                        newheader->next = qpnode->data;
@@ -3097,7 +2882,7 @@ cleanup:
 static void
 expire_ttl_headers(qpcache_t *qpdb, unsigned int locknum,
                   isc_rwlocktype_t *nlocktypep, isc_rwlocktype_t *tlocktypep,
-                  isc_stdtime_t now, bool cache_is_overmem DNS__DB_FLARG);
+                  isc_stdtime_t now DNS__DB_FLARG);
 
 static isc_result_t
 qpcache_addrdataset(dns_db_t *db, dns_dbnode_t *node, dns_dbversion_t *version,
@@ -3114,7 +2899,6 @@ qpcache_addrdataset(dns_db_t *db, dns_dbnode_t *node, dns_dbversion_t *version,
        isc_rwlocktype_t tlocktype = isc_rwlocktype_none;
        isc_rwlocktype_t nlocktype = isc_rwlocktype_none;
        isc_rwlock_t *nlock = NULL;
-       bool cache_is_overmem = false;
        dns_fixedname_t fixed;
        dns_name_t *name = NULL;
        isc_stdtime_t now = __now ? __now : isc_stdtime_now();
@@ -3140,8 +2924,6 @@ qpcache_addrdataset(dns_db_t *db, dns_dbnode_t *node, dns_dbversion_t *version,
        newheader = (dns_slabheader_t *)region.base;
        dns_slabheader_reset(newheader, db, node);
 
-       newheader->last_used = now;
-
        /*
         * By default, dns_rdataslab_fromrdataset() sets newheader->ttl
         * to the rdataset TTL. In the case of the cache, that's wrong;
@@ -3195,34 +2977,17 @@ qpcache_addrdataset(dns_db_t *db, dns_dbnode_t *node, dns_dbversion_t *version,
        /*
         * Add to the auxiliary NSEC tree if we're adding an NSEC record.
         */
-       TREE_RDLOCK(&qpdb->tree_lock, &tlocktype);
-       if (qpnode->nsec != DNS_DB_NSEC_HAS_NSEC &&
-           rdataset->type == dns_rdatatype_nsec)
-       {
-               newnsec = true;
-       } else {
-               newnsec = false;
-       }
-       TREE_UNLOCK(&qpdb->tree_lock, &tlocktype);
+       newnsec = (qpnode->nsec != DNS_DB_NSEC_HAS_NSEC &&
+                  rdataset->type == dns_rdatatype_nsec);
 
        /*
-        * If we're adding a delegation type, adding to the auxiliary NSEC
-        * tree, or the DB is a cache in an overmem state, hold an
-        * exclusive lock on the tree.  In the latter case the lock does
-        * not necessarily have to be acquired but it will help purge
-        * ancient entries more effectively.
+        * If we're adding a delegation type or adding to the auxiliary
+        * NSEC tree, hold an exclusive lock on the tree.
         */
-       if (isc_mem_isovermem(qpdb->common.mctx)) {
-               cache_is_overmem = true;
-       }
-       if (delegating || newnsec || cache_is_overmem) {
+       if (delegating || newnsec) {
                TREE_WRLOCK(&qpdb->tree_lock, &tlocktype);
        }
 
-       if (cache_is_overmem) {
-               overmem(qpdb, newheader, &tlocktype DNS__DB_FLARG_PASS);
-       }
-
        nlock = &qpdb->buckets[qpnode->locknum].lock;
 
        NODE_WRLOCK(nlock, &nlocktype);
@@ -3234,27 +2999,15 @@ qpcache_addrdataset(dns_db_t *db, dns_dbnode_t *node, dns_dbversion_t *version,
                                  true);
        }
 
-       expire_ttl_headers(qpdb, qpnode->locknum, &nlocktype, &tlocktype, now,
-                          cache_is_overmem DNS__DB_FLARG_PASS);
+       expire_ttl_headers(qpdb, qpnode->locknum, &nlocktype, &tlocktype,
+                          now DNS__DB_FLARG_PASS);
 
-       /*
-        * If we've been holding a write lock on the tree just for
-        * cleaning, we can release it now.  However, we still need the
-        * node lock.
-        */
-       if (tlocktype == isc_rwlocktype_write && !delegating && !newnsec) {
-               TREE_UNLOCK(&qpdb->tree_lock, &tlocktype);
-       }
-
-       result = ISC_R_SUCCESS;
        if (newnsec) {
                qpcnode_t *nsecnode = NULL;
 
                result = dns_qp_getname(qpdb->nsec, name, (void **)&nsecnode,
                                        NULL);
-               if (result == ISC_R_SUCCESS) {
-                       result = ISC_R_SUCCESS;
-               } else {
+               if (result != ISC_R_SUCCESS) {
                        INSIST(nsecnode == NULL);
                        nsecnode = new_qpcnode(qpdb, name);
                        nsecnode->nsec = DNS_DB_NSEC_NSEC;
@@ -3265,11 +3018,9 @@ qpcache_addrdataset(dns_db_t *db, dns_dbnode_t *node, dns_dbversion_t *version,
                qpnode->nsec = DNS_DB_NSEC_HAS_NSEC;
        }
 
-       if (result == ISC_R_SUCCESS) {
-               result = add(qpdb, qpnode, name, newheader, options,
-                            addedrdataset, now, nlocktype,
-                            tlocktype DNS__DB_FLARG_PASS);
-       }
+       result = add(qpdb, qpnode, name, newheader, options, addedrdataset, now,
+                    nlocktype, tlocktype DNS__DB_FLARG_PASS);
+
        if (result == ISC_R_SUCCESS && delegating) {
                qpnode->delegating = 1;
        }
@@ -3279,6 +3030,7 @@ qpcache_addrdataset(dns_db_t *db, dns_dbnode_t *node, dns_dbversion_t *version,
        if (tlocktype != isc_rwlocktype_none) {
                TREE_UNLOCK(&qpdb->tree_lock, &tlocktype);
        }
+
        INSIST(tlocktype == isc_rwlocktype_none);
 
        return result;
@@ -3402,7 +3154,7 @@ dns__qpcache_create(isc_mem_t *mctx, const dns_name_t *origin,
 
        dns_rdatasetstats_create(mctx, &qpdb->rrsetstats);
        for (i = 0; i < (int)qpdb->buckets_count; i++) {
-               ISC_LIST_INIT(qpdb->buckets[i].lru);
+               ISC_SIEVE_INIT(qpdb->buckets[i].sieve);
 
                qpdb->buckets[i].heap = NULL;
                isc_heap_create(hmctx, ttl_sooner, set_index, 0,
@@ -3911,6 +3663,7 @@ deletedata(dns_db_t *db ISC_ATTR_UNUSED, dns_dbnode_t *node ISC_ATTR_UNUSED,
           void *data) {
        dns_slabheader_t *header = data;
        qpcache_t *qpdb = (qpcache_t *)header->db;
+       int idx = HEADERNODE(header)->locknum;
 
        if (header->heap != NULL && header->heap_index != 0) {
                isc_heap_delete(header->heap, header->heap_index);
@@ -3920,8 +3673,7 @@ deletedata(dns_db_t *db ISC_ATTR_UNUSED, dns_dbnode_t *node ISC_ATTR_UNUSED,
                          atomic_load_acquire(&header->attributes), false);
 
        if (ISC_LINK_LINKED(header, link)) {
-               int idx = HEADERNODE(header)->locknum;
-               ISC_LIST_UNLINK(qpdb->buckets[idx].lru, header, link);
+               ISC_SIEVE_UNLINK(qpdb->buckets[idx].sieve, header, link);
        }
 
        if (header->noqname != NULL) {
@@ -3938,7 +3690,7 @@ deletedata(dns_db_t *db ISC_ATTR_UNUSED, dns_dbnode_t *node ISC_ATTR_UNUSED,
 static void
 expire_ttl_headers(qpcache_t *qpdb, unsigned int locknum,
                   isc_rwlocktype_t *nlocktypep, isc_rwlocktype_t *tlocktypep,
-                  isc_stdtime_t now, bool cache_is_overmem DNS__DB_FLARG) {
+                  isc_stdtime_t now DNS__DB_FLARG) {
        isc_heap_t *heap = qpdb->buckets[locknum].heap;
 
        for (size_t i = 0; i < DNS_QPDB_EXPIRE_TTL_COUNT; i++) {
@@ -3949,12 +3701,7 @@ expire_ttl_headers(qpcache_t *qpdb, unsigned int locknum,
                        return;
                }
 
-               dns_ttl_t ttl = header->expire;
-
-               if (!cache_is_overmem) {
-                       /* Only account for stale TTL if cache is not overmem */
-                       ttl += STALE_TTL(header, qpdb);
-               }
+               dns_ttl_t ttl = header->expire + STALE_TTL(header, qpdb);
 
                if (ttl >= now - QPDB_VIRTUAL) {
                        /*
index 1aaafd0ba20a6f7e2d18fd1b458896ccdf66162e..e7c5539458ed2514df32dd45ae2991377fbd7e16 100644 (file)
@@ -865,6 +865,7 @@ dns_slabheader_reset(dns_slabheader_t *h, dns_db_t *db, dns_dbnode_t *node) {
        h->heap = NULL;
        h->db = db;
        h->node = node;
+       h->visited = false;
 
        atomic_init(&h->attributes, 0);
        atomic_init(&h->last_refresh_fail_ts, 0);
index 27cfe92f3e2e1135e702bebb67a9274fbf71000d..1cdc540fb24474e5d79eb97271a54493c7f77df1 100644 (file)
@@ -75,6 +75,7 @@ libisc_la_HEADERS =                   \
        include/isc/rwlock.h            \
        include/isc/safe.h              \
        include/isc/serial.h            \
+       include/isc/sieve.h             \
        include/isc/signal.h            \
        include/isc/siphash.h           \
        include/isc/sockaddr.h          \
diff --git a/lib/isc/include/isc/sieve.h b/lib/isc/include/isc/sieve.h
new file mode 100644 (file)
index 0000000..dfebeb5
--- /dev/null
@@ -0,0 +1,166 @@
+/*
+ * Copyright (C) Internet Systems Consortium, Inc. ("ISC")
+ *
+ * SPDX-License-Identifier: MPL-2.0
+ *
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, you can obtain one at https://mozilla.org/MPL/2.0/.
+ *
+ * See the COPYRIGHT file distributed with this work for additional
+ * information regarding copyright ownership.
+ */
+
+#pragma once
+
+/*! \file isc/sieve.h */
+
+/*
+ * Zhang, Yazhuo, Juncheng Yang, Yao Yue, Ymir Vigfusson, and K V Rashmi.
+ * “SIEVE Is Simpler than LRU: An Efficient Turn-Key Eviction Algorithm for
+ * Web Caches,” n.d.
+ *
+ * Algorithm 1 SIEVE
+ *
+ * Input: The request x, doubly-linked queue T , cache size C, hand p
+ *  1: if x is in T then                       ▷ Cache Hit
+ *  2:     x.visited ←1
+ *  3: else                                    ▷ Cache Miss
+ *  4:     if |T |= C then                     ▷ Cache Full
+ *  5:         o ←p
+ *  6:         if o is NULL then
+ *  7:             o ←tail of T
+ *  8:         while o.visited = 1 do
+ *  9:             o.visited ←0
+ * 10:             o ←o.prev
+ * 11:             if o is NULL then
+ * 12:                 o ←tail of T
+ * 13:         p ←o.prev
+ * 14:         Discard o in T                  ▷ Eviction
+ * 15:     Insert x in the head of T .
+ * 16:     x.visited ←0                              ▷ Insertion
+ *
+ * Data structure.  SIEVE requires only one FIFO queue and one pointer
+ * called “hand”.  The queue maintains the insertion order between objects.
+ * Each object in the queue uses one bit to track the visited/non-visited
+ * status.  The hand points to the next eviction candidate in the cache and
+ * moves from the tail to the head.  Note that, unlike existing algorithms,
+ * e.g., LRU, FIFO, and CLOCK, in which the eviction candidate is always
+ * the tail object, the eviction candidate in SIEVE is an object somewhere
+ * in the queue.
+ *
+ * SIEVE operations.  A cache hit in SIEVE changes the visited bit of the
+ * accessed object to 1.  For a popular object whose visited bit is already
+ * 1, SIEVE does not need to perform any operation.  During a cache miss,
+ * SIEVE examines the object pointed by the hand.  If it has been visited,
+ * the visited bit is reset, and the hand moves to the next position (the
+ * retained object stays in the original position of the queue).  It
+ * continues this process until it encounters an object with the visited
+ * bit being 0, and it evicts the object.  After the eviction, the hand
+ * points to the next position (the previous object in the queue).  While
+ * an evicted object is in the middle of the queue most of the time, a new
+ * object is always inserted into the head of the queue.  In other words,
+ * the new objects and the retained objects are not mixed together.
+ *
+ * At first glance, SIEVE is similar to CLOCK/Second Chance/FIFO-Reinsertion.
+ * Each algorithm maintains a single queue in which each object is
+ * associated with a visited bit to track its access status.  Visited
+ * objects are retained (also called "survived") during an eviction.
+ * Notably, new objects are inserted at the head of the queue in both SIEVE
+ * and FIFO-Reinsertion.  However, the hand in SIEVE moves from the tail to
+ * the head over time, whereas the hand in FIFO-Reinsertion stays at the
+ * tail.  The key difference is where a retained object is kept.  SIEVE
+ * keeps it in the old position, while FIFO-Reinsertion inserts it at the
+ * head, together with newly inserted objects.
+ *
+ * We detail the algorithm in Alg. 1.  Line 1 checks whether there is a
+ * hit, and if so, then line 2 sets the visited bit to one.  In the case of
+ * a cache miss (Line 3), Lines 5-12 identify the object to be evicted.
+ *
+ * Lazy promotion and quick demotion.  Despite a simple design, SIEVE
+ * effectively incorporates both lazy promotion and quick demotion.  An
+ * object is only promoted at the eviction time in lazy promotion.  SIEVE
+ * operates in a similar manner.  However, rather than promoting the object
+ * to the head of the queue, SIEVE keeps the object at its original
+ * location.  The "survived" objects are generally more popular than the
+ * evicted ones, thus, they are likely to be accessed again in the future.
+ * By gathering the "survived" objects, the hand in SIEVE can quickly move
+ * from the tail to the area near the head, where most objects are newly
+ * inserted.  These newly inserted objects are quickly examined by the hand
+ * of SIEVE after they are admitted into the cache, thus achieving quick
+ * demotion.  This eviction mechanism makes SIEVE achieve both lazy
+ * promotion and quick demotion with- out adding too much overhead.
+ *
+ * The key ingredient of SIEVE is the moving hand, which functions like an
+ * adaptive filter that removes unpopular objects from the cache.  This
+ * mechanism enables SIEVE to strike a balance between finding new popular
+ * objects and keeping old popular objects.
+ */
+
+#include <isc/list.h>
+
+#define ISC_SIEVE(type)              \
+       struct {                     \
+               ISC_LIST(type) list; \
+               type *hand;          \
+       }
+#define ISC_SIEVE_INIT(sieve)                \
+       {                                    \
+               ISC_LIST_INIT((sieve).list); \
+               (sieve).hand = NULL;         \
+       }
+#define ISC_SIEVE_EMPTY(sieve) ISC_LIST_EMPTY((sieve).list)
+
+#define ISC_SIEVE_MARKED(entry, visited) CMM_LOAD_SHARED((entry)->visited)
+#define ISC_SIEVE_MARK(entry, visited)                    \
+       if (!ISC_SIEVE_MARKED(entry, visited)) {          \
+               CMM_STORE_SHARED((entry)->visited, true); \
+       }
+#define ISC_SIEVE_UNMARK(entry, visited) \
+       CMM_STORE_SHARED((entry)->visited, false)
+
+/*
+ * Note: To match the original algorithm design, the
+ * SIEVE queue is iterated from tail to head.
+ */
+#define ISC_SIEVE_NEXT(sieve, visited, link)                                  \
+       ({                                                                    \
+               __typeof__((sieve).hand) __hand = ((sieve).hand);             \
+               if (__hand == NULL && !ISC_LIST_EMPTY((sieve).list)) {        \
+                       __hand = ISC_LIST_TAIL((sieve).list);                 \
+               }                                                             \
+                                                                              \
+               while (__hand != NULL && ISC_SIEVE_MARKED(__hand, visited)) { \
+                       ISC_SIEVE_UNMARK(__hand, visited);                    \
+                                                                              \
+                       __hand = ISC_LIST_PREV(__hand, link);                 \
+                       if (__hand == NULL) {                                 \
+                               /* We know the queue is not empty */          \
+                               __hand = ISC_LIST_TAIL((sieve).list);         \
+                       }                                                     \
+               }                                                             \
+               (sieve).hand = __hand;                                        \
+               __hand;                                                       \
+       })
+
+#define ISC_SIEVE_UNLINK(sieve, entry, link)                                 \
+       ({                                                                   \
+               __typeof__((sieve).hand) __hand = (sieve).hand;              \
+               /* 1. Go to the previous node (possibly head of the list) */ \
+               if (entry == __hand) {                                       \
+                       __hand = ISC_LIST_PREV(entry, link);                 \
+               }                                                            \
+                                                                             \
+               /* 2. Unlink the node from the list */                       \
+               ISC_LIST_UNLINK((sieve).list, entry, link);                  \
+                                                                             \
+               /* 3. We reached head, continue with tail again */           \
+               if (__hand == NULL && !ISC_LIST_EMPTY((sieve).list)) {       \
+                       __hand = ISC_LIST_TAIL((sieve).list);                \
+               }                                                            \
+                                                                             \
+               (sieve).hand = __hand;                                       \
+       })
+
+#define ISC_SIEVE_INSERT(sieve, entry, link) \
+       ISC_LIST_PREPEND((sieve).list, entry, link)
index 24101becb9aadc4e391f67c8add25e208508d57e..4b1de18f7dd08d00831a87a8ca91efb2195281fd 100644 (file)
@@ -112,6 +112,16 @@ overmempurge_addrdataset(dns_db_t *db, isc_stdtime_t now, int idx,
        dns_db_detachnode(db, &node);
 }
 
+static void
+cleanup_all_deadnodes(dns_db_t *db) {
+       qpcache_t *qpdb = (qpcache_t *)db;
+       qpcache_ref(qpdb);
+       for (uint16_t locknum = 0; locknum < qpdb->buckets_count; locknum++) {
+               cleanup_deadnodes(qpdb, locknum);
+       }
+       qpcache_unref(qpdb);
+}
+
 ISC_LOOP_TEST_IMPL(overmempurge_bigrdata) {
        size_t maxcache = 2097152U; /* 2MB - same as DNS_CACHE_MINSIZE */
        size_t hiwater = maxcache - (maxcache >> 3); /* borrowed from cache.c */
@@ -150,6 +160,7 @@ ISC_LOOP_TEST_IMPL(overmempurge_bigrdata) {
         */
        while (i-- > 0) {
                overmempurge_addrdataset(db, now, i, 50054, 65535, false);
+               cleanup_all_deadnodes(db);
                if (verbose) {
                        print_message("# inuse: %zd max: %zd\n",
                                      isc_mem_inuse(mctx2), maxcache);
@@ -200,6 +211,7 @@ ISC_LOOP_TEST_IMPL(overmempurge_longname) {
         */
        while (i-- > 0) {
                overmempurge_addrdataset(db, now, i, 50054, 0, true);
+               cleanup_all_deadnodes(db);
                if (verbose) {
                        print_message("# inuse: %zd max: %zd\n",
                                      isc_mem_inuse(mctx2), maxcache);