#include <inttypes.h>
#include <stdalign.h>
#include <stdbool.h>
-#include <sys/mman.h>
#include <isc/ascii.h>
#include <isc/async.h>
#include <isc/refcount.h>
#include <isc/result.h>
#include <isc/rwlock.h>
+#include <isc/sieve.h>
#include <isc/stdio.h>
#include <isc/string.h>
#include <isc/time.h>
*/
#define QPDB_VIRTUAL 300
-/*%
- * Whether to rate-limit updating the LRU to avoid possible thread contention.
- * Updating LRU requires write locking, so we don't do it every time the
- * record is touched - only after some time passes.
- */
-#ifndef DNS_QPDB_LIMITLRUUPDATE
-#define DNS_QPDB_LIMITLRUUPDATE 1
-#endif
-
-/*% Time after which we update LRU for glue records, 5 minutes */
-#define DNS_QPDB_LRUUPDATE_GLUE 300
-/*% Time after which we update LRU for all other records, 10 minutes */
-#define DNS_QPDB_LRUUPDATE_REGULAR 600
-
/*
* This defines the number of headers that we try to expire each time the
* expire_ttl_headers() is run. The number should be small enough, so the
#define DNS_QPDB_EXPIRE_TTL_COUNT 10
/*%
- * This is the structure that is used for each node in the qp trie of trees.
+ * This is the structure that is used for each node in the qp trie of
+ * trees.
*/
typedef struct qpcnode qpcnode_t;
struct qpcnode {
/* Per-bucket lock. */
isc_rwlock_t lock;
- /*
- * Linked list used to implement LRU cache cleaning.
- */
- dns_slabheaderlist_t lru;
-
/*
* The heap is used for TTL based expiry. Note that qpcache->hmctx
* is the memory context to use for heap memory; this differs from
*/
isc_heap_t *heap;
+ /* SIEVE-LRU cache cleaning state. */
+ ISC_SIEVE(dns_slabheader_t) sieve;
+
/* Padding to prevent false sharing between locks. */
uint8_t __padding[ISC_OS_CACHELINE_SIZE -
(sizeof(isc_queue_t) + sizeof(isc_rwlock_t) +
- sizeof(dns_slabheaderlist_t) + sizeof(isc_heap_t *)) %
+ sizeof(isc_heap_t *) +
+ sizeof(ISC_SIEVE(dns_slabheader_t))) %
ISC_OS_CACHELINE_SIZE];
} qpcache_bucket_t;
*/
uint32_t serve_stale_refresh;
- /*
- * Start point % node_lock_count for next LRU cleanup.
- */
- atomic_uint lru_sweep;
-
- /*
- * When performing LRU cleaning limit cleaning to headers that were
- * last used at or before this.
- */
- _Atomic(isc_stdtime_t) last_used;
-
/* Locked by tree_lock. */
dns_qp_t *tree;
dns_qp_t *nsec;
static dns_dbmethods_t qpdb_cachemethods;
+static void
+cleanup_deadnodes_cb(void *arg);
+
/*%
* 'init_count' is used to initialize 'newheader->count' which in turn
* is used to determine where in the cycle rrset-order cyclic starts.
* Failure to follow this hierarchy can result in deadlock.
*/
-/*%
- * Routines for LRU-based cache management.
+/*
+ * Cache-eviction routines.
*/
-/*%
- * See if a given cache entry that is being reused needs to be updated
- * in the LRU-list. From the LRU management point of view, this function is
- * expected to return true for almost all cases. When used with threads,
- * however, this may cause a non-negligible performance penalty because a
- * writer lock will have to be acquired before updating the list.
- * If DNS_QPDB_LIMITLRUUPDATE is defined to be non 0 at compilation time, this
- * function returns true if the entry has not been updated for some period of
- * time. We differentiate the NS or glue address case and the others since
- * experiments have shown that the former tends to be accessed relatively
- * infrequently and the cost of cache miss is higher (e.g., a missing NS records
- * may cause external queries at a higher level zone, involving more
- * transactions).
- *
- * Caller must hold the node (read or write) lock.
- */
-static bool
-need_headerupdate(dns_slabheader_t *header, isc_stdtime_t now) {
- if (DNS_SLABHEADER_GETATTR(header, (DNS_SLABHEADERATTR_NONEXISTENT |
- DNS_SLABHEADERATTR_ANCIENT |
- DNS_SLABHEADERATTR_ZEROTTL)) != 0)
- {
- return false;
- }
+static void
+expireheader(dns_slabheader_t *header, isc_rwlocktype_t *nlocktypep,
+ isc_rwlocktype_t *tlocktypep, dns_expire_t reason DNS__DB_FLARG);
-#if DNS_QPDB_LIMITLRUUPDATE
- if (header->type == dns_rdatatype_ns ||
- (header->trust == dns_trust_glue &&
- dns_rdatatype_isaddr(header->type)))
- {
- /*
- * Glue records are updated if at least DNS_QPDB_LRUUPDATE_GLUE
- * seconds have passed since the previous update time.
- */
- return header->last_used + DNS_QPDB_LRUUPDATE_GLUE <= now;
+static size_t
+rdataset_size(dns_slabheader_t *header) {
+ if (EXISTS(header)) {
+ return dns_rdataslab_size(header);
}
- /*
- * Other records are updated if DNS_QPDB_LRUUPDATE_REGULAR seconds
- * have passed.
- */
- return header->last_used + DNS_QPDB_LRUUPDATE_REGULAR <= now;
-#else
- UNUSED(now);
-
- return true;
-#endif /* if DNS_QPDB_LIMITLRUUPDATE */
+ return sizeof(*header);
}
-/*%
- * Update the timestamp of a given cache entry and move it to the head
- * of the corresponding LRU list.
- *
- * Caller must hold the node (write) lock.
- *
- * Note that the we do NOT touch the heap here, as the TTL has not changed.
- */
static void
-update_header(qpcache_t *qpdb, dns_slabheader_t *header, isc_stdtime_t now) {
- /* To be checked: can we really assume this? XXXMLG */
- INSIST(ISC_LINK_LINKED(header, link));
+expire_lru_headers(qpcache_t *qpdb, uint32_t idx, size_t requested,
+ isc_rwlocktype_t *nlocktypep,
+ isc_rwlocktype_t *tlocktypep DNS__DB_FLARG) {
+ size_t expired = 0;
+
+ do {
+ dns_slabheader_t *header =
+ ISC_SIEVE_NEXT(qpdb->buckets[idx].sieve, visited, link);
+ if (header == NULL) {
+ return;
+ }
+
+ ISC_SIEVE_UNLINK(qpdb->buckets[idx].sieve, header, link);
- ISC_LIST_UNLINK(qpdb->buckets[HEADERNODE(header)->locknum].lru, header,
- link);
- header->last_used = now;
- ISC_LIST_PREPEND(qpdb->buckets[HEADERNODE(header)->locknum].lru, header,
- link);
+ expired += rdataset_size(header);
+
+ expireheader(header, nlocktypep, tlocktypep,
+ dns_expire_lru DNS__DB_FLARG_PASS);
+ } while (expired < requested);
}
static void
-maybe_update_headers(qpcache_t *qpdb, dns_slabheader_t *found,
- dns_slabheader_t *foundsig, isc_rwlock_t *nlock,
- isc_rwlocktype_t *nlocktypep, isc_stdtime_t now) {
- if (need_headerupdate(found, now) ||
- (foundsig != NULL && need_headerupdate(foundsig, now)))
- {
- if (*nlocktypep != isc_rwlocktype_write) {
- NODE_FORCEUPGRADE(nlock, nlocktypep);
- }
- if (need_headerupdate(found, now)) {
- update_header(qpdb, found, now);
- }
- if (foundsig != NULL && need_headerupdate(foundsig, now)) {
- update_header(qpdb, foundsig, now);
- }
+qpcache_miss(qpcache_t *qpdb, dns_slabheader_t *newheader,
+ isc_rwlocktype_t *nlocktypep,
+ isc_rwlocktype_t *tlocktypep DNS__DB_FLARG) {
+ uint32_t idx = HEADERNODE(newheader)->locknum;
+
+ isc_heap_insert(qpdb->buckets[idx].heap, newheader);
+ newheader->heap = qpdb->buckets[idx].heap;
+
+ if (isc_mem_isovermem(qpdb->common.mctx)) {
+ /*
+ * Maximum estimated size of the data being added: The size
+ * of the rdataset, plus a new QP database node and nodename,
+ * and a possible additional NSEC node and nodename. Also add
+ * a 12k margin for a possible QP-trie chunk allocation.
+ * (It's okay to overestimate, we want to get cache memory
+ * down quickly.)
+ */
+
+ size_t purgesize =
+ 2 * (sizeof(qpcnode_t) +
+ dns_name_size(&HEADERNODE(newheader)->name)) +
+ rdataset_size(newheader) + 12288;
+
+ expire_lru_headers(qpdb, idx, purgesize, nlocktypep,
+ tlocktypep DNS__DB_FLARG_PASS);
}
+
+ ISC_SIEVE_INSERT(qpdb->buckets[idx].sieve, newheader, link);
}
-/*
- * Locking:
- * If a routine is going to lock more than one lock in this module, then
- * the locking must be done in the following order:
- *
- * Tree Lock
- *
- * Node Lock (Only one from the set may be locked at one time by
- * any caller)
- *
- * Database Lock
- *
- * Failure to follow this hierarchy can result in deadlock.
- *
- * Deleting Nodes:
- * For zone databases the node for the origin of the zone MUST NOT be deleted.
- */
+static void
+qpcache_hit(qpcache_t *qpdb ISC_ATTR_UNUSED, dns_slabheader_t *header) {
+ /*
+ * On cache hit, we only mark the header as seen.
+ */
+ ISC_SIEVE_MARK(header, visited);
+}
/*
* DB Routines
tlocktype DNS__DB_FLARG_PASS);
}
-static void
-cleanup_deadnodes(void *arg);
-
/*
* Decrement the external references to a node. If the counter
* goes to zero, decrement the node use counter in the qpcache object
isc_loop_t *loop = isc_loop_get(qpdb->loopmgr,
node->locknum);
- isc_async_run(loop, cleanup_deadnodes, qpdb);
+ qpcache_ref(qpdb);
+ isc_async_run(loop, cleanup_deadnodes_cb, qpdb);
}
}
header->expire = newts;
- if (header->db == NULL || !dns_db_iscache(header->db)) {
- return;
- }
-
- /*
- * This is a cache. Adjust the heaps if necessary.
- */
if (header->heap == NULL || header->heap_index == 0 || newts == oldts) {
return;
}
dns_rdataset_t *sigrdataset DNS__DB_FLARG) {
bindrdataset(qpdb, qpnode, found, now, nlocktype, tlocktype,
rdataset DNS__DB_FLARG_PASS);
+ qpcache_hit(qpdb, found);
if (!NEGATIVE(found) && foundsig != NULL) {
bindrdataset(qpdb, qpnode, foundsig, now, nlocktype, tlocktype,
sigrdataset DNS__DB_FLARG_PASS);
+ qpcache_hit(qpdb, foundsig);
}
}
search->zonecut_sigheader, search->now, nlocktype,
tlocktype, rdataset,
sigrdataset DNS__DB_FLARG_PASS);
- maybe_update_headers(search->qpdb, search->zonecut_header,
- search->zonecut_sigheader, nlock,
- &nlocktype, search->now);
NODE_UNLOCK(nlock, &nlocktype);
}
search->now, nlocktype,
isc_rwlocktype_none, rdataset,
sigrdataset DNS__DB_FLARG_PASS);
- maybe_update_headers(search->qpdb, found, foundsig,
- nlock, &nlocktype, search->now);
}
NODE_UNLOCK(nlock, &nlocktype);
bindrdatasets(search->qpdb, node, found, foundsig, search->now,
nlocktype, isc_rwlocktype_none, rdataset,
sigrdataset DNS__DB_FLARG_PASS);
- maybe_update_headers(search->qpdb, found, foundsig, nlock,
- &nlocktype, search->now);
dns_name_copy(fname, foundname);
result = DNS_R_COVERINGNSEC;
bindrdatasets(search.qpdb, node, nsecheader, nsecsig,
search.now, nlocktype, tlocktype,
rdataset, sigrdataset DNS__DB_FLARG_PASS);
- maybe_update_headers(search.qpdb, nsecheader, nsecsig,
- nlock, &nlocktype, search.now);
result = DNS_R_COVERINGNSEC;
goto node_exit;
}
bindrdatasets(search.qpdb, node, nsheader, nssig,
search.now, nlocktype, tlocktype,
rdataset, sigrdataset DNS__DB_FLARG_PASS);
- maybe_update_headers(search.qpdb, nsheader, nssig,
- nlock, &nlocktype, search.now);
result = DNS_R_DELEGATION;
goto node_exit;
}
bindrdatasets(search.qpdb, node, found, foundsig, search.now,
nlocktype, tlocktype, rdataset,
sigrdataset DNS__DB_FLARG_PASS);
- maybe_update_headers(search.qpdb, found, foundsig, nlock,
- &nlocktype, search.now);
}
node_exit:
bindrdatasets(search->qpdb, node, found, foundsig, search->now,
nlocktype, *tlocktype, rdataset,
sigrdataset DNS__DB_FLARG_PASS);
- maybe_update_headers(search->qpdb, found, foundsig, nlock, &nlocktype,
- search->now);
NODE_UNLOCK(nlock, &nlocktype);
bindrdatasets(qpdb, qpnode, found, foundsig, search.now,
nlocktype, isc_rwlocktype_none, rdataset,
sigrdataset DNS__DB_FLARG_PASS);
- maybe_update_headers(qpdb, found, foundsig, nlock, &nlocktype,
- search.now);
}
NODE_UNLOCK(nlock, &nlocktype);
INSIST(tlocktype == isc_rwlocktype_none);
}
-static size_t
-rdataset_size(dns_slabheader_t *header) {
- if (EXISTS(header)) {
- return dns_rdataslab_size(header);
- }
-
- return sizeof(*header);
-}
-
-static size_t
-expire_lru_headers(qpcache_t *qpdb, unsigned int locknum,
- isc_rwlocktype_t *nlocktypep, isc_rwlocktype_t *tlocktypep,
- size_t purgesize DNS__DB_FLARG) {
- dns_slabheader_t *header = NULL;
- size_t purged = 0;
-
- for (header = ISC_LIST_TAIL(qpdb->buckets[locknum].lru);
- header != NULL && header->last_used <= qpdb->last_used &&
- purged <= purgesize;
- header = ISC_LIST_TAIL(qpdb->buckets[locknum].lru))
- {
- size_t header_size = rdataset_size(header);
-
- /*
- * Unlink the entry at this point to avoid checking it
- * again even if it's currently used someone else and
- * cannot be purged at this moment. This entry won't be
- * referenced any more (so unlinking is safe) since the
- * TTL will be reset to 0.
- */
- ISC_LIST_UNLINK(qpdb->buckets[locknum].lru, header, link);
- expireheader(header, nlocktypep, tlocktypep,
- dns_expire_lru DNS__DB_FLARG_PASS);
- purged += header_size;
- }
-
- return purged;
-}
-
-/*%
- * Purge some expired and/or stale (i.e. unused for some period) cache entries
- * due to an overmem condition. To recover from this condition quickly,
- * we clean up entries up to the size of newly added rdata that triggered
- * the overmem; this is accessible via newheader.
- *
- * The LRU lists tails are processed in LRU order to the nearest second.
- *
- * A write lock on the tree must be held.
- */
-static void
-overmem(qpcache_t *qpdb, dns_slabheader_t *newheader,
- isc_rwlocktype_t *tlocktypep DNS__DB_FLARG) {
- uint32_t locknum_start = qpdb->lru_sweep++ % qpdb->buckets_count;
- uint32_t locknum = locknum_start;
- size_t purgesize, purged = 0;
- isc_stdtime_t min_last_used = 0;
- size_t max_passes = 8;
-
- /*
- * Maximum estimated size of the data being added: The size
- * of the rdataset, plus a new QP database node and nodename,
- * and a possible additional NSEC node and nodename. Also add
- * a 12k margin for a possible QP-trie chunk allocation.
- * (It's okay to overestimate, we want to get cache memory
- * down quickly.)
- */
- purgesize = 2 * (sizeof(qpcnode_t) +
- dns_name_size(&HEADERNODE(newheader)->name)) +
- rdataset_size(newheader) + 12288;
-again:
- do {
- isc_rwlocktype_t nlocktype = isc_rwlocktype_none;
- isc_rwlock_t *nlock = &qpdb->buckets[locknum].lock;
- NODE_WRLOCK(nlock, &nlocktype);
-
- purged += expire_lru_headers(
- qpdb, locknum, &nlocktype, tlocktypep,
- purgesize - purged DNS__DB_FLARG_PASS);
-
- /*
- * Work out the oldest remaining last_used values of the list
- * tails as we walk across the array of lru lists.
- */
- dns_slabheader_t *header =
- ISC_LIST_TAIL(qpdb->buckets[locknum].lru);
- if (header != NULL &&
- (min_last_used == 0 || header->last_used < min_last_used))
- {
- min_last_used = header->last_used;
- }
- NODE_UNLOCK(nlock, &nlocktype);
- locknum = (locknum + 1) % qpdb->buckets_count;
- } while (locknum != locknum_start && purged <= purgesize);
-
- /*
- * Update qpdb->last_used if we have walked all the list tails and have
- * not freed the required amount of memory.
- */
- if (purged < purgesize) {
- if (min_last_used != 0) {
- qpdb->last_used = min_last_used;
- if (max_passes-- > 0) {
- goto again;
- }
- }
- }
-}
-
/*%
* These functions allow the heap code to rank the priority of each
* element. It returns true if v1 happens "sooner" than v2.
for (i = 0; i < qpdb->buckets_count; i++) {
NODE_DESTROYLOCK(&qpdb->buckets[i].lock);
- INSIST(ISC_LIST_EMPTY(qpdb->buckets[i].lru));
+ INSIST(ISC_SIEVE_EMPTY(qpdb->buckets[i].sieve));
INSIST(isc_queue_empty(&qpdb->buckets[i].deadnodes));
isc_queue_destroy(&qpdb->buckets[i].deadnodes);
* to wait for the tree write lock.
*/
static void
-cleanup_deadnodes(void *arg) {
- qpcache_t *qpdb = arg;
- uint16_t locknum = isc_tid();
+cleanup_deadnodes(qpcache_t *qpdb, uint16_t locknum) {
isc_rwlocktype_t tlocktype = isc_rwlocktype_none;
isc_rwlocktype_t nlocktype = isc_rwlocktype_none;
isc_rwlock_t *nlock = &qpdb->buckets[locknum].lock;
TREE_WRLOCK(&qpdb->tree_lock, &tlocktype);
NODE_WRLOCK(nlock, &nlocktype);
- RUNTIME_CHECK(isc_queue_splice(&deadnodes,
- &qpdb->buckets[locknum].deadnodes));
+ isc_queue_splice(&deadnodes, &qpdb->buckets[locknum].deadnodes);
isc_queue_for_each_entry_safe(&deadnodes, qpnode, qpnext, deadlink) {
qpcnode_release(qpdb, qpnode, &nlocktype,
&tlocktype DNS__DB_FILELINE);
TREE_UNLOCK(&qpdb->tree_lock, &tlocktype);
}
+static void
+cleanup_deadnodes_cb(void *arg) {
+ qpcache_t *qpdb = arg;
+ uint16_t locknum = isc_tid();
+
+ cleanup_deadnodes(qpdb, locknum);
+ qpcache_unref(qpdb);
+}
/*
* This function is assumed to be called when a node is newly referenced
* and can be in the deadnode list. In that case the node will be references
dns_slabheader_t *prioheader = NULL, *expireheader = NULL;
dns_typepair_t negtype = 0;
dns_trust_t trust;
- int idx;
uint32_t ntypes = 0;
if ((options & DNS_DBADD_FORCE) != 0) {
if (header->expire > newheader->expire) {
setttl(header, newheader->expire);
}
- if (header->last_used != now) {
- ISC_LIST_UNLINK(
- qpdb->buckets[HEADERNODE(header)->locknum]
- .lru,
- header, link);
- header->last_used = now;
- ISC_LIST_PREPEND(
- qpdb->buckets[HEADERNODE(header)->locknum]
- .lru,
- header, link);
- }
+
+ qpcache_hit(qpdb, header);
+
if (header->noqname == NULL &&
newheader->noqname != NULL)
{
if (header->expire > newheader->expire) {
setttl(header, newheader->expire);
}
- if (header->last_used != now) {
- ISC_LIST_UNLINK(
- qpdb->buckets[HEADERNODE(header)->locknum]
- .lru,
- header, link);
- header->last_used = now;
- ISC_LIST_PREPEND(
- qpdb->buckets[HEADERNODE(header)->locknum]
- .lru,
- header, link);
- }
+
+ qpcache_hit(qpdb, header);
+
if (header->noqname == NULL &&
newheader->noqname != NULL)
{
return ISC_R_SUCCESS;
}
- idx = HEADERNODE(newheader)->locknum;
- isc_heap_insert(qpdb->buckets[idx].heap, newheader);
- newheader->heap = qpdb->buckets[idx].heap;
- if (ZEROTTL(newheader)) {
- newheader->last_used = qpdb->last_used + 1;
- ISC_LIST_APPEND(qpdb->buckets[idx].lru, newheader,
- link);
- } else {
- ISC_LIST_PREPEND(qpdb->buckets[idx].lru, newheader,
- link);
- }
+ qpcache_miss(qpdb, newheader, &nlocktype,
+ &tlocktype DNS__DB_FLARG_PASS);
+
if (topheader_prev != NULL) {
topheader_prev->next = newheader;
} else {
/* No rdatasets of the given type exist at the node. */
INSIST(newheader->down == NULL);
- idx = HEADERNODE(newheader)->locknum;
- isc_heap_insert(qpdb->buckets[idx].heap, newheader);
- newheader->heap = qpdb->buckets[idx].heap;
- if (ZEROTTL(newheader)) {
- ISC_LIST_APPEND(qpdb->buckets[idx].lru, newheader,
- link);
- } else {
- ISC_LIST_PREPEND(qpdb->buckets[idx].lru, newheader,
- link);
- }
-
+ qpcache_miss(qpdb, newheader, &nlocktype,
+ &tlocktype DNS__DB_FLARG_PASS);
if (prio_header(newheader)) {
/* This is a priority type, prepend it */
newheader->next = qpnode->data;
static void
expire_ttl_headers(qpcache_t *qpdb, unsigned int locknum,
isc_rwlocktype_t *nlocktypep, isc_rwlocktype_t *tlocktypep,
- isc_stdtime_t now, bool cache_is_overmem DNS__DB_FLARG);
+ isc_stdtime_t now DNS__DB_FLARG);
static isc_result_t
qpcache_addrdataset(dns_db_t *db, dns_dbnode_t *node, dns_dbversion_t *version,
isc_rwlocktype_t tlocktype = isc_rwlocktype_none;
isc_rwlocktype_t nlocktype = isc_rwlocktype_none;
isc_rwlock_t *nlock = NULL;
- bool cache_is_overmem = false;
dns_fixedname_t fixed;
dns_name_t *name = NULL;
isc_stdtime_t now = __now ? __now : isc_stdtime_now();
newheader = (dns_slabheader_t *)region.base;
dns_slabheader_reset(newheader, db, node);
- newheader->last_used = now;
-
/*
* By default, dns_rdataslab_fromrdataset() sets newheader->ttl
* to the rdataset TTL. In the case of the cache, that's wrong;
/*
* Add to the auxiliary NSEC tree if we're adding an NSEC record.
*/
- TREE_RDLOCK(&qpdb->tree_lock, &tlocktype);
- if (qpnode->nsec != DNS_DB_NSEC_HAS_NSEC &&
- rdataset->type == dns_rdatatype_nsec)
- {
- newnsec = true;
- } else {
- newnsec = false;
- }
- TREE_UNLOCK(&qpdb->tree_lock, &tlocktype);
+ newnsec = (qpnode->nsec != DNS_DB_NSEC_HAS_NSEC &&
+ rdataset->type == dns_rdatatype_nsec);
/*
- * If we're adding a delegation type, adding to the auxiliary NSEC
- * tree, or the DB is a cache in an overmem state, hold an
- * exclusive lock on the tree. In the latter case the lock does
- * not necessarily have to be acquired but it will help purge
- * ancient entries more effectively.
+ * If we're adding a delegation type or adding to the auxiliary
+ * NSEC tree, hold an exclusive lock on the tree.
*/
- if (isc_mem_isovermem(qpdb->common.mctx)) {
- cache_is_overmem = true;
- }
- if (delegating || newnsec || cache_is_overmem) {
+ if (delegating || newnsec) {
TREE_WRLOCK(&qpdb->tree_lock, &tlocktype);
}
- if (cache_is_overmem) {
- overmem(qpdb, newheader, &tlocktype DNS__DB_FLARG_PASS);
- }
-
nlock = &qpdb->buckets[qpnode->locknum].lock;
NODE_WRLOCK(nlock, &nlocktype);
true);
}
- expire_ttl_headers(qpdb, qpnode->locknum, &nlocktype, &tlocktype, now,
- cache_is_overmem DNS__DB_FLARG_PASS);
+ expire_ttl_headers(qpdb, qpnode->locknum, &nlocktype, &tlocktype,
+ now DNS__DB_FLARG_PASS);
- /*
- * If we've been holding a write lock on the tree just for
- * cleaning, we can release it now. However, we still need the
- * node lock.
- */
- if (tlocktype == isc_rwlocktype_write && !delegating && !newnsec) {
- TREE_UNLOCK(&qpdb->tree_lock, &tlocktype);
- }
-
- result = ISC_R_SUCCESS;
if (newnsec) {
qpcnode_t *nsecnode = NULL;
result = dns_qp_getname(qpdb->nsec, name, (void **)&nsecnode,
NULL);
- if (result == ISC_R_SUCCESS) {
- result = ISC_R_SUCCESS;
- } else {
+ if (result != ISC_R_SUCCESS) {
INSIST(nsecnode == NULL);
nsecnode = new_qpcnode(qpdb, name);
nsecnode->nsec = DNS_DB_NSEC_NSEC;
qpnode->nsec = DNS_DB_NSEC_HAS_NSEC;
}
- if (result == ISC_R_SUCCESS) {
- result = add(qpdb, qpnode, name, newheader, options,
- addedrdataset, now, nlocktype,
- tlocktype DNS__DB_FLARG_PASS);
- }
+ result = add(qpdb, qpnode, name, newheader, options, addedrdataset, now,
+ nlocktype, tlocktype DNS__DB_FLARG_PASS);
+
if (result == ISC_R_SUCCESS && delegating) {
qpnode->delegating = 1;
}
if (tlocktype != isc_rwlocktype_none) {
TREE_UNLOCK(&qpdb->tree_lock, &tlocktype);
}
+
INSIST(tlocktype == isc_rwlocktype_none);
return result;
dns_rdatasetstats_create(mctx, &qpdb->rrsetstats);
for (i = 0; i < (int)qpdb->buckets_count; i++) {
- ISC_LIST_INIT(qpdb->buckets[i].lru);
+ ISC_SIEVE_INIT(qpdb->buckets[i].sieve);
qpdb->buckets[i].heap = NULL;
isc_heap_create(hmctx, ttl_sooner, set_index, 0,
void *data) {
dns_slabheader_t *header = data;
qpcache_t *qpdb = (qpcache_t *)header->db;
+ int idx = HEADERNODE(header)->locknum;
if (header->heap != NULL && header->heap_index != 0) {
isc_heap_delete(header->heap, header->heap_index);
atomic_load_acquire(&header->attributes), false);
if (ISC_LINK_LINKED(header, link)) {
- int idx = HEADERNODE(header)->locknum;
- ISC_LIST_UNLINK(qpdb->buckets[idx].lru, header, link);
+ ISC_SIEVE_UNLINK(qpdb->buckets[idx].sieve, header, link);
}
if (header->noqname != NULL) {
static void
expire_ttl_headers(qpcache_t *qpdb, unsigned int locknum,
isc_rwlocktype_t *nlocktypep, isc_rwlocktype_t *tlocktypep,
- isc_stdtime_t now, bool cache_is_overmem DNS__DB_FLARG) {
+ isc_stdtime_t now DNS__DB_FLARG) {
isc_heap_t *heap = qpdb->buckets[locknum].heap;
for (size_t i = 0; i < DNS_QPDB_EXPIRE_TTL_COUNT; i++) {
return;
}
- dns_ttl_t ttl = header->expire;
-
- if (!cache_is_overmem) {
- /* Only account for stale TTL if cache is not overmem */
- ttl += STALE_TTL(header, qpdb);
- }
+ dns_ttl_t ttl = header->expire + STALE_TTL(header, qpdb);
if (ttl >= now - QPDB_VIRTUAL) {
/*
--- /dev/null
+/*
+ * Copyright (C) Internet Systems Consortium, Inc. ("ISC")
+ *
+ * SPDX-License-Identifier: MPL-2.0
+ *
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, you can obtain one at https://mozilla.org/MPL/2.0/.
+ *
+ * See the COPYRIGHT file distributed with this work for additional
+ * information regarding copyright ownership.
+ */
+
+#pragma once
+
+/*! \file isc/sieve.h */
+
+/*
+ * Zhang, Yazhuo, Juncheng Yang, Yao Yue, Ymir Vigfusson, and K V Rashmi.
+ * “SIEVE Is Simpler than LRU: An Efficient Turn-Key Eviction Algorithm for
+ * Web Caches,” n.d.
+ *
+ * Algorithm 1 SIEVE
+ *
+ * Input: The request x, doubly-linked queue T , cache size C, hand p
+ * 1: if x is in T then ▷ Cache Hit
+ * 2: x.visited ←1
+ * 3: else ▷ Cache Miss
+ * 4: if |T |= C then ▷ Cache Full
+ * 5: o ←p
+ * 6: if o is NULL then
+ * 7: o ←tail of T
+ * 8: while o.visited = 1 do
+ * 9: o.visited ←0
+ * 10: o ←o.prev
+ * 11: if o is NULL then
+ * 12: o ←tail of T
+ * 13: p ←o.prev
+ * 14: Discard o in T ▷ Eviction
+ * 15: Insert x in the head of T .
+ * 16: x.visited ←0 ▷ Insertion
+ *
+ * Data structure. SIEVE requires only one FIFO queue and one pointer
+ * called “hand”. The queue maintains the insertion order between objects.
+ * Each object in the queue uses one bit to track the visited/non-visited
+ * status. The hand points to the next eviction candidate in the cache and
+ * moves from the tail to the head. Note that, unlike existing algorithms,
+ * e.g., LRU, FIFO, and CLOCK, in which the eviction candidate is always
+ * the tail object, the eviction candidate in SIEVE is an object somewhere
+ * in the queue.
+ *
+ * SIEVE operations. A cache hit in SIEVE changes the visited bit of the
+ * accessed object to 1. For a popular object whose visited bit is already
+ * 1, SIEVE does not need to perform any operation. During a cache miss,
+ * SIEVE examines the object pointed by the hand. If it has been visited,
+ * the visited bit is reset, and the hand moves to the next position (the
+ * retained object stays in the original position of the queue). It
+ * continues this process until it encounters an object with the visited
+ * bit being 0, and it evicts the object. After the eviction, the hand
+ * points to the next position (the previous object in the queue). While
+ * an evicted object is in the middle of the queue most of the time, a new
+ * object is always inserted into the head of the queue. In other words,
+ * the new objects and the retained objects are not mixed together.
+ *
+ * At first glance, SIEVE is similar to CLOCK/Second Chance/FIFO-Reinsertion.
+ * Each algorithm maintains a single queue in which each object is
+ * associated with a visited bit to track its access status. Visited
+ * objects are retained (also called "survived") during an eviction.
+ * Notably, new objects are inserted at the head of the queue in both SIEVE
+ * and FIFO-Reinsertion. However, the hand in SIEVE moves from the tail to
+ * the head over time, whereas the hand in FIFO-Reinsertion stays at the
+ * tail. The key difference is where a retained object is kept. SIEVE
+ * keeps it in the old position, while FIFO-Reinsertion inserts it at the
+ * head, together with newly inserted objects.
+ *
+ * We detail the algorithm in Alg. 1. Line 1 checks whether there is a
+ * hit, and if so, then line 2 sets the visited bit to one. In the case of
+ * a cache miss (Line 3), Lines 5-12 identify the object to be evicted.
+ *
+ * Lazy promotion and quick demotion. Despite a simple design, SIEVE
+ * effectively incorporates both lazy promotion and quick demotion. An
+ * object is only promoted at the eviction time in lazy promotion. SIEVE
+ * operates in a similar manner. However, rather than promoting the object
+ * to the head of the queue, SIEVE keeps the object at its original
+ * location. The "survived" objects are generally more popular than the
+ * evicted ones, thus, they are likely to be accessed again in the future.
+ * By gathering the "survived" objects, the hand in SIEVE can quickly move
+ * from the tail to the area near the head, where most objects are newly
+ * inserted. These newly inserted objects are quickly examined by the hand
+ * of SIEVE after they are admitted into the cache, thus achieving quick
+ * demotion. This eviction mechanism makes SIEVE achieve both lazy
+ * promotion and quick demotion with- out adding too much overhead.
+ *
+ * The key ingredient of SIEVE is the moving hand, which functions like an
+ * adaptive filter that removes unpopular objects from the cache. This
+ * mechanism enables SIEVE to strike a balance between finding new popular
+ * objects and keeping old popular objects.
+ */
+
+#include <isc/list.h>
+
+#define ISC_SIEVE(type) \
+ struct { \
+ ISC_LIST(type) list; \
+ type *hand; \
+ }
+#define ISC_SIEVE_INIT(sieve) \
+ { \
+ ISC_LIST_INIT((sieve).list); \
+ (sieve).hand = NULL; \
+ }
+#define ISC_SIEVE_EMPTY(sieve) ISC_LIST_EMPTY((sieve).list)
+
+#define ISC_SIEVE_MARKED(entry, visited) CMM_LOAD_SHARED((entry)->visited)
+#define ISC_SIEVE_MARK(entry, visited) \
+ if (!ISC_SIEVE_MARKED(entry, visited)) { \
+ CMM_STORE_SHARED((entry)->visited, true); \
+ }
+#define ISC_SIEVE_UNMARK(entry, visited) \
+ CMM_STORE_SHARED((entry)->visited, false)
+
+/*
+ * Note: To match the original algorithm design, the
+ * SIEVE queue is iterated from tail to head.
+ */
+#define ISC_SIEVE_NEXT(sieve, visited, link) \
+ ({ \
+ __typeof__((sieve).hand) __hand = ((sieve).hand); \
+ if (__hand == NULL && !ISC_LIST_EMPTY((sieve).list)) { \
+ __hand = ISC_LIST_TAIL((sieve).list); \
+ } \
+ \
+ while (__hand != NULL && ISC_SIEVE_MARKED(__hand, visited)) { \
+ ISC_SIEVE_UNMARK(__hand, visited); \
+ \
+ __hand = ISC_LIST_PREV(__hand, link); \
+ if (__hand == NULL) { \
+ /* We know the queue is not empty */ \
+ __hand = ISC_LIST_TAIL((sieve).list); \
+ } \
+ } \
+ (sieve).hand = __hand; \
+ __hand; \
+ })
+
+#define ISC_SIEVE_UNLINK(sieve, entry, link) \
+ ({ \
+ __typeof__((sieve).hand) __hand = (sieve).hand; \
+ /* 1. Go to the previous node (possibly head of the list) */ \
+ if (entry == __hand) { \
+ __hand = ISC_LIST_PREV(entry, link); \
+ } \
+ \
+ /* 2. Unlink the node from the list */ \
+ ISC_LIST_UNLINK((sieve).list, entry, link); \
+ \
+ /* 3. We reached head, continue with tail again */ \
+ if (__hand == NULL && !ISC_LIST_EMPTY((sieve).list)) { \
+ __hand = ISC_LIST_TAIL((sieve).list); \
+ } \
+ \
+ (sieve).hand = __hand; \
+ })
+
+#define ISC_SIEVE_INSERT(sieve, entry, link) \
+ ISC_LIST_PREPEND((sieve).list, entry, link)