]> git.ipfire.org Git - thirdparty/bind9.git/commitdiff
2475. [bug] LRU cache cleanup under overmem condition could purge
authorMark Andrews <marka@isc.org>
Mon, 27 Oct 2008 03:52:43 +0000 (03:52 +0000)
committerMark Andrews <marka@isc.org>
Mon, 27 Oct 2008 03:52:43 +0000 (03:52 +0000)
                        particular entries more aggresively. [RT #17628]

CHANGES
lib/dns/rbtdb.c

diff --git a/CHANGES b/CHANGES
index 099d3ca434fcf03d547aef002e4cbcfa8962c3cf..4575338d2b61043f02cdb2b30a2d4d6d39f52e7e 100644 (file)
--- a/CHANGES
+++ b/CHANGES
@@ -1,3 +1,6 @@
+2475.  [bug]           LRU cache cleanup under overmem condition could purge
+                       particular entries more aggresively. [RT #17628]
+
 2474.  [bug]           ACL structures could be allocated with insufficient
                        space, causing an array overrun. [RT #18765]
 
index fa813f99b8fe6fc0d520abe5734fb7f93691ef7a..41d04cbe25e8285dd44f75708ea08b9272846617 100644 (file)
@@ -15,7 +15,7 @@
  * PERFORMANCE OF THIS SOFTWARE.
  */
 
-/* $Id: rbtdb.c,v 1.265 2008/10/24 00:11:17 marka Exp $ */
+/* $Id: rbtdb.c,v 1.266 2008/10/27 03:52:43 marka Exp $ */
 
 /*! \file */
 
@@ -322,7 +322,26 @@ struct acachectl {
        (((header)->attributes & RDATASET_ATTR_OPTOUT) != 0)
 
 #define DEFAULT_NODE_LOCK_COUNT         7       /*%< Should be prime. */
-#define DEFAULT_CACHE_NODE_LOCK_COUNT   1009    /*%< Should be prime. */
+
+/*%
+ * Number of buckets for cache DB entries (locks, LRU lists, TTL heaps).
+ * There is a tradeoff issue about configuring this value: if this is too
+ * small, it may cause heavier contention between threads; if this is too large,
+ * LRU purge algorithm won't work well (entries tend to be purged prematurely).
+ * The default value should work well for most environments, but this can
+ * also be configurable at compilation time via the
+ * DNS_RBTDB_CACHE_NODE_LOCK_COUNT variable.  This value must be larger than
+ * 1 due to the assumption of overmem_purge().
+ */
+#ifdef DNS_RBTDB_CACHE_NODE_LOCK_COUNT
+#if DNS_RBTDB_CACHE_NODE_LOCK_COUNT <= 1
+#error "DNS_RBTDB_CACHE_NODE_LOCK_COUNT must be larger 1"
+#else
+#define DEFAULT_CACHE_NODE_LOCK_COUNT DNS_RBTDB_CACHE_NODE_LOCK_COUNT
+#endif
+#else
+#define DEFAULT_CACHE_NODE_LOCK_COUNT   16
+#endif /* DNS_RBTDB_CACHE_NODE_LOCK_COUNT */
 
 typedef struct {
        nodelock_t                      lock;
@@ -499,8 +518,10 @@ static inline isc_boolean_t need_headerupdate(rdatasetheader_t *header,
                                              isc_stdtime_t now);
 static void update_header(dns_rbtdb_t *rbtdb, rdatasetheader_t *header,
                          isc_stdtime_t now);
-static void check_stale_cache(dns_rbtdb_t *rbtdb, dns_rbtnode_t *rbtnode,
-                             isc_stdtime_t now, isc_boolean_t tree_locked);
+static void expire_header(dns_rbtdb_t *rbtdb, rdatasetheader_t *header,
+                         isc_boolean_t tree_locked);
+static void overmem_purge(dns_rbtdb_t *rbtdb, unsigned int locknum_start,
+                         isc_stdtime_t now, isc_boolean_t tree_locked);
 static isc_result_t resign_insert(dns_rbtdb_t *rbtdb, int idx,
                                  rdatasetheader_t *newheader);
 
@@ -5776,6 +5797,7 @@ addrdataset(dns_db_t *db, dns_dbnode_t *node, dns_dbversion_t *version,
        rbtdb_version_t *rbtversion = version;
        isc_region_t region;
        rdatasetheader_t *newheader;
+       rdatasetheader_t *header;
        isc_result_t result;
        isc_boolean_t delegating;
        isc_boolean_t tree_locked = ISC_FALSE;
@@ -5871,6 +5893,9 @@ addrdataset(dns_db_t *db, dns_dbnode_t *node, dns_dbversion_t *version,
                RWLOCK(&rbtdb->tree_lock, isc_rwlocktype_write);
        }
 
+       if (IS_CACHE(rbtdb) && rbtdb->overmem)
+               overmem_purge(rbtdb, rbtnode->locknum, now, tree_locked);
+
        NODE_LOCK(&rbtdb->node_locks[rbtnode->locknum].lock,
                  isc_rwlocktype_write);
 
@@ -5882,7 +5907,10 @@ addrdataset(dns_db_t *db, dns_dbnode_t *node, dns_dbversion_t *version,
        if (IS_CACHE(rbtdb)) {
                if (tree_locked)
                        cleanup_dead_nodes(rbtdb, rbtnode->locknum);
-               check_stale_cache(rbtdb, rbtnode, now, tree_locked);
+
+               header = isc_heap_element(rbtdb->heaps[rbtnode->locknum], 1);
+               if (header && header->rdh_ttl <= now - RBTDB_VIRTUAL)
+                       expire_header(rbtdb, header, tree_locked);
 
                /*
                 * If we've been holding a write lock on the tree just for
@@ -8323,79 +8351,70 @@ update_header(dns_rbtdb_t *rbtdb, rdatasetheader_t *header,
 }
 
 /*%
- * Examine the tail entry of the LRU list to see if it expires or is stale
- * (unused for some period).  If so, it's marked as stale and possibly freed.
- * If the DB is in the overmem condition, the tail and the next to tail entries
- * will be unconditionally marked.  We don't care about a race on 'overmem'
- * at the risk of causing some collateral damage or a small delay in starting
- * cleanup, so we don't bother to lock rbtdb.
- *
- * Caller must hold the node (write) lock.
- *
- * We can get away with locking only one node here, since it will lock all
- * other nodes in that lock pool bucket.
+ * Purge some expired and/or stale (i.e. unused for some period) cache entries
+ * under an overmem condition.  To recover from this condition quickly, up to
+ * 2 entries will be purged.  This process is triggered while adding a new
+ * entry, and we specifically avoid purging entries in the same LRU bucket as
+ * the one to which the new entry will belong.  Otherwise, we might purge
+ * entries of the same name of different RR types while adding RRsets from a
+ * single response (consider the case where we're adding A and AAAA glue records
+ * of the same NS name).
  */
 static void
-check_stale_cache(dns_rbtdb_t *rbtdb, dns_rbtnode_t *rbtnode,
-                 isc_stdtime_t now, isc_boolean_t tree_locked)
+overmem_purge(dns_rbtdb_t *rbtdb, unsigned int locknum_start,
+             isc_stdtime_t now, isc_boolean_t tree_locked)
 {
-       rdatasetheader_t *victim;
-       isc_boolean_t overmem = rbtdb->overmem;
-       int victims = 0;
-
-       /*
-        * Check for TTL-based expiry.
-        */
-       victim = isc_heap_element(rbtdb->heaps[rbtnode->locknum], 1);
-       if (victim != NULL && victim->rdh_ttl <= now - RBTDB_VIRTUAL) {
-               INSIST(victim->node->locknum == rbtnode->locknum);
-               victims++;
-
-               set_ttl(rbtdb, victim, 0);
-               victim->attributes |= RDATASET_ATTR_STALE;
-               victim->node->dirty = 1;
+       rdatasetheader_t *header, *header_prev;
+       unsigned int locknum;
+       int purgecount = 2;
+
+       for (locknum = (locknum_start + 1) % rbtdb->node_lock_count;
+            locknum != locknum_start && purgecount > 0;
+            locknum = (locknum + 1) % rbtdb->node_lock_count) {
+               NODE_LOCK(&rbtdb->node_locks[locknum].lock,
+                         isc_rwlocktype_write);
+
+               header = isc_heap_element(rbtdb->heaps[locknum], 1);
+               if (header && header->rdh_ttl <= now - RBTDB_VIRTUAL) {
+                       expire_header(rbtdb, header, tree_locked);
+                       purgecount--;
+               }
 
-               if (dns_rbtnode_refcurrent(victim->node) == 0) {
-                       INSIST(rbtnode != victim->node);
-                       /*
-                        * If no one else is using the node, we can
-                        * clean it up now.  We first need to gain
-                        * a new reference to the node to meet a
-                        * requirement of decrement_reference().
-                        */
-                       new_reference(rbtdb, victim->node);
-                       decrement_reference(rbtdb, victim->node, 0,
-                                           isc_rwlocktype_write,
-                                           tree_locked ? isc_rwlocktype_write :
-                                           isc_rwlocktype_none);
+               for (header = ISC_LIST_TAIL(rbtdb->rdatasets[locknum]);
+                    header != NULL && purgecount > 0;
+                    header = header_prev) {
+                       header_prev = ISC_LIST_PREV(header, lru_link);
+                       expire_header(rbtdb, header, tree_locked);
+                       purgecount--;
                }
+
+               NODE_UNLOCK(&rbtdb->node_locks[locknum].lock,
+                                   isc_rwlocktype_write);
        }
+}
+
+static void
+expire_header(dns_rbtdb_t *rbtdb, rdatasetheader_t *header,
+             isc_boolean_t tree_locked)
+{
+       set_ttl(rbtdb, header, 0);
+       header->attributes |= RDATASET_ATTR_STALE;
+       header->node->dirty = 1;
 
        /*
-        * If we are over memory, delete the end entry from the LRU.
+        * Caller must hold the node (write) lock.
         */
-       victim = ISC_LIST_TAIL(rbtdb->rdatasets[rbtnode->locknum]);
-       if (victim != NULL && overmem) {
-               INSIST(victim->node->locknum == rbtnode->locknum);
-               victims++;
-
-               set_ttl(rbtdb, victim, 0);
-               victim->attributes |= RDATASET_ATTR_STALE;
-               victim->node->dirty = 1;
 
-               if (dns_rbtnode_refcurrent(victim->node) == 0) {
-                       INSIST(rbtnode != victim->node);
-                       /*
-                        * If no one else is using the node, we can
-                        * clean it up now.  We first need to gain
-                        * a new reference to the node to meet a
-                        * requirement of decrement_reference().
-                        */
-                       new_reference(rbtdb, victim->node);
-                       decrement_reference(rbtdb, victim->node, 0,
-                                           isc_rwlocktype_write,
-                                           tree_locked ? isc_rwlocktype_write :
-                                           isc_rwlocktype_none);
-               }
+       if (dns_rbtnode_refcurrent(header->node) == 0) {
+               /*
+                * If no one else is using the node, we can clean it up now.
+                * We first need to gain a new reference to the node to meet a
+                * requirement of decrement_reference().
+                */
+               new_reference(rbtdb, header->node);
+               decrement_reference(rbtdb, header->node, 0,
+                                   isc_rwlocktype_write,
+                                   tree_locked ? isc_rwlocktype_write :
+                                   isc_rwlocktype_none);
        }
 }