]> git.ipfire.org Git - thirdparty/bind9.git/commitdiff
2596. [bug] Stale tree nodes of cache/dynamic rbtdb could stay
authorTatuya JINMEI 神明達哉 <jinmei@isc.org>
Wed, 6 May 2009 23:34:30 +0000 (23:34 +0000)
committerTatuya JINMEI 神明達哉 <jinmei@isc.org>
Wed, 6 May 2009 23:34:30 +0000 (23:34 +0000)
long, leading to inefficient memory usage or rejecting
newer cache entries in the worst case. [RT #19563]

CHANGES
lib/dns/cache.c
lib/dns/include/dns/events.h
lib/dns/rbtdb.c
lib/isc/hash.c

diff --git a/CHANGES b/CHANGES
index 04d5e01b6a8722e0040df45dcf66ff2468201409..f6d76172eaaf9df9e98ee7b0558f8b10d18ab4e1 100644 (file)
--- a/CHANGES
+++ b/CHANGES
@@ -1,3 +1,7 @@
+2596.  [bug]           Stale tree nodes of cache/dynamic rbtdb could stay
+                       long, leading to inefficient memory usage or rejecting
+                       newer cache entries in the worst case. [RT #19563]
+
 2595.  [bug]           Fix unknown extended rcodes in dig. [RT #19625]
 
 2592.  [bug]           Treat "any" as a type in nsupdate. [RT #19455]
index 4c19788d5b8e4f51fabb191f390b4cf332721713..aee824e5135c74671c70f0dc4bcdcc24ce9c5c3e 100644 (file)
@@ -15,7 +15,7 @@
  * PERFORMANCE OF THIS SOFTWARE.
  */
 
-/* $Id: cache.c,v 1.80.50.2 2009/01/18 23:47:35 tbox Exp $ */
+/* $Id: cache.c,v 1.80.50.3 2009/05/06 23:34:30 jinmei Exp $ */
 
 /*! \file */
 
@@ -174,6 +174,7 @@ dns_cache_create(isc_mem_t *mctx, isc_taskmgr_t *taskmgr,
        isc_result_t result;
        dns_cache_t *cache;
        int i;
+       isc_task_t *dbtask;
 
        REQUIRE(cachep != NULL);
        REQUIRE(*cachep == NULL);
@@ -229,6 +230,14 @@ dns_cache_create(isc_mem_t *mctx, isc_taskmgr_t *taskmgr,
        result = cache_create_db(cache, &cache->db);
        if (result != ISC_R_SUCCESS)
                goto cleanup_dbargv;
+       if (taskmgr != NULL) {
+               dbtask = NULL;
+               result = isc_task_create(taskmgr, 1, &dbtask);
+               if (result != ISC_R_SUCCESS)
+                       goto cleanup_db;
+               dns_db_settask(cache->db, dbtask);
+               isc_task_detach(&dbtask);
+       }
 
        cache->filename = NULL;
 
index 7692c1b69639c62fb826df30915c326ebad5a6fb..c272b36d0bb31de9e29a222c00d060f1d2a9998b 100644 (file)
@@ -15,7 +15,7 @@
  * PERFORMANCE OF THIS SOFTWARE.
  */
 
-/* $Id: events.h,v 1.49 2007/06/19 23:47:16 tbox Exp $ */
+/* $Id: events.h,v 1.49.332.1 2009/05/06 23:34:30 jinmei Exp $ */
 
 #ifndef DNS_EVENTS_H
 #define DNS_EVENTS_H 1
@@ -68,6 +68,7 @@
 #define DNS_EVENT_ACACHECONTROL                        (ISC_EVENTCLASS_DNS + 38)
 #define DNS_EVENT_ACACHECLEAN                  (ISC_EVENTCLASS_DNS + 39)
 #define DNS_EVENT_ACACHEOVERMEM                        (ISC_EVENTCLASS_DNS + 40)
+#define DNS_EVENT_RBTPRUNE                     (ISC_EVENTCLASS_DNS + 41)
 
 #define DNS_EVENT_FIRSTEVENT                   (ISC_EVENTCLASS_DNS + 0)
 #define DNS_EVENT_LASTEVENT                    (ISC_EVENTCLASS_DNS + 65535)
index 2506c1148b252d0918071b5f5f4b089fff752e22..9741c157f7840e2691ef635a590754004a63a546 100644 (file)
@@ -15,7 +15,7 @@
  * PERFORMANCE OF THIS SOFTWARE.
  */
 
-/* $Id: rbtdb.c,v 1.270.12.5 2009/04/07 02:50:11 jinmei Exp $ */
+/* $Id: rbtdb.c,v 1.270.12.6 2009/05/06 23:34:30 jinmei Exp $ */
 
 /*! \file */
 
@@ -533,6 +533,7 @@ static void overmem_purge(dns_rbtdb_t *rbtdb, unsigned int locknum_start,
                          isc_stdtime_t now, isc_boolean_t tree_locked);
 static isc_result_t resign_insert(dns_rbtdb_t *rbtdb, int idx,
                                  rdatasetheader_t *newheader);
+static void prune_tree(isc_task_t *task, isc_event_t *event);
 
 static dns_rdatasetmethods_t rdataset_methods = {
        rdataset_disassociate,
@@ -1551,7 +1552,7 @@ new_reference(dns_rbtdb_t *rbtdb, dns_rbtnode_t *node) {
 /*
  * This function is assumed to be called when a node is newly referenced
  * and can be in the deadnode list.  In that case the node must be retrieved
- * from the list because the it is going to be used.  In addition, if the caller
+ * from the list because it is going to be used.  In addition, if the caller
  * happens to hold a write lock on the tree, it's a good chance to purge dead
  * nodes.
  * Note: while a new reference is gained in multiple places, there are only very
@@ -1604,13 +1605,15 @@ reactivate_node(dns_rbtdb_t *rbtdb, dns_rbtnode_t *node,
 static isc_boolean_t
 decrement_reference(dns_rbtdb_t *rbtdb, dns_rbtnode_t *node,
                    rbtdb_serial_t least_serial,
-                   isc_rwlocktype_t nlock, isc_rwlocktype_t tlock)
+                   isc_rwlocktype_t nlock, isc_rwlocktype_t tlock,
+                   isc_boolean_t pruning)
 {
        isc_result_t result;
        isc_boolean_t write_locked;
        rbtdb_nodelock_t *nodelock;
        unsigned int refs, nrefs;
        int bucket = node->locknum;
+       isc_boolean_t no_reference;
 
        nodelock = &rbtdb->node_locks[bucket];
 
@@ -1693,6 +1696,7 @@ decrement_reference(dns_rbtdb_t *rbtdb, dns_rbtnode_t *node,
        } else
                write_locked = ISC_TRUE;
 
+       no_reference = ISC_TRUE;
        if (write_locked && dns_rbtnode_refcurrent(node) == 0) {
                /*
                 * We can now delete the node if the reference counter is
@@ -1701,35 +1705,97 @@ decrement_reference(dns_rbtdb_t *rbtdb, dns_rbtnode_t *node,
                 * current thread locks the tree (e.g., in findnode()).
                 */
 
-               if (isc_log_wouldlog(dns_lctx, ISC_LOG_DEBUG(1))) {
-                       char printname[DNS_NAME_FORMATSIZE];
+               /*
+                * If this node is the only one in the level it's in, deleting
+                * this node may recursively make its parent the only node in
+                * the parent level; if so, and if no one is currently using
+                * the parent node, this is almost the only opportunity to
+                * clean it up.  But the recursive cleanup is not that trivial
+                * since the child and parent may be in different lock buckets,
+                * which would cause a lock order reversal problem.  To avoid
+                * the trouble, we'll dispatch a separate event for batch
+                * cleaning.  We need to check whether we're deleting the node
+                * as a result of pruning to avoid infinite dispatching.
+                * Note: pruning happens only when a task has been set for the
+                * rbtdb.  If the user of the rbtdb chooses not to set a task,
+                * it's their responsibility to purge stale leaves (e.g. by
+                * periodic walk-through).
+                */
+               if (!pruning && node->parent != NULL &&
+                   node->parent->down == node && node->left == NULL &&
+                   node->right == NULL && rbtdb->task != NULL) {
+                       isc_event_t *ev;
+                       dns_db_t *db;
+
+                       ev = isc_event_allocate(rbtdb->common.mctx, NULL,
+                                               DNS_EVENT_RBTPRUNE,
+                                               prune_tree, node,
+                                               sizeof(isc_event_t));
+                       if (ev != NULL) {
+                               new_reference(rbtdb, node);
+                               db = NULL;
+                               attach((dns_db_t *)rbtdb, &db);
+                               ev->ev_sender = db;
+                               isc_task_send(rbtdb->task, &ev);
+                               no_reference = ISC_FALSE;
+                       } else {
+                               /*
+                                * XXX: this is a weird situation.  We could
+                                * ignore this error case, but then the stale
+                                * node will unlikely be purged except via a
+                                * rare condition such as manual cleanup.  So
+                                * we queue it in the deadnodes list, hoping
+                                * the memory shortage is temporary and the node
+                                * will be deleted later.
+                                */
+                               isc_log_write(dns_lctx,
+                                             DNS_LOGCATEGORY_DATABASE,
+                                             DNS_LOGMODULE_CACHE,
+                                             ISC_LOG_INFO,
+                                             "decrement_reference: failed to "
+                                             "allocate pruning event");
+                               INSIST(!ISC_LINK_LINKED(node, deadlink));
+                               ISC_LIST_APPEND(rbtdb->deadnodes[bucket], node,
+                                               deadlink);
+                       }
+               } else {
+                       if (isc_log_wouldlog(dns_lctx, ISC_LOG_DEBUG(1))) {
+                               char printname[DNS_NAME_FORMATSIZE];
+
+                               isc_log_write(dns_lctx,
+                                             DNS_LOGCATEGORY_DATABASE,
+                                             DNS_LOGMODULE_CACHE,
+                                             ISC_LOG_DEBUG(1),
+                                             "decrement_reference: "
+                                             "delete from rbt: %p %s",
+                                             node,
+                                             dns_rbt_formatnodename(node,
+                                                       printname,
+                                                       sizeof(printname)));
+                       }
 
-                       isc_log_write(dns_lctx, DNS_LOGCATEGORY_DATABASE,
-                                     DNS_LOGMODULE_CACHE, ISC_LOG_DEBUG(1),
-                                     "decrement_reference: "
-                                     "delete from rbt: %p %s",
-                                     node,
-                                     dns_rbt_formatnodename(node, printname,
-                                                          sizeof(printname)));
+                       INSIST(!ISC_LINK_LINKED(node, deadlink));
+                       if (node->nsec3)
+                               result = dns_rbt_deletenode(rbtdb->nsec3, node,
+                                                           ISC_FALSE);
+                       else
+                               result = dns_rbt_deletenode(rbtdb->tree, node,
+                                                           ISC_FALSE);
+                       if (result != ISC_R_SUCCESS) {
+                               isc_log_write(dns_lctx,
+                                             DNS_LOGCATEGORY_DATABASE,
+                                             DNS_LOGMODULE_CACHE,
+                                             ISC_LOG_WARNING,
+                                             "decrement_reference: "
+                                             "dns_rbt_deletenode: %s",
+                                             isc_result_totext(result));
+                       }
                }
-
-               INSIST(!ISC_LINK_LINKED(node, deadlink));
-               if (node->nsec3)
-                       result = dns_rbt_deletenode(rbtdb->nsec3, node,
-                                                   ISC_FALSE);
-               else
-                       result = dns_rbt_deletenode(rbtdb->tree, node,
-                                                   ISC_FALSE);
-               if (result != ISC_R_SUCCESS)
-                       isc_log_write(dns_lctx, DNS_LOGCATEGORY_DATABASE,
-                                     DNS_LOGMODULE_CACHE, ISC_LOG_WARNING,
-                                     "decrement_reference: "
-                                     "dns_rbt_deletenode: %s",
-                                     isc_result_totext(result));
        } else if (dns_rbtnode_refcurrent(node) == 0) {
                INSIST(!ISC_LINK_LINKED(node, deadlink));
                ISC_LIST_APPEND(rbtdb->deadnodes[bucket], node, deadlink);
-       }
+       } else
+               no_reference = ISC_FALSE;
 
        /* Restore the lock? */
        if (nlock == isc_rwlocktype_read)
@@ -1746,7 +1812,71 @@ decrement_reference(dns_rbtdb_t *rbtdb, dns_rbtnode_t *node,
                if (write_locked)
                        isc_rwlock_downgrade(&rbtdb->tree_lock);
 
-       return (ISC_TRUE);
+       return (no_reference);
+}
+
+/*
+ * Prune the tree by recursively cleaning-up single leaves.  In the worst
+ * case, the number of iteration is the number of tree levels, which is at
+ * most the maximum number of domain name labels, i.e, 127.  In practice, this
+ * should be much smaller (only a few times), and even the worst case would be
+ * acceptable for a single event.
+ */
+static void
+prune_tree(isc_task_t *task, isc_event_t *event) {
+       dns_rbtdb_t *rbtdb = event->ev_sender;
+       dns_rbtnode_t *node = event->ev_arg;
+       dns_rbtnode_t *parent;
+       unsigned int locknum;
+
+       UNUSED(task);
+
+       isc_event_free(&event);
+
+       RWLOCK(&rbtdb->tree_lock, isc_rwlocktype_write);
+       locknum = node->locknum;
+       NODE_LOCK(&rbtdb->node_locks[locknum].lock, isc_rwlocktype_write);
+       do {
+               parent = node->parent;
+               decrement_reference(rbtdb, node, 0, isc_rwlocktype_write,
+                                   isc_rwlocktype_write, ISC_TRUE);
+
+               if (parent != NULL && parent->down == NULL) {
+                       /*
+                        * node was the only down child of the parent and has
+                        * just been removed.  We'll then need to examine the
+                        * parent.  Keep the lock if possible; otherwise,
+                        * release the old lock and acquire one for the parent.
+                        */
+                       if (parent->locknum != locknum) {
+                               NODE_UNLOCK(&rbtdb->node_locks[locknum].lock,
+                                           isc_rwlocktype_write);
+                               locknum = parent->locknum;
+                               NODE_LOCK(&rbtdb->node_locks[locknum].lock,
+                                         isc_rwlocktype_write);
+                       }
+
+                       /*
+                        * We need to gain a reference to the node before
+                        * decrementing it in the next iteration.  In addition,
+                        * if the node is in the dead-nodes list, extract it
+                        * from the list beforehand as we do in
+                        * reactivate_node().
+                        */
+                       new_reference(rbtdb, parent);
+                       if (ISC_LINK_LINKED(parent, deadlink)) {
+                               ISC_LIST_UNLINK(rbtdb->deadnodes[locknum],
+                                               parent, deadlink);
+                       }
+               } else
+                       parent = NULL;
+
+               node = parent;
+       } while (node != NULL);
+       NODE_UNLOCK(&rbtdb->node_locks[locknum].lock, isc_rwlocktype_write);
+       RWUNLOCK(&rbtdb->tree_lock, isc_rwlocktype_write);
+
+       detach((dns_db_t **)&rbtdb);
 }
 
 static inline void
@@ -2163,8 +2293,8 @@ closeversion(dns_db_t *db, dns_dbversion_t **versionp, isc_boolean_t commit) {
                        NODE_UNLOCK(lock, isc_rwlocktype_write);
                }
                decrement_reference(rbtdb, header->node, least_serial,
-                                   isc_rwlocktype_write,
-                                   isc_rwlocktype_none);
+                                   isc_rwlocktype_write, isc_rwlocktype_none,
+                                   ISC_FALSE);
        }
 
        if (!EMPTY(cleanup_list)) {
@@ -2197,7 +2327,7 @@ closeversion(dns_db_t *db, dns_dbversion_t **versionp, isc_boolean_t commit) {
                                rollback_node(rbtnode, serial);
                        decrement_reference(rbtdb, rbtnode, least_serial,
                                            isc_rwlocktype_write,
-                                           isc_rwlocktype_write);
+                                           isc_rwlocktype_write, ISC_FALSE);
 
                        NODE_UNLOCK(lock, isc_rwlocktype_write);
 
@@ -3747,7 +3877,8 @@ zone_find(dns_db_t *db, dns_name_t *name, dns_dbversion_t *version,
 
                NODE_LOCK(lock, isc_rwlocktype_read);
                decrement_reference(search.rbtdb, node, 0,
-                                   isc_rwlocktype_read, isc_rwlocktype_none);
+                                   isc_rwlocktype_read, isc_rwlocktype_none,
+                                   ISC_FALSE);
                NODE_UNLOCK(lock, isc_rwlocktype_read);
        }
 
@@ -4525,7 +4656,8 @@ cache_find(dns_db_t *db, dns_name_t *name, dns_dbversion_t *version,
 
                NODE_LOCK(lock, isc_rwlocktype_read);
                decrement_reference(search.rbtdb, node, 0,
-                                   isc_rwlocktype_read, isc_rwlocktype_none);
+                                   isc_rwlocktype_read, isc_rwlocktype_none,
+                                   ISC_FALSE);
                NODE_UNLOCK(lock, isc_rwlocktype_read);
        }
 
@@ -4743,7 +4875,7 @@ detachnode(dns_db_t *db, dns_dbnode_t **targetp) {
        NODE_LOCK(&nodelock->lock, isc_rwlocktype_read);
 
        if (decrement_reference(rbtdb, node, 0, isc_rwlocktype_read,
-                               isc_rwlocktype_none)) {
+                               isc_rwlocktype_none, ISC_FALSE)) {
                if (isc_refcount_current(&nodelock->references) == 0 &&
                    nodelock->exiting) {
                        inactive = ISC_TRUE;
@@ -7469,7 +7601,7 @@ dereference_iter_node(rbtdb_dbiterator_t *rbtdbiter) {
        lock = &rbtdb->node_locks[node->locknum].lock;
        NODE_LOCK(lock, isc_rwlocktype_read);
        decrement_reference(rbtdb, node, 0, isc_rwlocktype_read,
-                           rbtdbiter->tree_locked);
+                           rbtdbiter->tree_locked, ISC_FALSE);
        NODE_UNLOCK(lock, isc_rwlocktype_read);
 
        rbtdbiter->node = NULL;
@@ -7510,7 +7642,7 @@ flush_deletions(rbtdb_dbiterator_t *rbtdbiter) {
                        NODE_LOCK(lock, isc_rwlocktype_read);
                        decrement_reference(rbtdb, node, 0,
                                            isc_rwlocktype_read,
-                                           rbtdbiter->tree_locked);
+                                           rbtdbiter->tree_locked, ISC_FALSE);
                        NODE_UNLOCK(lock, isc_rwlocktype_read);
                }
 
@@ -8446,6 +8578,6 @@ expire_header(dns_rbtdb_t *rbtdb, rdatasetheader_t *header,
                decrement_reference(rbtdb, header->node, 0,
                                    isc_rwlocktype_write,
                                    tree_locked ? isc_rwlocktype_write :
-                                   isc_rwlocktype_none);
+                                   isc_rwlocktype_none, ISC_FALSE);
        }
 }
index 41bb88428431380202c86d7feeba6cf60a9b99ae..ce057c79eae847c18dfe66edb12f183928b071a5 100644 (file)
@@ -15,7 +15,7 @@
  * PERFORMANCE OF THIS SOFTWARE.
  */
 
-/* $Id: hash.c,v 1.13 2007/06/19 23:47:17 tbox Exp $ */
+/* $Id: hash.c,v 1.13.332.1 2009/05/06 23:34:30 jinmei Exp $ */
 
 /*! \file
  * Some portion of this code was derived from universal hash function
@@ -293,6 +293,7 @@ static void
 destroy(isc_hash_t **hctxp) {
        isc_hash_t *hctx;
        isc_mem_t *mctx;
+       unsigned char canary0[4], canary1[4];
 
        REQUIRE(hctxp != NULL && *hctxp != NULL);
        hctx = *hctxp;
@@ -312,7 +313,10 @@ destroy(isc_hash_t **hctxp) {
 
        DESTROYLOCK(&hctx->lock);
 
+       memcpy(canary0, hctx + 1, sizeof(canary0));
        memset(hctx, 0, sizeof(isc_hash_t));
+       memcpy(canary1, hctx + 1, sizeof(canary1));
+       INSIST(memcmp(canary0, canary1, sizeof(canary0)) == 0);
        isc_mem_put(mctx, hctx, sizeof(isc_hash_t));
        isc_mem_detach(&mctx);
 }