From: Tatuya JINMEI 神明達哉 Date: Wed, 11 Aug 2010 23:09:18 +0000 (+0000) Subject: 2937. [bug] Worked around an apparent race condition in over X-Git-Tag: v9.5.3rc1~23 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=6650f6308ec2ff123c94efe50c36d7f166991279;p=thirdparty%2Fbind9.git 2937. [bug] Worked around an apparent race condition in over memory conditions. Without this fix a DNS cache DB or ADB could incorrectly stay in an over memory state, effectively refusing further caching, which subsequently made a BIND 9 caching server unworkable. This fix prevents this problem from happening by polling the state of the memory context, rather than making a copy of the state, which appeared to cause a race. This is a "workaround" in that it doesn't solve the possible race per se, but several experiments proved this change solves the symptom. Also, the polling overhead hasn't been reported to be an issue. This bug should only affect a caching server that specifies a finite max-cache-size. It's also quite likely that the bug happens only when enabling threads, but it's not confirmed yet. [RT #21818] --- diff --git a/CHANGES b/CHANGES index d4b8229885c..e058853fadb 100644 --- a/CHANGES +++ b/CHANGES @@ -1,3 +1,19 @@ +2937. [bug] Worked around an apparent race condition in over + memory conditions. Without this fix a DNS cache DB or + ADB could incorrectly stay in an over memory state, + effectively refusing further caching, which + subsequently made a BIND 9 caching server unworkable. + This fix prevents this problem from happening by + polling the state of the memory context, rather than + making a copy of the state, which appeared to cause + a race. This is a "workaround" in that it doesn't + solve the possible race per se, but several experiments + proved this change solves the symptom. Also, the + polling overhead hasn't been reported to be an issue. + This bug should only affect a caching server that + specifies a finite max-cache-size. It's also quite + likely that the bug happens only when enabling threads, + but it's not confirmed yet. [RT #21818] --- 9.5.3b1 released --- diff --git a/lib/dns/adb.c b/lib/dns/adb.c index cb45c82d01b..9b9ccc62ee5 100644 --- a/lib/dns/adb.c +++ b/lib/dns/adb.c @@ -15,7 +15,7 @@ * PERFORMANCE OF THIS SOFTWARE. */ -/* $Id: adb.c,v 1.233.36.14 2009/02/03 22:35:00 jinmei Exp $ */ +/* $Id: adb.c,v 1.233.36.15 2010/08/11 23:09:18 jinmei Exp $ */ /*! \file * @@ -118,7 +118,6 @@ struct dns_adb { isc_taskmgr_t *taskmgr; isc_task_t *task; - isc_boolean_t overmem; isc_interval_t tick_interval; int next_cleanbucket; @@ -294,8 +293,8 @@ static inline void inc_adb_irefcnt(dns_adb_t *); static inline void inc_adb_erefcnt(dns_adb_t *); static inline void inc_entry_refcnt(dns_adb_t *, dns_adbentry_t *, isc_boolean_t); -static inline isc_boolean_t dec_entry_refcnt(dns_adb_t *, dns_adbentry_t *, - isc_boolean_t); +static inline isc_boolean_t dec_entry_refcnt(dns_adb_t *, isc_boolean_t, + dns_adbentry_t *, isc_boolean_t); static inline void violate_locking_hierarchy(isc_mutex_t *, isc_mutex_t *); static isc_boolean_t clean_namehooks(dns_adb_t *, dns_adbnamehooklist_t *); static void clean_target(dns_adb_t *, dns_name_t *); @@ -777,7 +776,7 @@ link_entry(dns_adb_t *adb, int bucket, dns_adbentry_t *entry) { int i; dns_adbentry_t *e; - if (adb->overmem) { + if (isc_mem_isovermem(adb->mctx)) { for (i = 0; i < 2; i++) { e = ISC_LIST_TAIL(adb->entries[bucket]); if (e == NULL) @@ -943,6 +942,7 @@ clean_namehooks(dns_adb_t *adb, dns_adbnamehooklist_t *namehooks) { dns_adbnamehook_t *namehook; int addr_bucket; isc_boolean_t result = ISC_FALSE; + isc_boolean_t overmem = isc_mem_isovermem(adb->mctx); addr_bucket = DNS_ADB_INVALIDBUCKET; namehook = ISC_LIST_HEAD(*namehooks); @@ -963,7 +963,8 @@ clean_namehooks(dns_adb_t *adb, dns_adbnamehooklist_t *namehooks) { LOCK(&adb->entrylocks[addr_bucket]); } - result = dec_entry_refcnt(adb, entry, ISC_FALSE); + result = dec_entry_refcnt(adb, overmem, entry, + ISC_FALSE); } /* @@ -1235,7 +1236,9 @@ inc_entry_refcnt(dns_adb_t *adb, dns_adbentry_t *entry, isc_boolean_t lock) { } static inline isc_boolean_t -dec_entry_refcnt(dns_adb_t *adb, dns_adbentry_t *entry, isc_boolean_t lock) { +dec_entry_refcnt(dns_adb_t *adb, isc_boolean_t overmem, dns_adbentry_t *entry, + isc_boolean_t lock) +{ int bucket; isc_boolean_t destroy_entry; isc_boolean_t result = ISC_FALSE; @@ -1250,7 +1253,7 @@ dec_entry_refcnt(dns_adb_t *adb, dns_adbentry_t *entry, isc_boolean_t lock) { destroy_entry = ISC_FALSE; if (entry->refcnt == 0 && - (adb->entry_sd[bucket] || entry->expires == 0 || adb->overmem || + (adb->entry_sd[bucket] || entry->expires == 0 || overmem || (entry->flags & ENTRY_IS_DEAD) != 0)) { destroy_entry = ISC_TRUE; result = unlink_entry(adb, entry); @@ -1852,7 +1855,7 @@ check_stale_name(dns_adb_t *adb, int bucket, isc_stdtime_t now) { int victims, max_victims; isc_boolean_t result; dns_adbname_t *victim, *next_victim; - isc_boolean_t overmem = adb->overmem; + isc_boolean_t overmem = isc_mem_isovermem(adb->mctx); int scans = 0; INSIST(bucket != DNS_ADB_INVALIDBUCKET); @@ -2049,7 +2052,6 @@ dns_adb_create(isc_mem_t *mem, dns_view_t *view, isc_timermgr_t *timermgr, adb, NULL, NULL); adb->cevent_sent = ISC_FALSE; adb->shutting_down = ISC_FALSE; - adb->overmem = ISC_FALSE; ISC_LIST_INIT(adb->whenshutdown); isc_mem_attach(mem, &adb->mctx); @@ -2616,6 +2618,7 @@ dns_adb_destroyfind(dns_adbfind_t **findp) { dns_adbaddrinfo_t *ai; int bucket; dns_adb_t *adb; + isc_boolean_t overmem; REQUIRE(findp != NULL && DNS_ADBFIND_VALID(*findp)); find = *findp; @@ -2640,13 +2643,14 @@ dns_adb_destroyfind(dns_adbfind_t **findp) { * Return the find to the memory pool, and decrement the adb's * reference count. */ + overmem = isc_mem_isovermem(adb->mctx); ai = ISC_LIST_HEAD(find->list); while (ai != NULL) { ISC_LIST_UNLINK(find->list, ai, publink); entry = ai->entry; ai->entry = NULL; INSIST(DNS_ADBENTRY_VALID(entry)); - RUNTIME_CHECK(dec_entry_refcnt(adb, entry, ISC_TRUE) == + RUNTIME_CHECK(dec_entry_refcnt(adb, overmem, entry, ISC_TRUE) == ISC_FALSE); free_adbaddrinfo(adb, &ai); ai = ISC_LIST_HEAD(find->list); @@ -3509,6 +3513,7 @@ dns_adb_freeaddrinfo(dns_adb_t *adb, dns_adbaddrinfo_t **addrp) { int bucket; isc_stdtime_t now; isc_boolean_t want_check_exit = ISC_FALSE; + isc_boolean_t overmem; REQUIRE(DNS_ADB_VALID(adb)); REQUIRE(addrp != NULL); @@ -3520,13 +3525,14 @@ dns_adb_freeaddrinfo(dns_adb_t *adb, dns_adbaddrinfo_t **addrp) { isc_stdtime_get(&now); *addrp = NULL; + overmem = isc_mem_isovermem(adb->mctx); bucket = addr->entry->lock_bucket; LOCK(&adb->entrylocks[bucket]); entry->expires = now + ADB_ENTRY_WINDOW; - want_check_exit = dec_entry_refcnt(adb, entry, ISC_FALSE); + want_check_exit = dec_entry_refcnt(adb, overmem, entry, ISC_FALSE); UNLOCK(&adb->entrylocks[bucket]); @@ -3591,6 +3597,14 @@ dns_adb_flushname(dns_adb_t *adb, dns_name_t *name) { static void water(void *arg, int mark) { + /* + * We're going to change the way to handle overmem condition: use + * isc_mem_isovermem() instead of storing the state via this callback, + * since the latter way tends to cause race conditions. + * To minimize the change, and in case we re-enable the callback + * approach, however, keep this function at the moment. + */ + dns_adb_t *adb = arg; isc_boolean_t overmem = ISC_TF(mark == ISC_MEM_HIWATER); @@ -3598,17 +3612,6 @@ water(void *arg, int mark) { DP(ISC_LOG_DEBUG(1), "adb reached %s water mark", overmem ? "high" : "low"); - - /* - * We can't use adb->lock as there is potential for water - * to be called when adb->lock is held. - */ - LOCK(&adb->overmemlock); - if (adb->overmem != overmem) { - adb->overmem = overmem; - isc_mem_waterack(adb->mctx, mark); - } - UNLOCK(&adb->overmemlock); } void diff --git a/lib/dns/rbtdb.c b/lib/dns/rbtdb.c index 55d5bcb05fa..5b1fbb2c170 100644 --- a/lib/dns/rbtdb.c +++ b/lib/dns/rbtdb.c @@ -15,7 +15,7 @@ * PERFORMANCE OF THIS SOFTWARE. */ -/* $Id: rbtdb.c,v 1.248.12.27 2010/05/10 01:46:05 marka Exp $ */ +/* $Id: rbtdb.c,v 1.248.12.28 2010/08/11 23:09:18 jinmei Exp $ */ /*! \file */ @@ -393,7 +393,6 @@ typedef struct { rbtdb_version_t * current_version; rbtdb_version_t * future_version; rbtdb_versionlist_t open_versions; - isc_boolean_t overmem; isc_task_t * task; dns_dbnode_t *soanode; dns_dbnode_t *nsnode; @@ -4549,7 +4548,7 @@ expirenode(dns_db_t *db, dns_dbnode_t *node, isc_stdtime_t now) { if (now == 0) isc_stdtime_get(&now); - if (rbtdb->overmem) { + if (isc_mem_isovermem(rbtdb->common.mctx)) { isc_uint32_t val; isc_random_get(&val); @@ -4559,8 +4558,8 @@ expirenode(dns_db_t *db, dns_dbnode_t *node, isc_stdtime_t now) { force_expire = ISC_TF(rbtnode->down == NULL && val % 4 == 0); /* - * Note that 'log' can be true IFF rbtdb->overmem is also true. - * rbtdb->overmem can currently only be true for cache + * Note that 'log' can be true IFF overmem is also true. + * overmem can currently only be true for cache * databases -- hence all of the "overmem cache" log strings. */ log = ISC_TF(isc_log_wouldlog(dns_lctx, level)); @@ -4605,7 +4604,7 @@ expirenode(dns_db_t *db, dns_dbnode_t *node, isc_stdtime_t now) { "reprieve by RETAIN() %s", printname); } - } else if (rbtdb->overmem && log) + } else if (isc_mem_isovermem(rbtdb->common.mctx) && log) isc_log_write(dns_lctx, category, module, level, "overmem cache: saved %s", printname); @@ -4617,10 +4616,12 @@ expirenode(dns_db_t *db, dns_dbnode_t *node, isc_stdtime_t now) { static void overmem(dns_db_t *db, isc_boolean_t overmem) { - dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)db; + /* This is an empty callback. See adb.c:water() */ - if (IS_CACHE(rbtdb)) - rbtdb->overmem = overmem; + UNUSED(db); + UNUSED(overmem); + + return; } static void @@ -5474,6 +5475,7 @@ addrdataset(dns_db_t *db, dns_dbnode_t *node, dns_dbversion_t *version, isc_result_t result; isc_boolean_t delegating; isc_boolean_t tree_locked = ISC_FALSE; + isc_boolean_t cache_is_overmem = ISC_FALSE; REQUIRE(VALID_RBTDB(rbtdb)); @@ -5535,12 +5537,14 @@ addrdataset(dns_db_t *db, dns_dbnode_t *node, dns_dbversion_t *version, * the lock does not necessarily have to be acquired but it will help * purge stale entries more effectively. */ - if (delegating || (IS_CACHE(rbtdb) && rbtdb->overmem)) { + if (IS_CACHE(rbtdb) && isc_mem_isovermem(rbtdb->common.mctx)) + cache_is_overmem = ISC_TRUE; + if (delegating || cache_is_overmem) { tree_locked = ISC_TRUE; RWLOCK(&rbtdb->tree_lock, isc_rwlocktype_write); } - if (IS_CACHE(rbtdb) && rbtdb->overmem) + if (cache_is_overmem) overmem_purge(rbtdb, rbtnode->locknum, now, tree_locked); NODE_LOCK(&rbtdb->node_locks[rbtnode->locknum].lock, @@ -6356,7 +6360,6 @@ dns_rbtdb_create } rbtdb->attributes = 0; rbtdb->secure = ISC_FALSE; - rbtdb->overmem = ISC_FALSE; rbtdb->task = NULL; /* diff --git a/lib/isc/include/isc/mem.h b/lib/isc/include/isc/mem.h index 92bcd62632f..cc57d34a970 100644 --- a/lib/isc/include/isc/mem.h +++ b/lib/isc/include/isc/mem.h @@ -15,7 +15,7 @@ * PERFORMANCE OF THIS SOFTWARE. */ -/* $Id: mem.h,v 1.72.128.8 2010/03/04 23:46:49 tbox Exp $ */ +/* $Id: mem.h,v 1.72.128.9 2010/08/11 23:09:18 jinmei Exp $ */ #ifndef ISC_MEM_H #define ISC_MEM_H 1 @@ -334,6 +334,14 @@ isc_mem_inuse(isc_mem_t *mctx); * allocated from the system but not yet used. */ +isc_boolean_t +isc_mem_isovermem(isc_mem_t *mctx); +/*%< + * Return true iff the memory context is in "over memory" state, i.e., + * a hiwater mark has been set and the used amount of memory has exceeds + * the mark. + */ + void isc_mem_setwater(isc_mem_t *mctx, isc_mem_water_t water, void *water_arg, size_t hiwater, size_t lowater); diff --git a/lib/isc/mem.c b/lib/isc/mem.c index 445ee2806ad..ef9513f61bb 100644 --- a/lib/isc/mem.c +++ b/lib/isc/mem.c @@ -15,7 +15,7 @@ * PERFORMANCE OF THIS SOFTWARE. */ -/* $Id: mem.c,v 1.137.16.11 2010/05/12 00:52:36 marka Exp $ */ +/* $Id: mem.c,v 1.137.16.12 2010/08/11 23:09:18 jinmei Exp $ */ /*! \file */ @@ -141,6 +141,7 @@ struct isc_mem { size_t hi_water; size_t lo_water; isc_boolean_t hi_called; + isc_boolean_t is_overmem; isc_mem_water_t water; void * water_arg; ISC_LIST(isc_mempool_t) pools; @@ -765,6 +766,7 @@ isc_mem_createx2(size_t init_max_size, size_t target_size, ctx->hi_water = 0; ctx->lo_water = 0; ctx->hi_called = ISC_FALSE; + ctx->is_overmem = ISC_FALSE; ctx->water = NULL; ctx->water_arg = NULL; ctx->magic = MEM_MAGIC; @@ -1103,6 +1105,10 @@ isc__mem_get(isc_mem_t *ctx, size_t size FLARG) { } ADD_TRACE(ctx, ptr, size, file, line); + if (ctx->hi_water != 0U && ctx->inuse > ctx->hi_water && + !ctx->is_overmem) { + ctx->is_overmem = ISC_TRUE; + } if (ctx->hi_water != 0U && !ctx->hi_called && ctx->inuse > ctx->hi_water) { call_water = ISC_TRUE; @@ -1160,6 +1166,10 @@ isc__mem_put(isc_mem_t *ctx, void *ptr, size_t size FLARG) * when the context was pushed over hi_water but then had * isc_mem_setwater() called with 0 for hi_water and lo_water. */ + if (ctx->is_overmem && + (ctx->inuse < ctx->lo_water || ctx->lo_water == 0U)) { + ctx->is_overmem = ISC_FALSE; + } if (ctx->hi_called && (ctx->inuse < ctx->lo_water || ctx->lo_water == 0U)) { if (ctx->water != NULL) @@ -1346,6 +1356,11 @@ isc__mem_allocate(isc_mem_t *ctx, size_t size FLARG) { #if ISC_MEM_TRACKLINES ADD_TRACE(ctx, si, si[-1].u.size, file, line); #endif + if (ctx->hi_water != 0U && ctx->inuse > ctx->hi_water && + !ctx->is_overmem) { + ctx->is_overmem = ISC_TRUE; + } + if (ctx->hi_water != 0U && !ctx->hi_called && ctx->inuse > ctx->hi_water) { ctx->hi_called = ISC_TRUE; @@ -1434,6 +1449,11 @@ isc__mem_free(isc_mem_t *ctx, void *ptr FLARG) { * when the context was pushed over hi_water but then had * isc_mem_setwater() called with 0 for hi_water and lo_water. */ + if (ctx->is_overmem && + (ctx->inuse < ctx->lo_water || ctx->lo_water == 0U)) { + ctx->is_overmem = ISC_FALSE; + } + if (ctx->hi_called && (ctx->inuse < ctx->lo_water || ctx->lo_water == 0U)) { ctx->hi_called = ISC_FALSE; @@ -1560,6 +1580,18 @@ isc_mem_setwater(isc_mem_t *ctx, isc_mem_water_t water, void *water_arg, (oldwater)(oldwater_arg, ISC_MEM_LOWATER); } +isc_boolean_t +isc_mem_isovermem(isc_mem_t *ctx) { + REQUIRE(VALID_CONTEXT(ctx)); + + /* + * We don't bother to lock the context because 100% accuracy isn't + * necessary (and even if we locked the context the returned value + * could be different from the actual state when it's used anyway) + */ + return (ctx->is_overmem); +} + void isc_mem_setname(isc_mem_t *ctx, const char *name, void *tag) { REQUIRE(VALID_CONTEXT(ctx));