From: Ondřej Surý <ondrej@isc.org>
Date: Tue, 30 May 2023 06:46:17 +0000 (+0200)
Subject: Improve RBT overmem cache cleaning
X-Git-Tag: v9.19.14~3^2~2
X-Git-Url: http://git.ipfire.org/gitweb.cgi?a=commitdiff_plain;h=da0eafcdee52147e72d407cc3b9f179378ee1d3a;p=thirdparty%2Fbind9.git

Improve RBT overmem cache cleaning

When cache memory usage is over the configured cache size (overmem) and
we are cleaning unused entries, it might not be enough to clean just two
entries if the entries to be expired are smaller than the newly added
rdata.  This could be abused by an attacker to cause a remote Denial of
Service by possibly running out of the operating system memory.

Currently, the addrdataset() tries to do a single TTL-based cleaning
considering the serve-stale TTL and then optionally moves to overmem
cleaning if we are in that condition.  Then the overmem_purge() tries to
do another single TTL based cleaning from the TTL heap and then continue
with LRU-based cleaning up to 2 entries cleaned.

Squash the TTL-cleaning mechanism into single call from addrdataset(),
but ignore the serve-stale TTL if we are currently overmem.

Then instead of having a fixed number of entries to clean, pass the size
of newly added rdatasetheader to the overmem_purge() function and
cleanup at least the size of the newly added data.  This prevents the
cache going over the configured memory limit (`max-cache-size`).

Additionally, refactor the overmem_purge() function to reduce for-loop
nesting for readability.
---

diff --git a/lib/dns/rbtdb.c b/lib/dns/rbtdb.c
index 2bc92f38359..6887b9fcc4e 100644
--- a/lib/dns/rbtdb.c
+++ b/lib/dns/rbtdb.c
@@ -622,7 +622,7 @@ expire_header(dns_rbtdb_t *rbtdb, rdatasetheader_t *header,
 	      isc_rwlocktype_t *nlocktypep, isc_rwlocktype_t *tlocktypep,
 	      expire_t reason DNS__DB_FLARG);
 static void
-overmem_purge(dns_rbtdb_t *rbtdb, unsigned int locknum_start, isc_stdtime_t now,
+overmem_purge(dns_rbtdb_t *rbtdb, unsigned int locknum_start, size_t purgesize,
 	      isc_rwlocktype_t *tlocktypep DNS__DB_FLARG);
 static void
 resign_insert(dns_rbtdb_t *rbtdb, int idx, rdatasetheader_t *newheader);
@@ -6878,6 +6878,16 @@ cleanup:
 
 static dns_dbmethods_t zone_methods;
 
+static size_t
+rdataset_size(rdatasetheader_t *header) {
+	if (!NONEXISTENT(header)) {
+		return (dns_rdataslab_size((unsigned char *)header,
+					   sizeof(*header)));
+	}
+
+	return (sizeof(*header));
+}
+
 static isc_result_t
 addrdataset(dns_db_t *db, dns_dbnode_t *node, dns_dbversion_t *version,
 	    isc_stdtime_t now, dns_rdataset_t *rdataset, unsigned int options,
@@ -7042,7 +7052,7 @@ addrdataset(dns_db_t *db, dns_dbnode_t *node, dns_dbversion_t *version,
 	}
 
 	if (cache_is_overmem) {
-		overmem_purge(rbtdb, rbtnode->locknum, now,
+		overmem_purge(rbtdb, rbtnode->locknum, rdataset_size(newheader),
 			      &tlocktype DNS__DB_FLARG_PASS);
 	}
 
@@ -7062,12 +7072,19 @@ addrdataset(dns_db_t *db, dns_dbnode_t *node, dns_dbversion_t *version,
 		}
 
 		header = isc_heap_element(rbtdb->heaps[rbtnode->locknum], 1);
-		if (header != NULL &&
-		    header->rdh_ttl + STALE_TTL(header, rbtdb) <
-			    now - RBTDB_VIRTUAL)
-		{
-			expire_header(rbtdb, header, &nlocktype, &tlocktype,
-				      expire_ttl DNS__DB_FLARG_PASS);
+		if (header != NULL) {
+			dns_ttl_t rdh_ttl = header->rdh_ttl;
+
+			/* Only account for stale TTL if cache is not overmem */
+			if (!cache_is_overmem) {
+				rdh_ttl += STALE_TTL(header, rbtdb);
+			}
+
+			if (rdh_ttl < now - RBTDB_VIRTUAL) {
+				expire_header(rbtdb, header, &nlocktype,
+					      &tlocktype,
+					      expire_ttl DNS__DB_FLARG_PASS);
+			}
 		}
 
 		/*
@@ -9971,54 +9988,61 @@ update_header(dns_rbtdb_t *rbtdb, rdatasetheader_t *header, isc_stdtime_t now) {
 	ISC_LIST_PREPEND(rbtdb->rdatasets[header->node->locknum], header, link);
 }
 
+static size_t
+expire_lru_headers(dns_rbtdb_t *rbtdb, unsigned int locknum,
+		   isc_rwlocktype_t *nlocktypep, isc_rwlocktype_t *tlocktypep,
+		   size_t purgesize DNS__DB_FLARG) {
+	rdatasetheader_t *header, *header_prev;
+	size_t purged = 0;
+
+	for (header = ISC_LIST_TAIL(rbtdb->rdatasets[locknum]);
+	     header != NULL && purged <= purgesize; header = header_prev)
+	{
+		header_prev = ISC_LIST_PREV(header, link);
+		/*
+		 * Unlink the entry at this point to avoid checking it
+		 * again even if it's currently used someone else and
+		 * cannot be purged at this moment.  This entry won't be
+		 * referenced any more (so unlinking is safe) since the
+		 * TTL was reset to 0.
+		 */
+		ISC_LIST_UNLINK(rbtdb->rdatasets[locknum], header, link);
+		size_t header_size = rdataset_size(header);
+		expire_header(rbtdb, header, nlocktypep, tlocktypep,
+			      expire_lru DNS__DB_FLARG_PASS);
+		purged += header_size;
+	}
+
+	return (purged);
+}
+
 /*%
- * Purge some expired and/or stale (i.e. unused for some period) cache entries
- * under an overmem condition.  To recover from this condition quickly, up to
- * 2 entries will be purged.  This process is triggered while adding a new
- * entry, and we specifically avoid purging entries in the same LRU bucket as
- * the one to which the new entry will belong.  Otherwise, we might purge
- * entries of the same name of different RR types while adding RRsets from a
- * single response (consider the case where we're adding A and AAAA glue records
- * of the same NS name).
+ * Purge some stale (i.e. unused for some period - LRU based cleaning) cache
+ * entries under the overmem condition.  To recover from this condition quickly,
+ * we cleanup entries up to the size of newly added rdata (passed as purgesize).
+ *
+ * This process is triggered while adding a new entry, and we specifically avoid
+ * purging entries in the same LRU bucket as the one to which the new entry will
+ * belong.  Otherwise, we might purge entries of the same name of different RR
+ * types while adding RRsets from a single response (consider the case where
+ * we're adding A and AAAA glue records of the same NS name).
  */
 static void
-overmem_purge(dns_rbtdb_t *rbtdb, unsigned int locknum_start, isc_stdtime_t now,
+overmem_purge(dns_rbtdb_t *rbtdb, unsigned int locknum_start, size_t purgesize,
 	      isc_rwlocktype_t *tlocktypep DNS__DB_FLARG) {
-	rdatasetheader_t *header, *header_prev;
 	unsigned int locknum;
-	int purgecount = 2;
+	size_t purged = 0;
 
 	for (locknum = (locknum_start + 1) % rbtdb->node_lock_count;
-	     locknum != locknum_start && purgecount > 0;
+	     locknum != locknum_start && purged <= purgesize;
 	     locknum = (locknum + 1) % rbtdb->node_lock_count)
 	{
 		isc_rwlocktype_t nlocktype = isc_rwlocktype_none;
 		NODE_WRLOCK(&rbtdb->node_locks[locknum].lock, &nlocktype);
 
-		header = isc_heap_element(rbtdb->heaps[locknum], 1);
-		if (header && header->rdh_ttl < now - RBTDB_VIRTUAL) {
-			expire_header(rbtdb, header, &nlocktype, tlocktypep,
-				      expire_ttl DNS__DB_FLARG_PASS);
-			purgecount--;
-		}
-
-		for (header = ISC_LIST_TAIL(rbtdb->rdatasets[locknum]);
-		     header != NULL && purgecount > 0; header = header_prev)
-		{
-			header_prev = ISC_LIST_PREV(header, link);
-			/*
-			 * Unlink the entry at this point to avoid checking it
-			 * again even if it's currently used someone else and
-			 * cannot be purged at this moment.  This entry won't be
-			 * referenced any more (so unlinking is safe) since the
-			 * TTL was reset to 0.
-			 */
-			ISC_LIST_UNLINK(rbtdb->rdatasets[locknum], header,
-					link);
-			expire_header(rbtdb, header, &nlocktype, tlocktypep,
-				      expire_lru DNS__DB_FLARG_PASS);
-			purgecount--;
-		}
+		purged += expire_lru_headers(
+			rbtdb, locknum, &nlocktype, tlocktypep,
+			purgesize - purged DNS__DB_FLARG_PASS);
 
 		NODE_UNLOCK(&rbtdb->node_locks[locknum].lock, &nlocktype);
 	}
@@ -10037,15 +10061,14 @@ expire_header(dns_rbtdb_t *rbtdb, rdatasetheader_t *header,
 	INSIST(*nlocktypep == isc_rwlocktype_write);
 
 	if (isc_refcount_current(&header->node->references) == 0) {
-		isc_rwlocktype_t nlocktype = isc_rwlocktype_write;
 		/*
 		 * If no one else is using the node, we can clean it up now.
 		 * We first need to gain a new reference to the node to meet a
 		 * requirement of decrement_reference().
 		 */
 		new_reference(rbtdb, header->node,
-			      nlocktype DNS__DB_FLARG_PASS);
-		decrement_reference(rbtdb, header->node, 0, &nlocktype,
+			      *nlocktypep DNS__DB_FLARG_PASS);
+		decrement_reference(rbtdb, header->node, 0, nlocktypep,
 				    tlocktypep, true, false DNS__DB_FLARG_PASS);
 
 		if (rbtdb->cachestats == NULL) {