From fe732f041ed36756679e3cf28f5407fb53c5453c Mon Sep 17 00:00:00 2001
From: Justin Lebar <justin.lebar@gmail.com>
Date: Mon, 29 Nov 2010 20:24:06 +0100
Subject: [PATCH] Improve speed of temporal macro search

---
 hashutil.c  | 168 ++++++++++++++++++++--------------------------------
 macroskip.h |  56 ++++++++++++++++++
 test.sh     |  39 ------------
 3 files changed, 120 insertions(+), 143 deletions(-)
 create mode 100644 macroskip.h

diff --git a/hashutil.c b/hashutil.c
index 0ec714dd5..d9fadef6e 100644
--- a/hashutil.c
+++ b/hashutil.c
@@ -19,6 +19,7 @@
 #include "ccache.h"
 #include "hashutil.h"
 #include "murmurhashneutral2.h"
+#include "macroskip.h"
 
 unsigned
 hash_from_string(void *str)
@@ -45,125 +46,84 @@ file_hashes_equal(struct file_hash *fh1, struct file_hash *fh2)
 		&& fh1->size == fh2->size;
 }
 
-#define HASH(ch) \
-	do {\
-		hashbuf[hashbuflen] = ch; \
-		hashbuflen++; \
-		if (hashbuflen == sizeof(hashbuf)) {\
-			hash_buffer(hash, hashbuf, sizeof(hashbuf)); \
-			hashbuflen = 0; \
-		} \
-	} while (0)
-
 /*
- * Hash a string ignoring comments. Returns a bitmask of HASH_SOURCE_CODE_*
- * results.
+ * Search for the strings "__DATE__" and "__TIME__" in str.
+ *
+ * Returns a bitmask with HASH_SOURCE_CODE_FOUND_DATE and
+ * HASH_SOURCE_CODE_FOUND_TIME set appropriately.
  */
-int
-hash_source_code_string(
-	struct mdfour *hash, const char *str, size_t len, const char *path)
+static int
+check_for_temporal_macros(const char *str, size_t len)
 {
-	const char *p;
-	const char *end;
-	char hashbuf[64];
-	size_t hashbuflen = 0;
-	int result = HASH_SOURCE_CODE_OK;
-	extern unsigned sloppiness;
+	int result = 0;
 
-	p = str;
-	end = str + len;
-	while (1) {
-		if (p >= end) {
-			goto end;
-		}
-		switch (*p) {
-		/* Potential start of comment. */
-		case '/':
-			if (p+1 == end) {
-				break;
-			}
-			switch (*(p+1)) {
-			case '*':
-				HASH(' '); /* Don't paste tokens together when removing the comment. */
-				p += 2;
-				while (p+1 < end
-				       && (*p != '*' || *(p+1) != '/')) {
-					if (*p == '\n') {
-						/* Keep line numbers. */
-						HASH('\n');
-					}
-					p++;
-				}
-				if (p+1 == end) {
-					goto end;
-				}
-				p += 2;
-				continue;
+	/*
+	 * We're using the Boyer-Moore-Horspool algorithm, which searches
+	 * starting from the *end* of the needle.  Our needles are 8 characters
+	 * long, so i starts at 7.
+	 */
+	size_t i = 7;
 
-			case '/':
-				p += 2;
-				while (p < end
-				       && (*p != '\n' || *(p-1) == '\\')) {
-					p++;
-				}
-				continue;
+	while (i < len) {
+		/*
+		 * Check whether the substring ending at str[i] has the form
+		 * '__...E__'.  On the assumption that 'E' is less common in
+		 * source than '_', we check str[i-2] first.
+		 */
+		if (str[i - 2] == 'E' &&
+		    str[i - 0] == '_' &&
+		    str[i - 7] == '_' &&
+		    str[i - 1] == '_' &&
+		    str[i - 6] == '_') {
 
-			default:
-				break;
-			}
-			break;
+			/*
+			 * Check the remaining characters to see if the
+			 * substring is '__DATE__' or '__TIME__'.
+			 */
 
-		/* Start of string. */
-		case '"':
-			HASH(*p);
-			p++;
-			while (p < end && (*p != '"' || *(p-1) == '\\')) {
-				HASH(*p);
-				p++;
-			}
-			if (p == end) {
-				goto end;
+			if (str[i - 5] == 'D' && str[i - 4] == 'A' &&
+			    str[i - 3] == 'T') {
+				result |= HASH_SOURCE_CODE_FOUND_DATE;
 			}
-			break;
-
-		/* Potential start of volatile macro. */
-		case '_':
-			if (p + 7 < end
-			    && p[1] == '_' && p[5] == 'E'
-			    && p[6] == '_' && p[7] == '_') {
-				if (p[2] == 'D' && p[3] == 'A'
-				    && p[4] == 'T') {
-					result |= HASH_SOURCE_CODE_FOUND_DATE;
-				} else if (p[2] == 'T' && p[3] == 'I'
-				           && p[4] == 'M') {
-					result |= HASH_SOURCE_CODE_FOUND_TIME;
-				}
-				/*
-				 * Of course, we can't be sure that we have found a __{DATE,TIME}__
-				 * that's actually used, but better safe than sorry. And if you do
-				 * something like
-				 *
-				 * #define TIME __TI ## ME__
-				 *
-				 * in your code, you deserve to get a false cache hit.
-				 */
+			else if (str[i - 5] == 'T' && str[i - 4] == 'I' &&
+				 str[i - 3] == 'M') {
+				result |= HASH_SOURCE_CODE_FOUND_TIME;
 			}
-			break;
-
-		default:
-			break;
 		}
 
-		HASH(*p);
-		p++;
+		/*
+		 * macro_skip tells us how far we can skip forward upon seeing
+		 * str[i] at the end of a substring.
+		 */
+		i += macro_skip[(uint8_t)str[i]];
 	}
 
-end:
-	hash_buffer(hash, hashbuf, hashbuflen);
+	return result;
+}
 
-	if (sloppiness & SLOPPY_TIME_MACROS) {
-		return 0;
+/*
+ * Hash a string. Returns a bitmask of HASH_SOURCE_CODE_* results.
+ */
+int
+hash_source_code_string(
+	struct mdfour *hash, const char *str, size_t len, const char *path)
+{
+	int result = HASH_SOURCE_CODE_OK;
+	extern unsigned sloppiness;
+
+	/*
+	 * Check for __DATE__ and __TIME__ if the sloppiness argument tells us
+	 * we have to.
+	 */
+	if (!(sloppiness & SLOPPY_TIME_MACROS)) {
+		result |= check_for_temporal_macros(str, len);
 	}
+
+	/*
+	 * Hash the source string.
+	 */
+	hash_buffer(hash, str, len);
+
 	if (result & HASH_SOURCE_CODE_FOUND_DATE) {
 		/*
 		 * Make sure that the hash sum changes if the (potential) expansion of
diff --git a/macroskip.h b/macroskip.h
new file mode 100644
index 000000000..14522018a
--- /dev/null
+++ b/macroskip.h
@@ -0,0 +1,56 @@
+/*
+ * A Boyer-Moore-Horspool skip table used for searching for the strings
+ * "__TIME__" and "__DATE__".
+ *
+ * macro_skip[c] = 8 for all c not in "__TIME__" and "__DATE__".
+ *
+ * The other characters map as follows:
+ *
+ *   _ -> 1
+ *   A -> 5
+ *   D -> 6
+ *   E -> 3
+ *   I -> 5
+ *   M -> 4
+ *   T -> 4
+ *
+ *
+ * This was generated with the following Python script:
+ *
+ * m = {'_': 1,
+ *      'A': 5,
+ *      'D': 6,
+ *      'E': 3,
+ *      'I': 5,
+ *      'M': 4,
+ *      'T': 4}
+ *
+ * for i in range(0, 256):
+ *     if chr(i) in m:
+ *         num = m[chr(i)]
+ *     else:
+ *         num = 8
+ *     print ("%d, " % num),
+ *
+ *     if i % 16 == 15:
+ *         print ""
+ */
+
+static const uint32_t macro_skip[] = {
+	8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,
+	8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,
+	8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,
+	8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,
+	8,  5,  8,  8,  6,  3,  8,  8,  8,  5,  8,  8,  8,  4,  8,  8,
+	8,  8,  8,  8,  4,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  1,
+	8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,
+	8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,
+	8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,
+	8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,
+	8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,
+	8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,
+	8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,
+	8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,
+	8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,
+	8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,
+};
diff --git a/test.sh b/test.sh
index 74fc5ae0e..c8520b45f 100755
--- a/test.sh
+++ b/test.sh
@@ -866,45 +866,6 @@ EOF
     checkstat 'cache miss' 1
     checkfile stderr-mf.txt "`cat stderr-orig.txt`"
 
-    ##################################################################
-    # Check that changes in comments are ignored when hashing.
-    testname="changes in comments"
-    $CCACHE -C >/dev/null
-    $CCACHE -z >/dev/null
-    cat <<EOF >comments.h
-/*
- * /* foo comment
- */
-EOF
-    backdate comments.h
-    cat <<'EOF' >comments.c
-#include "comments.h"
-char test[] = "\
-/* apple */ // banana"; // foo comment
-EOF
-
-    $CCACHE $COMPILER -c comments.c
-    checkstat 'cache hit (direct)' 0
-    checkstat 'cache hit (preprocessed)' 0
-    checkstat 'cache miss' 1
-
-    sed_in_place 's/foo/ignored/' comments.h comments.c
-    backdate comments.h
-
-    $CCACHE $COMPILER -c comments.c
-    checkstat 'cache hit (direct)' 1
-    checkstat 'cache hit (preprocessed)' 0
-    checkstat 'cache miss' 1
-
-    # Check that comment-like string contents are hashed.
-    sed_in_place 's/apple/orange/' comments.c
-    backdate comments.h
-
-    $CCACHE $COMPILER -c comments.c
-    checkstat 'cache hit (direct)' 1
-    checkstat 'cache hit (preprocessed)' 0
-    checkstat 'cache miss' 2
-
     ##################################################################
     # Check that it is possible to compile and cache an empty source code file.
     testname="empty source file"
-- 
2.47.3