Improve speed of temporal macro search

author Justin Lebar <justin.lebar@gmail.com>

Mon, 29 Nov 2010 19:24:06 +0000 (20:24 +0100)

committer Joel Rosdahl <joel@rosdahl.net>

Mon, 29 Nov 2010 19:35:19 +0000 (20:35 +0100)
author Justin Lebar <justin.lebar@gmail.com>
Mon, 29 Nov 2010 19:24:06 +0000 (20:24 +0100)
committer Joel Rosdahl <joel@rosdahl.net>
Mon, 29 Nov 2010 19:35:19 +0000 (20:35 +0100)
diff --git a/hashutil.c b/hashutil.c

index 0ec714dd5be39dc0f21fa6fe1739ca2f59e85c22..d9fadef6ea8586f4eb63382da5100c50470f0c5a 100644 (file)
--- a/hashutil.c
+++ b/hashutil.c
@@ -19,6 +19,7 @@
  #include "ccache.h"
  #include "hashutil.h"
  #include "murmurhashneutral2.h"
+#include "macroskip.h"
  
  unsigned
  hash_from_string(void *str)
@@ -45,125 +46,84 @@ file_hashes_equal(struct file_hash *fh1, struct file_hash *fh2)
                 && fh1->size == fh2->size;
  }
  
-#define HASH(ch) \
-       do {\
-               hashbuf[hashbuflen] = ch; \
-               hashbuflen++; \
-               if (hashbuflen == sizeof(hashbuf)) {\
-                       hash_buffer(hash, hashbuf, sizeof(hashbuf)); \
-                       hashbuflen = 0; \
-               } \
-       } while (0)
-
  /*
- * Hash a string ignoring comments. Returns a bitmask of HASH_SOURCE_CODE_*
- * results.
+ * Search for the strings "__DATE__" and "__TIME__" in str.
+ *
+ * Returns a bitmask with HASH_SOURCE_CODE_FOUND_DATE and
+ * HASH_SOURCE_CODE_FOUND_TIME set appropriately.
   */
-int
-hash_source_code_string(
-       struct mdfour *hash, const char *str, size_t len, const char *path)
+static int
+check_for_temporal_macros(const char *str, size_t len)
  {
-       const char *p;
-       const char *end;
-       char hashbuf[64];
-       size_t hashbuflen = 0;
-       int result = HASH_SOURCE_CODE_OK;
-       extern unsigned sloppiness;
+       int result = 0;
  
-       p = str;
-       end = str + len;
-       while (1) {
-               if (p >= end) {
-                       goto end;
-               }
-               switch (*p) {
-               /* Potential start of comment. */
-               case '/':
-                       if (p+1 == end) {
-                               break;
-                       }
-                       switch (*(p+1)) {
-                       case '*':
-                               HASH(' '); /* Don't paste tokens together when removing the comment. */
-                               p += 2;
-                               while (p+1 < end
-                                      && (*p != '*' || *(p+1) != '/')) {
-                                       if (*p == '\n') {
-                                               /* Keep line numbers. */
-                                               HASH('\n');
-                                       }
-                                       p++;
-                               }
-                               if (p+1 == end) {
-                                       goto end;
-                               }
-                               p += 2;
-                               continue;
+       /*
+        * We're using the Boyer-Moore-Horspool algorithm, which searches
+        * starting from the *end* of the needle.  Our needles are 8 characters
+        * long, so i starts at 7.
+        */
+       size_t i = 7;
  
-                       case '/':
-                               p += 2;
-                               while (p < end
-                                      && (*p != '\n' || *(p-1) == '\\')) {
-                                       p++;
-                               }
-                               continue;
+       while (i < len) {
+               /*
+                * Check whether the substring ending at str[i] has the form
+                * '__...E__'.  On the assumption that 'E' is less common in
+                * source than '_', we check str[i-2] first.
+                */
+               if (str[i - 2] == 'E' &&
+                   str[i - 0] == '_' &&
+                   str[i - 7] == '_' &&
+                   str[i - 1] == '_' &&
+                   str[i - 6] == '_') {
  
-                       default:
-                               break;
-                       }
-                       break;
+                       /*
+                        * Check the remaining characters to see if the
+                        * substring is '__DATE__' or '__TIME__'.
+                        */
  
-               /* Start of string. */
-               case '"':
-                       HASH(*p);
-                       p++;
-                       while (p < end && (*p != '"' || *(p-1) == '\\')) {
-                               HASH(*p);
-                               p++;
-                       }
-                       if (p == end) {
-                               goto end;
+                       if (str[i - 5] == 'D' && str[i - 4] == 'A' &&
+                           str[i - 3] == 'T') {
+                               result |= HASH_SOURCE_CODE_FOUND_DATE;
                         }
-                       break;
-
-               /* Potential start of volatile macro. */
-               case '_':
-                       if (p + 7 < end
-                           && p[1] == '_' && p[5] == 'E'
-                           && p[6] == '_' && p[7] == '_') {
-                               if (p[2] == 'D' && p[3] == 'A'
-                                   && p[4] == 'T') {
-                                       result |= HASH_SOURCE_CODE_FOUND_DATE;
-                               } else if (p[2] == 'T' && p[3] == 'I'
-                                          && p[4] == 'M') {
-                                       result |= HASH_SOURCE_CODE_FOUND_TIME;
-                               }
-                               /*
-                                * Of course, we can't be sure that we have found a __{DATE,TIME}__
-                                * that's actually used, but better safe than sorry. And if you do
-                                * something like
-                                *
-                                * #define TIME __TI ## ME__
-                                *
-                                * in your code, you deserve to get a false cache hit.
-                                */
+                       else if (str[i - 5] == 'T' && str[i - 4] == 'I' &&
+                                str[i - 3] == 'M') {
+                               result |= HASH_SOURCE_CODE_FOUND_TIME;
                         }
-                       break;
-
-               default:
-                       break;
                 }
  
-               HASH(*p);
-               p++;
+               /*
+                * macro_skip tells us how far we can skip forward upon seeing
+                * str[i] at the end of a substring.
+                */
+               i += macro_skip[(uint8_t)str[i]];
         }
  
-end:
-       hash_buffer(hash, hashbuf, hashbuflen);
+       return result;
+}
  
-       if (sloppiness & SLOPPY_TIME_MACROS) {
-               return 0;
+/*
+ * Hash a string. Returns a bitmask of HASH_SOURCE_CODE_* results.
+ */
+int
+hash_source_code_string(
+       struct mdfour *hash, const char *str, size_t len, const char *path)
+{
+       int result = HASH_SOURCE_CODE_OK;
+       extern unsigned sloppiness;
+
+       /*
+        * Check for __DATE__ and __TIME__ if the sloppiness argument tells us
+        * we have to.
+        */
+       if (!(sloppiness & SLOPPY_TIME_MACROS)) {
+               result |= check_for_temporal_macros(str, len);
         }
+
+       /*
+        * Hash the source string.
+        */
+       hash_buffer(hash, str, len);
+
         if (result & HASH_SOURCE_CODE_FOUND_DATE) {
                 /*
                  * Make sure that the hash sum changes if the (potential) expansion of
diff --git a/macroskip.h b/macroskip.h

new file mode 100644 (file)

index 0000000..1452201
--- /dev/null
+++ b/macroskip.h
@@ -0,0 +1,56 @@
+/*
+ * A Boyer-Moore-Horspool skip table used for searching for the strings
+ * "__TIME__" and "__DATE__".
+ *
+ * macro_skip[c] = 8 for all c not in "__TIME__" and "__DATE__".
+ *
+ * The other characters map as follows:
+ *
+ *   _ -> 1
+ *   A -> 5
+ *   D -> 6
+ *   E -> 3
+ *   I -> 5
+ *   M -> 4
+ *   T -> 4
+ *
+ *
+ * This was generated with the following Python script:
+ *
+ * m = {'_': 1,
+ *      'A': 5,
+ *      'D': 6,
+ *      'E': 3,
+ *      'I': 5,
+ *      'M': 4,
+ *      'T': 4}
+ *
+ * for i in range(0, 256):
+ *     if chr(i) in m:
+ *         num = m[chr(i)]
+ *     else:
+ *         num = 8
+ *     print ("%d, " % num),
+ *
+ *     if i % 16 == 15:
+ *         print ""
+ */
+
+static const uint32_t macro_skip[] = {
+       8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,
+       8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,
+       8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,
+       8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,
+       8,  5,  8,  8,  6,  3,  8,  8,  8,  5,  8,  8,  8,  4,  8,  8,
+       8,  8,  8,  8,  4,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  1,
+       8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,
+       8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,
+       8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,
+       8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,
+       8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,
+       8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,
+       8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,
+       8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,
+       8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,
+       8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,
+};
diff --git a/test.sh b/test.sh

index 74fc5ae0e7a4b5d609b80ae6644699656724efb9..c8520b45ffdbaeab86cea295a9f8dca84619728c 100755 (executable)
--- a/test.sh
+++ b/test.sh
@@ -866,45 +866,6 @@ EOF
      checkstat 'cache miss' 1
      checkfile stderr-mf.txt "`cat stderr-orig.txt`"
  
-    ##################################################################
-    # Check that changes in comments are ignored when hashing.
-    testname="changes in comments"
-    $CCACHE -C >/dev/null
-    $CCACHE -z >/dev/null
-    cat <<EOF >comments.h
-/*
- * /* foo comment
- */
-EOF
-    backdate comments.h
-    cat <<'EOF' >comments.c
-#include "comments.h"
-char test[] = "\
-/* apple */ // banana"; // foo comment
-EOF
-
-    $CCACHE $COMPILER -c comments.c
-    checkstat 'cache hit (direct)' 0
-    checkstat 'cache hit (preprocessed)' 0
-    checkstat 'cache miss' 1
-
-    sed_in_place 's/foo/ignored/' comments.h comments.c
-    backdate comments.h
-
-    $CCACHE $COMPILER -c comments.c
-    checkstat 'cache hit (direct)' 1
-    checkstat 'cache hit (preprocessed)' 0
-    checkstat 'cache miss' 1
-
-    # Check that comment-like string contents are hashed.
-    sed_in_place 's/apple/orange/' comments.c
-    backdate comments.h
-
-    $CCACHE $COMPILER -c comments.c
-    checkstat 'cache hit (direct)' 1
-    checkstat 'cache hit (preprocessed)' 0
-    checkstat 'cache miss' 2
-
      ##################################################################
      # Check that it is possible to compile and cache an empty source code file.
      testname="empty source file"
author	Justin Lebar <justin.lebar@gmail.com>
	Mon, 29 Nov 2010 19:24:06 +0000 (20:24 +0100)
committer	Joel Rosdahl <joel@rosdahl.net>
	Mon, 29 Nov 2010 19:35:19 +0000 (20:35 +0100)
hashutil.c		patch \| blob \| blame \| history
macroskip.h	[new file with mode: 0644]	patch \| blob
test.sh		patch \| blob \| blame \| history