]> git.ipfire.org Git - thirdparty/ccache.git/commitdiff
Fix comment scanning bug in hash_source_code_string
authorJoel Rosdahl <joel@rosdahl.net>
Fri, 20 Feb 2015 19:32:33 +0000 (20:32 +0100)
committerJoel Rosdahl <joel@rosdahl.net>
Sat, 7 Mar 2015 14:55:48 +0000 (15:55 +0100)
hash_source_code_string tries to ignore __DATE__/__TIME strings in
comments, but fails to parse code that contains character literal of a
double quote. This could result in false cache hits when the source code
happens to contain '"' followed by " /*" or " //" (with variations).

The fix is to do like it's already done in ccache 3.2: Don't try to be
overly clever about __DATE__/__TIME__, just check for those strings
anywhere in the source code string.

NEWS.txt
dev.mk.in
hashutil.c
hashutil.h
macroskip.h [new file with mode: 0644]
test.sh
test/test_hashutil.c

index 4876b233658daa8b99daf051c7f41cb42cc78ad8..935ef429b628eee3efa79ac33e20b2fd39756cf4 100644 (file)
--- a/NEWS.txt
+++ b/NEWS.txt
@@ -8,6 +8,9 @@ Unreleased
 Bug fixes
 ~~~~~~~~~
 
+- Fixed bug which could result in false cache hits when source code contains
+  `'"'` followed by `" /*"` or `" //"` (with variations).
+
 - Made hash of cached result created with and without `CCACHE_CPP2` different.
   This makes it possible to rebuild with `CCACHE_CPP2` set without having to
   clear the cache to get new results.
index 1fb1f902d52652e76acd5fd6c86bda96c50cabab..d3964508997ccf4965a667b0ee063b113c94aea9 100644 (file)
--- a/dev.mk.in
+++ b/dev.mk.in
@@ -25,7 +25,7 @@ built_dist_files = $(generated_docs)
 headers = \
     ccache.h hashtable.h hashtable_itr.h hashtable_private.h hashutil.h \
     manifest.h mdfour.h counters.h murmurhashneutral2.h getopt_long.h \
-    language.h system.h compopt.h \
+    language.h macroskip.h system.h compopt.h \
     test/framework.h test/suites.h test/util.h
 
 files_to_clean += *.tar.bz2 *.tar.gz *.tar.xz *.xml .deps/*
index 6cc9e6cb7adc4a5f417f39c7e4a354e95e7a267e..1ead7680e0c439c463e0514aad68c70b3d871b3f 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2009-2010, 2012 Joel Rosdahl
+ * Copyright (C) 2009-2015 Joel Rosdahl
  *
  * This program is free software; you can redistribute it and/or modify it
  * under the terms of the GNU General Public License as published by the Free
@@ -18,6 +18,7 @@
 
 #include "ccache.h"
 #include "hashutil.h"
+#include "macroskip.h"
 #include "murmurhashneutral2.h"
 
 unsigned
@@ -56,124 +57,82 @@ file_hashes_equal(struct file_hash *fh1, struct file_hash *fh2)
        } while (0)
 
 /*
- * Hash a string ignoring comments. Returns a bitmask of HASH_SOURCE_CODE_*
- * results.
+ * Search for the strings "__DATE__" and "__TIME__" in str.
+ *
+ * Returns a bitmask with HASH_SOURCE_CODE_FOUND_DATE and
+ * HASH_SOURCE_CODE_FOUND_TIME set appropriately.
  */
 int
-hash_source_code_string(
-       struct mdfour *hash, const char *str, size_t len, const char *path)
+check_for_temporal_macros(const char *str, size_t len)
 {
-       const char *p;
-       const char *end;
-       char hashbuf[64];
-       size_t hashbuflen = 0;
-       int result = HASH_SOURCE_CODE_OK;
-       extern unsigned sloppiness;
-       bool seen_backslash;
+       int result = 0;
 
-       p = str;
-       end = str + len;
-       while (1) {
-               if (p >= end) {
-                       goto end;
-               }
-               switch (*p) {
-               /* Potential start of comment. */
-               case '/':
-                       if (p+1 == end) {
-                               break;
-                       }
-                       switch (*(p+1)) {
-                       case '*':
-                               HASH(' '); /* Don't paste tokens together when removing the comment. */
-                               p += 2;
-                               while (p+1 < end && (*p != '*' || *(p+1) != '/')) {
-                                       if (*p == '\n') {
-                                               /* Keep line numbers. */
-                                               HASH('\n');
-                                       }
-                                       p++;
-                               }
-                               if (p+1 == end) {
-                                       goto end;
-                               }
-                               p += 2;
-                               continue;
+       /*
+        * We're using the Boyer-Moore-Horspool algorithm, which searches starting
+        * from the *end* of the needle. Our needles are 8 characters long, so i
+        * starts at 7.
+        */
+       size_t i = 7;
 
-                       case '/':
-                               p += 2;
-                               while (p < end && (*p != '\n' || *(p-1) == '\\')) {
-                                       p++;
-                               }
-                               continue;
-
-                       default:
-                               break;
-                       }
-                       break;
-
-               /* Start of string. */
-               case '"':
-                       HASH(*p);
-                       p++;
-                       seen_backslash = false;
-                       while (p < end) {
-                               if (seen_backslash) {
-                                       seen_backslash = false;
-                               } else if (*p == '"') {
-                                       /* Found end of string. */
-                                       HASH(*p);
-                                       p++;
-                                       break;
-                               } else if (*p == '\\') {
-                                       seen_backslash = true;
-                               }
-                               HASH(*p);
-                               p++;
-                       }
-                       if (p == end) {
-                               goto end;
+       while (i < len) {
+               /*
+                * Check whether the substring ending at str[i] has the form "__...E__". On
+                * the assumption that 'E' is less common in source than '_', we check
+                * str[i-2] first.
+                */
+               if (str[i - 2] == 'E' &&
+                   str[i - 0] == '_' &&
+                   str[i - 7] == '_' &&
+                   str[i - 1] == '_' &&
+                   str[i - 6] == '_') {
+                       /*
+                        * Check the remaining characters to see if the substring is "__DATE__"
+                        * or "__TIME__".
+                        */
+                       if (str[i - 5] == 'D' && str[i - 4] == 'A' &&
+                           str[i - 3] == 'T') {
+                               result |= HASH_SOURCE_CODE_FOUND_DATE;
                        }
-                       break;
-
-               /* Potential start of volatile macro. */
-               case '_':
-                       if (p + 7 < end
-                           && p[1] == '_' && p[5] == 'E'
-                           && p[6] == '_' && p[7] == '_') {
-                               if (p[2] == 'D' && p[3] == 'A'
-                                   && p[4] == 'T') {
-                                       result |= HASH_SOURCE_CODE_FOUND_DATE;
-                               } else if (p[2] == 'T' && p[3] == 'I'
-                                          && p[4] == 'M') {
-                                       result |= HASH_SOURCE_CODE_FOUND_TIME;
-                               }
-                               /*
-                                * Of course, we can't be sure that we have found a __{DATE,TIME}__
-                                * that's actually used, but better safe than sorry. And if you do
-                                * something like
-                                *
-                                * #define TIME __TI ## ME__
-                                *
-                                * in your code, you deserve to get a false cache hit.
-                                */
+                       else if (str[i - 5] == 'T' && str[i - 4] == 'I' &&
+                                str[i - 3] == 'M') {
+                               result |= HASH_SOURCE_CODE_FOUND_TIME;
                        }
-                       break;
-
-               default:
-                       break;
                }
 
-               HASH(*p);
-               p++;
+               /*
+                * macro_skip tells us how far we can skip forward upon seeing str[i] at
+                * the end of a substring.
+                */
+               i += macro_skip[(uint8_t)str[i]];
        }
 
-end:
-       hash_buffer(hash, hashbuf, hashbuflen);
+       return result;
+}
 
-       if (sloppiness & SLOPPY_TIME_MACROS) {
-               return 0;
+/*
+ * Hash a string ignoring comments. Returns a bitmask of HASH_SOURCE_CODE_*
+ * results.
+ */
+int
+hash_source_code_string(
+       struct mdfour *hash, const char *str, size_t len, const char *path)
+{
+       int result = HASH_SOURCE_CODE_OK;
+       extern unsigned sloppiness;
+
+       /*
+        * Check for __DATE__ and __TIME__ if the sloppiness configuration tells us
+        * we should.
+        */
+       if (!(sloppiness & SLOPPY_TIME_MACROS)) {
+               result |= check_for_temporal_macros(str, len);
        }
+
+       /*
+        * Hash the source string.
+        */
+       hash_buffer(hash, str, len);
+
        if (result & HASH_SOURCE_CODE_FOUND_DATE) {
                /*
                 * Make sure that the hash sum changes if the (potential) expansion of
index a897b3e12dff3a96c05fce9b17d9be92f048457c..ec4e059fdac9876a6a431041f84a8e2b4d7aed25 100644 (file)
@@ -20,6 +20,7 @@ int file_hashes_equal(struct file_hash *fh1, struct file_hash *fh2);
 #define        HASH_SOURCE_CODE_FOUND_DATE 2
 #define        HASH_SOURCE_CODE_FOUND_TIME 4
 
+int check_for_temporal_macros(const char *str, size_t len);
 int hash_source_code_string(
        struct mdfour *hash, const char *str, size_t len, const char *path);
 int hash_source_code_file(struct mdfour *hash, const char *path);
diff --git a/macroskip.h b/macroskip.h
new file mode 100644 (file)
index 0000000..cb32ec4
--- /dev/null
@@ -0,0 +1,56 @@
+/*
+ * A Boyer-Moore-Horspool skip table used for searching for the strings
+ * "__TIME__" and "__DATE__".
+ *
+ * macro_skip[c] = 8 for all c not in "__TIME__" and "__DATE__".
+ *
+ * The other characters map as follows:
+ *
+ *   _ -> 1
+ *   A -> 4
+ *   D -> 5
+ *   E -> 2
+ *   I -> 4
+ *   M -> 3
+ *   T -> 3
+ *
+ *
+ * This was generated with the following Python script:
+ *
+ * m = {'_': 1,
+ *      'A': 4,
+ *      'D': 5,
+ *      'E': 2,
+ *      'I': 4,
+ *      'M': 3,
+ *      'T': 3}
+ *
+ * for i in range(0, 256):
+ *     if chr(i) in m:
+ *         num = m[chr(i)]
+ *     else:
+ *         num = 8
+ *     print ("%d, " % num),
+ *
+ *     if i % 16 == 15:
+ *         print ""
+ */
+
+static const uint32_t macro_skip[] = {
+       8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,
+       8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,
+       8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,
+       8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,
+       8,  4,  8,  8,  5,  2,  8,  8,  8,  4,  8,  8,  8,  3,  8,  8,
+       8,  8,  8,  8,  3,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  1,
+       8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,
+       8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,
+       8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,
+       8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,
+       8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,
+       8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,
+       8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,
+       8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,
+       8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,
+       8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,
+};
diff --git a/test.sh b/test.sh
index 62854cfd60530a246d630d56f8da38d9ac852b83..5efbca4f78b6e2a8e122c1694c72101df019c96e 100755 (executable)
--- a/test.sh
+++ b/test.sh
@@ -3,7 +3,7 @@
 # A simple test suite for ccache.
 #
 # Copyright (C) 2002-2007 Andrew Tridgell
-# Copyright (C) 2009-2014 Joel Rosdahl
+# Copyright (C) 2009-2015 Joel Rosdahl
 #
 # This program is free software; you can redistribute it and/or modify it under
 # the terms of the GNU General Public License as published by the Free Software
@@ -903,45 +903,6 @@ EOF
     checkstat 'cache miss' 1
     checkfile stderr-mf.txt "`cat stderr-orig.txt`"
 
-    ##################################################################
-    # Check that changes in comments are ignored when hashing.
-    testname="changes in comments"
-    $CCACHE -C >/dev/null
-    $CCACHE -z >/dev/null
-    cat <<EOF >comments.h
-/*
- * /* foo comment
- */
-EOF
-    backdate comments.h
-    cat <<'EOF' >comments.c
-#include "comments.h"
-char test[] = "\
-/* apple */ // banana"; // foo comment
-EOF
-
-    $CCACHE $COMPILER -c comments.c
-    checkstat 'cache hit (direct)' 0
-    checkstat 'cache hit (preprocessed)' 0
-    checkstat 'cache miss' 1
-
-    sed_in_place 's/foo/ignored/' comments.h comments.c
-    backdate comments.h
-
-    $CCACHE $COMPILER -c comments.c
-    checkstat 'cache hit (direct)' 1
-    checkstat 'cache hit (preprocessed)' 0
-    checkstat 'cache miss' 1
-
-    # Check that comment-like string contents are hashed.
-    sed_in_place 's/apple/orange/' comments.c
-    backdate comments.h
-
-    $CCACHE $COMPILER -c comments.c
-    checkstat 'cache hit (direct)' 1
-    checkstat 'cache hit (preprocessed)' 0
-    checkstat 'cache miss' 2
-
     ##################################################################
     # Check that it's possible to compile and cache an empty source code file.
     testname="empty source file"
@@ -1209,6 +1170,23 @@ EOF
     checkstat 'cache hit (direct)' 2
     checkstat 'cache hit (preprocessed)' 0
     checkstat 'cache miss' 2
+
+    testname="comment in strings"
+    $CCACHE -Cz >/dev/null
+    echo 'char *comment = " /* \\\\u" "foo" " */";' >comment.c
+    $CCACHE $COMPILER -c comment.c
+    checkstat 'cache hit (direct)' 0
+    checkstat 'cache hit (preprocessed)' 0
+    checkstat 'cache miss' 1
+    $CCACHE $COMPILER -c comment.c
+    checkstat 'cache hit (direct)' 1
+    checkstat 'cache hit (preprocessed)' 0
+    checkstat 'cache miss' 1
+    echo 'char *comment = " /* \\\\u" "goo" " */";' >comment.c
+    $CCACHE $COMPILER -c comment.c
+    checkstat 'cache hit (direct)' 1
+    checkstat 'cache hit (preprocessed)' 0
+    checkstat 'cache miss' 2
 }
 
 basedir_suite() {
index 6ee9d4b8286d3f49b84598349b6840d629883df4..243af69af1651ec6f9a40f1eb39df71250220f10 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2010, 2012 Joel Rosdahl
+ * Copyright (C) 2010-2015 Joel Rosdahl
  *
  * This program is free software; you can redistribute it and/or modify it
  * under the terms of the GNU General Public License as published by the Free
@@ -109,83 +109,82 @@ TEST(hash_source_code_simple_case)
        CHECK_STR_EQ_FREE2("a448017aaf21d8525fc10ae87aa6729d-3", hash_result(&h));
 }
 
-TEST(hash_source_code_with_c_style_comment)
+TEST(check_for_temporal_macros)
 {
-       struct mdfour h;
-       char input[] = "a/*b*/c";
-       size_t input_len = strlen(input);
-
-       hash_start(&h);
-       hash_source_code_string(&h, input, input_len, "");
-       CHECK_STR_EQ_FREE2("1c2c87080ee03418fb1279e3b1f09a68-3", hash_result(&h));
-
-       input[3] = 'd';
-       hash_start(&h);
-       hash_source_code_string(&h, input, input_len, "");
-       CHECK_STR_EQ_FREE2("1c2c87080ee03418fb1279e3b1f09a68-3", hash_result(&h));
-}
-
-TEST(hash_source_code_with_cplusplus_style_comment)
-{
-       struct mdfour h;
-       char input[] = "a//b\nc";
-       size_t input_len = strlen(input);
-
-       hash_start(&h);
-       hash_source_code_string(&h, input, input_len, "");
-       CHECK_STR_EQ_FREE2("4a3fbbe3c140fa193227dba3814db6e6-3", hash_result(&h));
-
-       input[3] = 'd';
-       hash_start(&h);
-       hash_source_code_string(&h, input, input_len, "");
-       CHECK_STR_EQ_FREE2("4a3fbbe3c140fa193227dba3814db6e6-3", hash_result(&h));
-}
-
-TEST(hash_source_code_with_comment_inside_string)
-{
-       struct mdfour h;
-       char input[] = "a\"//b\"c";
-       size_t input_len = strlen(input);
-
-       hash_start(&h);
-       hash_source_code_string(&h, input, input_len, "");
-       CHECK_STR_EQ_FREE2("4c2fa74b0843d8f93df5c04c98ccb0a4-7", hash_result(&h));
-
-       input[4] = 'd';
-       hash_start(&h);
-       hash_source_code_string(&h, input, input_len, "");
-       CHECK_STR_EQ_FREE2("f0069218ec640008cbfa2d150c1061bb-7", hash_result(&h));
-}
-
-TEST(hash_source_code_with_quote_in_string)
-{
-       struct mdfour h;
-       char input[] = "a\"\\\"b//c\""; /* a"\"b//c" */
-       size_t input_len = strlen(input);
-
-       hash_start(&h);
-       hash_source_code_string(&h, input, input_len, "");
-       CHECK_STR_EQ_FREE2("c4e45e7a7f6f29b000a51f187dc4cf06-9", hash_result(&h));
-
-       hash_start(&h);
-       input[7] = 'd';
-       hash_source_code_string(&h, input, input_len, "");
-       CHECK_STR_EQ_FREE2("bef8fb852dddcee189b91b068a621c55-9", hash_result(&h));
-}
-
-TEST(hash_source_code_with_backslash_at_string_end)
-{
-       struct mdfour h;
-       char input[] = "a\"\\\\\"b//c"; /* a"\\"b//c */
-       size_t input_len = strlen(input);
-
-       hash_start(&h);
-       hash_source_code_string(&h, input, input_len, "");
-       CHECK_STR_EQ_FREE2("7f3ccf27edadad1b90cb2cffb59775d6-6", hash_result(&h));
-
-       input[input_len - 1] = 'd';
-       hash_source_code_string(&h, input, input_len, "");
-       CHECK_STR_EQ_FREE2("7f3ccf27edadad1b90cb2cffb59775d6-6", hash_result(&h));
+       const char time_start[] =
+               "__TIME__\n"
+               "int a;\n";
+       const char time_middle[] =
+               "#define a __TIME__\n"
+               "int a;\n";
+       const char time_end[] =
+               "#define a __TIME__";
+
+       const char date_start[] =
+               "__DATE__\n"
+               "int ab;\n";
+       const char date_middle[] =
+               "#define ab __DATE__\n"
+               "int ab;\n";
+       const char date_end[] =
+               "#define ab __DATE__";
+
+       const char no_temporal[] =
+               "#define ab _ _DATE__\n"
+               "#define ab __ DATE__\n"
+               "#define ab __D ATE__\n"
+               "#define ab __DA TE__\n"
+               "#define ab __DAT E__\n"
+               "#define ab __DATE __\n"
+               "#define ab __DATE_ _\n"
+               "#define ab _ _TIME__\n"
+               "#define ab __ TIME__\n"
+               "#define ab __T IME__\n"
+               "#define ab __TI ME__\n"
+               "#define ab __TIM E__\n"
+               "#define ab __TIME __\n"
+               "#define ab __TIME_ _\n";
+
+       CHECK(check_for_temporal_macros(time_start + 0, sizeof(time_start) - 0));
+       CHECK(!check_for_temporal_macros(time_start + 1, sizeof(time_start) - 1));
+
+       CHECK(check_for_temporal_macros(time_middle + 0, sizeof(time_middle) - 0));
+       CHECK(check_for_temporal_macros(time_middle + 1, sizeof(time_middle) - 1));
+       CHECK(check_for_temporal_macros(time_middle + 2, sizeof(time_middle) - 2));
+       CHECK(check_for_temporal_macros(time_middle + 3, sizeof(time_middle) - 3));
+       CHECK(check_for_temporal_macros(time_middle + 4, sizeof(time_middle) - 4));
+       CHECK(check_for_temporal_macros(time_middle + 5, sizeof(time_middle) - 5));
+       CHECK(check_for_temporal_macros(time_middle + 6, sizeof(time_middle) - 6));
+       CHECK(check_for_temporal_macros(time_middle + 7, sizeof(time_middle) - 7));
+
+       CHECK(check_for_temporal_macros(time_end + 0, sizeof(time_end) - 0));
+       CHECK(check_for_temporal_macros(time_end + sizeof(time_end) - 9, 9));
+       CHECK(!check_for_temporal_macros(time_end + sizeof(time_end) - 8, 8));
+
+       CHECK(check_for_temporal_macros(date_start + 0, sizeof(date_start) - 0));
+       CHECK(!check_for_temporal_macros(date_start + 1, sizeof(date_start) - 1));
+
+       CHECK(check_for_temporal_macros(date_middle + 0, sizeof(date_middle) - 0));
+       CHECK(check_for_temporal_macros(date_middle + 1, sizeof(date_middle) - 1));
+       CHECK(check_for_temporal_macros(date_middle + 2, sizeof(date_middle) - 2));
+       CHECK(check_for_temporal_macros(date_middle + 3, sizeof(date_middle) - 3));
+       CHECK(check_for_temporal_macros(date_middle + 4, sizeof(date_middle) - 4));
+       CHECK(check_for_temporal_macros(date_middle + 5, sizeof(date_middle) - 5));
+       CHECK(check_for_temporal_macros(date_middle + 6, sizeof(date_middle) - 6));
+       CHECK(check_for_temporal_macros(date_middle + 7, sizeof(date_middle) - 7));
+
+       CHECK(check_for_temporal_macros(date_end + 0, sizeof(date_end) - 0));
+       CHECK(check_for_temporal_macros(date_end + sizeof(date_end) - 9, 9));
+       CHECK(!check_for_temporal_macros(date_end + sizeof(date_end) - 8, 8));
+
+       CHECK(!check_for_temporal_macros(no_temporal + 0, sizeof(no_temporal) - 0));
+       CHECK(!check_for_temporal_macros(no_temporal + 1, sizeof(no_temporal) - 1));
+       CHECK(!check_for_temporal_macros(no_temporal + 2, sizeof(no_temporal) - 2));
+       CHECK(!check_for_temporal_macros(no_temporal + 3, sizeof(no_temporal) - 3));
+       CHECK(!check_for_temporal_macros(no_temporal + 4, sizeof(no_temporal) - 4));
+       CHECK(!check_for_temporal_macros(no_temporal + 5, sizeof(no_temporal) - 5));
+       CHECK(!check_for_temporal_macros(no_temporal + 6, sizeof(no_temporal) - 6));
+       CHECK(!check_for_temporal_macros(no_temporal + 7, sizeof(no_temporal) - 7));
 }
 
 TEST_SUITE_END