From fe732f041ed36756679e3cf28f5407fb53c5453c Mon Sep 17 00:00:00 2001 From: Justin Lebar Date: Mon, 29 Nov 2010 20:24:06 +0100 Subject: [PATCH] Improve speed of temporal macro search --- hashutil.c | 168 ++++++++++++++++++++-------------------------------- macroskip.h | 56 ++++++++++++++++++ test.sh | 39 ------------ 3 files changed, 120 insertions(+), 143 deletions(-) create mode 100644 macroskip.h diff --git a/hashutil.c b/hashutil.c index 0ec714dd5..d9fadef6e 100644 --- a/hashutil.c +++ b/hashutil.c @@ -19,6 +19,7 @@ #include "ccache.h" #include "hashutil.h" #include "murmurhashneutral2.h" +#include "macroskip.h" unsigned hash_from_string(void *str) @@ -45,125 +46,84 @@ file_hashes_equal(struct file_hash *fh1, struct file_hash *fh2) && fh1->size == fh2->size; } -#define HASH(ch) \ - do {\ - hashbuf[hashbuflen] = ch; \ - hashbuflen++; \ - if (hashbuflen == sizeof(hashbuf)) {\ - hash_buffer(hash, hashbuf, sizeof(hashbuf)); \ - hashbuflen = 0; \ - } \ - } while (0) - /* - * Hash a string ignoring comments. Returns a bitmask of HASH_SOURCE_CODE_* - * results. + * Search for the strings "__DATE__" and "__TIME__" in str. + * + * Returns a bitmask with HASH_SOURCE_CODE_FOUND_DATE and + * HASH_SOURCE_CODE_FOUND_TIME set appropriately. */ -int -hash_source_code_string( - struct mdfour *hash, const char *str, size_t len, const char *path) +static int +check_for_temporal_macros(const char *str, size_t len) { - const char *p; - const char *end; - char hashbuf[64]; - size_t hashbuflen = 0; - int result = HASH_SOURCE_CODE_OK; - extern unsigned sloppiness; + int result = 0; - p = str; - end = str + len; - while (1) { - if (p >= end) { - goto end; - } - switch (*p) { - /* Potential start of comment. */ - case '/': - if (p+1 == end) { - break; - } - switch (*(p+1)) { - case '*': - HASH(' '); /* Don't paste tokens together when removing the comment. */ - p += 2; - while (p+1 < end - && (*p != '*' || *(p+1) != '/')) { - if (*p == '\n') { - /* Keep line numbers. */ - HASH('\n'); - } - p++; - } - if (p+1 == end) { - goto end; - } - p += 2; - continue; + /* + * We're using the Boyer-Moore-Horspool algorithm, which searches + * starting from the *end* of the needle. Our needles are 8 characters + * long, so i starts at 7. + */ + size_t i = 7; - case '/': - p += 2; - while (p < end - && (*p != '\n' || *(p-1) == '\\')) { - p++; - } - continue; + while (i < len) { + /* + * Check whether the substring ending at str[i] has the form + * '__...E__'. On the assumption that 'E' is less common in + * source than '_', we check str[i-2] first. + */ + if (str[i - 2] == 'E' && + str[i - 0] == '_' && + str[i - 7] == '_' && + str[i - 1] == '_' && + str[i - 6] == '_') { - default: - break; - } - break; + /* + * Check the remaining characters to see if the + * substring is '__DATE__' or '__TIME__'. + */ - /* Start of string. */ - case '"': - HASH(*p); - p++; - while (p < end && (*p != '"' || *(p-1) == '\\')) { - HASH(*p); - p++; - } - if (p == end) { - goto end; + if (str[i - 5] == 'D' && str[i - 4] == 'A' && + str[i - 3] == 'T') { + result |= HASH_SOURCE_CODE_FOUND_DATE; } - break; - - /* Potential start of volatile macro. */ - case '_': - if (p + 7 < end - && p[1] == '_' && p[5] == 'E' - && p[6] == '_' && p[7] == '_') { - if (p[2] == 'D' && p[3] == 'A' - && p[4] == 'T') { - result |= HASH_SOURCE_CODE_FOUND_DATE; - } else if (p[2] == 'T' && p[3] == 'I' - && p[4] == 'M') { - result |= HASH_SOURCE_CODE_FOUND_TIME; - } - /* - * Of course, we can't be sure that we have found a __{DATE,TIME}__ - * that's actually used, but better safe than sorry. And if you do - * something like - * - * #define TIME __TI ## ME__ - * - * in your code, you deserve to get a false cache hit. - */ + else if (str[i - 5] == 'T' && str[i - 4] == 'I' && + str[i - 3] == 'M') { + result |= HASH_SOURCE_CODE_FOUND_TIME; } - break; - - default: - break; } - HASH(*p); - p++; + /* + * macro_skip tells us how far we can skip forward upon seeing + * str[i] at the end of a substring. + */ + i += macro_skip[(uint8_t)str[i]]; } -end: - hash_buffer(hash, hashbuf, hashbuflen); + return result; +} - if (sloppiness & SLOPPY_TIME_MACROS) { - return 0; +/* + * Hash a string. Returns a bitmask of HASH_SOURCE_CODE_* results. + */ +int +hash_source_code_string( + struct mdfour *hash, const char *str, size_t len, const char *path) +{ + int result = HASH_SOURCE_CODE_OK; + extern unsigned sloppiness; + + /* + * Check for __DATE__ and __TIME__ if the sloppiness argument tells us + * we have to. + */ + if (!(sloppiness & SLOPPY_TIME_MACROS)) { + result |= check_for_temporal_macros(str, len); } + + /* + * Hash the source string. + */ + hash_buffer(hash, str, len); + if (result & HASH_SOURCE_CODE_FOUND_DATE) { /* * Make sure that the hash sum changes if the (potential) expansion of diff --git a/macroskip.h b/macroskip.h new file mode 100644 index 000000000..14522018a --- /dev/null +++ b/macroskip.h @@ -0,0 +1,56 @@ +/* + * A Boyer-Moore-Horspool skip table used for searching for the strings + * "__TIME__" and "__DATE__". + * + * macro_skip[c] = 8 for all c not in "__TIME__" and "__DATE__". + * + * The other characters map as follows: + * + * _ -> 1 + * A -> 5 + * D -> 6 + * E -> 3 + * I -> 5 + * M -> 4 + * T -> 4 + * + * + * This was generated with the following Python script: + * + * m = {'_': 1, + * 'A': 5, + * 'D': 6, + * 'E': 3, + * 'I': 5, + * 'M': 4, + * 'T': 4} + * + * for i in range(0, 256): + * if chr(i) in m: + * num = m[chr(i)] + * else: + * num = 8 + * print ("%d, " % num), + * + * if i % 16 == 15: + * print "" + */ + +static const uint32_t macro_skip[] = { + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 8, 5, 8, 8, 6, 3, 8, 8, 8, 5, 8, 8, 8, 4, 8, 8, + 8, 8, 8, 8, 4, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 1, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, +}; diff --git a/test.sh b/test.sh index 74fc5ae0e..c8520b45f 100755 --- a/test.sh +++ b/test.sh @@ -866,45 +866,6 @@ EOF checkstat 'cache miss' 1 checkfile stderr-mf.txt "`cat stderr-orig.txt`" - ################################################################## - # Check that changes in comments are ignored when hashing. - testname="changes in comments" - $CCACHE -C >/dev/null - $CCACHE -z >/dev/null - cat <comments.h -/* - * /* foo comment - */ -EOF - backdate comments.h - cat <<'EOF' >comments.c -#include "comments.h" -char test[] = "\ -/* apple */ // banana"; // foo comment -EOF - - $CCACHE $COMPILER -c comments.c - checkstat 'cache hit (direct)' 0 - checkstat 'cache hit (preprocessed)' 0 - checkstat 'cache miss' 1 - - sed_in_place 's/foo/ignored/' comments.h comments.c - backdate comments.h - - $CCACHE $COMPILER -c comments.c - checkstat 'cache hit (direct)' 1 - checkstat 'cache hit (preprocessed)' 0 - checkstat 'cache miss' 1 - - # Check that comment-like string contents are hashed. - sed_in_place 's/apple/orange/' comments.c - backdate comments.h - - $CCACHE $COMPILER -c comments.c - checkstat 'cache hit (direct)' 1 - checkstat 'cache hit (preprocessed)' 0 - checkstat 'cache miss' 2 - ################################################################## # Check that it is possible to compile and cache an empty source code file. testname="empty source file" -- 2.47.3