From 947a72ce3712a6901e7ddc43a50c5df40739113e Mon Sep 17 00:00:00 2001
From: Joel Rosdahl <joel@rosdahl.net>
Date: Sun, 29 Dec 2019 19:21:39 +0100
Subject: [PATCH] Remove the unify mode
MIME-Version: 1.0
Content-Type: text/plain; charset=utf8
Content-Transfer-Encoding: 8bit

The unify mode has not received enough attention and has at least these
bugs:

1. The direct mode doesnât work if the unify mode is enabled. This is
   because the unify mode doesnât call into process_preprocessed_file
   which stores the paths and hashes of included files needed by the
   direct mode.
2. The .incbin directive detection has no effect when using the unify
   mode. This is again because the unify mode doesnât use
   process_preprocessed_file which is where the .incbin detection takes
   place.
3. The unifierâs tokenizer doesnât understand C++11 raw string literals.
4. The unifier ignores comments, but comments may have semantic meaning
   to modern compilers, e.g. âfall throughâ comments.

Bugs 3 and 4 are fixable by improving the unifierâs tokenization
algorithm, but since itâs a bit tricky it likely wonât be worth the
effort, especially not as a bug fix.

Bugs 1 and 2 are also fixable by unifying the two code paths, but thatâs
a non-trivial effort.

In addition to the bugs, I believe that the usefullness of the unify
mode is low:

* Itâs only applicable when not using -g.
* It won't be enabled for C++ unless somebody fixes bug 3.
* It can make line numbers in warning messages and __LINE__ expansions
  incorrect.
* Since comments should not be ignored, the unify mode can only make a
  difference for some types of whitespace changes, like adding or
  removing blank lines or changing a+b to a + b. (a + b is already
  normalized to a + b by the preprocessor.)

Therefore Iâll just remove the unify mode to fix the bugs.

Fixes #497.
---
 Makefile.in              |   1 -
 dev.mk.in                |   1 -
 doc/MANUAL.adoc          |  13 --
 src/ccache.c             |  34 +----
 src/conf.c               |   2 -
 src/conf.h               |   1 -
 src/confitems.gperf      |   1 -
 src/envtoconfitems.gperf |   1 -
 src/unify.c              | 262 ---------------------------------------
 src/unify.h              |  24 ----
 test/suites/base.bash    |  16 ---
 unittest/test_conf.c     |   9 +-
 12 files changed, 8 insertions(+), 357 deletions(-)
 delete mode 100644 src/unify.c
 delete mode 100644 src/unify.h

diff --git a/Makefile.in b/Makefile.in
index b5a3ea5dd..d10f9459b 100644
--- a/Makefile.in
+++ b/Makefile.in
@@ -46,7 +46,6 @@ non_3pp_sources = \
     src/manifest.c \
     src/mdfour.c \
     src/stats.c \
-    src/unify.c \
     src/util.c
 generated_sources = \
     src/confitems_lookup.c \
diff --git a/dev.mk.in b/dev.mk.in
index 7b0839fc2..8890b3334 100644
--- a/dev.mk.in
+++ b/dev.mk.in
@@ -53,7 +53,6 @@ headers = \
     src/minitrace.h \
     src/murmurhashneutral2.h \
     src/system.h \
-    src/unify.h \
     unittest/framework.h \
     unittest/util.h
 generated_headers = \
diff --git a/doc/MANUAL.adoc b/doc/MANUAL.adoc
index 0964152c2..1717c18e2 100644
--- a/doc/MANUAL.adoc
+++ b/doc/MANUAL.adoc
@@ -610,18 +610,6 @@ NOTE: In previous versions of ccache, *CCACHE_TEMPDIR* had to be on the same
     with other users. Note that this also affects the file permissions set on
     the object files created from your compilations.
 
-*unify* (*CCACHE_UNIFY* or *CCACHE_NOUNIFY*, see <<_boolean_values,Boolean values>> above)::
-
-    If true, ccache will use a C/C++ unifier when hashing the preprocessor
-    output if the *-g* option is not used. The unifier is slower than a normal
-    hash, so setting this environment variable loses a little bit of speed, but
-    it means that ccache can take advantage of not recompiling when the changes
-    to the source code consist of reformatting only. Note that enabling the
-    unifier changes the hash, so cached compilations produced when the unifier
-    is enabled cannot be reused when the unifier is disabled, and vice versa.
-    Enabling the unifier may result in incorrect line number information in
-    compiler warning messages and expansions of the `__LINE__` macro.
-
 
 Cache size management
 ---------------------
@@ -935,7 +923,6 @@ The depend mode will be disabled if any of the following holds:
 
 * the configuration setting *depend_mode* is false
 * the configuration setting *run_second_cpp* is false
-* the configuration setting *unify* is true
 * the compiler is not generating dependencies using *-MD* or *-MMD*
 
 
diff --git a/src/ccache.c b/src/ccache.c
index f06312dd9..03aa97a96 100644
--- a/src/ccache.c
+++ b/src/ccache.c
@@ -30,7 +30,6 @@
 #include "hashutil.h"
 #include "language.h"
 #include "manifest.h"
-#include "unify.h"
 
 #define STRINGIFY(x) #x
 #define TO_STRING(x) STRINGIFY(x)
@@ -1726,27 +1725,11 @@ get_object_name_from_cpp(struct args *args, struct hash *hash)
 		failed();
 	}
 
-	if (conf->unify) {
-		// When we are doing the unifying tricks we need to include the input file
-		// name in the hash to get the warnings right.
-		hash_delimiter(hash, "unifyfilename");
-		hash_string(hash, input_file);
-
-		hash_delimiter(hash, "unifycpp");
-
-		bool debug_unify = getenv("CCACHE_DEBUG_UNIFY");
-		if (unify_hash(hash, path_stdout, debug_unify) != 0) {
-			stats_update(STATS_ERROR);
-			cc_log("Failed to unify %s", path_stdout);
-			failed();
-		}
-	} else {
-		hash_delimiter(hash, "cpp");
-		if (!process_preprocessed_file(hash, path_stdout,
-		                               guessed_compiler == GUESSED_PUMP)) {
-			stats_update(STATS_ERROR);
-			failed();
-		}
+	hash_delimiter(hash, "cpp");
+	if (!process_preprocessed_file(hash, path_stdout,
+	                               guessed_compiler == GUESSED_PUMP)) {
+		stats_update(STATS_ERROR);
+		failed();
 	}
 
 	hash_delimiter(hash, "cppstderr");
@@ -3303,11 +3286,6 @@ cc_process_args(struct args *args,
 		}
 	} // for
 
-	if (generating_debuginfo && conf->unify) {
-		cc_log("Generating debug info; disabling unify mode");
-		conf->unify = false;
-	}
-
 	if (generating_debuginfo_level_3 && !conf->run_second_cpp) {
 		cc_log("Generating debug info level 3; not compiling preprocessed code");
 		conf->run_second_cpp = true;
@@ -3991,7 +3969,7 @@ ccache(int argc, char *argv[])
 
 	if (conf->depend_mode
 	    && (!generating_dependencies || str_eq(output_dep, "/dev/null")
-	        || !conf->run_second_cpp || conf->unify)) {
+	        || !conf->run_second_cpp)) {
 		cc_log("Disabling depend mode");
 		conf->depend_mode = false;
 	}
diff --git a/src/conf.c b/src/conf.c
index 9ddd14c02..92fe34bd6 100644
--- a/src/conf.c
+++ b/src/conf.c
@@ -160,7 +160,6 @@ conf_create(void)
 	conf->stats = true;
 	conf->temporary_dir = x_strdup("");
 	conf->umask = UINT_MAX; // Default: don't set umask.
-	conf->unify = false;
 	conf->item_origins = x_malloc(confitems_count() * sizeof(char *));
 	for (size_t i = 0; i < confitems_count(); ++i) {
 		conf->item_origins[i] = "default";
@@ -429,6 +428,5 @@ conf_print_items(struct conf *conf,
 	ok &= print_item(conf, "stats", printer, context);
 	ok &= print_item(conf, "temporary_dir", printer, context);
 	ok &= print_item(conf, "umask", printer, context);
-	ok &= print_item(conf, "unify", printer, context);
 	return ok;
 }
diff --git a/src/conf.h b/src/conf.h
index 52a48c8e7..6fb0844d2 100644
--- a/src/conf.h
+++ b/src/conf.h
@@ -37,7 +37,6 @@ struct conf {
 	bool stats;
 	char *temporary_dir;
 	unsigned umask;
-	bool unify;
 
 	const char **item_origins;
 };
diff --git a/src/confitems.gperf b/src/confitems.gperf
index 828e62eb3..60e48ca8f 100644
--- a/src/confitems.gperf
+++ b/src/confitems.gperf
@@ -53,4 +53,3 @@ sloppiness,                 ITEM(sloppiness, sloppiness)
 stats,                      ITEM(stats, bool)
 temporary_dir,              ITEM(temporary_dir, env_string)
 umask,                      ITEM(umask, umask)
-unify,                      ITEM(unify, bool)
diff --git a/src/envtoconfitems.gperf b/src/envtoconfitems.gperf
index 374d19c5d..ae5f37f29 100644
--- a/src/envtoconfitems.gperf
+++ b/src/envtoconfitems.gperf
@@ -45,4 +45,3 @@ SLOPPINESS, "sloppiness"
 STATS, "stats"
 TEMPDIR, "temporary_dir"
 UMASK, "umask"
-UNIFY, "unify"
diff --git a/src/unify.c b/src/unify.c
deleted file mode 100644
index 4b727842f..000000000
--- a/src/unify.c
+++ /dev/null
@@ -1,262 +0,0 @@
-// Copyright (C) 2002 Andrew Tridgell
-// Copyright (C) 2009-2019 Joel Rosdahl
-//
-// This program is free software; you can redistribute it and/or modify it
-// under the terms of the GNU General Public License as published by the Free
-// Software Foundation; either version 3 of the License, or (at your option)
-// any later version.
-//
-// This program is distributed in the hope that it will be useful, but WITHOUT
-// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
-// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
-// more details.
-//
-// You should have received a copy of the GNU General Public License along with
-// this program; if not, write to the Free Software Foundation, Inc., 51
-// Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-
-// C/C++ unifier
-//
-// The idea is that changes that don't affect the resulting C code should not
-// change the hash. This is achieved by folding white-space and other
-// non-semantic fluff in the input into a single unified format.
-//
-// This unifier was design to match the output of the unifier in compilercache,
-// which is flex based. The major difference is that this unifier is much
-// faster (about 2x) and more forgiving of syntactic errors. Continuing on
-// syntactic errors is important to cope with C/C++ extensions in the local
-// compiler (for example, inline assembly systems).
-
-#include "ccache.h"
-#include "hash.h"
-#include "unify.h"
-
-static bool print_unified = true;
-
-static const char *const s_tokens[] = {
-	"...", ">>=", "<<=", "+=", "-=", "*=", "/=", "%=", "&=", "^=",
-	"|=",  ">>",  "<<",  "++", "--", "->", "&&", "||", "<=", ">=",
-	"==",  "!=",  ";",   "{",  "<%", "}",  "%>", ",",  ":",  "=",
-	"(",   ")",   "[",   "<:", "]",  ":>", ".",  "&",  "!",  "~",
-	"-",   "+",   "*",   "/",  "%",  "<",  ">",  "^",  "|",  "?",
-	0
-};
-
-#define C_ALPHA 1
-#define C_SPACE 2
-#define C_TOKEN 4
-#define C_QUOTE 8
-#define C_DIGIT 16
-#define C_HEX   32
-#define C_FLOAT 64
-#define C_SIGN  128
-
-static struct {
-	unsigned char type;
-	unsigned char num_toks;
-	const char *toks[7];
-} tokens[256];
-
-// Build up the table used by the unifier.
-static void
-build_table(void)
-{
-	static bool done;
-	if (done) {
-		return;
-	}
-	done = true;
-
-	memset(tokens, 0, sizeof(tokens));
-	for (unsigned char c = 0; c < 128; c++) {
-		if (isalpha(c) || c == '_') {
-			tokens[c].type |= C_ALPHA;
-		}
-		if (isdigit(c)) {
-			tokens[c].type |= C_DIGIT;
-		}
-		if (isspace(c)) {
-			tokens[c].type |= C_SPACE;
-		}
-		if (isxdigit(c)) {
-			tokens[c].type |= C_HEX;
-		}
-	}
-	tokens['\''].type |= C_QUOTE;
-	tokens['"'].type |= C_QUOTE;
-	tokens['l'].type |= C_FLOAT;
-	tokens['L'].type |= C_FLOAT;
-	tokens['f'].type |= C_FLOAT;
-	tokens['F'].type |= C_FLOAT;
-	tokens['U'].type |= C_FLOAT;
-	tokens['u'].type |= C_FLOAT;
-
-	tokens['-'].type |= C_SIGN;
-	tokens['+'].type |= C_SIGN;
-
-	for (int i = 0; s_tokens[i]; i++) {
-		unsigned char c = s_tokens[i][0];
-		tokens[c].type |= C_TOKEN;
-		tokens[c].toks[tokens[c].num_toks] = s_tokens[i];
-		tokens[c].num_toks++;
-	}
-}
-
-// Buffer up characters before hashing them.
-static void
-pushchar(struct hash *hash, unsigned char c)
-{
-	static unsigned char buf[64];
-	static size_t len;
-
-	if (c == 0) {
-		if (len > 0) {
-			hash_buffer(hash, (char *)buf, len);
-			if (print_unified) {
-				printf("%.*s", (int) len, buf);
-			}
-			len = 0;
-		}
-		return;
-	}
-
-	buf[len++] = c;
-	if (len == 64) {
-		hash_buffer(hash, (char *)buf, len);
-		if (print_unified) {
-			printf("%.*s", (int) len, buf);
-		}
-		len = 0;
-	}
-}
-
-// Hash some C/C++ code after unifying.
-static void
-unify(struct hash *hash, unsigned char *p, size_t size)
-{
-	build_table();
-
-	for (size_t ofs = 0; ofs < size;) {
-		if (p[ofs] == '#') {
-			if ((size-ofs) > 2 && p[ofs+1] == ' ' && isdigit(p[ofs+2])) {
-				do {
-					ofs++;
-				} while (ofs < size && p[ofs] != '\n');
-				ofs++;
-			} else {
-				do {
-					pushchar(hash, p[ofs]);
-					ofs++;
-				} while (ofs < size && p[ofs] != '\n');
-				pushchar(hash, '\n');
-				ofs++;
-			}
-			continue;
-		}
-
-		if (tokens[p[ofs]].type & C_ALPHA) {
-			do {
-				pushchar(hash, p[ofs]);
-				ofs++;
-			} while (ofs < size && (tokens[p[ofs]].type & (C_ALPHA|C_DIGIT)));
-			pushchar(hash, '\n');
-			continue;
-		}
-
-		if (tokens[p[ofs]].type & C_DIGIT) {
-			do {
-				pushchar(hash, p[ofs]);
-				ofs++;
-			} while (ofs < size &&
-			         ((tokens[p[ofs]].type & C_DIGIT) || p[ofs] == '.'));
-			if (ofs < size && (p[ofs] == 'x' || p[ofs] == 'X')) {
-				do {
-					pushchar(hash, p[ofs]);
-					ofs++;
-				} while (ofs < size && (tokens[p[ofs]].type & C_HEX));
-			}
-			if (ofs < size && (p[ofs] == 'E' || p[ofs] == 'e')) {
-				pushchar(hash, p[ofs]);
-				ofs++;
-				while (ofs < size && (tokens[p[ofs]].type & (C_DIGIT|C_SIGN))) {
-					pushchar(hash, p[ofs]);
-					ofs++;
-				}
-			}
-			while (ofs < size && (tokens[p[ofs]].type & C_FLOAT)) {
-				pushchar(hash, p[ofs]);
-				ofs++;
-			}
-			pushchar(hash, '\n');
-			continue;
-		}
-
-		if (tokens[p[ofs]].type & C_SPACE) {
-			do {
-				ofs++;
-			} while (ofs < size && (tokens[p[ofs]].type & C_SPACE));
-			continue;
-		}
-
-		if (tokens[p[ofs]].type & C_QUOTE) {
-			unsigned char q = p[ofs];
-			pushchar(hash, p[ofs]);
-			do {
-				ofs++;
-				while (ofs < size-1 && p[ofs] == '\\') {
-					pushchar(hash, p[ofs]);
-					pushchar(hash, p[ofs+1]);
-					ofs += 2;
-				}
-				pushchar(hash, p[ofs]);
-			} while (ofs < size && p[ofs] != q);
-			pushchar(hash, '\n');
-			ofs++;
-			continue;
-		}
-
-		if (tokens[p[ofs]].type & C_TOKEN) {
-			unsigned char q = p[ofs];
-			int i;
-			for (i = 0; i < tokens[q].num_toks; i++) {
-				const unsigned char *s = (const unsigned char *)tokens[q].toks[i];
-				int len = strlen((const char *)s);
-				if (size >= ofs+len && memcmp(&p[ofs], s, len) == 0) {
-					int j;
-					for (j = 0; s[j]; j++) {
-						pushchar(hash, s[j]);
-						ofs++;
-					}
-					pushchar(hash, '\n');
-					break;
-				}
-			}
-			if (i < tokens[q].num_toks) {
-				continue;
-			}
-		}
-
-		pushchar(hash, p[ofs]);
-		pushchar(hash, '\n');
-		ofs++;
-	}
-	pushchar(hash, 0);
-}
-
-
-// Hash a file that consists of preprocessor output, but remove any line number
-// information from the hash.
-int
-unify_hash(struct hash *hash, const char *fname, bool debug)
-{
-	char *data;
-	size_t size;
-	if (!read_file(fname, 0, &data, &size)) {
-		stats_update(STATS_PREPROCESSOR);
-		return -1;
-	}
-	print_unified = debug;
-	unify(hash, (unsigned char *)data, size);
-	free(data);
-	return 0;
-}
diff --git a/src/unify.h b/src/unify.h
deleted file mode 100644
index eac167dc6..000000000
--- a/src/unify.h
+++ /dev/null
@@ -1,24 +0,0 @@
-// Copyright (C) 2018 Joel Rosdahl
-//
-// This program is free software; you can redistribute it and/or modify it
-// under the terms of the GNU General Public License as published by the Free
-// Software Foundation; either version 3 of the License, or (at your option)
-// any later version.
-//
-// This program is distributed in the hope that it will be useful, but WITHOUT
-// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
-// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
-// more details.
-//
-// You should have received a copy of the GNU General Public License along with
-// this program; if not, write to the Free Software Foundation, Inc., 51
-// Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-
-#ifndef UNIFY_H
-#define UNIFY_H
-
-#include "hash.h"
-
-int unify_hash(struct hash *hash, const char *fname, bool print);
-
-#endif
diff --git a/test/suites/base.bash b/test/suites/base.bash
index fac44ea67..a8d8acdc4 100644
--- a/test/suites/base.bash
+++ b/test/suites/base.bash
@@ -434,22 +434,6 @@ base_tests() {
     expect_stat 'cache hit (preprocessed)' 2
     expect_stat 'cache miss' 1
 
-    # -------------------------------------------------------------------------
-    TEST "CCACHE_UNIFY"
-
-    echo '// a silly comment' >>test1.c
-    CCACHE_UNIFY=1 $CCACHE_COMPILE -c test1.c
-    expect_stat 'cache hit (preprocessed)' 0
-    expect_stat 'cache miss' 1
-
-    echo '// another silly comment' >>test1.c
-    CCACHE_UNIFY=1 $CCACHE_COMPILE -c test1.c
-    expect_stat 'cache hit (preprocessed)' 1
-    expect_stat 'cache miss' 1
-
-    $REAL_COMPILER -c -o reference_test1.o test1.c
-    expect_equal_object_files reference_test1.o test1.o
-
     # -------------------------------------------------------------------------
     TEST "CCACHE_NLEVELS"
 
diff --git a/unittest/test_conf.c b/unittest/test_conf.c
index bc458fce2..a203ec875 100644
--- a/unittest/test_conf.c
+++ b/unittest/test_conf.c
@@ -18,7 +18,7 @@
 #include "framework.h"
 #include "util.h"
 
-#define N_CONFIG_ITEMS 34
+#define N_CONFIG_ITEMS 33
 static struct {
 	char *descr;
 	char *origin;
@@ -83,7 +83,6 @@ TEST(conf_create)
 	CHECK(conf->stats);
 	CHECK_STR_EQ("", conf->temporary_dir);
 	CHECK_INT_EQ(UINT_MAX, conf->umask);
-	CHECK(!conf->unify);
 	conf_free(conf);
 }
 
@@ -135,8 +134,7 @@ TEST(conf_read_valid_config)
 		"sloppiness =     time_macros   ,include_file_mtime  include_file_ctime,file_stat_matches,file_stat_matches_ctime,pch_defines ,  no_system_headers,system_headers,clang_index_store\n"
 		"stats = false\n"
 		"temporary_dir = ${USER}_foo\n"
-		"umask = 777\n"
-		"unify = true"); // Note: no newline.
+		"umask = 777"); // Note: no newline.
 	CHECK(conf_read(conf, "ccache.conf", &errmsg));
 	CHECK(!errmsg);
 
@@ -185,7 +183,6 @@ TEST(conf_read_valid_config)
 	CHECK(!conf->stats);
 	CHECK_STR_EQ_FREE1(format("%s_foo", user), conf->temporary_dir);
 	CHECK_INT_EQ(0777, conf->umask);
-	CHECK(conf->unify);
 
 	conf_free(conf);
 }
@@ -499,7 +496,6 @@ TEST(conf_print_items)
 		false,
 		"td",
 		022,
-		true,
 		NULL
 	};
 	size_t n = 0;
@@ -553,7 +549,6 @@ TEST(conf_print_items)
 	CHECK_STR_EQ("stats = false", received_conf_items[n++].descr);
 	CHECK_STR_EQ("temporary_dir = td", received_conf_items[n++].descr);
 	CHECK_STR_EQ("umask = 022", received_conf_items[n++].descr);
-	CHECK_STR_EQ("unify = true", received_conf_items[n++].descr);
 
 	for (i = 0; i < N_CONFIG_ITEMS; ++i) {
 #ifndef __MINGW32__
-- 
2.47.2