From 47cb00f7623ae47deef0ab2fa54e2b73a04b08c9 Mon Sep 17 00:00:00 2001
From: Joel Rosdahl <joel@rosdahl.net>
Date: Tue, 22 Sep 2020 08:34:26 +0200
Subject: [PATCH] Encode hash digests as 4 base16 digits + 29 base32hex digits

Reducing file lengths should be beneficial since it reduces the number
of needed system calls when scanning many files in the cache. The effect
is very small but there is no real downside. See also
b16001a67f4389956ef6e7ccf7d8023684b57119.

Base32 is chosen since the encoding algorithm is very simple compared to
e.g. base36. Base64 cannot be used since the encoded digest string is
used in filenames and the cache directory may be located on a case
insensitive filesystem. The base32hex variant is chosen instead of the
other base32 variants since it feels more natural and there are no
visual ambiguity issues.

The first two bytes are encoded as base16 to maintain compatibility with
the cleanup algorithm in older ccache versions and to allow for up to
four uniform cache levels.
---
 src/Digest.hpp          |  9 ++++++++-
 src/Util.cpp            | 14 ++++++++++++++
 src/Util.hpp            |  4 ++++
 test/suites/base.bash   |  4 ++--
 test/suites/direct.bash |  6 +++---
 unittest/test_Hash.cpp  | 17 ++++++++---------
 unittest/test_Util.cpp  | 13 +++++++++++++
 7 files changed, 52 insertions(+), 15 deletions(-)
diff --git a/src/Digest.hpp b/src/Digest.hpp
index f20684f16..6a7b53ecc 100644
--- a/src/Digest.hpp
+++ b/src/Digest.hpp
@@ -72,7 +72,14 @@ Digest::size()
 inline std::string
 Digest::to_string() const
 {
-  return Util::format_base16(m_bytes, size());
+  // The first two bytes are encoded as four lowercase base16 digits to maintain
+  // compatibility with the cleanup algorithm in older ccache versions and to
+  // allow for up to four uniform cache levels. The rest are encoded as
+  // lowercase base32hex digits without padding characters.
+  const size_t base16_bytes = 2;
+  return Util::format_base16(m_bytes, base16_bytes)
+         + Util::format_base32hex(m_bytes + base16_bytes,
+                                  size() - base16_bytes);
 }
 
 inline bool
diff --git a/src/Util.cpp b/src/Util.cpp
index 154a942d3..24e4b4f28 100644
--- a/src/Util.cpp
+++ b/src/Util.cpp
@@ -25,6 +25,10 @@
 #include "Logging.hpp"
 #include "TemporaryFile.hpp"
 
+extern "C" {
+#include "third_party/base32hex.h"
+}
+
 #include <algorithm>
 #include <fstream>
 
@@ -534,6 +538,16 @@ format_base16(const uint8_t* data, size_t size)
   return result;
 }
 
+std::string
+format_base32hex(const uint8_t* data, size_t size)
+{
+  const size_t bytes_to_reserve = size * 8 / 5 + 1;
+  std::string result(bytes_to_reserve, 0);
+  const size_t actual_size = base32hex(&result[0], data, size);
+  result.resize(actual_size);
+  return result;
+}
+
 std::string
 format_human_readable_size(uint64_t size)
 {
diff --git a/src/Util.hpp b/src/Util.hpp
index 2876b7012..d2c7bff8f 100644
--- a/src/Util.hpp
+++ b/src/Util.hpp
@@ -170,6 +170,10 @@ std::string format_argv_for_logging(const char* const* argv);
 // string will be `2 * size` long.
 std::string format_base16(const uint8_t* data, size_t size);
 
+// Format a lowercase base32hex string representing `size` bytes of `data`. No
+// padding characters will be added.
+std::string format_base32hex(const uint8_t* data, size_t size);
+
 // Format `size` as a human-readable string.
 std::string format_human_readable_size(uint64_t size);
 
diff --git a/test/suites/base.bash b/test/suites/base.bash
index d480413c4..d596d6a39 100644
--- a/test/suites/base.bash
+++ b/test/suites/base.bash
@@ -1272,8 +1272,8 @@ EOF
     $CCACHE --hash-file /dev/null > hash.out
     printf "a" | $CCACHE --hash-file - >> hash.out
 
-    hash_0='af1349b9f5f9a1a6a0404dea36dcc9499bcb25c9'
-    hash_1='17762fddd969a453925d65717ac3eea21320b66b'
+    hash_0='af1396svbud1kqg40jfa6reciicrpcisi'
+    hash_1='17765vetiqd4ae95qpbhfb1ut8gj42r6m'
 
     if grep "$hash_0" hash.out >/dev/null 2>&1 && \
        grep "$hash_1" hash.out >/dev/null 2>&1; then
diff --git a/test/suites/direct.bash b/test/suites/direct.bash
index 45689f32c..82a3a5593 100644
--- a/test/suites/direct.bash
+++ b/test/suites/direct.bash
@@ -1026,9 +1026,9 @@ EOF
     manifest=`find $CCACHE_DIR -name '*M'`
     $CCACHE --dump-manifest $manifest >manifest.dump
 
-    checksum_test1_h='b7271c414e35d190304ccbb02cdce8aaa391497e'
-    checksum_test2_h='24f1184b3644bd65db35d8de74fbe468757a4200'
-    checksum_test3_h='56a66d1ef7bfe44154ecd084e395a1c8d55bb3a1'
+    checksum_test1_h='b7273h0ksdehi0o4pitg5jeehal3i54ns'
+    checksum_test2_h='24f1315jch5tcndjbm6uejtu8q3lf9100'
+    checksum_test3_h='56a6dkffffv485aepk44seaq3i6lbepq2'
 
     if grep "Hash: $checksum_test1_h" manifest.dump >/dev/null 2>&1 && \
        grep "Hash: $checksum_test2_h" manifest.dump >/dev/null 2>&1 && \
diff --git a/unittest/test_Hash.cpp b/unittest/test_Hash.cpp
index c1a864457..74351c6e7 100644
--- a/unittest/test_Hash.cpp
+++ b/unittest/test_Hash.cpp
@@ -26,26 +26,25 @@ TEST_CASE("known strings")
 {
   SUBCASE("initial state")
   {
-    CHECK(Hash().digest().to_string()
-          == "af1349b9f5f9a1a6a0404dea36dcc9499bcb25c9");
+    CHECK(Hash().digest().to_string() == "af1396svbud1kqg40jfa6reciicrpcisi");
   }
 
   SUBCASE("empty string")
   {
     CHECK(Hash().hash("").digest().to_string()
-          == "af1349b9f5f9a1a6a0404dea36dcc9499bcb25c9");
+          == "af1396svbud1kqg40jfa6reciicrpcisi");
   }
 
   SUBCASE("a")
   {
     CHECK(Hash().hash("a").digest().to_string()
-          == "17762fddd969a453925d65717ac3eea21320b66b");
+          == "17765vetiqd4ae95qpbhfb1ut8gj42r6m");
   }
 
   SUBCASE("message digest")
   {
     CHECK(Hash().hash("message digest").digest().to_string()
-          == "7bc2a2eeb95ddbf9b7ecf6adcb76b453091c58dc");
+          == "7bc2kbnbinerv6ruptldpdrb8ko93hcdo");
   }
 
   SUBCASE("long string")
@@ -54,7 +53,7 @@ TEST_CASE("known strings")
       "123456789012345678901234567890123456789012345678901234567890"
       "12345678901234567890";
     CHECK(Hash().hash(long_string).digest().to_string()
-          == "f263acf51621980b9c8de5da4a17d314984e05ab");
+          == "f263ljqhc8co1ee8rpeq98bt654o9o2qm");
   }
 }
 
@@ -64,14 +63,14 @@ TEST_CASE("Hash::digest should not alter state")
   h.hash("message");
   h.digest();
   h.hash(" digest");
-  CHECK(h.digest().to_string() == "7bc2a2eeb95ddbf9b7ecf6adcb76b453091c58dc");
+  CHECK(h.digest().to_string() == "7bc2kbnbinerv6ruptldpdrb8ko93hcdo");
 }
 
 TEST_CASE("Hash::digest should be idempotent")
 {
   Hash h;
-  CHECK(h.digest().to_string() == "af1349b9f5f9a1a6a0404dea36dcc9499bcb25c9");
-  CHECK(h.digest().to_string() == "af1349b9f5f9a1a6a0404dea36dcc9499bcb25c9");
+  CHECK(h.digest().to_string() == "af1396svbud1kqg40jfa6reciicrpcisi");
+  CHECK(h.digest().to_string() == "af1396svbud1kqg40jfa6reciicrpcisi");
 }
 
 TEST_CASE("Digest::bytes")
diff --git a/unittest/test_Util.cpp b/unittest/test_Util.cpp
index cf3e56497..5633c0220 100644
--- a/unittest/test_Util.cpp
+++ b/unittest/test_Util.cpp
@@ -284,6 +284,19 @@ TEST_CASE("Util::format_base16")
   CHECK(Util::format_base16(data, sizeof(data)) == "00010203");
 }
 
+TEST_CASE("Util::format_base32hex")
+{
+  // Test vectors (without padding) from RFC 4648.
+  const uint8_t input[] = {'f', 'o', 'o', 'b', 'a', 'r'};
+  CHECK(Util::format_base32hex(input, 0) == "");
+  CHECK(Util::format_base32hex(input, 1) == "co");
+  CHECK(Util::format_base32hex(input, 2) == "cpng");
+  CHECK(Util::format_base32hex(input, 3) == "cpnmu");
+  CHECK(Util::format_base32hex(input, 4) == "cpnmuog");
+  CHECK(Util::format_base32hex(input, 5) == "cpnmuoj1");
+  CHECK(Util::format_base32hex(input, 6) == "cpnmuoj1e8");
+}
+
 TEST_CASE("Util::format_human_readable_size")
 {
   CHECK(Util::format_human_readable_size(0) == "0.0 kB");
-- 
2.47.3