From 47cb00f7623ae47deef0ab2fa54e2b73a04b08c9 Mon Sep 17 00:00:00 2001 From: Joel Rosdahl Date: Tue, 22 Sep 2020 08:34:26 +0200 Subject: [PATCH] Encode hash digests as 4 base16 digits + 29 base32hex digits Reducing file lengths should be beneficial since it reduces the number of needed system calls when scanning many files in the cache. The effect is very small but there is no real downside. See also b16001a67f4389956ef6e7ccf7d8023684b57119. Base32 is chosen since the encoding algorithm is very simple compared to e.g. base36. Base64 cannot be used since the encoded digest string is used in filenames and the cache directory may be located on a case insensitive filesystem. The base32hex variant is chosen instead of the other base32 variants since it feels more natural and there are no visual ambiguity issues. The first two bytes are encoded as base16 to maintain compatibility with the cleanup algorithm in older ccache versions and to allow for up to four uniform cache levels. --- src/Digest.hpp | 9 ++++++++- src/Util.cpp | 14 ++++++++++++++ src/Util.hpp | 4 ++++ test/suites/base.bash | 4 ++-- test/suites/direct.bash | 6 +++--- unittest/test_Hash.cpp | 17 ++++++++--------- unittest/test_Util.cpp | 13 +++++++++++++ 7 files changed, 52 insertions(+), 15 deletions(-) diff --git a/src/Digest.hpp b/src/Digest.hpp index f20684f16..6a7b53ecc 100644 --- a/src/Digest.hpp +++ b/src/Digest.hpp @@ -72,7 +72,14 @@ Digest::size() inline std::string Digest::to_string() const { - return Util::format_base16(m_bytes, size()); + // The first two bytes are encoded as four lowercase base16 digits to maintain + // compatibility with the cleanup algorithm in older ccache versions and to + // allow for up to four uniform cache levels. The rest are encoded as + // lowercase base32hex digits without padding characters. + const size_t base16_bytes = 2; + return Util::format_base16(m_bytes, base16_bytes) + + Util::format_base32hex(m_bytes + base16_bytes, + size() - base16_bytes); } inline bool diff --git a/src/Util.cpp b/src/Util.cpp index 154a942d3..24e4b4f28 100644 --- a/src/Util.cpp +++ b/src/Util.cpp @@ -25,6 +25,10 @@ #include "Logging.hpp" #include "TemporaryFile.hpp" +extern "C" { +#include "third_party/base32hex.h" +} + #include #include @@ -534,6 +538,16 @@ format_base16(const uint8_t* data, size_t size) return result; } +std::string +format_base32hex(const uint8_t* data, size_t size) +{ + const size_t bytes_to_reserve = size * 8 / 5 + 1; + std::string result(bytes_to_reserve, 0); + const size_t actual_size = base32hex(&result[0], data, size); + result.resize(actual_size); + return result; +} + std::string format_human_readable_size(uint64_t size) { diff --git a/src/Util.hpp b/src/Util.hpp index 2876b7012..d2c7bff8f 100644 --- a/src/Util.hpp +++ b/src/Util.hpp @@ -170,6 +170,10 @@ std::string format_argv_for_logging(const char* const* argv); // string will be `2 * size` long. std::string format_base16(const uint8_t* data, size_t size); +// Format a lowercase base32hex string representing `size` bytes of `data`. No +// padding characters will be added. +std::string format_base32hex(const uint8_t* data, size_t size); + // Format `size` as a human-readable string. std::string format_human_readable_size(uint64_t size); diff --git a/test/suites/base.bash b/test/suites/base.bash index d480413c4..d596d6a39 100644 --- a/test/suites/base.bash +++ b/test/suites/base.bash @@ -1272,8 +1272,8 @@ EOF $CCACHE --hash-file /dev/null > hash.out printf "a" | $CCACHE --hash-file - >> hash.out - hash_0='af1349b9f5f9a1a6a0404dea36dcc9499bcb25c9' - hash_1='17762fddd969a453925d65717ac3eea21320b66b' + hash_0='af1396svbud1kqg40jfa6reciicrpcisi' + hash_1='17765vetiqd4ae95qpbhfb1ut8gj42r6m' if grep "$hash_0" hash.out >/dev/null 2>&1 && \ grep "$hash_1" hash.out >/dev/null 2>&1; then diff --git a/test/suites/direct.bash b/test/suites/direct.bash index 45689f32c..82a3a5593 100644 --- a/test/suites/direct.bash +++ b/test/suites/direct.bash @@ -1026,9 +1026,9 @@ EOF manifest=`find $CCACHE_DIR -name '*M'` $CCACHE --dump-manifest $manifest >manifest.dump - checksum_test1_h='b7271c414e35d190304ccbb02cdce8aaa391497e' - checksum_test2_h='24f1184b3644bd65db35d8de74fbe468757a4200' - checksum_test3_h='56a66d1ef7bfe44154ecd084e395a1c8d55bb3a1' + checksum_test1_h='b7273h0ksdehi0o4pitg5jeehal3i54ns' + checksum_test2_h='24f1315jch5tcndjbm6uejtu8q3lf9100' + checksum_test3_h='56a6dkffffv485aepk44seaq3i6lbepq2' if grep "Hash: $checksum_test1_h" manifest.dump >/dev/null 2>&1 && \ grep "Hash: $checksum_test2_h" manifest.dump >/dev/null 2>&1 && \ diff --git a/unittest/test_Hash.cpp b/unittest/test_Hash.cpp index c1a864457..74351c6e7 100644 --- a/unittest/test_Hash.cpp +++ b/unittest/test_Hash.cpp @@ -26,26 +26,25 @@ TEST_CASE("known strings") { SUBCASE("initial state") { - CHECK(Hash().digest().to_string() - == "af1349b9f5f9a1a6a0404dea36dcc9499bcb25c9"); + CHECK(Hash().digest().to_string() == "af1396svbud1kqg40jfa6reciicrpcisi"); } SUBCASE("empty string") { CHECK(Hash().hash("").digest().to_string() - == "af1349b9f5f9a1a6a0404dea36dcc9499bcb25c9"); + == "af1396svbud1kqg40jfa6reciicrpcisi"); } SUBCASE("a") { CHECK(Hash().hash("a").digest().to_string() - == "17762fddd969a453925d65717ac3eea21320b66b"); + == "17765vetiqd4ae95qpbhfb1ut8gj42r6m"); } SUBCASE("message digest") { CHECK(Hash().hash("message digest").digest().to_string() - == "7bc2a2eeb95ddbf9b7ecf6adcb76b453091c58dc"); + == "7bc2kbnbinerv6ruptldpdrb8ko93hcdo"); } SUBCASE("long string") @@ -54,7 +53,7 @@ TEST_CASE("known strings") "123456789012345678901234567890123456789012345678901234567890" "12345678901234567890"; CHECK(Hash().hash(long_string).digest().to_string() - == "f263acf51621980b9c8de5da4a17d314984e05ab"); + == "f263ljqhc8co1ee8rpeq98bt654o9o2qm"); } } @@ -64,14 +63,14 @@ TEST_CASE("Hash::digest should not alter state") h.hash("message"); h.digest(); h.hash(" digest"); - CHECK(h.digest().to_string() == "7bc2a2eeb95ddbf9b7ecf6adcb76b453091c58dc"); + CHECK(h.digest().to_string() == "7bc2kbnbinerv6ruptldpdrb8ko93hcdo"); } TEST_CASE("Hash::digest should be idempotent") { Hash h; - CHECK(h.digest().to_string() == "af1349b9f5f9a1a6a0404dea36dcc9499bcb25c9"); - CHECK(h.digest().to_string() == "af1349b9f5f9a1a6a0404dea36dcc9499bcb25c9"); + CHECK(h.digest().to_string() == "af1396svbud1kqg40jfa6reciicrpcisi"); + CHECK(h.digest().to_string() == "af1396svbud1kqg40jfa6reciicrpcisi"); } TEST_CASE("Digest::bytes") diff --git a/unittest/test_Util.cpp b/unittest/test_Util.cpp index cf3e56497..5633c0220 100644 --- a/unittest/test_Util.cpp +++ b/unittest/test_Util.cpp @@ -284,6 +284,19 @@ TEST_CASE("Util::format_base16") CHECK(Util::format_base16(data, sizeof(data)) == "00010203"); } +TEST_CASE("Util::format_base32hex") +{ + // Test vectors (without padding) from RFC 4648. + const uint8_t input[] = {'f', 'o', 'o', 'b', 'a', 'r'}; + CHECK(Util::format_base32hex(input, 0) == ""); + CHECK(Util::format_base32hex(input, 1) == "co"); + CHECK(Util::format_base32hex(input, 2) == "cpng"); + CHECK(Util::format_base32hex(input, 3) == "cpnmu"); + CHECK(Util::format_base32hex(input, 4) == "cpnmuog"); + CHECK(Util::format_base32hex(input, 5) == "cpnmuoj1"); + CHECK(Util::format_base32hex(input, 6) == "cpnmuoj1e8"); +} + TEST_CASE("Util::format_human_readable_size") { CHECK(Util::format_human_readable_size(0) == "0.0 kB"); -- 2.47.3