From: Joel Rosdahl Date: Wed, 29 Dec 2021 19:50:29 +0000 (+0100) Subject: refactor: Decouple Manifest data structure from how it’s stored X-Git-Tag: v4.6~52 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=9882ac2d27985880fee525301cba831c16ad70bf;p=thirdparty%2Fccache.git refactor: Decouple Manifest data structure from how it’s stored Manifest is now a standalone data structure that knows how to serialize and deserialize itself but not how it’s embedded in a cache entry. --- diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 1fb096957..1dc6bdbdd 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -9,7 +9,6 @@ set( Hash.cpp Lockfile.cpp Logging.cpp - Manifest.cpp ProgressBar.cpp Result.cpp ResultDumper.cpp diff --git a/src/Manifest.cpp b/src/Manifest.cpp deleted file mode 100644 index 341e66dbf..000000000 --- a/src/Manifest.cpp +++ /dev/null @@ -1,608 +0,0 @@ -// Copyright (C) 2009-2021 Joel Rosdahl and other contributors -// -// See doc/AUTHORS.adoc for a complete list of contributors. -// -// This program is free software; you can redistribute it and/or modify it -// under the terms of the GNU General Public License as published by the Free -// Software Foundation; either version 3 of the License, or (at your option) -// any later version. -// -// This program is distributed in the hope that it will be useful, but WITHOUT -// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or -// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for -// more details. -// -// You should have received a copy of the GNU General Public License along with -// this program; if not, write to the Free Software Foundation, Inc., 51 -// Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - -#include "Manifest.hpp" - -#include "AtomicFile.hpp" -#include "Config.hpp" -#include "Context.hpp" -#include "Digest.hpp" -#include "File.hpp" -#include "Hash.hpp" -#include "Logging.hpp" -#include "fmtmacros.hpp" -#include "hashutil.hpp" - -#include -#include -#include -#include -#include -#include -#include - -#include - -// Manifest data format -// ==================== -// -// Integers are big-endian. -// -// ::= -// ::= uint8_t -// ::= * -// ::= uint32_t -// ::= -// ::= uint16_t -// ::= path_len bytes -// ::= * -// ::= uint32_t -// ::= -// ::= uint32_t -// ::= Digest::size() bytes -// ::= uint64_t ; file size -// ::= int64_t ; modification time -// ::= int64_t ; status change time -// ::= * -// ::= uint32_t -// ::= * -// ::= uint32_t -// ::= uint32_t -// ::= Digest::size() bytes - -using nonstd::nullopt; -using nonstd::optional; - -const uint8_t k_manifest_format_version = 0; -const uint32_t k_max_manifest_entries = 100; -const uint32_t k_max_manifest_file_info_entries = 10000; - -namespace { - -struct FileInfo -{ - // Index to n_files. - uint32_t index; - // Digest of referenced file. - Digest digest; - // Size of referenced file. - uint64_t fsize; - // mtime of referenced file. - int64_t mtime; - // ctime of referenced file. - int64_t ctime; -}; - -bool -operator==(const FileInfo& lhs, const FileInfo& rhs) -{ - return lhs.index == rhs.index && lhs.digest == rhs.digest - && lhs.fsize == rhs.fsize && lhs.mtime == rhs.mtime - && lhs.ctime == rhs.ctime; -} - -} // namespace - -namespace std { - -template<> struct hash -{ - size_t - operator()(const FileInfo& file_info) const - { - static_assert(sizeof(FileInfo) == 48, "unexpected size"); // No padding. - util::XXH3_64 hash; - hash.update(&file_info, sizeof(file_info)); - return hash.digest(); - } -}; - -} // namespace std - -namespace { - -struct ResultEntry -{ - // Indexes to file_infos. - std::vector file_info_indexes; - - // Key of the result. - Digest key; -}; - -bool -operator==(const ResultEntry& lhs, const ResultEntry& rhs) -{ - return lhs.file_info_indexes == rhs.file_info_indexes && lhs.key == rhs.key; -} - -struct ManifestData -{ - // Referenced include files. - std::vector files; - - // Information about referenced include files. - std::vector file_infos; - - // Result keys plus references to include file infos. - std::vector results; - - bool - add_result_entry( - const Digest& result_key, - const std::unordered_map& included_files, - time_t time_of_compilation, - bool save_timestamp) - { - std::unordered_map mf_files; - for (uint32_t i = 0; i < files.size(); ++i) { - mf_files.emplace(files[i], i); - } - - std::unordered_map mf_file_infos; - for (uint32_t i = 0; i < file_infos.size(); ++i) { - mf_file_infos.emplace(file_infos[i], i); - } - - std::vector file_info_indexes; - file_info_indexes.reserve(included_files.size()); - - for (const auto& item : included_files) { - file_info_indexes.push_back(get_file_info_index(item.first, - item.second, - mf_files, - mf_file_infos, - time_of_compilation, - save_timestamp)); - } - - ResultEntry entry{std::move(file_info_indexes), result_key}; - if (std::find(results.begin(), results.end(), entry) == results.end()) { - results.push_back(std::move(entry)); - return true; - } else { - return false; - } - } - -private: - uint32_t - get_file_info_index( - const std::string& path, - const Digest& digest, - const std::unordered_map& mf_files, - const std::unordered_map& mf_file_infos, - time_t time_of_compilation, - bool save_timestamp) - { - struct FileInfo fi; - - auto f_it = mf_files.find(path); - if (f_it != mf_files.end()) { - fi.index = f_it->second; - } else { - files.push_back(path); - fi.index = files.size() - 1; - } - - fi.digest = digest; - - // file_stat.{m,c}time() have a resolution of 1 second, so we can cache the - // file's mtime and ctime only if they're at least one second older than - // time_of_compilation. - // - // file_stat.ctime() may be 0, so we have to check time_of_compilation - // against MAX(mtime, ctime). - // - // ccache only reads mtime/ctime if file_stat_match sloppiness is enabled, - // so mtimes/ctimes are stored as a dummy value (-1) if not enabled. This - // reduces the number of file_info entries for the common case. - - auto file_stat = Stat::stat(path, Stat::OnError::log); - if (file_stat) { - if (save_timestamp - && time_of_compilation - > std::max(file_stat.mtime(), file_stat.ctime())) { - fi.mtime = file_stat.mtime(); - fi.ctime = file_stat.ctime(); - } else { - fi.mtime = -1; - fi.ctime = -1; - } - fi.fsize = file_stat.size(); - } else { - fi.mtime = -1; - fi.ctime = -1; - fi.fsize = 0; - } - - auto fi_it = mf_file_infos.find(fi); - if (fi_it != mf_file_infos.end()) { - return fi_it->second; - } else { - file_infos.push_back(fi); - return file_infos.size() - 1; - } - } -}; - -struct FileStats -{ - uint64_t size; - int64_t mtime; - int64_t ctime; -}; - -std::unique_ptr -read_manifest(const std::string& path, FILE* dump_stream = nullptr) -{ - FILE* file_stream; - File file; - if (path == "-") { - file_stream = stdin; - } else { - file = File(path, "rb"); - if (!file) { - return {}; - } - file_stream = file.get(); - } - - core::FileReader file_reader(file_stream); - core::CacheEntryReader reader(file_reader); - - if (dump_stream) { - reader.header().dump(dump_stream); - } - - const auto format_ver = reader.read_int(); - if (format_ver != k_manifest_format_version) { - throw core::Error("Unknown manifest format version: {}", format_ver); - } - - if (dump_stream) { - PRINT(dump_stream, "Manifest format version: {}\n", format_ver); - } - - auto mf = std::make_unique(); - - const auto file_count = reader.read_int(); - for (uint32_t i = 0; i < file_count; ++i) { - mf->files.push_back(reader.read_str(reader.read_int())); - } - - const auto file_info_count = reader.read_int(); - for (uint32_t i = 0; i < file_info_count; ++i) { - mf->file_infos.emplace_back(); - auto& entry = mf->file_infos.back(); - - reader.read_int(entry.index); - reader.read(entry.digest.bytes(), Digest::size()); - reader.read_int(entry.fsize); - reader.read_int(entry.mtime); - reader.read_int(entry.ctime); - } - - const auto result_count = reader.read_int(); - for (uint32_t i = 0; i < result_count; ++i) { - mf->results.emplace_back(); - auto& entry = mf->results.back(); - - const auto file_info_index_count = reader.read_int(); - for (uint32_t j = 0; j < file_info_index_count; ++j) { - entry.file_info_indexes.push_back(reader.read_int()); - } - reader.read(entry.key.bytes(), Digest::size()); - } - - reader.finalize(); - return mf; -} - -bool -write_manifest(const Config& config, - const std::string& path, - const ManifestData& mf) -{ - uint64_t payload_size = 0; - payload_size += 1; // format_ver - payload_size += 4; // n_files - for (const auto& file : mf.files) { - payload_size += 2 + file.length(); - } - payload_size += 4; // n_file_infos - payload_size += mf.file_infos.size() * (4 + Digest::size() + 8 + 8 + 8); - payload_size += 4; // n_results - for (const auto& result : mf.results) { - payload_size += 4; // n_file_info_indexes - payload_size += result.file_info_indexes.size() * 4; - payload_size += Digest::size(); - } - - AtomicFile atomic_manifest_file(path, AtomicFile::Mode::binary); - core::FileWriter file_writer(atomic_manifest_file.stream()); - core::CacheEntryHeader header(core::CacheEntryType::manifest, - compression::type_from_config(config), - compression::level_from_config(config), - time(nullptr), - CCACHE_VERSION, - config.namespace_()); - header.set_entry_size_from_payload_size(payload_size); - - core::CacheEntryWriter writer(file_writer, header); - writer.write_int(k_manifest_format_version); - writer.write_int(mf.files.size()); - for (const auto& file : mf.files) { - writer.write_int(file.length()); - writer.write_str(file); - } - - writer.write_int(mf.file_infos.size()); - for (const auto& file_info : mf.file_infos) { - writer.write_int(file_info.index); - writer.write(file_info.digest.bytes(), Digest::size()); - writer.write_int(file_info.fsize); - writer.write_int(file_info.mtime); - writer.write_int(file_info.ctime); - } - - writer.write_int(mf.results.size()); - for (const auto& result : mf.results) { - writer.write_int(result.file_info_indexes.size()); - for (auto index : result.file_info_indexes) { - writer.write_int(index); - } - writer.write(result.key.bytes(), Digest::size()); - } - - writer.finalize(); - atomic_manifest_file.commit(); - return true; -} - -bool -verify_result(const Context& ctx, - const ManifestData& mf, - const ResultEntry& result, - std::unordered_map& stated_files, - std::unordered_map& hashed_files) -{ - for (uint32_t file_info_index : result.file_info_indexes) { - const auto& fi = mf.file_infos[file_info_index]; - const auto& path = mf.files[fi.index]; - - auto stated_files_iter = stated_files.find(path); - if (stated_files_iter == stated_files.end()) { - auto file_stat = Stat::stat(path, Stat::OnError::log); - if (!file_stat) { - return false; - } - FileStats st; - st.size = file_stat.size(); - st.mtime = file_stat.mtime(); - st.ctime = file_stat.ctime(); - stated_files_iter = stated_files.emplace(path, st).first; - } - const FileStats& fs = stated_files_iter->second; - - if (fi.fsize != fs.size) { - return false; - } - - // Clang stores the mtime of the included files in the precompiled header, - // and will error out if that header is later used without rebuilding. - if ((ctx.config.compiler_type() == CompilerType::clang - || ctx.config.compiler_type() == CompilerType::other) - && ctx.args_info.output_is_precompiled_header - && !ctx.args_info.fno_pch_timestamp && fi.mtime != fs.mtime) { - LOG("Precompiled header includes {}, which has a new mtime", path); - return false; - } - - if (ctx.config.sloppiness().is_enabled(core::Sloppy::file_stat_matches)) { - if (!(ctx.config.sloppiness().is_enabled( - core::Sloppy::file_stat_matches_ctime))) { - if (fi.mtime == fs.mtime && fi.ctime == fs.ctime) { - LOG("mtime/ctime hit for {}", path); - continue; - } else { - LOG("mtime/ctime miss for {}", path); - } - } else { - if (fi.mtime == fs.mtime) { - LOG("mtime hit for {}", path); - continue; - } else { - LOG("mtime miss for {}", path); - } - } - } - - auto hashed_files_iter = hashed_files.find(path); - if (hashed_files_iter == hashed_files.end()) { - Hash hash; - int ret = hash_source_code_file(ctx, hash, path, fs.size); - if (ret & HASH_SOURCE_CODE_ERROR) { - LOG("Failed hashing {}", path); - return false; - } - if (ret & HASH_SOURCE_CODE_FOUND_TIME) { - return false; - } - - Digest actual = hash.digest(); - hashed_files_iter = hashed_files.emplace(path, actual).first; - } - - if (fi.digest != hashed_files_iter->second) { - return false; - } - } - - return true; -} - -} // namespace - -namespace Manifest { - -const std::string k_file_suffix = "M"; -const uint8_t k_magic[4] = {'c', 'C', 'm', 'F'}; -const uint8_t k_version = 2; - -// Try to get the result key from a manifest file. Returns nullopt on failure. -optional -get(const Context& ctx, const std::string& path) -{ - std::unique_ptr mf; - try { - mf = read_manifest(path); - if (!mf) { - LOG_RAW("No such manifest file"); - return nullopt; - } - } catch (const core::Error& e) { - LOG("Error: {}", e.what()); - return nullopt; - } - - std::unordered_map stated_files; - std::unordered_map hashed_files; - - // Check newest result first since it's a bit more likely to match. - for (uint32_t i = mf->results.size(); i > 0; i--) { - if (verify_result( - ctx, *mf, mf->results[i - 1], stated_files, hashed_files)) { - return mf->results[i - 1].key; - } - } - - return nullopt; -} - -// Put the result key into a manifest file given a set of included files. -// Returns true on success, otherwise false. -bool -put(const Config& config, - const std::string& path, - const Digest& result_key, - const std::unordered_map& included_files, - - time_t time_of_compilation, - bool save_timestamp) -{ - // We don't bother to acquire a lock when writing the manifest to disk. A - // race between two processes will only result in one lost entry, which is - // not a big deal, and it's also very unlikely. - - std::unique_ptr mf; - try { - mf = read_manifest(path); - if (!mf) { - // Manifest file didn't exist. - mf = std::make_unique(); - } - } catch (const core::Error& e) { - LOG("Error: {}", e.what()); - // Manifest file was corrupt, ignore. - mf = std::make_unique(); - } - - if (mf->results.size() > k_max_manifest_entries) { - // Normally, there shouldn't be many result entries in the manifest since - // new entries are added only if an include file has changed but not the - // source file, and you typically change source files more often than - // header files. However, it's certainly possible to imagine cases where - // the manifest will grow large (for instance, a generated header file that - // changes for every build), and this must be taken care of since - // processing an ever growing manifest eventually will take too much time. - // A good way of solving this would be to maintain the result entries in - // LRU order and discarding the old ones. An easy way is to throw away all - // entries when there are too many. Let's do that for now. - LOG("More than {} entries in manifest file; discarding", - k_max_manifest_entries); - mf = std::make_unique(); - } else if (mf->file_infos.size() > k_max_manifest_file_info_entries) { - // Rarely, FileInfo entries can grow large in pathological cases where - // many included files change, but the main file does not. This also puts - // an upper bound on the number of FileInfo entries. - LOG("More than {} FileInfo entries in manifest file; discarding", - k_max_manifest_file_info_entries); - mf = std::make_unique(); - } - - bool added = mf->add_result_entry( - result_key, included_files, time_of_compilation, save_timestamp); - - if (added) { - try { - write_manifest(config, path, *mf); - return true; - } catch (const core::Error& e) { - LOG("Error: {}", e.what()); - } - } else { - LOG_RAW("The entry already exists in the manifest, not adding"); - } - return false; -} - -bool -dump(const std::string& path, FILE* stream) -{ - std::unique_ptr mf; - try { - mf = read_manifest(path, stream); - } catch (const core::Error& e) { - PRINT(stream, "Error: {}\n", e.what()); - return false; - } - - if (!mf) { - PRINT(stream, "Error: No such file: {}\n", path); - return false; - } - - PRINT(stream, "File paths ({}):\n", mf->files.size()); - for (size_t i = 0; i < mf->files.size(); ++i) { - PRINT(stream, " {}: {}\n", i, mf->files[i]); - } - PRINT(stream, "File infos ({}):\n", mf->file_infos.size()); - for (size_t i = 0; i < mf->file_infos.size(); ++i) { - PRINT(stream, " {}:\n", i); - PRINT(stream, " Path index: {}\n", mf->file_infos[i].index); - PRINT(stream, " Hash: {}\n", mf->file_infos[i].digest.to_string()); - PRINT(stream, " File size: {}\n", mf->file_infos[i].fsize); - PRINT(stream, " Mtime: {}\n", mf->file_infos[i].mtime); - PRINT(stream, " Ctime: {}\n", mf->file_infos[i].ctime); - } - PRINT(stream, "Results ({}):\n", mf->results.size()); - for (size_t i = 0; i < mf->results.size(); ++i) { - PRINT(stream, " {}:\n", i); - PRINT_RAW(stream, " File info indexes:"); - for (uint32_t file_info_index : mf->results[i].file_info_indexes) { - PRINT(stream, " {}", file_info_index); - } - PRINT_RAW(stream, "\n"); - PRINT(stream, " Key: {}\n", mf->results[i].key.to_string()); - } - - return true; -} - -} // namespace Manifest diff --git a/src/Manifest.hpp b/src/Manifest.hpp deleted file mode 100644 index 0230c707a..000000000 --- a/src/Manifest.hpp +++ /dev/null @@ -1,48 +0,0 @@ -// Copyright (C) 2009-2021 Joel Rosdahl and other contributors -// -// See doc/AUTHORS.adoc for a complete list of contributors. -// -// This program is free software; you can redistribute it and/or modify it -// under the terms of the GNU General Public License as published by the Free -// Software Foundation; either version 3 of the License, or (at your option) -// any later version. -// -// This program is distributed in the hope that it will be useful, but WITHOUT -// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or -// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for -// more details. -// -// You should have received a copy of the GNU General Public License along with -// this program; if not, write to the Free Software Foundation, Inc., 51 -// Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - -#pragma once - -#include "third_party/nonstd/optional.hpp" - -#include -#include -#include -#include -#include - -class Config; -class Context; -class Digest; - -namespace Manifest { - -extern const std::string k_file_suffix; -extern const uint8_t k_magic[4]; -extern const uint8_t k_version; - -nonstd::optional get(const Context& ctx, const std::string& path); -bool put(const Config& config, - const std::string& path, - const Digest& result_key, - const std::unordered_map& included_files, - time_t time_of_compilation, - bool save_timestamp); -bool dump(const std::string& path, FILE* stream); - -} // namespace Manifest diff --git a/src/ccache.cpp b/src/ccache.cpp index 9e58317de..422d97695 100644 --- a/src/ccache.cpp +++ b/src/ccache.cpp @@ -30,7 +30,6 @@ #include "Hash.hpp" #include "Lockfile.hpp" #include "Logging.hpp" -#include "Manifest.hpp" #include "MiniTrace.hpp" #include "Result.hpp" #include "ResultRetriever.hpp" @@ -46,7 +45,13 @@ #include "hashutil.hpp" #include "language.hpp" +#include #include +#include +#include +#include +#include +#include #include #include #include @@ -710,6 +715,45 @@ do_execute(Context& ctx, return status; } +static core::Manifest +read_manifest(const std::string& path) +{ + core::Manifest manifest; + File file(path, "rb"); + if (file) { + try { + core::FileReader file_reader(*file); + core::CacheEntryReader reader(file_reader); + manifest.read(reader); + reader.finalize(); + } catch (const core::Error& e) { + LOG("Error reading {}: {}", path, e.what()); + } + } + return manifest; +} + +static void +save_manifest(const Config& config, + const core::Manifest& manifest, + const std::string& path) +{ + AtomicFile atomic_manifest_file(path, AtomicFile::Mode::binary); + core::FileWriter file_writer(atomic_manifest_file.stream()); + core::CacheEntryHeader header(core::CacheEntryType::manifest, + compression::type_from_config(config), + compression::level_from_config(config), + time(nullptr), + CCACHE_VERSION, + config.namespace_()); + header.set_entry_size_from_payload_size(manifest.serialized_size()); + + core::CacheEntryWriter writer(file_writer, header); + manifest.write(writer); + writer.finalize(); + atomic_manifest_file.commit(); +} + // Create or update the manifest file. static void update_manifest_file(Context& ctx, @@ -724,8 +768,8 @@ update_manifest_file(Context& ctx, MTR_SCOPE("manifest", "manifest_put"); - // See comment in get_file_hash_index for why saving of timestamps is forced - // for precompiled headers. + // See comment in core::Manifest::get_file_info_index for why saving of + // timestamps is forced for precompiled headers. const bool save_timestamp = (ctx.config.sloppiness().is_enabled(core::Sloppy::file_stat_matches)) || ctx.args_info.output_is_precompiled_header; @@ -733,12 +777,20 @@ update_manifest_file(Context& ctx, ctx.storage.put( manifest_key, core::CacheEntryType::manifest, [&](const auto& path) { LOG("Adding result key to {}", path); - return Manifest::put(ctx.config, - path, - result_key, - ctx.included_files, - ctx.time_of_compilation, - save_timestamp); + try { + auto manifest = read_manifest(path); + const bool added = manifest.add_result(result_key, + ctx.included_files, + ctx.time_of_compilation, + save_timestamp); + if (added) { + save_manifest(ctx.config, manifest, path); + } + return added; + } catch (const core::Error& e) { + LOG("Failed to add result key to {}: {}", path, e.what()); + return false; + } }); } @@ -1438,7 +1490,7 @@ calculate_result_and_manifest_key(Context& ctx, if (direct_mode) { hash.hash_delimiter("manifest version"); - hash.hash(Manifest::k_version); + hash.hash(core::Manifest::k_format_version); } // clang will emit warnings for unused linker flags, so we shouldn't skip @@ -1708,7 +1760,12 @@ calculate_result_and_manifest_key(Context& ctx, if (manifest_path) { LOG("Looking for result key in {}", *manifest_path); MTR_BEGIN("manifest", "manifest_get"); - result_key = Manifest::get(ctx, *manifest_path); + try { + const auto manifest = read_manifest(*manifest_path); + result_key = manifest.look_up_result_digest(ctx); + } catch (const core::Error& e) { + LOG("Failed to look up result key in {}: {}", *manifest_path, e.what()); + } MTR_END("manifest", "manifest_get"); if (result_key) { LOG_RAW("Got result key from manifest"); diff --git a/src/core/CMakeLists.txt b/src/core/CMakeLists.txt index b3779bd37..c4f393b46 100644 --- a/src/core/CMakeLists.txt +++ b/src/core/CMakeLists.txt @@ -3,6 +3,7 @@ set( ${CMAKE_CURRENT_SOURCE_DIR}/CacheEntryHeader.cpp ${CMAKE_CURRENT_SOURCE_DIR}/CacheEntryReader.cpp ${CMAKE_CURRENT_SOURCE_DIR}/CacheEntryWriter.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/Manifest.cpp ${CMAKE_CURRENT_SOURCE_DIR}/Statistics.cpp ${CMAKE_CURRENT_SOURCE_DIR}/StatisticsCounters.cpp ${CMAKE_CURRENT_SOURCE_DIR}/StatsLog.cpp diff --git a/src/core/Manifest.cpp b/src/core/Manifest.cpp new file mode 100644 index 000000000..d162fe2c6 --- /dev/null +++ b/src/core/Manifest.cpp @@ -0,0 +1,446 @@ +// Copyright (C) 2009-2021 Joel Rosdahl and other contributors +// +// See doc/AUTHORS.adoc for a complete list of contributors. +// +// This program is free software; you can redistribute it and/or modify it +// under the terms of the GNU General Public License as published by the Free +// Software Foundation; either version 3 of the License, or (at your option) +// any later version. +// +// This program is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for +// more details. +// +// You should have received a copy of the GNU General Public License along with +// this program; if not, write to the Free Software Foundation, Inc., 51 +// Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + +#include "Manifest.hpp" + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +// Manifest data format +// ==================== +// +// Integers are big-endian. +// +// ::= +// ::= uint8_t +// ::= * +// ::= uint32_t +// ::= +// ::= uint16_t +// ::= path_len bytes +// ::= * +// ::= uint32_t +// ::= +// ::= uint32_t +// ::= Digest::size() bytes +// ::= uint64_t ; file size +// ::= int64_t ; modification time +// ::= int64_t ; status change time +// ::= * +// ::= uint32_t +// ::= * +// ::= uint32_t +// ::= uint32_t +// ::= Digest::size() bytes + +const uint32_t k_max_manifest_entries = 100; +const uint32_t k_max_manifest_file_info_entries = 10000; + +namespace std { + +template<> struct hash +{ + size_t + operator()(const core::Manifest::FileInfo& file_info) const + { + static_assert(sizeof(file_info) == 48, "unexpected size"); // No padding. + util::XXH3_64 hash; + hash.update(&file_info, sizeof(file_info)); + return hash.digest(); + } +}; + +} // namespace std + +namespace core { + +const uint8_t Manifest::k_format_version = 0; + +void +Manifest::read(Reader& reader) +{ + clear(); + + const auto format_version = reader.read_int(); + if (format_version != k_format_version) { + throw core::Error( + "Unknown format version: {} != {}", format_version, k_format_version); + } + + const auto file_count = reader.read_int(); + for (uint32_t i = 0; i < file_count; ++i) { + m_files.push_back(reader.read_str(reader.read_int())); + } + + const auto file_info_count = reader.read_int(); + for (uint32_t i = 0; i < file_info_count; ++i) { + m_file_infos.emplace_back(); + auto& entry = m_file_infos.back(); + + reader.read_int(entry.index); + reader.read(entry.digest.bytes(), Digest::size()); + reader.read_int(entry.fsize); + reader.read_int(entry.mtime); + reader.read_int(entry.ctime); + } + + const auto result_count = reader.read_int(); + for (uint32_t i = 0; i < result_count; ++i) { + m_results.emplace_back(); + auto& entry = m_results.back(); + + const auto file_info_index_count = reader.read_int(); + for (uint32_t j = 0; j < file_info_index_count; ++j) { + entry.file_info_indexes.push_back(reader.read_int()); + } + reader.read(entry.key.bytes(), Digest::size()); + } +} + +nonstd::optional +Manifest::look_up_result_digest(const Context& ctx) const +{ + std::unordered_map stated_files; + std::unordered_map hashed_files; + + // Check newest result first since it's a more likely to match. + for (size_t i = m_results.size(); i > 0; i--) { + const auto& result = m_results[i - 1]; + if (result_matches(ctx, result, stated_files, hashed_files)) { + return result.key; + } + } + + return nonstd::nullopt; +} + +bool +Manifest::add_result(const Digest& result_key, + std::unordered_map& included_files, + const time_t time_of_compilation, + const bool save_timestamp) +{ + if (m_results.size() > k_max_manifest_entries) { + // Normally, there shouldn't be many result entries in the manifest since + // new entries are added only if an include file has changed but not the + // source file, and you typically change source files more often than header + // files. However, it's certainly possible to imagine cases where the + // manifest will grow large (for instance, a generated header file that + // changes for every build), and this must be taken care of since processing + // an ever growing manifest eventually will take too much time. A good way + // of solving this would be to maintain the result entries in LRU order and + // discarding the old ones. An easy way is to throw away all entries when + // there are too many. Let's do that for now. + LOG("More than {} entries in manifest file; discarding", + k_max_manifest_entries); + clear(); + } else if (m_file_infos.size() > k_max_manifest_file_info_entries) { + // Rarely, FileInfo entries can grow large in pathological cases where many + // included files change, but the main file does not. This also puts an + // upper bound on the number of FileInfo entries. + LOG("More than {} FileInfo entries in manifest file; discarding", + k_max_manifest_file_info_entries); + clear(); + } + + std::unordered_map mf_files; + for (uint32_t i = 0; i < m_files.size(); ++i) { + mf_files.emplace(m_files[i], i); + } + + std::unordered_map mf_file_infos; + for (uint32_t i = 0; i < m_file_infos.size(); ++i) { + mf_file_infos.emplace(m_file_infos[i], i); + } + + std::vector file_info_indexes; + file_info_indexes.reserve(included_files.size()); + + for (const auto& item : included_files) { + file_info_indexes.push_back(get_file_info_index(item.first, + item.second, + mf_files, + mf_file_infos, + time_of_compilation, + save_timestamp)); + } + + ResultEntry entry{std::move(file_info_indexes), result_key}; + if (std::find(m_results.begin(), m_results.end(), entry) == m_results.end()) { + m_results.push_back(std::move(entry)); + return true; + } else { + return false; + } +} + +size_t +Manifest::serialized_size() const +{ + uint64_t size = 0; + + size += 1; // format_ver + size += 4; // n_files + for (const auto& file : m_files) { + size += 2 + file.length(); + } + size += 4; // n_file_infos + size += m_file_infos.size() * (4 + Digest::size() + 8 + 8 + 8); + size += 4; // n_results + for (const auto& result : m_results) { + size += 4; // n_file_info_indexes + size += result.file_info_indexes.size() * 4; + size += Digest::size(); + } + + return size; +} + +void +Manifest::write(Writer& writer) const +{ + writer.write_int(k_format_version); + writer.write_int(m_files.size()); + for (const auto& file : m_files) { + writer.write_int(file.length()); + writer.write_str(file); + } + + writer.write_int(m_file_infos.size()); + for (const auto& file_info : m_file_infos) { + writer.write_int(file_info.index); + writer.write(file_info.digest.bytes(), Digest::size()); + writer.write_int(file_info.fsize); + writer.write_int(file_info.mtime); + writer.write_int(file_info.ctime); + } + + writer.write_int(m_results.size()); + for (const auto& result : m_results) { + writer.write_int(result.file_info_indexes.size()); + for (auto index : result.file_info_indexes) { + writer.write_int(index); + } + writer.write(result.key.bytes(), Digest::size()); + } + + writer.finalize(); +} + +bool +Manifest::FileInfo::operator==(const FileInfo& other) const +{ + return index == other.index && digest == other.digest && fsize == other.fsize + && mtime == other.mtime && ctime == other.ctime; +} + +bool +Manifest::ResultEntry::operator==(const ResultEntry& other) const +{ + return file_info_indexes == other.file_info_indexes && key == other.key; +} + +void +Manifest::clear() +{ + m_files.clear(); + m_file_infos.clear(); + m_results.clear(); +} + +uint32_t +Manifest::get_file_info_index( + const std::string& path, + const Digest& digest, + const std::unordered_map& mf_files, + const std::unordered_map& mf_file_infos, + const time_t time_of_compilation, + const bool save_timestamp) +{ + FileInfo fi; + + const auto f_it = mf_files.find(path); + if (f_it != mf_files.end()) { + fi.index = f_it->second; + } else { + m_files.push_back(path); + fi.index = m_files.size() - 1; + } + + fi.digest = digest; + + // file_stat.{m,c}time() have a resolution of 1 second, so we can cache the + // file's mtime and ctime only if they're at least one second older than + // time_of_compilation. + // + // file_stat.ctime() may be 0, so we have to check time_of_compilation against + // MAX(mtime, ctime). + // + // ccache only reads mtime/ctime if file_stat_match sloppiness is enabled, so + // mtimes/ctimes are stored as a dummy value (-1) if not enabled. This reduces + // the number of file_info entries for the common case. + + const auto file_stat = Stat::stat(path, Stat::OnError::log); + if (file_stat) { + if (save_timestamp + && time_of_compilation + > std::max(file_stat.mtime(), file_stat.ctime())) { + fi.mtime = file_stat.mtime(); + fi.ctime = file_stat.ctime(); + } else { + fi.mtime = -1; + fi.ctime = -1; + } + fi.fsize = file_stat.size(); + } else { + fi.mtime = -1; + fi.ctime = -1; + fi.fsize = 0; + } + + const auto fi_it = mf_file_infos.find(fi); + if (fi_it != mf_file_infos.end()) { + return fi_it->second; + } else { + m_file_infos.push_back(fi); + return m_file_infos.size() - 1; + } +} + +bool +Manifest::result_matches( + const Context& ctx, + const ResultEntry& result, + std::unordered_map& stated_files, + std::unordered_map& hashed_files) const +{ + for (uint32_t file_info_index : result.file_info_indexes) { + const auto& fi = m_file_infos[file_info_index]; + const auto& path = m_files[fi.index]; + + auto stated_files_iter = stated_files.find(path); + if (stated_files_iter == stated_files.end()) { + auto file_stat = Stat::stat(path, Stat::OnError::log); + if (!file_stat) { + return false; + } + FileStats st; + st.size = file_stat.size(); + st.mtime = file_stat.mtime(); + st.ctime = file_stat.ctime(); + stated_files_iter = stated_files.emplace(path, st).first; + } + const FileStats& fs = stated_files_iter->second; + + if (fi.fsize != fs.size) { + return false; + } + + // Clang stores the mtime of the included files in the precompiled header, + // and will error out if that header is later used without rebuilding. + if ((ctx.config.compiler_type() == CompilerType::clang + || ctx.config.compiler_type() == CompilerType::other) + && ctx.args_info.output_is_precompiled_header + && !ctx.args_info.fno_pch_timestamp && fi.mtime != fs.mtime) { + LOG("Precompiled header includes {}, which has a new mtime", path); + return false; + } + + if (ctx.config.sloppiness().is_enabled(core::Sloppy::file_stat_matches)) { + if (!(ctx.config.sloppiness().is_enabled( + core::Sloppy::file_stat_matches_ctime))) { + if (fi.mtime == fs.mtime && fi.ctime == fs.ctime) { + LOG("mtime/ctime hit for {}", path); + continue; + } else { + LOG("mtime/ctime miss for {}", path); + } + } else { + if (fi.mtime == fs.mtime) { + LOG("mtime hit for {}", path); + continue; + } else { + LOG("mtime miss for {}", path); + } + } + } + + auto hashed_files_iter = hashed_files.find(path); + if (hashed_files_iter == hashed_files.end()) { + Hash hash; + int ret = hash_source_code_file(ctx, hash, path, fs.size); + if (ret & HASH_SOURCE_CODE_ERROR) { + LOG("Failed hashing {}", path); + return false; + } + if (ret & HASH_SOURCE_CODE_FOUND_TIME) { + return false; + } + + Digest actual = hash.digest(); + hashed_files_iter = hashed_files.emplace(path, actual).first; + } + + if (fi.digest != hashed_files_iter->second) { + return false; + } + } + + return true; +} + +void +Manifest::dump(FILE* const stream) const +{ + PRINT(stream, "Manifest format version: {}\n", k_format_version); + + PRINT(stream, "File paths ({}):\n", m_files.size()); + for (size_t i = 0; i < m_files.size(); ++i) { + PRINT(stream, " {}: {}\n", i, m_files[i]); + } + + PRINT(stream, "File infos ({}):\n", m_file_infos.size()); + for (size_t i = 0; i < m_file_infos.size(); ++i) { + PRINT(stream, " {}:\n", i); + PRINT(stream, " Path index: {}\n", m_file_infos[i].index); + PRINT(stream, " Hash: {}\n", m_file_infos[i].digest.to_string()); + PRINT(stream, " File size: {}\n", m_file_infos[i].fsize); + PRINT(stream, " Mtime: {}\n", m_file_infos[i].mtime); + PRINT(stream, " Ctime: {}\n", m_file_infos[i].ctime); + } + + PRINT(stream, "Results ({}):\n", m_results.size()); + for (size_t i = 0; i < m_results.size(); ++i) { + PRINT(stream, " {}:\n", i); + PRINT_RAW(stream, " File info indexes:"); + for (uint32_t file_info_index : m_results[i].file_info_indexes) { + PRINT(stream, " {}", file_info_index); + } + PRINT_RAW(stream, "\n"); + PRINT(stream, " Key: {}\n", m_results[i].key.to_string()); + } +} + +} // namespace core diff --git a/src/core/Manifest.hpp b/src/core/Manifest.hpp new file mode 100644 index 000000000..bb9e26a48 --- /dev/null +++ b/src/core/Manifest.hpp @@ -0,0 +1,104 @@ +// Copyright (C) 2009-2021 Joel Rosdahl and other contributors +// +// See doc/AUTHORS.adoc for a complete list of contributors. +// +// This program is free software; you can redistribute it and/or modify it +// under the terms of the GNU General Public License as published by the Free +// Software Foundation; either version 3 of the License, or (at your option) +// any later version. +// +// This program is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for +// more details. +// +// You should have received a copy of the GNU General Public License along with +// this program; if not, write to the Free Software Foundation, Inc., 51 +// Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + +#pragma once + +#include + +#include + +#include +#include +#include +#include + +class Context; + +namespace core { + +class Reader; +class Writer; + +class Manifest +{ +public: + static const uint8_t k_format_version; + + Manifest() = default; + + void read(Reader& reader); + nonstd::optional look_up_result_digest(const Context& ctx) const; + + bool add_result(const Digest& result_key, + std::unordered_map& included_files, + time_t time_of_compilation, + bool save_timestamp); + size_t serialized_size() const; + void write(Writer& writer) const; + + void dump(FILE* stream) const; + +private: + struct FileStats + { + uint64_t size; + int64_t mtime; + int64_t ctime; + }; + + struct FileInfo + { + uint32_t index; // Index to m_files. + Digest digest; // Digest of referenced file. + uint64_t fsize; // Size of referenced file. + int64_t mtime; // mtime of referenced file. + int64_t ctime; // ctime of referenced file. + + bool operator==(const FileInfo& other) const; + }; + + friend std::hash; + + struct ResultEntry + { + std::vector file_info_indexes; // Indexes to m_file_infos. + Digest key; // Key of the result. + + bool operator==(const ResultEntry& other) const; + }; + + std::vector m_files; // Names of referenced include files. + std::vector m_file_infos; // Info about referenced include files. + std::vector m_results; + + void clear(); + uint32_t get_file_info_index( + const std::string& path, + const Digest& digest, + const std::unordered_map& mf_files, + const std::unordered_map& mf_file_infos, + time_t time_of_compilation, + bool save_timestamp); + bool + result_matches(const Context& ctx, + const ResultEntry& result, + std::unordered_map& stated_files, + std::unordered_map& hashed_files) const; +}; + +} // namespace core diff --git a/src/core/mainoptions.cpp b/src/core/mainoptions.cpp index ba1803834..868e9c6dc 100644 --- a/src/core/mainoptions.cpp +++ b/src/core/mainoptions.cpp @@ -20,13 +20,16 @@ #include #include +#include #include #include -#include #include #include #include #include +#include +#include +#include #include #include #include @@ -385,8 +388,19 @@ process_main_options(int argc, const char* const* argv) break; } - case DUMP_MANIFEST: - return Manifest::dump(arg, stdout) ? 0 : 1; + case DUMP_MANIFEST: { + File file(arg, "rb"); + if (!file) { + throw Fatal("No such file: {}", arg); + } + core::FileReader file_reader(*file); + core::CacheEntryReader reader(file_reader); + core::Manifest manifest; + manifest.read(reader); + reader.finalize(); + manifest.dump(stdout); + return 0; + } case DUMP_RESULT: { ResultDumper result_dumper(stdout); diff --git a/src/storage/primary/CacheFile.cpp b/src/storage/primary/CacheFile.cpp index 0068fcdd7..1a65e583a 100644 --- a/src/storage/primary/CacheFile.cpp +++ b/src/storage/primary/CacheFile.cpp @@ -18,8 +18,8 @@ #include "CacheFile.hpp" -#include #include +#include #include const Stat& @@ -35,7 +35,7 @@ CacheFile::lstat() const CacheFile::Type CacheFile::type() const { - if (util::ends_with(m_path, Manifest::k_file_suffix)) { + if (util::ends_with(m_path, "M")) { return Type::manifest; } else if (util::ends_with(m_path, Result::k_file_suffix)) { return Type::result; diff --git a/src/storage/primary/PrimaryStorage_compress.cpp b/src/storage/primary/PrimaryStorage_compress.cpp index 6b40567ec..4965f224c 100644 --- a/src/storage/primary/PrimaryStorage_compress.cpp +++ b/src/storage/primary/PrimaryStorage_compress.cpp @@ -22,7 +22,6 @@ #include #include #include -#include #include #include #include @@ -31,6 +30,7 @@ #include #include #include +#include #include #include #include