]> git.ipfire.org Git - thirdparty/ccache.git/commitdiff
Revise disk format for results
authorJoel Rosdahl <joel@rosdahl.net>
Thu, 30 May 2019 18:37:12 +0000 (20:37 +0200)
committerJoel Rosdahl <joel@rosdahl.net>
Tue, 4 Jun 2019 20:18:07 +0000 (22:18 +0200)
* Removed unused hash_size and reserved fields. Since there are no
  hashes stored in in the result metadata, hash size is superfluous. The
  reserved bits field is also unnecessary; if we need to change the
  format, we can just step RESULT_VERSION and be done with it.
* Instead of storing file count in the header, store an EOF marker after
  the file entries. The main reason for this is that files then can be
  appended to the result file without having to precalculate how many
  files the result will contain.
* Don’t include trailing NUL in suffix strings since the length is known.
* Instead of potentially compressing the whole file, added an
  uncompressed header telling how/if the rest of the file is
  compressed (which algorithm and level). This makes it possible to more
  efficiently recompress files in a batch job since it’s possible to
  reasonably efficiently check if a cached file should be repacked. The
  reason for not having compression info in each subfile
  header (supporting different compression algorithms/levels per
  subfile) is to make the repacking scenario simpler.
* Prepared for adding support for “reference entries”, which refer to
  other results. There are two potential use cases for reference
  entries: a) deduplication and b) storing partial results with a
  different compression algorithm/level. It’s probably only the
  deduplication use case that is interesting, though. It can be done
  either at cache miss time or later as a batch job. If we really want
  to, we can in the future add similar “raw reference entries” that
  refer to files stored verbatim in the storage, thus re-enabling hard
  link functionality.
* Changed to cCrS as the magic bytes for result files. This is analogous
  to the magic bytes used for manifest files.
* Added documentation of the format.

src/ccache.c
src/manifest.c
src/result.c

index 253268363f56ec623fa8d63ed68c04f669f1f2ac..b1fe0ff49e3e164880225e7b7d00a2461eab7da3 100644 (file)
@@ -1387,7 +1387,7 @@ to_cache(struct args *args, struct hash *depend_mode_hash)
        }
        struct filelist *filelist = create_empty_filelist();
        if (st.st_size > 0) {
-               add_file_to_filelist(filelist, tmp_stderr, ".stderr");
+               add_file_to_filelist(filelist, tmp_stderr, "stderr");
        }
        add_file_to_filelist(filelist, output_obj, ".o");
        if (generating_dependencies) {
@@ -1812,6 +1812,9 @@ calculate_object_hash(struct args *args, struct hash *hash, int direct_mode)
 {
        bool found_ccbin = false;
 
+       hash_delimiter(hash, "result version");
+       hash_int(hash, RESULT_VERSION);
+
        if (direct_mode) {
                hash_delimiter(hash, "manifest version");
                hash_int(hash, MANIFEST_VERSION);
@@ -2151,7 +2154,7 @@ from_cache(enum fromcache_call_mode mode, bool put_object_in_manifest)
                        add_file_to_filelist(filelist, output_dwo, ".dwo");
                }
        }
-       add_file_to_filelist(filelist, tmp_stderr, ".stderr");
+       add_file_to_filelist(filelist, tmp_stderr, "stderr");
        if (produce_dep_file) {
                add_file_to_filelist(filelist, output_dep, ".d");
        }
index 7b272b9c704bafabf0b9ec84940bc398ee11d93a..9d614f94b3f5b258fd02bde980641f60cbcd2283 100644 (file)
@@ -22,9 +22,9 @@
 
 #include <zlib.h>
 
-// Sketchy specification of the manifest disk format:
+// Sketchy specification of the manifest data format:
 //
-// <magic>         magic number                        (4 bytes)
+// <magic>         magic number                        (4 bytes: cCmF)
 // <version>       file format version                 (1 byte unsigned int)
 // <hash_size>     size of the hash fields (in bytes)  (1 byte unsigned int)
 // <reserved>      reserved for future use             (2 bytes)
@@ -64,7 +64,7 @@
 // <hash[n-1]>
 // <size[n-1]>
 
-static const uint32_t MAGIC = 0x63436d46U;
+static const uint32_t MAGIC = 0x63436d46U; // cCmF
 static const uint32_t MAX_MANIFEST_ENTRIES = 100;
 static const uint32_t MAX_MANIFEST_FILE_INFO_ENTRIES = 10000;
 
index 31dff3f0da0c18c5a3c298e89f4c1f72b6ab28ee..bc735208f3c9577c9407a826c436882c43a7f277 100644 (file)
@@ -1,4 +1,4 @@
-// Copyright (C) 2009-2018 Joel Rosdahl
+// Copyright (C) 2019 Joel Rosdahl
 //
 // This program is free software; you can redistribute it and/or modify it
 // under the terms of the GNU General Public License as published by the Free
 
 #include <zlib.h>
 
-static const uint32_t MAGIC = 0x63436343U;
+// Result data format:
+//
+// <result>      ::= <header> <body> ; <body> is potentially compressed
+// <header>      ::= <magic> <version> <compr_type> <compr_level>
+// <body>        ::= <entry>* <eof_marker>
+// <eof_marker>  ::= 0 (uint8_t)
+// <magic>       ::= uint32_t ; "cCrS"
+// <version>     ::= uint8_t
+// <compr_type>  ::= <compr_none> | <compr_gzip>
+// <compr_none>  ::= 0
+// <compr_gzip>  ::= 1
+// <compr_level> ::= uint8_t
+// <entry>       ::= <file_entry> | <ref_entry>
+// <file_entry>  ::= <file_marker> <suffix_len> <suffix> <data_len> <data>
+// <file_marker> ::= 1 (uint8_t)
+// <suffix_len>  ::= uint8_t
+// <suffix>      ::= suffix_len bytes
+// <data_len>    ::= uint64_t
+// <data>        ::= data_len bytes
+// <ref_entry>   ::= <ref_marker> <key_len> <key>
+// <ref_marker>  ::= 2 (uint8_t)
+// <key_len>     ::= uint8_t
+// <key>         ::= key_len bytes
+//
+// Sketch of concrete layout:
+//
+// <magic>         4 bytes
+// <version>       1 byte
+// <compr_type>    1 byte
+// <compr_level>   1 byte
+// --- [potentially compressed from here ] -----------------------------------
+// <file_marker>   1 byte
+// <suffix_len>    1 byte
+// <suffix>        suffix_len bytes
+// <data_len>      8 bytes
+// <data>          data_len bytes
+// ...
+// <ref_marker>    1 byte
+// <key_len>       1 byte
+// <key>           key_len bytes
+// ...
+// <eof_marker>    1 byte
+
+static const char MAGIC[4] = "cCrS";
+
+enum {
+       EOF_MARKER = 0,
+       FILE_MARKER = 1,
+       REF_MARKER = 2
+};
+
+enum {
+       COMPR_TYPE_NONE = 0,
+       COMPR_TYPE_GZIP = 1
+};
 
 struct file {
-       uint32_t suffix_len;
        char *suffix;
        uint32_t path_len;
        char *path;
@@ -54,7 +107,6 @@ add_file_to_filelist(struct filelist *l, const char *path, const char *suffix)
        struct file *f = &l->files[l->n_files];
        l->n_files++;
 
-       f->suffix_len = strlen(suffix);
        f->suffix = x_strdup(suffix);
        f->path_len = strlen(path);
        f->path = x_strdup(path);
@@ -100,24 +152,11 @@ free_filelist(struct filelist *l)
                (var) = u_; \
        } while (false)
 
-#define READ_STR(var) \
+#define READ_BYTES(length, buf) \
        do { \
-               char buf_[1024]; \
-               size_t i_; \
-               for (i_ = 0; i_ < sizeof(buf_); i_++) { \
-                       int ch_ = gzgetc(f); \
-                       if (ch_ == EOF) { \
-                               goto error; \
-                       } \
-                       buf_[i_] = ch_; \
-                       if (ch_ == '\0') { \
-                               break; \
-                       } \
-               } \
-               if (i_ == sizeof(buf_)) { \
+               if (gzread(f, buf, length) != length) { \
                        goto error; \
                } \
-               (var) = x_strdup(buf_); \
        } while (false)
 
 #define READ_FILE(size, path) \
@@ -135,71 +174,122 @@ free_filelist(struct filelist *l)
                fclose(f_); \
        } while (false)
 
-
-static struct filelist *
-read_cache(gzFile f, struct filelist *l, bool copy)
+static bool
+read_cache(const char *path, struct filelist *l, FILE *dump_stream)
 {
-       uint32_t magic;
-       READ_INT(4, magic);
-       if (magic != MAGIC) {
-               cc_log("Cache file has bad magic number %u", magic);
-               goto error;
+       int fd = open(path, O_RDONLY | O_BINARY);
+       if (fd == -1) {
+               // Cache miss.
+               cc_log("No such cache file");
+               return false;
+       }
+
+       char header[7];
+       if (read(fd, header, sizeof(header)) != (ssize_t)sizeof(header)) {
+               close(fd);
+               cc_log("Failed to read result file header");
+               return false;
+       }
+
+       if (memcmp(header, MAGIC, sizeof(MAGIC)) != 0) {
+               cc_log("Cache file has bad magic value 0x%x%x%x%x",
+                      header[0], header[1], header[2], header[3]);
+               // TODO: Return error message like read_manifest does.
+               return false;
        }
 
-       uint8_t version;
-       READ_BYTE(version);
-       (void)version;
+       // TODO: Verify version like read_manifest does.
+       const uint8_t version = header[4];
+       const uint8_t compr_type = header[5];
+       switch (compr_type) {
+       case COMPR_TYPE_NONE:
+       case COMPR_TYPE_GZIP:
+               break;
+
+       default:
+               cc_log("Unknown compression type: %u", compr_type);
+               return false;
+       }
 
-       uint8_t hash_size;
-       READ_INT(1, hash_size);
-       (void)hash_size;
+       if (dump_stream) {
+               const uint8_t compr_level = header[6];
+               fprintf(dump_stream, "Magic: %c%c%c%c\n",
+                       MAGIC[0], MAGIC[1], MAGIC[2], MAGIC[3]);
+               fprintf(dump_stream, "Version: %u\n", version);
+               fprintf(dump_stream, "Compression type: %s\n",
+                       compr_type == COMPR_TYPE_NONE ? "none" : "gzip");
+               fprintf(dump_stream, "Compression level: %u\n", compr_level);
+       }
 
-       uint16_t reserved;
-       READ_INT(2, reserved);
-       (void)reserved;
+       gzFile f = gzdopen(fd, "rb");
+       if (!f) {
+               close(fd);
+               cc_log("Failed to gzdopen result file");
+               return false;
+       }
 
-       uint32_t n_files;
-       READ_INT(4, n_files);
+       uint8_t marker;
+       for (uint32_t i = 0; ; i++) {
+               READ_BYTE(marker);
+               switch (marker) {
+               case EOF_MARKER:
+                       gzclose(f);
+                       return true;
+
+               case FILE_MARKER:
+                       break;
+
+               case REF_MARKER:
+                       // TODO: Implement.
+                       continue;
+
+               default:
+                       cc_log("Unknown entry type: %u", marker);
+                       goto error;
+               }
 
-       for (uint32_t i = 0; i < n_files; i++) {
-               uint32_t sufflen;
-               READ_INT(4, sufflen);
-               char *suffix;
-               READ_STR(suffix);
+               uint8_t suffix_len;
+               READ_BYTE(suffix_len);
 
-               uint32_t filelen;
-               READ_INT(4, filelen);
+               char suffix[256 + 1];
+               READ_BYTES(suffix_len, suffix);
+               suffix[suffix_len] = '\0';
 
-               cc_log("Reading file #%d: %s (%u)", i, suffix, filelen);
+               uint64_t filelen;
+               READ_INT(8, filelen);
+
+               cc_log("Reading entry #%u: %s (%lu)",
+                      i,
+                      str_eq(suffix, "stderr") ? "<stderr>" : suffix,
+                      (unsigned long)filelen);
 
                bool found = false;
-               if (copy) {
+               if (dump_stream) {
+                       fprintf(dump_stream,
+                               "Entry: %s (size: %" PRIu64 " bytes)\n",
+                               str_eq(suffix, "stderr") ? "<stderr>" : suffix,
+                               filelen);
+               } else {
                        for (uint32_t j = 0; j < l->n_files; j++) {
-                               if (sufflen == l->files[j].suffix_len &&
-                               str_eq(suffix, l->files[j].suffix)) {
+                               if (str_eq(suffix, l->files[j].suffix)) {
                                        found = true;
 
-                                       cc_log("Copying %s from cache", l->files[i].path);
+                                       cc_log("Copying file to %s", l->files[i].path);
 
                                        READ_FILE(filelen, l->files[j].path);
                                }
                        }
-               } else {
-                       add_file_to_filelist(l, "", suffix);
-                       l->sizes[l->n_files-1] = filelen;
                }
                if (!found) {
                        // Skip the data, if no match
                        gzseek(f, filelen, SEEK_CUR);
                }
-
-               free(suffix);
        }
-       return l;
 
 error:
+       gzclose(f);
        cc_log("Corrupt cache file");
-       return NULL;
+       return false;
 }
 
 #define WRITE_BYTE(var) \
@@ -222,9 +312,9 @@ error:
                } \
        } while (false)
 
-#define WRITE_STR(var) \
+#define WRITE_BYTES(length, buf) \
        do { \
-               if (gzputs(f, var) == EOF || gzputc(f, '\0') == EOF) { \
+               if (gzwrite(f, buf, length) != (long)length) { \
                        goto error; \
                } \
        } while (false)
@@ -247,31 +337,25 @@ error:
 static int
 write_cache(gzFile f, const struct filelist *l)
 {
-       WRITE_INT(4, MAGIC);
-
-       WRITE_BYTE(RESULT_VERSION);
-       WRITE_INT(1, 16);
-       WRITE_INT(2, 0);
-
-       WRITE_INT(4, l->n_files);
        for (uint32_t i = 0; i < l->n_files; i++) {
                struct stat st;
                if (x_stat(l->files[i].path, &st) != 0) {
                        return -1;
                }
 
-               cc_log("Writing file #%d: %s (%ld)", i, l->files[i].suffix,
-                      (long)st.st_size);
-
-               WRITE_INT(4, l->files[i].suffix_len);
-               WRITE_STR(l->files[i].suffix);
+               cc_log("Writing file #%u: %s (%lu)", i, l->files[i].suffix,
+                      (unsigned long)st.st_size);
 
-               cc_log("Copying %s to cache", l->files[i].path);
-
-               WRITE_INT(4, st.st_size);
+               WRITE_BYTE(FILE_MARKER);
+               size_t suffix_len = strlen(l->files[i].suffix);
+               WRITE_BYTE(suffix_len);
+               WRITE_BYTES(suffix_len, l->files[i].suffix);
+               WRITE_INT(8, st.st_size);
                WRITE_FILE(st.st_size, l->files[i].path);
        }
 
+       WRITE_BYTE(EOF_MARKER);
+
        return 1;
 
 error:
@@ -279,34 +363,10 @@ error:
        return 0;
 }
 
-bool cache_get(const char *cache_path, struct filelist *l)
+bool cache_get(const char *path, struct filelist *l)
 {
-       int ret = 0;
-       gzFile f = NULL;
-
-       int fd = open(cache_path, O_RDONLY | O_BINARY);
-       if (fd == -1) {
-               // Cache miss.
-               cc_log("No such cache file");
-               goto out;
-       }
-       f = gzdopen(fd, "rb");
-       if (!f) {
-               close(fd);
-               cc_log("Failed to gzdopen cache file");
-               goto out;
-       }
-       l = read_cache(f, l, true);
-       if (!l) {
-               cc_log("Error reading cache file");
-               goto out;
-       }
-       ret = 1;
-out:
-       if (f) {
-               gzclose(f);
-       }
-       return ret;
+       cc_log("Getting result %s from cache", path);
+       return read_cache(path, l, NULL);
 }
 
 bool cache_put(const char *cache_path, struct filelist *l, int compression_level)
@@ -314,21 +374,32 @@ bool cache_put(const char *cache_path, struct filelist *l, int compression_level
        int ret = 0;
        gzFile f2 = NULL;
        char *tmp_file = NULL;
-       char *mode;
 
        tmp_file = format("%s.tmp", cache_path);
        int fd = create_tmp_fd(&tmp_file);
+
+       char header[7];
+       memcpy(header, MAGIC, sizeof(MAGIC));
+       header[4] = RESULT_VERSION;
+       header[5] = compression_level == 0 ? COMPR_TYPE_NONE : COMPR_TYPE_GZIP;
+       header[6] = compression_level;
+       if (write(fd, header, sizeof(header)) != (ssize_t)sizeof(header)) {
+               cc_log("Failed to write to %s", tmp_file);
+               close(fd);
+       }
+
+       char *mode;
        if (compression_level > 0) {
                mode = format("wb%d", compression_level);
        } else {
                mode = x_strdup("wbT");
        }
        f2 = gzdopen(fd, mode);
+       free(mode);
        if (!f2) {
                cc_log("Failed to gzdopen %s", tmp_file);
                goto out;
        }
-       free(mode);
 
        if (write_cache(f2, l)) {
                gzclose(f2);
@@ -356,46 +427,5 @@ out:
 bool
 cache_dump(const char *cache_path, FILE *stream)
 {
-       struct filelist *l = create_empty_filelist();
-       gzFile f = NULL;
-       bool ret = false;
-
-       int fd = open(cache_path, O_RDONLY | O_BINARY);
-       if (fd == -1) {
-               fprintf(stderr, "No such cache file: %s\n", cache_path);
-               goto out;
-       }
-       f = gzdopen(fd, "rb");
-       if (!f) {
-               fprintf(stderr, "Failed to gzdopen cache file\n");
-               close(fd);
-               goto out;
-       }
-       l = read_cache(f, l, false);
-       if (!l) {
-               fprintf(stderr, "Error reading cache file\n");
-               goto out;
-       }
-
-       fprintf(stream, "Magic: %c%c%c%c\n",
-               (MAGIC >> 24) & 0xFF,
-               (MAGIC >> 16) & 0xFF,
-               (MAGIC >> 8) & 0xFF,
-               MAGIC & 0xFF);
-       fprintf(stream, "File paths (%u):\n", (unsigned)l->n_files);
-       for (unsigned i = 0; i < l->n_files; ++i) {
-               fprintf(stream, "  %u: %s (%s)\n", i, l->files[i].suffix,
-                               format_human_readable_size(l->sizes[i]));
-       }
-
-       ret = true;
-
-out:
-       if (l) {
-               free_filelist(l);
-       }
-       if (f) {
-               gzclose(f);
-       }
-       return ret;
+       return read_cache(cache_path, NULL, stream);
 }