journal: use a different hash function for each journal file

author Lennart Poettering <lennart@poettering.net>

Fri, 29 May 2020 22:00:50 +0000 (00:00 +0200)

committer Lennart Poettering <lennart@poettering.net>

Thu, 25 Jun 2020 13:01:45 +0000 (15:01 +0200)
author Lennart Poettering <lennart@poettering.net>
Fri, 29 May 2020 22:00:50 +0000 (00:00 +0200)
committer Lennart Poettering <lennart@poettering.net>
Thu, 25 Jun 2020 13:01:45 +0000 (15:01 +0200)
diff --git a/src/journal/journal-def.h b/src/journal/journal-def.h

index 54260c97b02406a598ce696fde86d7d12c78c9fd..431f46bb593bbf2834984de27917d228610f19d8 100644 (file)
--- a/src/journal/journal-def.h
+++ b/src/journal/journal-def.h
@@ -147,18 +147,22 @@ enum {
  enum {
          HEADER_INCOMPATIBLE_COMPRESSED_XZ   = 1 << 0,
          HEADER_INCOMPATIBLE_COMPRESSED_LZ4  = 1 << 1,
+        HEADER_INCOMPATIBLE_KEYED_HASH      = 1 << 2,
  };
  
-#define HEADER_INCOMPATIBLE_ANY (HEADER_INCOMPATIBLE_COMPRESSED_XZ|HEADER_INCOMPATIBLE_COMPRESSED_LZ4)
+#define HEADER_INCOMPATIBLE_ANY              \
+        (HEADER_INCOMPATIBLE_COMPRESSED_XZ|  \
+         HEADER_INCOMPATIBLE_COMPRESSED_LZ4| \
+         HEADER_INCOMPATIBLE_KEYED_HASH)
  
  #if HAVE_XZ && HAVE_LZ4
  #  define HEADER_INCOMPATIBLE_SUPPORTED HEADER_INCOMPATIBLE_ANY
  #elif HAVE_XZ
-#  define HEADER_INCOMPATIBLE_SUPPORTED HEADER_INCOMPATIBLE_COMPRESSED_XZ
+#  define HEADER_INCOMPATIBLE_SUPPORTED (HEADER_INCOMPATIBLE_COMPRESSED_XZ|HEADER_INCOMPATIBLE_KEYED_HASH)
  #elif HAVE_LZ4
-#  define HEADER_INCOMPATIBLE_SUPPORTED HEADER_INCOMPATIBLE_COMPRESSED_LZ4
+#  define HEADER_INCOMPATIBLE_SUPPORTED (HEADER_INCOMPATIBLE_COMPRESSED_LZ4|HEADER_INCOMPATIBLE_KEYED_HASH)
  #else
-#  define HEADER_INCOMPATIBLE_SUPPORTED 0
+#  define HEADER_INCOMPATIBLE_SUPPORTED HEADER_INCOMPATIBLE_KEYED_HASH
  #endif
  
  enum {
diff --git a/src/journal/journal-file.c b/src/journal/journal-file.c

index c77a9436e6b7abb15ce34dd7e0479cdc6797585c..8ae966a6b2d9fac84af904a0db9522267b101387 100644 (file)
--- a/src/journal/journal-file.c
+++ b/src/journal/journal-file.c
@@ -16,6 +16,7 @@
  #include "btrfs-util.h"
  #include "chattr-util.h"
  #include "compress.h"
+#include "env-util.h"
  #include "fd-util.h"
  #include "format-util.h"
  #include "fs-util.h"
@@ -419,7 +420,8 @@ static int journal_file_init_header(JournalFile *f, JournalFile *template) {
  
          h.incompatible_flags |= htole32(
                  f->compress_xz * HEADER_INCOMPATIBLE_COMPRESSED_XZ |
-                f->compress_lz4 * HEADER_INCOMPATIBLE_COMPRESSED_LZ4);
+                f->compress_lz4 * HEADER_INCOMPATIBLE_COMPRESSED_LZ4 |
+                f->keyed_hash * HEADER_INCOMPATIBLE_KEYED_HASH);
  
          h.compatible_flags = htole32(
                  f->seal * HEADER_COMPATIBLE_SEALED);
@@ -486,16 +488,21 @@ static bool warn_wrong_flags(const JournalFile *f, bool compatible) {
                                    f->path, type, flags & ~any);
                  flags = (flags & any) & ~supported;
                  if (flags) {
-                        const char* strv[3];
+                        const char* strv[4];
                          unsigned n = 0;
                          _cleanup_free_ char *t = NULL;
  
-                        if (compatible && (flags & HEADER_COMPATIBLE_SEALED))
-                                strv[n++] = "sealed";
-                        if (!compatible && (flags & HEADER_INCOMPATIBLE_COMPRESSED_XZ))
-                                strv[n++] = "xz-compressed";
-                        if (!compatible && (flags & HEADER_INCOMPATIBLE_COMPRESSED_LZ4))
-                                strv[n++] = "lz4-compressed";
+                        if (compatible) {
+                                if (flags & HEADER_COMPATIBLE_SEALED)
+                                        strv[n++] = "sealed";
+                        } else {
+                                if (flags & HEADER_INCOMPATIBLE_COMPRESSED_XZ)
+                                        strv[n++] = "xz-compressed";
+                                if (flags & HEADER_INCOMPATIBLE_COMPRESSED_LZ4)
+                                        strv[n++] = "lz4-compressed";
+                                if (flags & HEADER_INCOMPATIBLE_KEYED_HASH)
+                                        strv[n++] = "keyed-hash";
+                        }
                          strv[n] = NULL;
                          assert(n < ELEMENTSOF(strv));
  
@@ -595,6 +602,8 @@ static int journal_file_verify_header(JournalFile *f) {
  
          f->seal = JOURNAL_HEADER_SEALED(f->header);
  
+        f->keyed_hash = JOURNAL_HEADER_KEYED_HASH(f->header);
+
          return 0;
  }
  
@@ -1334,21 +1343,35 @@ int journal_file_find_field_object_with_hash(
          return 0;
  }
  
+uint64_t journal_file_hash_data(
+                JournalFile *f,
+                const void *data,
+                size_t sz) {
+
+        assert(f);
+        assert(data || sz == 0);
+
+        /* We try to unify our codebase on siphash, hence new-styled journal files utilizing the keyed hash
+         * function use siphash. Old journal files use the Jenkins hash. */
+
+        if (JOURNAL_HEADER_KEYED_HASH(f->header))
+                return siphash24(data, sz, f->header->file_id.bytes);
+
+        return jenkins_hash64(data, sz);
+}
+
  int journal_file_find_field_object(
                  JournalFile *f,
                  const void *field, uint64_t size,
                  Object **ret, uint64_t *ret_offset) {
  
-        uint64_t hash;
-
          assert(f);
          assert(field && size > 0);
  
-        hash = jenkins_hash64(field, size);
-
          return journal_file_find_field_object_with_hash(
                          f,
-                        field, size, hash,
+                        field, size,
+                        journal_file_hash_data(f, field, size),
                          ret, ret_offset);
  }
  
@@ -1446,16 +1469,13 @@ int journal_file_find_data_object(
                  const void *data, uint64_t size,
                  Object **ret, uint64_t *ret_offset) {
  
-        uint64_t hash;
-
          assert(f);
          assert(data || size == 0);
  
-        hash = jenkins_hash64(data, size);
-
          return journal_file_find_data_object_with_hash(
                          f,
-                        data, size, hash,
+                        data, size,
+                        journal_file_hash_data(f, data, size),
                          ret, ret_offset);
  }
  
@@ -1472,7 +1492,7 @@ static int journal_file_append_field(
          assert(f);
          assert(field && size > 0);
  
-        hash = jenkins_hash64(field, size);
+        hash = journal_file_hash_data(f, field, size);
  
          r = journal_file_find_field_object_with_hash(f, field, size, hash, &o, &p);
          if (r < 0)
@@ -1535,7 +1555,7 @@ static int journal_file_append_data(
          assert(f);
          assert(data || size == 0);
  
-        hash = jenkins_hash64(data, size);
+        hash = journal_file_hash_data(f, data, size);
  
          r = journal_file_find_data_object_with_hash(f, data, size, hash, &o, &p);
          if (r < 0)
@@ -2028,7 +2048,20 @@ int journal_file_append_entry(
                  if (r < 0)
                          return r;
  
-                xor_hash ^= le64toh(o->data.hash);
+                /* When calculating the XOR hash field, we need to take special care if the "keyed-hash"
+                 * journal file flag is on. We use the XOR hash field to quickly determine the identity of a
+                 * specific record, and give records with otherwise identical position (i.e. match in seqno,
+                 * timestamp, …) a stable ordering. But for that we can't have it that the hash of the
+                 * objects in each file is different since they are keyed. Hence let's calculate the Jenkins
+                 * hash here for that. This also has the benefit that cursors for old and new journal files
+                 * are completely identical (they include the XOR hash after all). For classic Jenkins-hash
+                 * files things are easier, we can just take the value from the stored record directly. */
+
+                if (JOURNAL_HEADER_KEYED_HASH(f->header))
+                        xor_hash ^= jenkins_hash64(iovec[i].iov_base, iovec[i].iov_len);
+                else
+                        xor_hash ^= le64toh(o->data.hash);
+
                  items[i].object_offset = htole64(p);
                  items[i].hash = o->data.hash;
          }
@@ -3149,7 +3182,7 @@ void journal_file_print_header(JournalFile *f) {
                 "Sequential number ID: %s\n"
                 "State: %s\n"
                 "Compatible flags:%s%s\n"
-               "Incompatible flags:%s%s%s\n"
+               "Incompatible flags:%s%s%s%s\n"
                 "Header size: %"PRIu64"\n"
                 "Arena size: %"PRIu64"\n"
                 "Data hash table size: %"PRIu64"\n"
@@ -3174,6 +3207,7 @@ void journal_file_print_header(JournalFile *f) {
                 (le32toh(f->header->compatible_flags) & ~HEADER_COMPATIBLE_ANY) ? " ???" : "",
                 JOURNAL_HEADER_COMPRESSED_XZ(f->header) ? " COMPRESSED-XZ" : "",
                 JOURNAL_HEADER_COMPRESSED_LZ4(f->header) ? " COMPRESSED-LZ4" : "",
+               JOURNAL_HEADER_KEYED_HASH(f->header) ? " KEYED-HASH" : "",
                 (le32toh(f->header->incompatible_flags) & ~HEADER_INCOMPATIBLE_ANY) ? " ???" : "",
                 le64toh(f->header->header_size),
                 le64toh(f->header->arena_size),
@@ -3299,19 +3333,31 @@ int journal_file_open(
  #endif
          };
  
+        /* We turn on keyed hashes by default, but provide an environment variable to turn them off, if
+         * people really want that */
+        r = getenv_bool("SYSTEMD_JOURNAL_KEYED_HASH");
+        if (r < 0) {
+                if (r != -ENXIO)
+                        log_debug_errno(r, "Failed to parse $SYSTEMD_JOURNAL_KEYED_HASH environment variable, ignoring.");
+                f->keyed_hash = true;
+        } else
+                f->keyed_hash = r;
+
          if (DEBUG_LOGGING) {
-                static int last_seal = -1, last_compress = -1;
+                static int last_seal = -1, last_compress = -1, last_keyed_hash = -1;
                  static uint64_t last_bytes = UINT64_MAX;
                  char bytes[FORMAT_BYTES_MAX];
  
                  if (last_seal != f->seal ||
+                    last_keyed_hash != f->keyed_hash ||
                      last_compress != JOURNAL_FILE_COMPRESS(f) ||
                      last_bytes != f->compress_threshold_bytes) {
  
-                        log_debug("Journal effective settings seal=%s compress=%s compress_threshold_bytes=%s",
-                                  yes_no(f->seal), yes_no(JOURNAL_FILE_COMPRESS(f)),
+                        log_debug("Journal effective settings seal=%s keyed_hash=%s compress=%s compress_threshold_bytes=%s",
+                                  yes_no(f->seal), yes_no(f->keyed_hash), yes_no(JOURNAL_FILE_COMPRESS(f)),
                                    format_bytes(bytes, sizeof bytes, f->compress_threshold_bytes));
                          last_seal = f->seal;
+                        last_keyed_hash = f->keyed_hash;
                          last_compress = JOURNAL_FILE_COMPRESS(f);
                          last_bytes = f->compress_threshold_bytes;
                  }
@@ -3769,7 +3815,11 @@ int journal_file_copy_entry(JournalFile *from, JournalFile *to, Object *o, uint6
                  if (r < 0)
                          return r;
  
-                xor_hash ^= le64toh(u->data.hash);
+                if (JOURNAL_HEADER_KEYED_HASH(to->header))
+                        xor_hash ^= jenkins_hash64(data, l);
+                else
+                        xor_hash ^= le64toh(u->data.hash);
+
                  items[i].object_offset = htole64(h);
                  items[i].hash = u->data.hash;
  
diff --git a/src/journal/journal-file.h b/src/journal/journal-file.h

index 121e9153a6ba062407ed228c52c3e0fce0049acb..732c2f31cd263dc28dae7406668843bf03175244 100644 (file)
--- a/src/journal/journal-file.h
+++ b/src/journal/journal-file.h
@@ -71,6 +71,7 @@ typedef struct JournalFile {
          bool defrag_on_close:1;
          bool close_fd:1;
          bool archive:1;
+        bool keyed_hash:1;
  
          direction_t last_direction;
          LocationType location_type;
@@ -195,6 +196,9 @@ static inline bool VALID_EPOCH(uint64_t u) {
  #define JOURNAL_HEADER_COMPRESSED_LZ4(h) \
          FLAGS_SET(le32toh((h)->incompatible_flags), HEADER_INCOMPATIBLE_COMPRESSED_LZ4)
  
+#define JOURNAL_HEADER_KEYED_HASH(h) \
+        FLAGS_SET(le32toh((h)->incompatible_flags), HEADER_INCOMPATIBLE_KEYED_HASH)
+
  int journal_file_move_to_object(JournalFile *f, ObjectType type, uint64_t offset, Object **ret);
  
  uint64_t journal_file_entry_n_items(Object *o) _pure_;
@@ -262,3 +266,5 @@ static inline bool JOURNAL_FILE_COMPRESS(JournalFile *f) {
          assert(f);
          return f->compress_xz || f->compress_lz4;
  }
+
+uint64_t journal_file_hash_data(JournalFile *f, const void *data, size_t sz);
diff --git a/src/journal/journal-internal.h b/src/journal/journal-internal.h

index 028f0d9055d470480e075c7087506302bb2f16c0..a649acf634e74d4409da33af209cce716f5a7eaa 100644 (file)
--- a/src/journal/journal-internal.h
+++ b/src/journal/journal-internal.h
@@ -32,7 +32,7 @@ struct Match {
          /* For concrete matches */
          char *data;
          size_t size;
-        uint64_t hash;
+        uint64_t hash; /* old-style jenkins hash. New-style siphash is different per file, hence won't be cached here */
  
          /* For terms */
          LIST_HEAD(Match, matches);
diff --git a/src/journal/journal-verify.c b/src/journal/journal-verify.c

index c70ab7aa24459ea2550feb79edf9ea29122401cf..fe9997bc14518c430dc2e691a8eb160da2d3ea50 100644 (file)
--- a/src/journal/journal-verify.c
+++ b/src/journal/journal-verify.c
@@ -163,9 +163,9 @@ static int journal_file_object_verify(JournalFile *f, uint64_t offset, Object *o
                                  return r;
                          }
  
-                        h2 = jenkins_hash64(b, b_size);
+                        h2 = journal_file_hash_data(f, b, b_size);
                  } else
-                        h2 = jenkins_hash64(o->data.payload, le64toh(o->object.size) - offsetof(Object, data.payload));
+                        h2 = journal_file_hash_data(f, o->data.payload, le64toh(o->object.size) - offsetof(Object, data.payload));
  
                  if (h1 != h2) {
                          error(offset, "Invalid hash (%08"PRIx64" vs. %08"PRIx64, h1, h2);
diff --git a/src/journal/sd-journal.c b/src/journal/sd-journal.c

index 5ddca5f93a8b8e370bfe52c96c3213e2626a5ecb..515bb82621f02d9585c737978bb68e0b9cec31de 100644 (file)
--- a/src/journal/sd-journal.c
+++ b/src/journal/sd-journal.c
@@ -279,6 +279,8 @@ _public_ int sd_journal_add_match(sd_journal *j, const void *data, size_t size)
          assert(j->level1->type == MATCH_OR_TERM);
          assert(j->level2->type == MATCH_AND_TERM);
  
+        /* Old-style Jenkins (unkeyed) hashing only here. We do not cover new-style siphash (keyed) hashing
+         * here, since it's different for each file, and thus can't be pre-calculated in the Match object. */
          hash = jenkins_hash64(data, size);
  
          LIST_FOREACH(matches, l3, j->level2->matches) {
@@ -501,9 +503,16 @@ static int next_for_match(
          assert(f);
  
          if (m->type == MATCH_DISCRETE) {
-                uint64_t dp;
+                uint64_t dp, hash;
  
-                r = journal_file_find_data_object_with_hash(f, m->data, m->size, m->hash, NULL, &dp);
+                /* If the keyed hash logic is used, we need to calculate the hash fresh per file. Otherwise
+                 * we can use what we pre-calculated. */
+                if (JOURNAL_HEADER_KEYED_HASH(f->header))
+                        hash = journal_file_hash_data(f, m->data, m->size);
+                else
+                        hash = m->hash;
+
+                r = journal_file_find_data_object_with_hash(f, m->data, m->size, hash, NULL, &dp);
                  if (r <= 0)
                          return r;
  
@@ -590,9 +599,14 @@ static int find_location_for_match(
          assert(f);
  
          if (m->type == MATCH_DISCRETE) {
-                uint64_t dp;
+                uint64_t dp, hash;
+
+                if (JOURNAL_HEADER_KEYED_HASH(f->header))
+                        hash = journal_file_hash_data(f, m->data, m->size);
+                else
+                        hash = m->hash;
  
-                r = journal_file_find_data_object_with_hash(f, m->data, m->size, m->hash, NULL, &dp);
+                r = journal_file_find_data_object_with_hash(f, m->data, m->size, hash, NULL, &dp);
                  if (r <= 0)
                          return r;
  
diff --git a/src/journal/test-journal-stream.c b/src/journal/test-journal-stream.c

index 6d97bc5ce8aa6f656c434efdc903d8a4f7bf14ff..50aab11c6a8b61182f4a3914477e7b7909aa79bb 100644 (file)
--- a/src/journal/test-journal-stream.c
+++ b/src/journal/test-journal-stream.c
@@ -58,7 +58,7 @@ static void verify_contents(sd_journal *j, unsigned skip) {
                  assert_se(i == N_ENTRIES);
  }
  
-int main(int argc, char *argv[]) {
+static void run_test(void) {
          JournalFile *one, *two, *three;
          char t[] = "/var/tmp/journal-stream-XXXXXX";
          unsigned i;
@@ -68,12 +68,6 @@ int main(int argc, char *argv[]) {
          size_t l;
          dual_timestamp previous_ts = DUAL_TIMESTAMP_NULL;
  
-        /* journal_file_open requires a valid machine id */
-        if (access("/etc/machine-id", F_OK) != 0)
-                return log_tests_skipped("/etc/machine-id not found");
-
-        test_setup_logging(LOG_DEBUG);
-
          assert_se(mkdtemp(t));
          assert_se(chdir(t) >= 0);
          (void) chattr_path(t, FS_NOCOW_FL, FS_NOCOW_FL, NULL);
@@ -177,6 +171,22 @@ int main(int argc, char *argv[]) {
                  printf("%.*s\n", (int) l, (const char*) data);
  
          assert_se(rm_rf(t, REMOVE_ROOT|REMOVE_PHYSICAL) >= 0);
+}
+
+int main(int argc, char *argv[]) {
+
+        /* journal_file_open requires a valid machine id */
+        if (access("/etc/machine-id", F_OK) != 0)
+                return log_tests_skipped("/etc/machine-id not found");
+
+        test_setup_logging(LOG_DEBUG);
+
+        /* Run this test twice. Once with old hashing and once with new hashing */
+        assert_se(setenv("SYSTEMD_JOURNAL_KEYED_HASH", "1", 1) >= 0);
+        run_test();
+
+        assert_se(setenv("SYSTEMD_JOURNAL_KEYED_HASH", "0", 1) >= 0);
+        run_test();
  
          return 0;
  }
author	Lennart Poettering <lennart@poettering.net>
	Fri, 29 May 2020 22:00:50 +0000 (00:00 +0200)
committer	Lennart Poettering <lennart@poettering.net>
	Thu, 25 Jun 2020 13:01:45 +0000 (15:01 +0200)
src/journal/journal-def.h		patch \| blob \| blame \| history
src/journal/journal-file.c		patch \| blob \| blame \| history
src/journal/journal-file.h		patch \| blob \| blame \| history
src/journal/journal-internal.h		patch \| blob \| blame \| history
src/journal/journal-verify.c		patch \| blob \| blame \| history
src/journal/sd-journal.c		patch \| blob \| blame \| history
src/journal/test-journal-stream.c		patch \| blob \| blame \| history