]> git.ipfire.org Git - thirdparty/systemd.git/commitdiff
journal: Use 32-bit entry array offsets in compact mode
authorDaan De Meyer <daan.j.demeyer@gmail.com>
Sat, 23 Oct 2021 21:36:47 +0000 (22:36 +0100)
committerDaan De Meyer <daan.j.demeyer@gmail.com>
Fri, 7 Oct 2022 10:20:08 +0000 (12:20 +0200)
Before:

OBJECT TYPE      ENTRIES SIZE
Unused           0       0B
Data             3610336 595.7M
Field            5310    285.2K
Entry            3498326 1.2G
Data Hash Table  29  103.1M
Field Hash Table 29      151.3K
Entry Array      605991  1011.6M
Tag              0  0B
Total            7720021 2.9G

After:

OBJECT TYPE      ENTRIES SIZE
Unused           0  0B
Data             3562667 591.0M
Field            3971    213.6K
Entry            3498566 1.2G
Data Hash Table  20  71.1M
Field Hash Table 20  104.3K
Entry Array  582647  505.0M
Tag              0  0B
Total            7647891 2.4G

docs/JOURNAL_FILE_FORMAT.md
src/journal/managed-journal-file.c
src/libsystemd/sd-journal/journal-def.h
src/libsystemd/sd-journal/journal-file.c
src/libsystemd/sd-journal/journal-file.h
src/libsystemd/sd-journal/journal-verify.c

index d40688c440b75e70b13fcbf2d57e7ffdc73ea210..c4484693af52fe6d843399d1e142c34bfeaa6b99 100644 (file)
@@ -71,7 +71,7 @@ thread](https://lists.freedesktop.org/archives/systemd-devel/2012-October/007054
 
 ## Basics
 
-* All offsets, sizes, time values, hashes (and most other numeric values) are 64bit unsigned integers in LE format.
+* All offsets, sizes, time values, hashes (and most other numeric values) are 32bit/64bit unsigned integers in LE format.
 * Offsets are always relative to the beginning of the file.
 * The 64bit hash function siphash24 is used for newer journal files. For older files [Jenkins lookup3](https://en.wikipedia.org/wiki/Jenkins_hash_function) is used, more specifically `jenkins_hashlittle2()` with the first 32bit integer it returns as higher 32bit part of the 64bit value, and the second one uses as lower 32bit part.
 * All structures are aligned to 64bit boundaries and padded to multiples of 64bit
@@ -552,7 +552,10 @@ creativity rather than runtime parameters.
 _packed_ struct EntryArrayObject {
         ObjectHeader object;
         le64_t next_entry_array_offset;
-        le64_t items[];
+        union {
+                le64_t regular[];
+                le32_t compact[];
+        } items;
 };
 ```
 
@@ -560,6 +563,9 @@ Entry Arrays are used to store a sorted array of offsets to entries. Entry
 arrays are strictly sorted by offsets on disk, and hence by their timestamps
 and sequence numbers (with some restrictions, see above).
 
+If the `HEADER_INCOMPATIBLE_COMPACT` flag is set, offsets are stored as 32-bit
+integers instead of 64bit.
+
 Entry Arrays are chained up. If one entry array is full another one is
 allocated and the **next_entry_array_offset** field of the old one pointed to
 it. An Entry Array with **next_entry_array_offset** set to 0 is the last in the
index c22aac32715e38036ed7d9d0e84ada026e8a406e..c8522126f3732556bb6a6c2ec1b10eb74e3da42b 100644 (file)
@@ -50,7 +50,7 @@ static int managed_journal_file_entry_array_punch_hole(JournalFile *f, uint64_t
                 if (r < 0)
                         return r;
 
-                n_items += journal_file_entry_array_n_items(&o);
+                n_items += journal_file_entry_array_n_items(f, &o);
                 p = q;
         }
 
@@ -67,7 +67,7 @@ static int managed_journal_file_entry_array_punch_hole(JournalFile *f, uint64_t
                 return 0;
 
         offset = p + offsetof(Object, entry_array.items) +
-                (journal_file_entry_array_n_items(&o) - n_unused) * sizeof(le64_t);
+                (journal_file_entry_array_n_items(f, &o) - n_unused) * journal_file_entry_array_item_size(f);
         sz = p + le64toh(o.object.size) - offset;
 
         if (sz < MINIMUM_HOLE_SIZE)
index e0919a7faa7014edfc27a79de08e262c2847df5f..c35e438518a7f1fea0efb75642dca0d1ab5ba555 100644 (file)
@@ -117,7 +117,10 @@ struct HashTableObject {
 struct EntryArrayObject {
         ObjectHeader object;
         le64_t next_entry_array_offset;
-        le64_t items[];
+        union {
+                le64_t regular[0];
+                le32_t compact[0];
+        } items;
 } _packed_;
 
 #define TAG_LENGTH (256/8)
index edec27610f64c78c9e5b32057fb6cc12e94fe195..d9aa9a3806ad39cadb490ac3800cc1c7844908bf 100644 (file)
@@ -716,7 +716,7 @@ static int check_object_header(Object *o, ObjectType type, uint64_t offset) {
 
 /* Lightweight object checks. We want this to be fast, so that we won't
  * slowdown every journal_file_move_to_object() call too much. */
-static int check_object(Object *o, uint64_t offset) {
+static int check_object(JournalFile *f, Object *o, uint64_t offset) {
         assert(o);
 
         switch (o->object.type) {
@@ -827,8 +827,8 @@ static int check_object(Object *o, uint64_t offset) {
 
                 sz = le64toh(READ_NOW(o->object.size));
                 if (sz < offsetof(Object, entry_array.items) ||
-                    (sz - offsetof(Object, entry_array.items)) % sizeof(le64_t) != 0 ||
-                    (sz - offsetof(Object, entry_array.items)) / sizeof(le64_t) <= 0)
+                    (sz - offsetof(Object, entry_array.items)) % journal_file_entry_array_item_size(f) != 0 ||
+                    (sz - offsetof(Object, entry_array.items)) / journal_file_entry_array_item_size(f) <= 0)
                         return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
                                                "Invalid object entry array size: %" PRIu64 ": %" PRIu64,
                                                sz,
@@ -895,7 +895,7 @@ int journal_file_move_to_object(JournalFile *f, ObjectType type, uint64_t offset
         if (r < 0)
                 return r;
 
-        r = check_object(o, offset);
+        r = check_object(f, o, offset);
         if (r < 0)
                 return r;
 
@@ -944,7 +944,7 @@ int journal_file_read_object_header(JournalFile *f, ObjectType type, uint64_t of
                                        "Short read while reading object: %" PRIu64,
                                        offset);
 
-        r = check_object(&o, offset);
+        r = check_object(f, &o, offset);
         if (r < 0)
                 return r;
 
@@ -1672,7 +1672,7 @@ uint64_t journal_file_entry_n_items(Object *o) {
         return (sz - offsetof(Object, entry.items)) / sizeof(EntryItem);
 }
 
-uint64_t journal_file_entry_array_n_items(Object *o) {
+uint64_t journal_file_entry_array_n_items(JournalFile *f, Object *o) {
         uint64_t sz;
 
         assert(o);
@@ -1684,7 +1684,7 @@ uint64_t journal_file_entry_array_n_items(Object *o) {
         if (sz < offsetof(Object, entry_array.items))
                 return 0;
 
-        return (sz - offsetof(Object, entry_array.items)) / sizeof(uint64_t);
+        return (sz - offsetof(Object, entry_array.items)) / journal_file_entry_array_item_size(f);
 }
 
 uint64_t journal_file_hash_table_n_items(Object *o) {
@@ -1702,6 +1702,17 @@ uint64_t journal_file_hash_table_n_items(Object *o) {
         return (sz - offsetof(Object, hash_table.items)) / sizeof(HashItem);
 }
 
+static void write_entry_array_item(JournalFile *f, Object *o, uint64_t i, uint64_t p) {
+        assert(f);
+        assert(o);
+
+        if (JOURNAL_HEADER_COMPACT(f->header)) {
+                assert(p <= UINT32_MAX);
+                o->entry_array.items.compact[i] = htole32(p);
+        } else
+                o->entry_array.items.regular[i] = htole64(p);
+}
+
 static int link_entry_into_array(JournalFile *f,
                                  le64_t *first,
                                  le64_t *idx,
@@ -1724,9 +1735,9 @@ static int link_entry_into_array(JournalFile *f,
                 if (r < 0)
                         return r;
 
-                n = journal_file_entry_array_n_items(o);
+                n = journal_file_entry_array_n_items(f, o);
                 if (i < n) {
-                        o->entry_array.items[i] = htole64(p);
+                        write_entry_array_item(f, o, i, p);
                         *idx = htole64(hidx + 1);
                         return 0;
                 }
@@ -1745,7 +1756,7 @@ static int link_entry_into_array(JournalFile *f,
                 n = 4;
 
         r = journal_file_append_object(f, OBJECT_ENTRY_ARRAY,
-                                       offsetof(Object, entry_array.items) + n * sizeof(uint64_t),
+                                       offsetof(Object, entry_array.items) + n * journal_file_entry_array_item_size(f),
                                        &o, &q);
         if (r < 0)
                 return r;
@@ -1756,7 +1767,7 @@ static int link_entry_into_array(JournalFile *f,
                 return r;
 #endif
 
-        o->entry_array.items[i] = htole64(p);
+        write_entry_array_item(f, o, i, p);
 
         if (ap == 0)
                 *first = htole64(q);
@@ -2277,7 +2288,7 @@ static int generic_array_get(
                 if (r < 0)
                         return r;
 
-                k = journal_file_entry_array_n_items(o);
+                k = journal_file_entry_array_n_items(f, o);
                 if (i < k)
                         break;
 
@@ -2297,7 +2308,7 @@ static int generic_array_get(
                         if (r < 0)
                                 return r;
 
-                        k = journal_file_entry_array_n_items(o);
+                        k = journal_file_entry_array_n_items(f, o);
                         if (k == 0)
                                 break;
 
@@ -2305,12 +2316,12 @@ static int generic_array_get(
                 }
 
                 do {
-                        p = le64toh(o->entry_array.items[i]);
+                        p = journal_file_entry_array_item(f, o, i);
 
                         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, ret);
                         if (r >= 0) {
                                 /* Let's cache this item for the next invocation */
-                                chain_cache_put(f->chain_cache, ci, first, a, le64toh(o->entry_array.items[0]), t, i);
+                                chain_cache_put(f->chain_cache, ci, first, a, journal_file_entry_array_item(f, o, 0), t, i);
 
                                 if (ret_offset)
                                         *ret_offset = p;
@@ -2438,13 +2449,13 @@ static int generic_array_bisect(
                 if (r < 0)
                         return r;
 
-                k = journal_file_entry_array_n_items(array);
+                k = journal_file_entry_array_n_items(f, array);
                 right = MIN(k, n);
                 if (right <= 0)
                         return 0;
 
                 i = right - 1;
-                lp = p = le64toh(array->entry_array.items[i]);
+                lp = p = journal_file_entry_array_item(f, array, i);
                 if (p <= 0)
                         r = -EBADMSG;
                 else
@@ -2477,7 +2488,7 @@ static int generic_array_bisect(
                                 if (last_index > 0) {
                                         uint64_t x = last_index - 1;
 
-                                        p = le64toh(array->entry_array.items[x]);
+                                        p = journal_file_entry_array_item(f, array, x);
                                         if (p <= 0)
                                                 return -EBADMSG;
 
@@ -2497,7 +2508,7 @@ static int generic_array_bisect(
                                 if (last_index < right) {
                                         uint64_t y = last_index + 1;
 
-                                        p = le64toh(array->entry_array.items[y]);
+                                        p = journal_file_entry_array_item(f, array, y);
                                         if (p <= 0)
                                                 return -EBADMSG;
 
@@ -2527,7 +2538,7 @@ static int generic_array_bisect(
                                 assert(left < right);
                                 i = (left + right) / 2;
 
-                                p = le64toh(array->entry_array.items[i]);
+                                p = journal_file_entry_array_item(f, array, i);
                                 if (p <= 0)
                                         r = -EBADMSG;
                                 else
@@ -2575,14 +2586,14 @@ found:
                 return 0;
 
         /* Let's cache this item for the next invocation */
-        chain_cache_put(f->chain_cache, ci, first, a, le64toh(array->entry_array.items[0]), t, subtract_one ? (i > 0 ? i-1 : UINT64_MAX) : i);
+        chain_cache_put(f->chain_cache, ci, first, a, journal_file_entry_array_item(f, array, 0), t, subtract_one ? (i > 0 ? i-1 : UINT64_MAX) : i);
 
         if (subtract_one && i == 0)
                 p = last_p;
         else if (subtract_one)
-                p = le64toh(array->entry_array.items[i-1]);
+                p = journal_file_entry_array_item(f, array, i - 1);
         else
-                p = le64toh(array->entry_array.items[i]);
+                p = journal_file_entry_array_item(f, array, i);
 
         if (ret) {
                 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, ret);
index 01942ec72bced957f3ea2b7e818b4477a987a197..9b5bd1ff3637d22ab77f5a6d726cba93f79128e9 100644 (file)
@@ -194,7 +194,20 @@ int journal_file_tail_end_by_pread(JournalFile *f, uint64_t *ret_offset);
 int journal_file_tail_end_by_mmap(JournalFile *f, uint64_t *ret_offset);
 
 uint64_t journal_file_entry_n_items(Object *o) _pure_;
-uint64_t journal_file_entry_array_n_items(Object *o) _pure_;
+uint64_t journal_file_entry_array_n_items(JournalFile *f, Object *o) _pure_;
+
+static inline uint64_t journal_file_entry_array_item(JournalFile *f, Object *o, size_t i) {
+        assert(f);
+        assert(o);
+        return JOURNAL_HEADER_COMPACT(f->header) ? le32toh(o->entry_array.items.compact[i]) :
+                                                   le64toh(o->entry_array.items.regular[i]);
+}
+
+static inline size_t journal_file_entry_array_item_size(JournalFile *f) {
+        assert(f);
+        return JOURNAL_HEADER_COMPACT(f->header) ? sizeof(le32_t) : sizeof(le64_t);
+}
+
 uint64_t journal_file_hash_table_n_items(Object *o) _pure_;
 
 int journal_file_append_object(JournalFile *f, ObjectType type, uint64_t size, Object **ret, uint64_t *ret_offset);
index b9f6a161edca6dbb80297f17864a7d7fb1ed44de..d0da9bf8064c098cc77f772cd1657e9fdfdfafaa 100644 (file)
@@ -335,8 +335,8 @@ static int journal_file_object_verify(JournalFile *f, uint64_t offset, Object *o
                 break;
 
         case OBJECT_ENTRY_ARRAY:
-                if ((le64toh(o->object.size) - offsetof(Object, entry_array.items)) % sizeof(le64_t) != 0 ||
-                    (le64toh(o->object.size) - offsetof(Object, entry_array.items)) / sizeof(le64_t) <= 0) {
+                if ((le64toh(o->object.size) - offsetof(Object, entry_array.items)) % journal_file_entry_array_item_size(f) != 0 ||
+                    (le64toh(o->object.size) - offsetof(Object, entry_array.items)) / journal_file_entry_array_item_size(f) <= 0) {
                         error(offset,
                               "Invalid object entry array size: %"PRIu64,
                               le64toh(o->object.size));
@@ -350,15 +350,15 @@ static int journal_file_object_verify(JournalFile *f, uint64_t offset, Object *o
                         return -EBADMSG;
                 }
 
-                for (uint64_t i = 0; i < journal_file_entry_array_n_items(o); i++)
-                        if (le64toh(o->entry_array.items[i]) != 0 &&
-                            !VALID64(le64toh(o->entry_array.items[i]))) {
+                for (uint64_t i = 0; i < journal_file_entry_array_n_items(f, o); i++) {
+                        uint64_t q = journal_file_entry_array_item(f, o, i);
+                        if (q != 0 && !VALID64(q)) {
                                 error(offset,
                                       "Invalid object entry array item (%"PRIu64"/%"PRIu64"): "OFSfmt,
-                                      i, journal_file_entry_array_n_items(o),
-                                      le64toh(o->entry_array.items[i]));
+                                      i, journal_file_entry_array_n_items(f, o), q);
                                 return -EBADMSG;
                         }
+                }
 
                 break;
 
@@ -490,10 +490,10 @@ static int verify_data(
                         return -EBADMSG;
                 }
 
-                m = journal_file_entry_array_n_items(o);
+                m = journal_file_entry_array_n_items(f, o);
                 for (j = 0; i < n && j < m; i++, j++) {
 
-                        q = le64toh(o->entry_array.items[j]);
+                        q = journal_file_entry_array_item(f, o, j);
                         if (q <= last) {
                                 error(p, "Data object's entry array not sorted (%"PRIu64" <= %"PRIu64")", q, last);
                                 return -EBADMSG;
@@ -737,11 +737,11 @@ static int verify_entry_array(
                         return -EBADMSG;
                 }
 
-                m = journal_file_entry_array_n_items(o);
+                m = journal_file_entry_array_n_items(f, o);
                 for (j = 0; i < n && j < m; i++, j++) {
                         uint64_t p;
 
-                        p = le64toh(o->entry_array.items[j]);
+                        p = journal_file_entry_array_item(f, o, j);
                         if (p <= last) {
                                 error(a, "Entry array not sorted at %"PRIu64" of %"PRIu64, i, n);
                                 return -EBADMSG;