From 1435c4b20cb7d2151956689629c20f83f32162c4 Mon Sep 17 00:00:00 2001 From: Mostyn Bramley-Moore Date: Sat, 1 Nov 2025 22:42:30 +0100 Subject: [PATCH] Skip over LZ4/zstd skippable frames when detecting data format Since LZ4 and zstd share the same format for skippable frames, we need to skip over these frames when trying to detect the format of compressed data. Let's read up to something like 64kb of data when performing this scanning. Note that the LZ4 specification advises against starting with a skippable frame, but doesn't forbid it: > For the purpose of facilitating identification, it is discouraged to > start a flow of concatenated frames with a skippable frame. If there > is a need to start such a flow with some user data encapsulated into > a skippable frame, it's recommended to start with a zero-byte LZ4 > frame followed by a skippable frame. This will make it easier for > file type identifiers. Resolves #2692. --- libarchive/archive_read_support_filter_lz4.c | 82 ++++++++++++++++--- libarchive/archive_read_support_filter_zstd.c | 67 ++++++++++++--- 2 files changed, 124 insertions(+), 25 deletions(-) diff --git a/libarchive/archive_read_support_filter_lz4.c b/libarchive/archive_read_support_filter_lz4.c index 760e6d938..144572ef2 100644 --- a/libarchive/archive_read_support_filter_lz4.c +++ b/libarchive/archive_read_support_filter_lz4.c @@ -49,9 +49,12 @@ #include "archive_xxhash.h" #define LZ4_MAGICNUMBER 0x184d2204 -#define LZ4_SKIPPABLED 0x184d2a50 #define LZ4_LEGACY 0x184c2102 +// Note: LZ4 and zstd share the same skippable frame format with the same magic numbers. +#define LZ4_SKIPPABLE_START 0x184D2A50 +#define LZ4_SKIPPABLE_MASK 0xFFFFFFF0 + #if defined(HAVE_LIBLZ4) struct private_data { enum { SELECT_STREAM, @@ -141,19 +144,67 @@ lz4_reader_bid(struct archive_read_filter_bidder *self, { const unsigned char *buffer; ssize_t avail; - int bits_checked; - uint32_t number; + int bits_checked = 0; + ssize_t min_lz4_archive_size = 11; + + // LZ4 skippable frames contain a 4 byte magic number followed by + // a 4 byte frame data size, then that number of bytes of data. Regular + // frames contain a 4 byte magic number followed by a 2-14 byte frame + // header, some data, and a 3 byte end marker. + ssize_t min_lz4_frame_size = 8; + + ssize_t offset_in_buffer = 0; + ssize_t max_lookahead = 64 * 1024; - (void)self; /* UNUSED */ + (void)self; // UNUSED - /* Minimal lz4 archive is 11 bytes. */ - buffer = __archive_read_filter_ahead(filter, 11, &avail); + // Zstd and LZ4 skippable frame magic numbers are identical. To + // differentiate these two, we need to look for a non-skippable + // frame. + + // Minimal lz4 archive is 11 bytes. + buffer = __archive_read_filter_ahead(filter, min_lz4_archive_size, &avail); if (buffer == NULL) return (0); - /* First four bytes must be LZ4 magic numbers. */ - bits_checked = 0; - if ((number = archive_le32dec(buffer)) == LZ4_MAGICNUMBER) { + uint32_t magic_number = archive_le32dec(buffer); + + while ((magic_number & LZ4_SKIPPABLE_MASK) == LZ4_SKIPPABLE_START) { + + offset_in_buffer += 4; // Skip over the magic number + + // Ensure that we can read another 4 bytes. + if (offset_in_buffer + 4 > avail) { + buffer = __archive_read_filter_ahead(filter, offset_in_buffer + 4, &avail); + if (buffer == NULL) + return (0); + } + + uint32_t frame_data_size = archive_le32dec(buffer + offset_in_buffer); + + // Skip over the 4 frame data size bytes, plus the value stored there. + offset_in_buffer += 4 + frame_data_size; + + // There should be at least one more frame if this is LZ4 data. + if (offset_in_buffer + min_lz4_frame_size > avail) { // TODO: should this be >= ? + if (offset_in_buffer + min_lz4_frame_size > max_lookahead) + return (0); + + buffer = __archive_read_filter_ahead(filter, offset_in_buffer + min_lz4_frame_size, &avail); + if (buffer == NULL) + return (0); + } + + magic_number = archive_le32dec(buffer + offset_in_buffer); + } + + // We have skipped over any skippable frames. Either a regular LZ4 frame + // follows, or this isn't LZ4 data. + + bits_checked = offset_in_buffer; + buffer = buffer + offset_in_buffer; + + if (magic_number == LZ4_MAGICNUMBER) { unsigned char flag, BD; bits_checked += 32; @@ -175,11 +226,16 @@ lz4_reader_bid(struct archive_read_filter_bidder *self, if (BD & ~0x70) return (0); bits_checked += 8; - } else if (number == LZ4_LEGACY) { + + return (bits_checked); + } + + if (magic_number == LZ4_LEGACY) { bits_checked += 32; + return (bits_checked); } - - return (bits_checked); + + return (0); } #if !defined(HAVE_LIBLZ4) @@ -342,7 +398,7 @@ lz4_filter_read(struct archive_read_filter *self, const void **p) return lz4_filter_read_default_stream(self, p); else if (number == LZ4_LEGACY) return lz4_filter_read_legacy_stream(self, p); - else if ((number & ~0xF) == LZ4_SKIPPABLED) { + else if ((number & LZ4_SKIPPABLE_MASK) == LZ4_SKIPPABLE_START) { read_buf = __archive_read_filter_ahead( self->upstream, 4, NULL); if (read_buf == NULL) { diff --git a/libarchive/archive_read_support_filter_zstd.c b/libarchive/archive_read_support_filter_zstd.c index 9a1dd71ab..da7c540db 100644 --- a/libarchive/archive_read_support_filter_zstd.c +++ b/libarchive/archive_read_support_filter_zstd.c @@ -110,24 +110,67 @@ zstd_bidder_bid(struct archive_read_filter_bidder *self, { const unsigned char *buffer; ssize_t avail; - unsigned prefix; - /* Zstd frame magic values */ - unsigned zstd_magic = 0xFD2FB528U; - unsigned zstd_magic_skippable_start = 0x184D2A50U; - unsigned zstd_magic_skippable_mask = 0xFFFFFFF0; + // Zstandard skippable frames contain a 4 byte magic number followed by + // a 4 byte frame data size, then that number of bytes of data. Regular + // frames contain a 4 byte magic number followed by a 2-14 byte frame + // header, some data, and a 3 byte end marker. + ssize_t min_zstd_frame_size = 8; - (void) self; /* UNUSED */ + ssize_t offset_in_buffer = 0; + ssize_t max_lookahead = 64 * 1024; - buffer = __archive_read_filter_ahead(filter, 4, &avail); + // Zstd regular frame magic number. + uint32_t zstd_magic = 0xFD2FB528U; + + // Note: Zstd and LZ4 skippable frame magic numbers are identical. + // To differentiate these two, we need to look for a non-skippable + // frame. + uint32_t zstd_magic_skippable_start = 0x184D2A50; + uint32_t zstd_magic_skippable_mask = 0xFFFFFFF0; + + (void) self; // UNUSED + + buffer = __archive_read_filter_ahead(filter, min_zstd_frame_size, &avail); if (buffer == NULL) return (0); - prefix = archive_le32dec(buffer); - if (prefix == zstd_magic) - return (32); - if ((prefix & zstd_magic_skippable_mask) == zstd_magic_skippable_start) - return (32); + uint32_t magic_number = archive_le32dec(buffer); + + while ((magic_number & zstd_magic_skippable_mask) == zstd_magic_skippable_start) { + + offset_in_buffer += 4; // Skip over the magic number + + // Ensure that we can read another 4 bytes. + if (offset_in_buffer + 4 > avail) { + buffer = __archive_read_filter_ahead(filter, offset_in_buffer + 4, &avail); + if (buffer == NULL) + return (0); + } + + uint32_t frame_data_size = archive_le32dec(buffer + offset_in_buffer); + + // Skip over the 4 frame data size bytes, plus the value stored there. + offset_in_buffer += 4 + frame_data_size; + + // There should be at least one more frame if this is zstd data. + if (offset_in_buffer + min_zstd_frame_size > avail) { + if (offset_in_buffer + min_zstd_frame_size > max_lookahead) + return (0); + + buffer = __archive_read_filter_ahead(filter, offset_in_buffer + min_zstd_frame_size, &avail); + if (buffer == NULL) + return (0); + } + + magic_number = archive_le32dec(buffer + offset_in_buffer); + } + + // We have skipped over any skippable frames. Either a regular zstd frame + // follows, or this isn't zstd data. + + if (magic_number == zstd_magic) + return (offset_in_buffer + 4); return (0); } -- 2.47.3