fixed decoder behavior when nbSeqs==0 is encoded using 2 bytes

author Yann Collet <cyan@fb.com>

Mon, 5 Jun 2023 23:03:00 +0000 (16:03 -0700)

committer Yann Collet <cyan@fb.com>

Mon, 5 Jun 2023 23:03:00 +0000 (16:03 -0700)
author Yann Collet <cyan@fb.com>
Mon, 5 Jun 2023 23:03:00 +0000 (16:03 -0700)
committer Yann Collet <cyan@fb.com>
Mon, 5 Jun 2023 23:03:00 +0000 (16:03 -0700)
diff --git a/doc/decompressor_errata.md b/doc/decompressor_errata.md

index e170a62c155c870836d6262ace0bff000e827446..83d4071cb4d0c6aaadf97ab9d4341f0e70e30c4e 100644 (file)
--- a/doc/decompressor_errata.md
+++ b/doc/decompressor_errata.md
@@ -12,6 +12,26 @@ Each entry will contain:
  The document is in reverse chronological order, with the bugs that affect the most recent zstd decompressor versions listed first.
  
  
+No sequence using the 2-bytes format
+------------------------------------------------
+
+**Last affected version**: v1.5.5
+
+**Affected decompressor component(s)**: Library & CLI
+
+**Produced by the reference compressor**: No
+
+**Example Frame**: see zstd/tests/golden-decompression/zeroSeq_2B.zst
+
+The zstd decoder incorrectly expects FSE tables when there are 0 sequences present in the block
+if the value 0 is encoded using the 2-bytes format.
+Instead, it should immediately end the sequence section, and move on to next block.
+
+This situation was never generated by the reference compressor,
+because representing 0 sequences with the 2-bytes format is inefficient
+(the 1-byte format is always used in this case).
+
+
  Compressed block with a size of exactly 128 KB
  ------------------------------------------------
  
@@ -32,6 +52,7 @@ These blocks used to be disallowed by the spec up until spec version 0.3.2 when
  
  > A Compressed_Block has the extra restriction that Block_Size is always strictly less than the decompressed size. If this condition cannot be respected, the block must be sent uncompressed instead (Raw_Block).
  
+
  Compressed block with 0 literals and 0 sequences
  ------------------------------------------------
  
@@ -51,6 +72,7 @@ Additionally, these blocks were disallowed by the spec up until spec version 0.3
  
  > A Compressed_Block has the extra restriction that Block_Size is always strictly less than the decompressed size. If this condition cannot be respected, the block must be sent uncompressed instead (Raw_Block).
  
+
  First block is RLE block
  ------------------------
  
@@ -72,6 +94,7 @@ block.
  
  https://github.com/facebook/zstd/blob/8814aa5bfa74f05a86e55e9d508da177a893ceeb/lib/compress/zstd_compress.c#L3527-L3535
  
+
  Tiny FSE Table & Block
  ----------------------
  
diff --git a/doc/educational_decoder/zstd_decompress.c b/doc/educational_decoder/zstd_decompress.c

index 1da7c528d23995563aa5c039ea5d73eb74e85832..921c8f54cd35810b5f5a4ace085807d9113be8ae 100644 (file)
--- a/doc/educational_decoder/zstd_decompress.c
+++ b/doc/educational_decoder/zstd_decompress.c
@@ -1018,12 +1018,7 @@ static size_t decode_sequences(frame_context_t *const ctx, istream_t *in,
      // This is a variable size field using between 1 and 3 bytes. Let's call its
      // first byte byte0."
      u8 header = IO_read_bits(in, 8);
-    if (header == 0) {
-        // "There are no sequences. The sequence section stops there.
-        // Regenerated content is defined entirely by literals section."
-        *sequences = NULL;
-        return 0;
-    } else if (header < 128) {
+    if (header < 128) {
          // "Number_of_Sequences = byte0 . Uses 1 byte."
          num_sequences = header;
      } else if (header < 255) {
@@ -1034,6 +1029,12 @@ static size_t decode_sequences(frame_context_t *const ctx, istream_t *in,
          num_sequences = IO_read_bits(in, 16) + 0x7F00;
      }
  
+    if (num_sequences == 0) {
+        // "There are no sequences. The sequence section stops there."
+        *sequences = NULL;
+        return 0;
+    }
+
      *sequences = malloc(num_sequences * sizeof(sequence_command_t));
      if (!*sequences) {
          BAD_ALLOC();
diff --git a/doc/zstd_compression_format.md b/doc/zstd_compression_format.md

index 2a69c4c30ae95364a6bfea815e8db862af99501e..cd7308de196e58099fff907b6a9c40689ff66136 100644 (file)
--- a/doc/zstd_compression_format.md
+++ b/doc/zstd_compression_format.md
@@ -16,7 +16,7 @@ Distribution of this document is unlimited.
  
  ### Version
  
-0.3.9 (2023-03-08)
+0.4.0 (2023-06-05)
  
  
  Introduction
@@ -650,15 +650,16 @@ __`Number_of_Sequences`__
  
  This is a variable size field using between 1 and 3 bytes.
  Let's call its first byte `byte0`.
-- `if (byte0 == 0)` : there are no sequences.
-            The sequence section stops there.
-            Decompressed content is defined entirely as Literals Section content.
-            The FSE tables used in `Repeat_Mode` aren't updated.
  - `if (byte0 < 128)` : `Number_of_Sequences = byte0` . Uses 1 byte.
  - `if (byte0 < 255)` : `Number_of_Sequences = ((byte0 - 0x80) << 8) + byte1`. Uses 2 bytes.
              Note that the 2 bytes format fully overlaps the 1 byte format.
  - `if (byte0 == 255)`: `Number_of_Sequences = byte1 + (byte2<<8) + 0x7F00`. Uses 3 bytes.
  
+`if (Number_of_Sequences == 0)` : there are no sequences.
+            The sequence section stops immediately,
+            FSE tables used in `Repeat_Mode` aren't updated.
+            Block's decompressed content is defined solely by the Literals Section content.
+
  __Symbol compression modes__
  
  This is a single byte, defining the compression mode of each symbol type.
@@ -1698,6 +1699,7 @@ or at least provide a meaningful error code explaining for which reason it canno
  
  Version changes
  ---------------
+- 0.4.0 : fixed imprecise behavior for nbSeq==0, detected by Igor Pavlov
  - 0.3.9 : clarifications for Huffman-compressed literal sizes.
  - 0.3.8 : clarifications for Huffman Blocks and Huffman Tree descriptions.
  - 0.3.7 : clarifications for Repeat_Offsets, matching RFC8878
diff --git a/lib/common/zstd_internal.h b/lib/common/zstd_internal.h

index 1f942f27bf090c94b4a22bc2b814a86d599d5a65..f7c57a028bfe904c85175cc9012aa42392c850e2 100644 (file)
--- a/lib/common/zstd_internal.h
+++ b/lib/common/zstd_internal.h
@@ -366,13 +366,13 @@ typedef struct {
  
  /*! ZSTD_getcBlockSize() :
   *  Provides the size of compressed block from block header `src` */
-/* Used by: decompress, fullbench (does not get its definition from here) */
+/*  Used by: decompress, fullbench */
  size_t ZSTD_getcBlockSize(const void* src, size_t srcSize,
                            blockProperties_t* bpPtr);
  
  /*! ZSTD_decodeSeqHeaders() :
   *  decode sequence header from src */
-/* Used by: decompress, fullbench (does not get its definition from here) */
+/*  Used by: zstd_decompress_block, fullbench */
  size_t ZSTD_decodeSeqHeaders(ZSTD_DCtx* dctx, int* nbSeqPtr,
                         const void* src, size_t srcSize);
  
diff --git a/lib/decompress/zstd_decompress_block.c b/lib/decompress/zstd_decompress_block.c

index 5028a52f10318fded1d93546e4c4bbadba2f9235..c63d06d6a9efea0329860f825a60ef56e220f101 100644 (file)
--- a/lib/decompress/zstd_decompress_block.c
+++ b/lib/decompress/zstd_decompress_block.c
@@ -706,11 +706,6 @@ size_t ZSTD_decodeSeqHeaders(ZSTD_DCtx* dctx, int* nbSeqPtr,
  
      /* SeqHead */
      nbSeq = *ip++;
-    if (!nbSeq) {
-        *nbSeqPtr=0;
-        RETURN_ERROR_IF(srcSize != 1, srcSize_wrong, "");
-        return 1;
-    }
      if (nbSeq > 0x7F) {
          if (nbSeq == 0xFF) {
              RETURN_ERROR_IF(ip+2 > iend, srcSize_wrong, "");
@@ -723,6 +718,11 @@ size_t ZSTD_decodeSeqHeaders(ZSTD_DCtx* dctx, int* nbSeqPtr,
      }
      *nbSeqPtr = nbSeq;
  
+    if (nbSeq == 0) {
+        /* No sequence : section ends immediately */
+        return (size_t)(ip - istart);
+    }
+
      /* FSE table descriptors */
      RETURN_ERROR_IF(ip+1 > iend, srcSize_wrong, ""); /* minimum possible size: 1 byte for symbol encoding types */
      {   symbolEncodingType_e const LLtype = (symbolEncodingType_e)(*ip >> 6);
diff --git a/tests/golden-decompression/zeroSeq_2B.zst b/tests/golden-decompression/zeroSeq_2B.zst

new file mode 100644 (file)

index 0000000..f9f3520

Binary files /dev/null and b/tests/golden-decompression/zeroSeq_2B.zst differ
author	Yann Collet <cyan@fb.com>
	Mon, 5 Jun 2023 23:03:00 +0000 (16:03 -0700)
committer	Yann Collet <cyan@fb.com>
	Mon, 5 Jun 2023 23:03:00 +0000 (16:03 -0700)
doc/decompressor_errata.md		patch \| blob \| blame \| history
doc/educational_decoder/zstd_decompress.c		patch \| blob \| blame \| history
doc/zstd_compression_format.md		patch \| blob \| blame \| history
lib/common/zstd_internal.h		patch \| blob \| blame \| history
lib/decompress/zstd_decompress_block.c		patch \| blob \| blame \| history
tests/golden-decompression/zeroSeq_2B.zst	[new file with mode: 0644]	patch \| blob