update specification and comments

author Yann Collet <yann.collet.73@gmail.com>

Fri, 1 Jul 2016 18:55:28 +0000 (20:55 +0200)

committer Yann Collet <yann.collet.73@gmail.com>

Fri, 1 Jul 2016 18:55:28 +0000 (20:55 +0200)
author Yann Collet <yann.collet.73@gmail.com>
Fri, 1 Jul 2016 18:55:28 +0000 (20:55 +0200)
committer Yann Collet <yann.collet.73@gmail.com>
Fri, 1 Jul 2016 18:55:28 +0000 (20:55 +0200)
diff --git a/lib/common/zstd_internal.h b/lib/common/zstd_internal.h

index 7989e6acaaa0583364b68ff526f0ae58d2d80659..43cbc9a3a3e0776522b3f955281bc485ea3653c9 100644 (file)
--- a/lib/common/zstd_internal.h
+++ b/lib/common/zstd_internal.h
@@ -51,7 +51,7 @@
  /*-*************************************
  *  Common constants
  ***************************************/
-#define ZSTD_OPT_DEBUG 0     // 3 = compression stats;  5 = check encoded sequences;  9 = full logs
+#define ZSTD_OPT_DEBUG 0     /* 3 = compression stats;  5 = check encoded sequences;  9 = full logs */
  #include <stdio.h>
  #if defined(ZSTD_OPT_DEBUG) && ZSTD_OPT_DEBUG>=9
      #define ZSTD_LOG_PARSER(...) printf(__VA_ARGS__)
diff --git a/lib/compress/zstd_compress.c b/lib/compress/zstd_compress.c

index 91c8d5e5696ffdb4369d0b5d8b888e9e049c00bf..07d8802211c2ef03da97104ac0a26807b0b9075b 100644 (file)
--- a/lib/compress/zstd_compress.c
+++ b/lib/compress/zstd_compress.c
@@ -427,21 +427,8 @@ static void ZSTD_reduceIndex (ZSTD_CCtx* zc, const U32 reducerValue)
  */
  
  
-/* Frame descriptor
+/* Frame header :
  
-    // old
-   1 byte - Alloc :
-   bit 0-3 : windowLog - ZSTD_WINDOWLOG_ABSOLUTEMIN   (see zstd_internal.h)
-   bit 4   : reserved for windowLog (must be zero)
-   bit 5   : reserved (must be zero)
-   bit 6-7 : Frame content size : unknown, 1 byte, 2 bytes, 8 bytes
-
-   1 byte - checker :
-   bit 0-1 : dictID (0, 1, 2 or 4 bytes)
-   bit 2-7 : reserved (must be zero)
-
-
-    // new
     1 byte - FrameHeaderDescription :
     bit 0-1 : dictID (0, 1, 2 or 4 bytes)
     bit 2-4 : reserved (must be zero)
@@ -453,24 +440,24 @@ static void ZSTD_reduceIndex (ZSTD_CCtx* zc, const U32 reducerValue)
     bit 0-2 : octal Fractional (1/8th)
     bit 3-7 : Power of 2, with 0 = 1 KB (up to 2 TB)
  
+   Optional : content size (0, 1, 2, 4 or 8 bytes)
+   0 : unknown
+   1 : 0-255 bytes
+   2 : 256 - 65535+256
+   8 : up to 16 exa
+
     Optional : dictID (0, 1, 2 or 4 bytes)
     Automatic adaptation
     0 : no dictID
     1 : 1 - 255
     2 : 256 - 65535
     4 : all other values
-
-   Optional : content size (0, 1, 2, 4 or 8 bytes)
-   0 : unknown
-   1 : 0-255 bytes
-   2 : 256 - 65535+256
-   8 : up to 16 exa
  */
  
  
  /* Block format description
  
-   Block = Literal Section - Sequences Section
+   Block = Literals Section - Sequences Section
     Prerequisite : size of (compressed) block, maximum size of regenerated data
  
     1) Literal Section
@@ -478,7 +465,7 @@ static void ZSTD_reduceIndex (ZSTD_CCtx* zc, const U32 reducerValue)
     1.1) Header : 1-5 bytes
          flags: 2 bits
              00 compressed by Huff0
-            01 unused
+            01 repeat
              10 is Raw (uncompressed)
              11 is Rle
              Note : using 01 => Huff0 with precomputed table ?
@@ -514,7 +501,7 @@ static void ZSTD_reduceIndex (ZSTD_CCtx* zc, const U32 reducerValue)
              else           => 5 bytes (2-2-18-18)
              big endian convention
  
-        1- CTable available (stored into workspace ?)
+        1- CTable available (stored into workspace)
          2- Small input (fast heuristic ? Full comparison ? depend on clevel ?)
  
  
diff --git a/lib/decompress/zstd_decompress.c b/lib/decompress/zstd_decompress.c

index 84f64dc83e44b34df90dcf168574636c2ff3a531..001a19ae8a5db850fc2ccaa66ba1cf7f68263678 100644 (file)
--- a/lib/decompress/zstd_decompress.c
+++ b/lib/decompress/zstd_decompress.c
@@ -207,20 +207,8 @@ void ZSTD_copyDCtx(ZSTD_DCtx* dstDCtx, const ZSTD_DCtx* srcDCtx)
  */
  
  
-/* Frame descriptor
+/* Frame Header :
  
-    // old
-   1 byte - Alloc :
-   bit 0-3 : windowLog - ZSTD_WINDOWLOG_ABSOLUTEMIN   (see zstd_internal.h)
-   bit 4   : reserved for windowLog (must be zero)
-   bit 5   : reserved (must be zero)
-   bit 6-7 : Frame content size : unknown, 1 byte, 2 bytes, 8 bytes
-
-   1 byte - checker :
-   bit 0-1 : dictID (0, 1, 2 or 4 bytes)
-   bit 2-7 : reserved (must be zero)
-
-    // new
     1 byte - FrameHeaderDescription :
     bit 0-1 : dictID (0, 1, 2 or 4 bytes)
     bit 2   : checksumFlag
@@ -454,16 +442,14 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
                            const void* src, size_t srcSize)   /* note : srcSize < BLOCKSIZE */
  {
      const BYTE* const istart = (const BYTE*) src;
-    litBlockType_t lbt;
  
      if (srcSize < MIN_CBLOCK_SIZE) return ERROR(corruption_detected);
-    lbt = (litBlockType_t)(istart[0]>> 6);
  
-    switch(lbt)
+    switch((litBlockType_t)(istart[0]>> 6))
      {
      case lbt_huffman:
          {   size_t litSize, litCSize, singleStream=0;
-            U32 lhSize = ((istart[0]) >> 4) & 3;
+            U32 lhSize = (istart[0] >> 4) & 3;
              if (srcSize < 5) return ERROR(corruption_detected);   /* srcSize >= MIN_CBLOCK_SIZE == 3; here we need up to 5 for lhSize, + cSize (+nbSeq) */
              switch(lhSize)
              {
diff --git a/zstd_frame_format.md b/zstd_compression_format.md

similarity index 52%

rename from zstd_frame_format.md

rename to zstd_compression_format.md

index 61b5aef58ea6ede54cadb135286bb2e0021d8631..b203dd1013f7afc5c059c598aab5048019c72d86 100644 (file)
--- a/zstd_frame_format.md
+++ b/zstd_compression_format.md
@@ -1,5 +1,5 @@
-Zstandard Frame Format Description
-==================================
+Zstandard Compression Format Description
+========================================
  
  ### Notices
  
@@ -16,7 +16,7 @@ Distribution of this document is unlimited.
  
  ### Version
  
-0.1.0 (30/06/2016)
+0.1.0 (30/06/2016 - unfinished)
  
  
  Introduction
@@ -53,17 +53,32 @@ A compliant decompressor must be able to decompress
  at least one working set of parameters
  that conforms to the specifications presented here.
  It may also ignore informative fields, such as checksum.
-Whenever it does not support a specific parameter within the compressed stream,
-it must produce a non-ambiguous error code
-and associated error message explaining which parameter is unsupported.
+Whenever it does not support a parameter defined in the compressed stream,
+it must produce a non-ambiguous error code and associated error message
+explaining which parameter is unsupported.
+
+
+Definitions
+-----------
+A content compressed by Zstandard is transformed into a Zstandard __frame__.
+Multiple frames can be appended into a single file or stream.
+A frame is totally independent, has a defined beginning and end,
+and a set of parameters which tells the decoder how to decompress it.
+
+A frame encapsulates one or multiple __blocks__.
+Each block can be compressed or not,
+and has a guaranteed maximum content size, which depends on frame parameters.
+Unlike frames, each block depends on previous blocks for proper decoding.
+However, each block can be decompressed without waiting for its successor,
+allowing streaming operations.
  
  
  General Structure of Zstandard Frame format
  -------------------------------------------
  
-| MagicNb |  F. Header | Block | (...) | EndMark |
-|:-------:|:----------:| ----- | ----- | ------- |
-| 4 bytes | 2-14 bytes |       |       | 3 bytes |
+| MagicNb |  F. Header | Block | (More blocks) | EndMark |
+|:-------:|:----------:| ----- | ------------- | ------- |
+| 4 bytes | 2-14 bytes |       |               | 3 bytes |
  
  __Magic Number__
  
@@ -73,7 +88,6 @@ Value : 0xFD2FB527
  __Frame Header__
  
  2 to 14 Bytes, to be detailed in the next part.
-Most important part of the spec.
  
  __Data Blocks__
  
@@ -87,11 +101,11 @@ This last block header may optionally host a __Content Checksum__ .
  
  __Content Checksum__
  
-Content Checksum verify that the full content has been decoded correctly.
+Content Checksum verify that frame content has been regenrated correctly.
  The content checksum is the result
  of [xxh64() hash function](https://www.xxHash.com)
  digesting the original (decoded) data as input, and a seed of zero.
-Bits from 11 to 32 (included) are extracted to form the 22 bits checksum
+Bits from 11 to 32 (included) are extracted to form a 22 bits checksum
  stored into the last block header.
  ```
  contentChecksum = (XXH64(content, size, 0) >> 11) & (1<<22)-1);
@@ -114,52 +128,64 @@ The ability to decode multiple concatenated frames
  within a single stream or file is left outside of this specification.
  As an example, the reference `zstd` command line utility is able
  to decode all concatenated frames in their sequential order,
-presenting the final decompressed result as if it was a single frame.
+delivering the final decompressed result as if it was a single content.
  
  
  Frame Header
-----------------
+-------------
  
  | FHD     | (WD)      | (Content Size) | (dictID)  |
  | ------- | --------- |:--------------:| --------- |
  | 1 byte  | 0-1 byte  |  0 - 8 bytes   | 0-4 bytes |
  
-Frame header uses a minimum of 2 bytes,
+Frame header has a variable size, which uses a minimum of 2 bytes,
  and up to 14 bytes depending on optional parameters.
  
  __FHD byte__ (Frame Header Descriptor)
  
+The first Header's byte is called the Frame Header Descriptor.
+It tells which other fields are present.
+Decoding this byte is enough to get the full size of the Frame Header.
+
  |  BitNb  |   7-6  |    5    |   4    |    3     |    2     |    1-0   |
  | ------- | ------ | ------- | ------ | -------- | -------- | -------- |
  |FieldName| FCSize | Segment | Unused | Reserved | Checksum |  dictID  |
  
-In the table, bit 7 is highest bit, while bit 0 is lowest.
+In this table, bit 7 is highest bit, while bit 0 is lowest.
  
  __Frame Content Size flag__
  
  This is a 2-bits flag (`= FHD >> 6`),
-telling if original data size is provided within the header
+specifying if decompressed data size is provided within the header.
  
  |  Value  |  0  |  1  |  2  |  3  |
  | ------- | --- | --- | --- | --- |
  |FieldSize| 0-1 |  2  |  4  |  8  |
  
-Value 0 is special : it means `0` (data size not provided)
-_if_ the `WD` byte is present.
-Otherwise, it means `1` byte (data size <= 255 bytes).
+Value 0 has a double meaning :
+it either means `0` (size not provided) _if_ the `WD` byte is present,
+or it means `1` byte (size <= 255 bytes).
  
  __Single Segment__
  
  If this flag is set,
  data shall be regenerated within a single continuous memory segment.
-In which case, `WD` byte is not present,
+In which case, `WD` byte __is not present__,
  but `Frame Content Size` field necessarily is.
-The size of the memory segment must be at least `>= Frame Content Size`.
  
-In order to preserve decoder from unreasonable memory requirement,
+As a consequence, the decoder must allocate a memory segment
+of size `>= Frame Content Size`.
+
+In order to preserve the decoder from unreasonable memory requirement,
  a decoder can refuse a compressed frame
  which requests a memory size beyond decoder's authorized range.
  
+For broader compatibility, decoders are recommended to support
+memory sizes of 8 MB at least.
+However, this is merely a recommendation,
+and each decoder is free to support higher or lower limits,
+depending on local limitations.
+
  __Unused bit__
  
  The value of this bit is unimportant
@@ -170,7 +196,7 @@ to signal a property which is not required to properly decode the frame.
  __Reserved bit__
  
  This bit is reserved for some future feature.
-Its value must be zero.
+Its value _must be zero_.
  A decoder compliant with this specification version must ensure it is not set.
  This bit may be used in a future revision,
  to signal a feature that must be interpreted in order to decode the frame.
@@ -193,7 +219,7 @@ telling if a dictionary ID is provided within the header
  __WD byte__ (Window Descriptor)
  
  Provides guarantees on maximum back-reference distance
-that will be used within compressed data.
+that will be present within compressed data.
  This information is useful for decoders to allocate enough memory.
  
  |   BitNb   |    7-3   |    0-2   |
@@ -208,16 +234,25 @@ windowAdd = (windowBase / 8) * Mantissa;
  windowSize = windowBase + windowAdd;
  ```
  The minimum window size is 1 KB.
-The maximum value is (15*(2^38))-1 bytes, which is almost 1.875 TB.
+The maximum size is (15*(2^38))-1 bytes, which is almost 1.875 TB.
+
+To properly decode compressed data,
+a decoder will need to allocate a buffer of at least `windowSize` bytes.
  
-`WD` byte is optional. It's not present in `single segment` mode.
-In which case, the maximum back-reference distance is the content size itself, which can be any value from 1 to 2^64-1 bytes (16 EB).
+Note that `WD` byte is optional. It's not present in `single segment` mode.
+In which case, the maximum back-reference distance is the content size itself,
+which can be any value from 1 to 2^64-1 bytes (16 EB).
  
  In order to preserve decoder from unreasonable memory requirements,
  a decoder can refuse a compressed frame
  which requests a memory size beyond decoder's authorized range.
  
-For better interoperability, decoders are recommended to be compatible with window sizes up to 8 MB. Encoders are recommended to not request more than 8 MB. It's just a recommendation, decoders are free to accept or refuse larger or lower values.
+For better interoperability,
+decoders are recommended to be compatible with window sizes of 8 MB.
+Encoders are recommended to not request more than 8 MB.
+It's merely a recommendation though,
+decoders are free to support larger or lower limits,
+depending on local limitations.
  
  __Frame Content Size__
  
@@ -235,11 +270,11 @@ Format is Little endian.
  |     8      | 0 - 2^64-1 |
  
  When field size is 1, 4 or 8 bytes, the value is read directly.
-When field size is 2, an offset of 256 is added.
-It's possible to represent a small size of `18` using the 8-bytes variant.
-A size of `0` means `data size is unknown`.
-In which case, the `WD` byte will be the only hint
-to determine memory allocation.
+When field size is 2, _an offset of 256 is added_.
+It's allowed to represent a small size (ex: `18`) using the 8-bytes variant.
+A size of `0` means `content size is unknown`.
+In which case, the `WD` byte will necessarily be present,
+and becomes the only hint to determine memory allocation.
  
  In order to preserve decoder from unreasonable memory requirement,
  a decoder can refuse a compressed frame
@@ -257,7 +292,8 @@ Field size depends on __Dictionary ID flag__.
  2 bytes can represent an ID 0-65535.
  4 bytes can represent an ID 0-(2^32-1).
  
-It's possible to represent a small ID (for example `13`) with a large 4-bytes dictionary ID, losing some efficiency in the process.
+It's allowed to represent a small ID (for example `13`)
+with a large 4-bytes dictionary ID, losing some efficiency in the process.
  
  
  Data Blocks
@@ -270,10 +306,10 @@ Data Blocks
  
  __Block Header__
  
-This field uses 3-bytes, format is big-endian.
+This field uses 3-bytes, format is __big-endian__.
  
  The 2 highest bits represent the `block type`,
-while the remaining 22 bits represent the block size.
+while the remaining 22 bits represent the (compressed) block size.
  
  There are 4 block types :
  
@@ -281,24 +317,24 @@ There are 4 block types :
  | ---------- | ---------- | --- | --- | ------- |
  | Block Type | Compressed | Raw | RLE | EndMark |
  
-- Compressed : this is a compressed block,
-  following Zstandard's block format specification.
-  The "block size" is the compressed size.
+- Compressed : this is a Zstandard compressed block,
+  detailed in a later part of this specification.
+  "block size" is the compressed size.
    Decompressed size is unknown,
-  but its maximum possible value is guaranteed (see later)
+  but its maximum possible value is guaranteed (see below)
  - Raw : this is an uncompressed block.
    "block size" is the number of bytes to read and copy.
  - RLE : this is a single byte, repeated N times.
-  In which case, the size of the "compressed" block is always 1,
-  and the "block size" is the size to regenerate.
+  In which case, "block size" is the size to regenerate,
+  while the "compressed" block is just 1 byte (the byte to repeat).
  - EndMark : this is not a block. Signal the end of the frame.
    The rest of the field may be optionally filled by a checksum
    (see frame checksum).
  
-Block Size shall never be larger than Block Maximum Size.
-Block Maximum Size is the smallest of :
-- Max back-reference distance
-- 128 KB
+Block sizes must respect a few rules :
+- In compressed mode, compressed size if always strictly `< contentSize`.
+- Block decompressed size is necessarily <= maximum back-reference distance .
+- Block decompressed size is necessarily <= 128 KB
  
  
  __Data__
@@ -306,8 +342,8 @@ __Data__
  Where the actual data to decode stands.
  It might be compressed or not, depending on previous field indications.
  A data block is not necessarily "full" :
-an arbitrary “flush” may happen anytime. Any block can be “partially filled”.
-Therefore, data can have any size, up to Block Maximum Size.
+since an arbitrary “flush” may happen anytime,
+block content can be any size, up to Block Maximum Size.
  Block Maximum Size is the smallest of :
  - Max back-reference distance
  - 128 KB
@@ -329,25 +365,122 @@ over user-defined data and continue decoding.
  Skippable frames defined in this specification are compatible with LZ4 ones.
  
  
-__Magic Number__
+__Magic Number__ :
  
  4 Bytes, Little endian format.
  Value : 0x184D2A5X, which means any value from 0x184D2A50 to 0x184D2A5F.
  All 16 values are valid to identify a skippable frame.
  
-__Frame Size__
+__Frame Size__ :
  
  This is the size, in bytes, of the following User Data
  (without including the magic number nor the size field itself).
  4 Bytes, Little endian format, unsigned 32-bits.
  This means User Data can’t be bigger than (2^32-1) Bytes.
  
-__User Data__
+__User Data__ :
  
  User Data can be anything. Data will just be skipped by the decoder.
  
  
+Compressed block format
+-----------------------
+This specification details the content of a _compressed block_.
+A compressed block has a size, which must be known in order to decode it.
+It also has a guaranteed maximum regenerated size,
+in order to properly allocate destination buffer.
+See "Frame format" for more details.
+
+A compressed block consists of 2 sections :
+- Literals section
+- Sequences section
+
+### Compressed Literals
+
+Literals are compressed using order-0 huffman compression.
+During sequence phase, literals will be entangled with match copy operations.
+All literals are regrouped in the first part of the block.
+They can be decoded first, and then copied during sequence operations,
+or they can be decoded on the flow, as needed by sequences.
+
+| Header | (Tree Description) | Stream1 | (Stream2) | (Stream3) | (Stream4) |
+| ------ | ------------------ | ------- | --------- | --------- | --------- |
+
+Literals can be compressed, or uncompressed.
+When compressed, an optional tree description can be present,
+followed by 1 or 4 streams.
+
+#### Block Literal Header
+
+Header is in charge of describing precisely how literals are packed.
+It's a byte-aligned variable-size bitfield, ranging from 1 to 5 bytes,
+using big-endian convention.
+
+| BlockType | sizes format | (compressed size) | regenerated size |
+| --------- | ------------ | ----------------- | ---------------- |
+|   2 bits  |  1 - 2 bits  |    0 - 18 bits    |    5 - 20 bits   |
+
+__Block Type__ :
+
+This is a 2-bits field, describing 4 different block types :
+
+|    Value   |      0     |    1   |  2  |    3    |
+| ---------- | ---------- | ------ | --- | ------- |
+| Block Type | Compressed | Repeat | Raw |   RLE   |
+
+- Compressed : This is a standard huffman-compressed block,
+               starting with a huffman tree description.
+               See details below.
+- Repeat Stats : This is a huffman-compressed block,
+               using huffman tree from previous huffman-compressed block.
+               Huffman tree description will be skipped.
+               Compressed stream is equivalent to "compressed" block type.
+- Raw : Literals are stored uncompressed.
+- RLE : Literals consist of a single byte value repeated N times.
+
+__Sizes format__ :
+
+Sizes format are divided into 2 families :
+
+- For compressed block, it requires to decode both the compressed size
+  and the decompressed size. It will also decode the number of streams.
+- For Raw or RLE blocks, it's enough to decode the size to regenerate.
+
+For values spanning several bytes, convention is Big-endian.
+
+__Sizes format for Raw or RLE block__ :
+
+- Value : 0x : Regenerated size uses 5 bits (0-31).
+               Total literal header size is 1 byte.
+               `size = h[0] & 31;`
+- Value : 10 : Regenerated size uses 12 bits (0-4095).
+               Total literal header size is 2 bytes.
+               `size = ((h[0] & 15) << 8) + h[1];`
+- Value : 11 : Regenerated size uses 20 bits (0-1048575).
+               Total literal header size is 2 bytes.
+               `size = ((h[0] & 15) << 16) + (h[1]<<8) + h[2];`
+
+Note : it's allowed to represent a short value (ex : `13`)
+using a long format, accepting the reduced compacity.
+
+__Sizes format for Compressed Block__ :
+
+Note : also applicable to "repeat-stats" blocks.
+- Value : 00 : 4 streams
+               Compressed and regenerated sizes use 10 bits (0-1023)
+               Total literal header size is 3 bytes
+- Value : 01 : _Single stream_
+               Compressed and regenerated sizes use 10 bits (0-1023)
+               Total literal header size is 3 bytes
+- Value : 10 : 4 streams
+               Compressed and regenerated sizes use 14 bits (0-16383)
+               Total literal header size is 4 bytes
+- Value : 10 : 4 streams
+               Compressed and regenerated sizes use 18 bits (0-262143)
+               Total literal header size is 5 bytes
+
+
+
  Version changes
  ---------------
-
  0.1 : initial release
author	Yann Collet <yann.collet.73@gmail.com>
	Fri, 1 Jul 2016 18:55:28 +0000 (20:55 +0200)
committer	Yann Collet <yann.collet.73@gmail.com>
	Fri, 1 Jul 2016 18:55:28 +0000 (20:55 +0200)
lib/common/zstd_internal.h		patch \| blob \| blame \| history
lib/compress/zstd_compress.c		patch \| blob \| blame \| history
lib/decompress/zstd_decompress.c		patch \| blob \| blame \| history
zstd_compression_format.md	[moved from zstd_frame_format.md with 52% similarity]	patch \| blob \| blame \| history