updated specifications

author Yann Collet <yann.collet.73@gmail.com>

Mon, 4 Jul 2016 14:13:11 +0000 (16:13 +0200)

committer Yann Collet <yann.collet.73@gmail.com>

Mon, 4 Jul 2016 14:13:11 +0000 (16:13 +0200)
author Yann Collet <yann.collet.73@gmail.com>
Mon, 4 Jul 2016 14:13:11 +0000 (16:13 +0200)
committer Yann Collet <yann.collet.73@gmail.com>
Mon, 4 Jul 2016 14:13:11 +0000 (16:13 +0200)
diff --git a/lib/decompress/zstd_decompress.c b/lib/decompress/zstd_decompress.c

index 001a19ae8a5db850fc2ccaa66ba1cf7f68263678..228205824a4c0e736c290335c9cea762ff47ff1f 100644 (file)
--- a/lib/decompress/zstd_decompress.c
+++ b/lib/decompress/zstd_decompress.c
@@ -195,7 +195,7 @@ void ZSTD_copyDCtx(ZSTD_DCtx* dstDCtx, const ZSTD_DCtx* srcDCtx)
  /* Frame format description
     Frame Header -  [ Block Header - Block ] - Frame End
     1) Frame Header
-      - 4 bytes - Magic Number : ZSTD_MAGICNUMBER (defined within zstd_static.h)
+      - 4 bytes - Magic Number : ZSTD_MAGICNUMBER (defined within zstd.h)
        - 1 byte  - Frame Descriptor
     2) Block Header
        - 3 bytes, starting with a 2-bits descriptor
@@ -629,7 +629,7 @@ size_t ZSTD_decodeSeqHeaders(int* nbSeqPtr,
  
      /* FSE table descriptors */
      {   U32 const LLtype  = *ip >> 6;
-        U32 const Offtype = (*ip >> 4) & 3;
+        U32 const OFtype = (*ip >> 4) & 3;
          U32 const MLtype  = (*ip >> 2) & 3;
          ip++;
  
@@ -637,17 +637,17 @@ size_t ZSTD_decodeSeqHeaders(int* nbSeqPtr,
          if (ip > iend-3) return ERROR(srcSize_wrong); /* min : all 3 are "raw", hence no header, but at least xxLog bits per type */
  
          /* Build DTables */
-        {   size_t const bhSize = ZSTD_buildSeqTable(DTableLL, LLtype, MaxLL, LLFSELog, ip, iend-ip, LL_defaultNorm, LL_defaultNormLog, flagRepeatTable);
-            if (ZSTD_isError(bhSize)) return ERROR(corruption_detected);
-            ip += bhSize;
+        {   size_t const llhSize = ZSTD_buildSeqTable(DTableLL, LLtype, MaxLL, LLFSELog, ip, iend-ip, LL_defaultNorm, LL_defaultNormLog, flagRepeatTable);
+            if (ZSTD_isError(llhSize)) return ERROR(corruption_detected);
+            ip += llhSize;
          }
-        {   size_t const bhSize = ZSTD_buildSeqTable(DTableOffb, Offtype, MaxOff, OffFSELog, ip, iend-ip, OF_defaultNorm, OF_defaultNormLog, flagRepeatTable);
-            if (ZSTD_isError(bhSize)) return ERROR(corruption_detected);
-            ip += bhSize;
+        {   size_t const ofhSize = ZSTD_buildSeqTable(DTableOffb, OFtype, MaxOff, OffFSELog, ip, iend-ip, OF_defaultNorm, OF_defaultNormLog, flagRepeatTable);
+            if (ZSTD_isError(ofhSize)) return ERROR(corruption_detected);
+            ip += ofhSize;
          }
-        {   size_t const bhSize = ZSTD_buildSeqTable(DTableML, MLtype, MaxML, MLFSELog, ip, iend-ip, ML_defaultNorm, ML_defaultNormLog, flagRepeatTable);
-            if (ZSTD_isError(bhSize)) return ERROR(corruption_detected);
-            ip += bhSize;
+        {   size_t const mlhSize = ZSTD_buildSeqTable(DTableML, MLtype, MaxML, MLFSELog, ip, iend-ip, ML_defaultNorm, ML_defaultNormLog, flagRepeatTable);
+            if (ZSTD_isError(mlhSize)) return ERROR(corruption_detected);
+            ip += mlhSize;
      }   }
  
      return ip-istart;
diff --git a/zstd_compression_format.md b/zstd_compression_format.md

index a01305019c12e8f638970daf488aa20cf25e5ae8..dbadac75845739536991d67814502eb08888399b 100644 (file)
--- a/zstd_compression_format.md
+++ b/zstd_compression_format.md
@@ -134,9 +134,9 @@ delivering the final decompressed result as if it was a single content.
  Frame Header
  -------------
  
-| FHD     | (WD)      | (Content Size) | (dictID)  |
-| ------- | --------- |:--------------:| --------- |
-| 1 byte  | 0-1 byte  |  0 - 8 bytes   | 0-4 bytes |
+| FHD     | (WD)      | (dictID)  | (Content Size) |
+| ------- | --------- | --------- |:--------------:|
+| 1 byte  | 0-1 byte  | 0-4 bytes |  0 - 8 bytes   |
  
  Frame header has a variable size, which uses a minimum of 2 bytes,
  and up to 14 bytes depending on optional parameters.
@@ -145,11 +145,11 @@ __FHD byte__ (Frame Header Descriptor)
  
  The first Header's byte is called the Frame Header Descriptor.
  It tells which other fields are present.
-Decoding this byte is enough to get the full size of the Frame Header.
+Decoding this byte is enough to tell the size of Frame Header.
  
-|  BitNb  |   7-6  |    5    |   4    |    3     |    2     |    1-0   |
-| ------- | ------ | ------- | ------ | -------- | -------- | -------- |
-|FieldName| FCSize | Segment | Unused | Reserved | Checksum |  dictID  |
+|  BitNb  |   7-6  |    5    |   4    |    3     |    2     |  1-0   |
+| ------- | ------ | ------- | ------ | -------- | -------- | ------ |
+|FieldName| FCSize | Segment | Unused | Reserved | Checksum | dictID |
  
  In this table, bit 7 is highest bit, while bit 0 is lowest.
  
@@ -162,28 +162,28 @@ specifying if decompressed data size is provided within the header.
  | ------- | --- | --- | --- | --- |
  |FieldSize| 0-1 |  2  |  4  |  8  |
  
-Value 0 has a double meaning :
+Value 0 meaning depends on _single segment_ mode :
  it either means `0` (size not provided) _if_ the `WD` byte is present,
-or it means `1` byte (size <= 255 bytes).
+or `1` (frame content size <= 255 bytes) otherwise.
  
  __Single Segment__
  
  If this flag is set,
  data shall be regenerated within a single continuous memory segment.
+
  In which case, `WD` byte __is not present__,
  but `Frame Content Size` field necessarily is.
-
  As a consequence, the decoder must allocate a memory segment
  of size `>= Frame Content Size`.
  
  In order to preserve the decoder from unreasonable memory requirement,
-a decoder can refuse a compressed frame
+a decoder can reject a compressed frame
  which requests a memory size beyond decoder's authorized range.
  
  For broader compatibility, decoders are recommended to support
-memory sizes of 8 MB at least.
-However, this is merely a recommendation,
-and each decoder is free to support higher or lower limits,
+memory sizes of at least 8 MB.
+This is just a recommendation,
+as each decoder is free to support higher or lower limits,
  depending on local limitations.
  
  __Unused bit__
@@ -254,6 +254,21 @@ It's merely a recommendation though,
  decoders are free to support larger or lower limits,
  depending on local limitations.
  
+__Dictionary ID__
+
+This is a variable size field, which contains an ID.
+It checks if the correct dictionary is used for decoding.
+Note that this field is optional. If it's not present,
+it's up to the caller to make sure it uses the correct dictionary.
+
+Field size depends on __Dictionary ID flag__.
+1 byte can represent an ID 0-255.
+2 bytes can represent an ID 0-65535.
+4 bytes can represent an ID 0-(2^32-1).
+
+It's allowed to represent a small ID (for example `13`)
+with a large 4-bytes dictionary ID, losing some efficiency in the process.
+
  __Frame Content Size__
  
  This is the original (uncompressed) size.
@@ -274,27 +289,12 @@ When field size is 2, _an offset of 256 is added_.
  It's allowed to represent a small size (ex: `18`) using the 8-bytes variant.
  A size of `0` means `content size is unknown`.
  In which case, the `WD` byte will necessarily be present,
-and becomes the only hint to determine memory allocation.
+and becomes the only hint to help memory allocation.
  
  In order to preserve decoder from unreasonable memory requirement,
  a decoder can refuse a compressed frame
  which requests a memory size beyond decoder's authorized range.
  
-__Dictionary ID__
-
-This is a variable size field, which contains a single ID.
-It checks if the correct dictionary is used for decoding.
-Note that this field is optional. If it's not present,
-it's up to the caller to make sure it uses the correct dictionary.
-
-Field size depends on __Dictionary ID flag__.
-1 byte can represent an ID 0-255.
-2 bytes can represent an ID 0-65535.
-4 bytes can represent an ID 0-(2^32-1).
-
-It's allowed to represent a small ID (for example `13`)
-with a large 4-bytes dictionary ID, losing some efficiency in the process.
-
  
  Data Blocks
  -----------
@@ -364,7 +364,6 @@ over user-defined data and continue decoding.
  
  Skippable frames defined in this specification are compatible with LZ4 ones.
  
-
  __Magic Number__ :
  
  4 Bytes, Little endian format.
@@ -395,8 +394,8 @@ A compressed block consists of 2 sections :
  - Literals section
  - Sequences section
  
-### Prerequisite
-To decode a compressed block, it's required to access to following elements :
+### Prerequisites
+To decode a compressed block, the following elements are necessary :
  - Previous decoded blocks, up to a distance of `windowSize`,
    or all frame's previous blocks in "single segment" mode.
  - List of "recent offsets" from previous compressed block.
@@ -634,7 +633,6 @@ it gives the following distribution :
  | nb bits      |  0  |  4  |  4  |  3  |  2  |   1  |
  
  
-
  #### Literals bitstreams
  
  ##### Bitstreams sizes
@@ -711,12 +709,265 @@ which specifies a baseline and a number of additional bits.
  _Codes_ are FSE compressed,
  and interleaved with raw additional bits in the same bitstream.
  
-The Sequence section starts by a header,
-followed by an optional Probability table for each symbol type,
+The Sequences section starts by a header,
+followed by optional Probability tables for each symbol type,
  followed by the bitstream.
  
+To decode the Sequence section, it's required to know its size.
+This size is deducted from "blockSize - literalSectionSize".
+
+
  #### Sequences section header
  
+Consists in 2 items :
+- Nb of Sequences
+- Flags providing Symbol compression types
+
+__Nb of Sequences__
+
+This is a variable size field, `nbSeqs`, using between 1 and 3 bytes.
+Let's call its first byte `byte0`.
+- `if (byte0 == 0)` : there are no sequences.
+            The sequence section stops there.
+            Regenerated content is defined entirely by literals section.
+- `if (byte0 < 128)` : nbSeqs = byte0 . Uses 1 byte.
+- `if (byte0 < 255)` : nbSeqs = ((byte0-128) << 8) + byte1 . Uses 2 bytes.
+- `if (byte0 == 255)`: nbSeqs = byte1 + (byte2<<8) + 0x7F00 . Uses 3 bytes.
+
+__Symbol compression modes__
+
+This is a single byte, defining the compression mode of each symbol type.
+
+|  BitNb  |   7-6  |   5-4  |   3-2  |    1-0   |
+| ------- | ------ | ------ | ------ | -------- |
+|FieldName| LLtype | OFType | MLType | Reserved |
+
+The last field, `Reserved`, must be all-zeroes.
+
+`LLtype`, `OFType` and `MLType` define the compression mode of
+Literal Lengths, Offsets and Match Lengths respectively.
+
+They follow the same enumeration :
+
+|       Value      |    0   |  1  |    2   |  3  |
+| ---------------- | ------ | --- | ------ | --- |
+| Compression Mode | predef | RLE | Repeat | FSE |
+
+- "predef" : uses a pre-defined distribution table.
+- "RLE" : it's a single code, repeated `nbSeqs` times.
+- "Repeat" : re-use distribution table from previous compressed block.
+- "FSE" : standard FSE compression.
+          Symbol type requires a distribution table,
+          which will be described in next part.
+
+#### Symbols decoding
+
+##### Literal Lengths codes
+
+Literal lengths codes are values ranging from `0` to `35` included.
+They define lengths from 0 to 131071 bytes.
+
+|  Code  | 0-15 |
+| ------ | ---- |
+| nbBits |   0  |
+| value  | Code |
+
+|   Code   |  16  |  17  |  18  |  19  |  20  |  21  |  22  |  23  |
+| -------- | ---- | ---- | ---- | ---- | ---- | ---- | ---- | ---- |
+| Baseline |  16  |  18  |  20  |  22  |  24  |  28  |  32  |  40  |
+| nb Bits  |   1  |   1  |   1  |   1  |   2  |   2  |   3  |   3  |
+
+|   Code   |  24  |  25  |  26  |  27  |  28  |  29  |  30  |  31  |
+| -------- | ---- | ---- | ---- | ---- | ---- | ---- | ---- | ---- |
+| Baseline |  48  |  64  |  128 |  256 |  512 | 1024 | 2048 | 4096 |
+| nb Bits  |   4  |   6  |   7  |   8  |   9  |  10  |  11  |  12  |
+
+|   Code   |  32  |  33  |  34  |  35  |
+| -------- | ---- | ---- | ---- | ---- |
+| Baseline | 8192 |16384 |32768 |65536 |
+| nb Bits  |  13  |  14  |  15  |  16  |
+
+__Default distribution__
+
+When "compression mode" is defined as "default distribution",
+a pre-defined distribution is used for FSE compression.
+
+Here is its definition. It uses an accuracy of 6 bits (64 states).
+```
+short literalLengths_defaultDistribution[36] =
+        { 4, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1,
+          2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 2, 1, 1, 1, 1, 1,
+         -1,-1,-1,-1 };
+```
+
+##### Match Lengths codes
+
+Match lengths codes are values ranging from `0` to `52` included.
+They define lengths from 3 to 131074 bytes.
+
+|  Code  |   0-31   |
+| ------ | -------- |
+| nbBits |     0    |
+| value  | Code + 3 |
+
+|   Code   |  32  |  33  |  34  |  35  |  36  |  37  |  38  |  39  |
+| -------- | ---- | ---- | ---- | ---- | ---- | ---- | ---- | ---- |
+| Baseline |  35  |  37  |  39  |  41  |  43  |  47  |  51  |  59  |
+| nb Bits  |   1  |   1  |   1  |   1  |   2  |   2  |   3  |   3  |
+
+|   Code   |  40  |  41  |  42  |  43  |  44  |  45  |  46  |  47  |
+| -------- | ---- | ---- | ---- | ---- | ---- | ---- | ---- | ---- |
+| Baseline |  67  |  83  |  99  |  131 |  258 |  514 | 1026 | 2050 |
+| nb Bits  |   4  |   4  |   5  |   7  |   8  |   9  |  10  |  11  |
+
+|   Code   |  48  |  49  |  50  |  51  |  52  |
+| -------- | ---- | ---- | ---- | ---- | ---- |
+| Baseline | 4098 | 8194 |16486 |32770 |65538 |
+| nb Bits  |  12  |  13  |  14  |  15  |  16  |
+
+__Default distribution__
+
+When "compression mode" is defined as "default distribution",
+a pre-defined distribution is used for FSE compression.
+
+Here is its definition. It uses an accuracy of 6 bits (64 states).
+```
+short matchLengths_defaultDistribution[53] =
+        { 1, 4, 3, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1,
+          1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+          1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,-1,-1,
+         -1,-1,-1,-1,-1 };
+```
+
+##### Offset codes
+
+Offset codes are values ranging from `0` to `N`,
+with `N` being limited by maximum backreference distance.
+
+A decoder is free to limit its maximum `N` supported,
+although the recommendation is to support at least up to `22`.
+For information, at the time of this writing.
+the reference decoder supports a maximum `N` value of `28` in 64-bits mode.
+
+An offset code is also the nb of additional bits to read,
+and can be translated into an `OFValue` using the following formulae :
+
+```
+OFValue = (1 << offsetCode) + readNBits(offsetCode);
+if (OFValue > 3) offset = OFValue - 3;
+```
+
+OFValue from 1 to 3 are special : they define "repeat codes",
+which means one of the previous offsets will be repeated.
+They are sorted in recency order, with 1 meaning the most recent one.
+
+__Default distribution__
+
+When "compression mode" is defined as "default distribution",
+a pre-defined distribution is used for FSE compression.
+
+Here is its definition. It uses an accuracy of 5 bits (32 states),
+and support a maximum `N` of 28, allowing offset values up to 536,870,908 .
+
+If any sequence in the compressed block requires an offset larger than this,
+it's not possible to use the default distribution to represent it.
+
+```
+short offsetCodes_defaultDistribution[53] =
+        { 1, 1, 1, 1, 1, 1, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1,
+          1, 1, 1, 1, 1, 1, 1, 1,-1,-1,-1,-1,-1 };
+```
+
+#### Distribution tables
+
+Following the header, up to 3 distribution tables can be described.
+They are, in order :
+- Literal lengthes
+- Offsets
+- Match Lengthes
+
+The content to decode depends on their respective compression mode :
+- Repeat mode : no content. Re-use distribution from previous compressed block.
+- Predef : no content. Use pre-defined distribution table.
+- RLE : 1 byte. This is the only code to use across the whole compressed block.
+- FSE : A distribution table is present.
+
+##### FSE distribution table : condensed format
+
+An FSE distribution table describes the probabilities of all symbols
+from `0` to the last present one (included)
+on a normalized scale of `2^AccuracyLog` .
+
+It's a bitstream which is read forward, in little-endian fashion.
+It's not necessary to know its exact size,
+since it will be discovered and reported by the decoding process.
+
+The bitstream starts by reporting on which scale it operates.
+`AccuracyLog = low4bits + 5;`
+In theory, it can define a scale from 5 to 20.
+In practice, decoders are allowed to limit the maximum supported `AccuracyLog`.
+Recommended maximum are `9` for literal and match lengthes, and `8` for offsets.
+The reference decoder uses these limits.
+
+Then follow each symbol value, from `0` to last present one.
+The nb of bits used by each field is variable.
+It depends on :
+
+- Remaining probabilities + 1 :
+  __example__ :
+  Presuming an AccuracyLog of 8,
+  and presuming 100 probabilities points have already been distributed,
+  the decoder may discover value from `0` to `255 - 100 + 1 == 156` (included).
+  Therefore, it must read `log2sup(156) == 8` bits.
+
+- Value decoded : small values use 1 less bit :
+  __example__ :
+  Presuming values from 0 to 156 (included) are possible,
+  255-156 = 99 values are remaining in an 8-bits field.
+  They are used this way :
+  first 99 values (hence from 0 to 98) use only 7 bits,
+  values from 99 to 156 use 8 bits.
+  This is achieved through this scheme :
+
+  | Value read | Value decoded | nb Bits used |
+  | ---------- | ------------- | ------------ |
+  |   0 -  98  |   0 -  98     |  7           |
+  |  99 - 127  |  99 - 127     |  8           |
+  | 128 - 226  |   0 -  98     |  7           |
+  | 227 - 255  | 128 - 156     |  8           |
+
+Symbols probabilities are read one by one, in order.
+
+Probability is obtained from Value decoded by following formulae :
+`Proba = value - 1;`
+
+It means value `0` becomes negative probability `-1`.
+`-1` is a special probability, which means `less than 1`.
+Its effect on distribution table is described in a later paragraph.
+For the purpose of calculating cumulated distribution, it counts as one.
+
+When a symbol has a probability of `zero`,
+it is followed by a 2-bits repeat flag.
+This repeat flag tells how many probabilities of zeroes follow the current one.
+It provides a number ranging from 0 to 3.
+If it is a 3, another 2-bits repeat flag follows, and so on.
+
+When last symbol reaches cumulated total of `2^AccuracyLog`,
+decoding is complete.
+Then the decoder can tell how many bytes were used in this process,
+and how many symbols are present.
+
+The bitstream consumes a round number of bytes.
+Any remaining bit within the last byte is just unused.
+
+If the last symbol makes cumulated total go above `2^AccuracyLog`,
+distribution is considered corrupted.
+
+##### FSE decoding : from normalized distribution to decoding tables
+
+
+
+#### Bitstream
author	Yann Collet <yann.collet.73@gmail.com>
	Mon, 4 Jul 2016 14:13:11 +0000 (16:13 +0200)
committer	Yann Collet <yann.collet.73@gmail.com>
	Mon, 4 Jul 2016 14:13:11 +0000 (16:13 +0200)
lib/decompress/zstd_decompress.c		patch \| blob \| blame \| history
zstd_compression_format.md		patch \| blob \| blame \| history