]> git.ipfire.org Git - thirdparty/zstd.git/commitdiff
Don't shrink window log when streaming with a dictionary
authorNick Terrell <terrelln@fb.com>
Mon, 4 Jan 2021 21:43:47 +0000 (13:43 -0800)
committerNick Terrell <terrelln@fb.com>
Mon, 4 Jan 2021 23:54:09 +0000 (15:54 -0800)
Fixes #2442.

1. When creating a dictionary keep the same behavior as before.
   Assume the source size is 513 bytes when adjusting parameters.
2. When calling ZSTD_getCParams() or ZSTD_adjustCParams() keep
   the same behavior as before.
3. When attaching a dictionary keep the same behavior of ignoring
   the dictionary size. When streaming this will select the
   largest parameters and not adjust them down. But, the CDict
   will use the correctly sized parameters, which seems like the
   right tradeoff.
4. When not attaching a dictionary (either forced not to, or
   using a prefix dictionary) we select parameters based on the
   dictionary size + source size, and assume the source size is
   small, which is the same behavior as before. But, now we don't
   adjust the window log (and hash and chain log) down when the
   source size is unknown.

When the source size is unknown all cdicts should attach, except
when the user disables attaching, or `forceWindow` is used. This
means that when streaming with a CDict we end up in the good case
where we get small CDict parameters, and large source parameters.

TODO: Add a streaming + dictionary regression test case.

lib/compress/zstd_compress.c
tests/regression/results.csv

index 3cebbe173369fec8baddacff2b3b6a502ff2b040..c0628d832356660873f43f5103b0d1e158a0baf4 100644 (file)
@@ -1188,15 +1188,30 @@ ZSTD_adjustCParams_internal(ZSTD_compressionParameters cPar,
     const U64 maxWindowResize = 1ULL << (ZSTD_WINDOWLOG_MAX-1);
     assert(ZSTD_checkCParams(cPar)==0);
 
-    if (dictSize && srcSize == ZSTD_CONTENTSIZE_UNKNOWN)
-        srcSize = minSrcSize;
-
     switch (mode) {
     case ZSTD_cpm_noAttachDict:
+        /* If we don't know the source size, don't make any
+         * assumptions about it. We will already have selected
+         * smaller parameters if a dictionary is in use.
+         */
+        break;
     case ZSTD_cpm_unknown:
+        /* Keep the legacy behavior of assuming small source
+         * sizes when the cparam mode is unkown.
+         */
+        /* fall-through */
     case ZSTD_cpm_createCDict:
+        /* Assume a small source size when creating a dictionary
+         * with an unkown source size.
+         */
+        if (dictSize && srcSize == ZSTD_CONTENTSIZE_UNKNOWN)
+            srcSize = minSrcSize;
         break;
     case ZSTD_cpm_attachDict:
+        /* Dictionary has its own dedicated parameters which have
+         * already been selected. We are selecting parameters
+         * for only the source.
+         */
         dictSize = 0;
         break;
     default:
@@ -1213,7 +1228,8 @@ ZSTD_adjustCParams_internal(ZSTD_compressionParameters cPar,
                             ZSTD_highbit32(tSize-1) + 1;
         if (cPar.windowLog > srcLog) cPar.windowLog = srcLog;
     }
-    {   U32 const dictAndWindowLog = ZSTD_dictAndWindowLog(cPar.windowLog, (U64)srcSize, (U64)dictSize);
+    if (srcSize != ZSTD_CONTENTSIZE_UNKNOWN) {
+        U32 const dictAndWindowLog = ZSTD_dictAndWindowLog(cPar.windowLog, (U64)srcSize, (U64)dictSize);
         U32 const cycleLog = ZSTD_cycleLog(cPar.chainLog, cPar.strategy);
         if (cPar.hashLog > dictAndWindowLog+1) cPar.hashLog = dictAndWindowLog+1;
         if (cycleLog > dictAndWindowLog)
@@ -2955,9 +2971,9 @@ static size_t ZSTD_writeFrameHeader(void* dst, size_t dstCapacity,
 }
 
 /* ZSTD_writeSkippableFrame_advanced() :
- * Writes out a skippable frame with the specified magic number variant (16 are supported), 
+ * Writes out a skippable frame with the specified magic number variant (16 are supported),
  * from ZSTD_MAGIC_SKIPPABLE_START to ZSTD_MAGIC_SKIPPABLE_START+15, and the desired source data.
- * 
+ *
  * Returns the total number of bytes written, or a ZSTD error code.
  */
 size_t ZSTD_writeSkippableFrame(void* dst, size_t dstCapacity,
index 587b6afbe452f588d1d284db2a55c710d47a7542..0f818149b9fc1b0193687f5cd3af9e6809cdd0d1 100644 (file)
@@ -216,7 +216,7 @@ github.tar,                         level 16 with dict,                 zstdcli,
 github.tar,                         level 19,                           zstdcli,                            32841
 github.tar,                         level 19 with dict,                 zstdcli,                            32899
 github.tar,                         no source size,                     zstdcli,                            38442
-github.tar,                         no source size with dict,           zstdcli,                            213380
+github.tar,                         no source size with dict,           zstdcli,                            38004
 github.tar,                         long distance mode,                 zstdcli,                            39680
 github.tar,                         multithreaded,                      zstdcli,                            38445
 github.tar,                         multithreaded long distance mode,   zstdcli,                            39680
@@ -618,7 +618,7 @@ github.tar,                         level 16 with dict,                 advanced
 github.tar,                         level 19,                           advanced streaming,                 32837
 github.tar,                         level 19 with dict,                 advanced streaming,                 32895
 github.tar,                         no source size,                     advanced streaming,                 38438
-github.tar,                         no source size with dict,           advanced streaming,                 214381
+github.tar,                         no source size with dict,           advanced streaming,                 38000
 github.tar,                         long distance mode,                 advanced streaming,                 39676
 github.tar,                         multithreaded,                      advanced streaming,                 38441
 github.tar,                         multithreaded long distance mode,   advanced streaming,                 39676
@@ -695,7 +695,7 @@ github,                             level 16 with dict,                 old stre
 github,                             level 19,                           old streaming,                      134064
 github,                             level 19 with dict,                 old streaming,                      37576
 github,                             no source size,                     old streaming,                      140632
-github,                             no source size with dict,           old streaming,                      40678
+github,                             no source size with dict,           old streaming,                      40654
 github,                             uncompressed literals,              old streaming,                      136335
 github,                             uncompressed literals optimal,      old streaming,                      134064
 github,                             huffman literals,                   old streaming,                      175568
@@ -728,7 +728,7 @@ github.tar,                         level 16 with dict,                 old stre
 github.tar,                         level 19,                           old streaming,                      32837
 github.tar,                         level 19 with dict,                 old streaming,                      32895
 github.tar,                         no source size,                     old streaming,                      38438
-github.tar,                         no source size with dict,           old streaming,                      214384
+github.tar,                         no source size with dict,           old streaming,                      38000
 github.tar,                         uncompressed literals,              old streaming,                      38441
 github.tar,                         uncompressed literals optimal,      old streaming,                      32837
 github.tar,                         huffman literals,                   old streaming,                      42465
@@ -813,7 +813,7 @@ github,                             level 16 with dict,                 old stre
 github,                             level 19,                           old streaming advanced,             134064
 github,                             level 19 with dict,                 old streaming advanced,             37576
 github,                             no source size,                     old streaming advanced,             140632
-github,                             no source size with dict,           old streaming advanced,             40643
+github,                             no source size with dict,           old streaming advanced,             40608
 github,                             long distance mode,                 old streaming advanced,             141104
 github,                             multithreaded,                      old streaming advanced,             141104
 github,                             multithreaded long distance mode,   old streaming advanced,             141104
@@ -854,7 +854,7 @@ github.tar,                         level 16 with dict,                 old stre
 github.tar,                         level 19,                           old streaming advanced,             32837
 github.tar,                         level 19 with dict,                 old streaming advanced,             32876
 github.tar,                         no source size,                     old streaming advanced,             38438
-github.tar,                         no source size with dict,           old streaming advanced,             214384
+github.tar,                         no source size with dict,           old streaming advanced,             38015
 github.tar,                         long distance mode,                 old streaming advanced,             38441
 github.tar,                         multithreaded,                      old streaming advanced,             38441
 github.tar,                         multithreaded long distance mode,   old streaming advanced,             38441
@@ -880,7 +880,7 @@ github,                             level 9 with dict,                  old stre
 github,                             level 13 with dict,                 old streaming cdcit,                39743
 github,                             level 16 with dict,                 old streaming cdcit,                37577
 github,                             level 19 with dict,                 old streaming cdcit,                37576
-github,                             no source size with dict,           old streaming cdcit,                40678
+github,                             no source size with dict,           old streaming cdcit,                40654
 github.tar,                         level -5 with dict,                 old streaming cdcit,                45018
 github.tar,                         level -3 with dict,                 old streaming cdcit,                41886
 github.tar,                         level -1 with dict,                 old streaming cdcit,                41636
@@ -895,7 +895,7 @@ github.tar,                         level 9 with dict,                  old stre
 github.tar,                         level 13 with dict,                 old streaming cdcit,                36372
 github.tar,                         level 16 with dict,                 old streaming cdcit,                39353
 github.tar,                         level 19 with dict,                 old streaming cdcit,                32676
-github.tar,                         no source size with dict,           old streaming cdcit,                214384
+github.tar,                         no source size with dict,           old streaming cdcit,                38000
 github,                             level -5 with dict,                 old streaming advanced cdict,       49562
 github,                             level -3 with dict,                 old streaming advanced cdict,       44956
 github,                             level -1 with dict,                 old streaming advanced cdict,       42383
@@ -910,7 +910,7 @@ github,                             level 9 with dict,                  old stre
 github,                             level 13 with dict,                 old streaming advanced cdict,       39731
 github,                             level 16 with dict,                 old streaming advanced cdict,       40789
 github,                             level 19 with dict,                 old streaming advanced cdict,       37576
-github,                             no source size with dict,           old streaming advanced cdict,       40643
+github,                             no source size with dict,           old streaming advanced cdict,       40608
 github.tar,                         level -5 with dict,                 old streaming advanced cdict,       44307
 github.tar,                         level -3 with dict,                 old streaming advanced cdict,       41359
 github.tar,                         level -1 with dict,                 old streaming advanced cdict,       41322
@@ -925,4 +925,4 @@ github.tar,                         level 9 with dict,                  old stre
 github.tar,                         level 13 with dict,                 old streaming advanced cdict,       36035
 github.tar,                         level 16 with dict,                 old streaming advanced cdict,       38736
 github.tar,                         level 19 with dict,                 old streaming advanced cdict,       32876
-github.tar,                         no source size with dict,           old streaming advanced cdict,       214384
+github.tar,                         no source size with dict,           old streaming advanced cdict,       38015