strengthened streaming bufferless compression

author Yann Collet <yann.collet.73@gmail.com>

Fri, 4 Dec 2015 16:16:37 +0000 (17:16 +0100)

committer Yann Collet <yann.collet.73@gmail.com>

Fri, 4 Dec 2015 16:16:37 +0000 (17:16 +0100)
author Yann Collet <yann.collet.73@gmail.com>
Fri, 4 Dec 2015 16:16:37 +0000 (17:16 +0100)
committer Yann Collet <yann.collet.73@gmail.com>
Fri, 4 Dec 2015 16:16:37 +0000 (17:16 +0100)
diff --git a/Makefile b/Makefile

index 4ee569e164c329a68a7739b25547fc42da67549c..9e7b70ee094fd3fa5bc3607719828bd5d1dc8f57 100644 (file)
--- a/Makefile
+++ b/Makefile
@@ -87,8 +87,8 @@ gpptest: clean
         $(MAKE) all CC=g++ CFLAGS="-O3 -Wall -Wextra -Wundef -Wshadow -Wcast-align -Werror"
  
  armtest: clean
-       $(MAKE) -C $(ZSTDDIR) -e all CC=arm-linux-gnueabi-gcc MOREFLAGS="-Werror"
-       $(MAKE) -C $(PRGDIR) -e CC=arm-linux-gnueabi-gcc MOREFLAGS="-Werror"
+       $(MAKE) -C $(ZSTDDIR) all CC=arm-linux-gnueabi-gcc MOREFLAGS="-Werror"
+       $(MAKE) -C $(PRGDIR) CC=arm-linux-gnueabi-gcc MOREFLAGS="-Werror -static"
  
  usan: clean
         $(MAKE) test CC=clang MOREFLAGS="-g -fsanitize=undefined"
diff --git a/NEWS b/NEWS

index 33b8dfc6aa65ab5d381720b86189ea6615eebb91..ff276ac5fc7a43e0d4f84817a42cd33b1bd038a2 100644 (file)
--- a/NEWS
+++ b/NEWS
@@ -1,5 +1,6 @@
  v0.4.3 :
  new : zstd-frugal
+new : external dictionary API
  
  v0.4.2 :
  Generic minor improvements for small blocks
diff --git a/lib/zstd.h b/lib/zstd.h

index 15fc626339db3fdbf318a84114b36efaaa455747..e4d441497937c1e3c7c57cf0d2131be01b96ad41 100644 (file)
--- a/lib/zstd.h
+++ b/lib/zstd.h
@@ -48,7 +48,7 @@ extern "C" {
  ***************************************/
  #define ZSTD_VERSION_MAJOR    0    /* for breaking interface changes  */
  #define ZSTD_VERSION_MINOR    4    /* for new (non-breaking) interface capabilities */
-#define ZSTD_VERSION_RELEASE  2    /* for tweaks, bug-fixes, or development */
+#define ZSTD_VERSION_RELEASE  3    /* for tweaks, bug-fixes, or development */
  #define ZSTD_VERSION_NUMBER  (ZSTD_VERSION_MAJOR *100*100 + ZSTD_VERSION_MINOR *100 + ZSTD_VERSION_RELEASE)
  unsigned ZSTD_versionNumber (void);
  
diff --git a/lib/zstd_compress.c b/lib/zstd_compress.c

index 6304e2b6e8ddc7a87c5a26ad05c7652c8c54293a..6d2fd2cb5dfe48af32c6b06f06eab881c6f8977a 100644 (file)
--- a/lib/zstd_compress.c
+++ b/lib/zstd_compress.c
@@ -488,7 +488,7 @@ size_t ZSTD_compressSequences(void* dst, size_t maxDstSize,
              BYTE litLength = llTable[i];                                    /* (7)*/  /* (7)*/
              FSE_encodeSymbol(&blockStream, &stateMatchLength, matchLength); /* 17 */  /* 17 */
              if (MEM_32bits()) BIT_flushBits(&blockStream);                  /*  7 */
-            BIT_addBits(&blockStream, offset, nbBits);                      /* 32 */  /* 42 */
+            BIT_addBits(&blockStream, offset, nbBits);                      /* 31 */  /* 42 */   /* 24 bits max in 32-bits mode */
              if (MEM_32bits()) BIT_flushBits(&blockStream);                  /*  7 */
              FSE_encodeSymbol(&blockStream, &stateOffsetBits, offCode);      /* 16 */  /* 51 */
              FSE_encodeSymbol(&blockStream, &stateLitLength, litLength);     /* 26 */  /* 61 */
@@ -730,13 +730,30 @@ static size_t ZSTD_hashPtr(const void* p, U32 hBits, U32 mls)
  *  Fast Scan
  ***************************************/
  
+#define FILLHASHSTEP 3
+static void ZSTD_fillHashTable (ZSTD_CCtx* zc, const void* end, const U32 mls)
+{
+    U32* const hashTable = zc->hashTable;
+    const U32 hBits = zc->params.hashLog;
+    const BYTE* const base = zc->base;
+    const BYTE* ip = base + zc->nextToUpdate;
+    const BYTE* const iend = (const BYTE*) end;
+
+    while(ip <= iend)
+    {
+        hashTable[ZSTD_hashPtr(ip, hBits, mls)] = (U32)(ip - base);
+        ip += FILLHASHSTEP;
+    }
+}
+
+
  FORCE_INLINE
  size_t ZSTD_compressBlock_fast_generic(ZSTD_CCtx* zc,
                                         void* dst, size_t maxDstSize,
                                   const void* src, size_t srcSize,
                                   const U32 mls)
  {
-    U32* hashTable = zc->hashTable;
+    U32* const hashTable = zc->hashTable;
      const U32 hBits = zc->params.hashLog;
      seqStore_t* seqStorePtr = &(zc->seqStore);
      const BYTE* const base = zc->base;
@@ -1973,10 +1990,24 @@ size_t ZSTD_compressContinue (ZSTD_CCtx* zc,
                                   void* dst, size_t dstSize,
                             const void* src, size_t srcSize)
  {
+    U32 adressOverflow = 0;
      const BYTE* const ip = (const BYTE*) src;
  
+    /* Check if blocks follow each other */
+    if (src != zc->nextSrc)
+    {
+        /* not contiguous */
+        size_t delta = zc->nextSrc - ip;
+        zc->lowLimit = zc->dictLimit;
+        zc->dictLimit = (U32)(zc->nextSrc - zc->base);
+        zc->dictBase = zc->base;
+        if ((size_t)zc->base < delta) adressOverflow = zc->lowLimit;
+        zc->base -= delta;
+        zc->nextToUpdate = zc->dictLimit;
+    }
+
      /* preemptive overflow correction */
-    if ((zc->base > (const BYTE*)src) || (zc->lowLimit > (1<<30) ))
+    if (adressOverflow || (zc->lowLimit > (1<<30) ))
      {
          U32 correction = zc->lowLimit-1;
          ZSTD_reduceIndex(zc, correction);
@@ -1988,17 +2019,6 @@ size_t ZSTD_compressContinue (ZSTD_CCtx* zc,
          else zc->nextToUpdate -= correction;
      }
  
-    /* Check if blocks follow each other */
-    if (src != zc->nextSrc)
-    {
-        /* not contiguous */
-        zc->lowLimit = zc->dictLimit;
-        zc->dictLimit = (U32)(zc->nextSrc - zc->base);
-        zc->dictBase = zc->base;
-        zc->base += ip - zc->nextSrc;
-        zc->nextToUpdate = zc->dictLimit;
-    }
-
      /* input-dictionary overlap */
      if ((ip+srcSize > zc->dictBase + zc->lowLimit) && (ip < zc->dictBase + zc->dictLimit))
      {
@@ -2011,8 +2031,46 @@ size_t ZSTD_compressContinue (ZSTD_CCtx* zc,
      return ZSTD_compress_generic (zc, dst, dstSize, src, srcSize);
  }
  
+size_t ZSTD_compress_insertDictionary(ZSTD_CCtx* zc, const void* src, size_t srcSize)
+{
+    const BYTE* const ip = (const BYTE*) src;
+    const BYTE* const iend = ip + srcSize;
+
+    /* input becomes current prefix */
+    zc->lowLimit = zc->dictLimit;
+    zc->dictLimit = (U32)(zc->nextSrc - zc->base);
+    zc->dictBase = zc->base;
+    zc->base += ip - zc->nextSrc;
+    zc->nextToUpdate = zc->dictLimit;
+
+    zc->nextSrc = iend;
+    if (srcSize <= 8) return 0;
+
+    switch(zc->params.strategy)
+    {
+    case ZSTD_fast:
+        ZSTD_fillHashTable (zc, iend-8, zc->params.searchLength);
+        break;
+
+    case ZSTD_greedy:
+    case ZSTD_lazy:
+    case ZSTD_lazy2:
+        ZSTD_insertAndFindFirstIndex (zc, iend-8, zc->params.searchLength);
+        break;
+
+    case ZSTD_btlazy2:
+        ZSTD_updateTree(zc, iend-8, iend, 1 << zc->params.searchLog, zc->params.searchLength);
+        break;
+
+    default:
+        return ERROR(GENERIC);   /* strategy doesn't exist; impossible */
+    }
+
+    return 0;
+}
+
  
-/** ZSTD_compressBegin_advanced
+/*! ZSTD_compressBegin_advanced
  *   Write frame header, according to params
  *   @return : nb of bytes written */
  size_t ZSTD_compressBegin_advanced(ZSTD_CCtx* ctx,
diff --git a/lib/zstd_decompress.c b/lib/zstd_decompress.c

index 8940969f6a586b93e6a181671e76ce6ff8376cfc..fe4ae972d3334139151884aeca0b42eb5066cb48 100644 (file)
--- a/lib/zstd_decompress.c
+++ b/lib/zstd_decompress.c
@@ -127,10 +127,10 @@ struct ZSTD_DCtx_s
      U32 LLTable[FSE_DTABLE_SIZE_U32(LLFSELog)];
      U32 OffTable[FSE_DTABLE_SIZE_U32(OffFSELog)];
      U32 MLTable[FSE_DTABLE_SIZE_U32(MLFSELog)];
-    void* previousDstEnd;
-    void* base;
-    void* vBase;
-    void* dictEnd;
+    const void* previousDstEnd;
+    const void* base;
+    const void* vBase;
+    const void* dictEnd;
      size_t expected;
      size_t headerSize;
      ZSTD_parameters params;
@@ -141,7 +141,7 @@ struct ZSTD_DCtx_s
      size_t litSize;
      BYTE litBuffer[BLOCKSIZE + 8 /* margin for wildcopy */];
      BYTE headerBuffer[ZSTD_frameHeaderSize_max];
-};   /* typedef'd to ZSTD_Dctx within "zstd_static.h" */
+};  /* typedef'd to ZSTD_DCtx within "zstd_static.h" */
  
  size_t ZSTD_resetDCtx(ZSTD_DCtx* dctx)
  {
@@ -505,7 +505,7 @@ static void ZSTD_decodeSequence(seq_t* seq, seqState_t* seqState)
  FORCE_INLINE size_t ZSTD_execSequence(BYTE* op,
                                  BYTE* const oend, seq_t sequence,
                                  const BYTE** litPtr, const BYTE* const litLimit_8,
-                                BYTE* const base, BYTE* const vBase, BYTE* const dictEnd)
+                                const BYTE* const base, const BYTE* const vBase, const BYTE* const dictEnd)
  {
      static const int dec32table[] = { 0, 1, 2, 1, 4, 4, 4, 4 };   /* added */
      static const int dec64table[] = { 8, 8, 8, 7, 8, 9,10,11 };   /* substracted */
@@ -607,9 +607,9 @@ static size_t ZSTD_decompressSequences(
      U32* DTableLL = dctx->LLTable;
      U32* DTableML = dctx->MLTable;
      U32* DTableOffb = dctx->OffTable;
-    BYTE* const base = (BYTE*) (dctx->base);
-    BYTE* const vBase = (BYTE*) (dctx->vBase);
-    BYTE* const dictEnd = (BYTE*) (dctx->dictEnd);
+    const BYTE* const base = (const BYTE*) (dctx->base);
+    const BYTE* const vBase = (const BYTE*) (dctx->vBase);
+    const BYTE* const dictEnd = (const BYTE*) (dctx->dictEnd);
  
      /* Build Decoding Tables */
      errorCode = ZSTD_decodeSeqHeaders(&nbSeq, &dumps, &dumpsLength,
@@ -691,7 +691,7 @@ size_t ZSTD_decompressDCtx(ZSTD_DCtx* ctx, void* dst, size_t maxDstSize, const v
  
  
      /* init */
-    ctx->base = ctx->vBase = ctx->dictEnd = dst;
+    ctx->vBase = ctx->base = ctx->dictEnd = dst;
  
      /* Frame Header */
      {
@@ -776,7 +776,7 @@ size_t ZSTD_decompressContinue(ZSTD_DCtx* ctx, void* dst, size_t maxDstSize, con
          if ((dst > ctx->base) && (dst < ctx->previousDstEnd))   /* rolling buffer : new segment into dictionary */
              ctx->base = (char*)dst;   /* temporary affectation, for vBase calculation */
          ctx->dictEnd = ctx->previousDstEnd;
-        ctx->vBase = (char*)dst - ((char*)(ctx->previousDstEnd) - (char*)(ctx->base));
+        ctx->vBase = (const char*)dst - ((const char*)(ctx->previousDstEnd) - (const char*)(ctx->base));
          ctx->base = dst;
          ctx->previousDstEnd = dst;
      }
@@ -827,10 +827,9 @@ size_t ZSTD_decompressContinue(ZSTD_DCtx* ctx, void* dst, size_t maxDstSize, con
                  ctx->bType = bp.blockType;
                  ctx->stage = ZSTDds_decompressBlock;
              }
-
              return 0;
          }
-    case 3:
+    case ZSTDds_decompressBlock:
          {
              /* Decompress : block content */
              size_t rSize;
@@ -862,3 +861,10 @@ size_t ZSTD_decompressContinue(ZSTD_DCtx* ctx, void* dst, size_t maxDstSize, con
  }
  
  
+void ZSTD_decompress_insertDictionary(ZSTD_DCtx* ctx, const void* src, size_t srcSize)
+{
+    ctx->dictEnd = ctx->previousDstEnd;
+    ctx->vBase = (const char*)src - ((const char*)(ctx->previousDstEnd) - (const char*)(ctx->base));
+    ctx->base = src;
+    ctx->previousDstEnd = (const char*)src + srcSize;
+}
diff --git a/lib/zstd_static.h b/lib/zstd_static.h

index f698153ec9842e0cc5ed3012824e9333597fe572..5c315624879c4084b10c77ff5e6c0651332c0dd2 100644 (file)
--- a/lib/zstd_static.h
+++ b/lib/zstd_static.h
@@ -104,6 +104,8 @@ size_t ZSTD_compress_advanced (ZSTD_CCtx* ctx,
  ****************************************/
  size_t ZSTD_compressBegin(ZSTD_CCtx* cctx, void* dst, size_t maxDstSize, int compressionLevel);
  size_t ZSTD_compressBegin_advanced(ZSTD_CCtx* ctx, void* dst, size_t maxDstSize, ZSTD_parameters params);
+size_t ZSTD_compress_insertDictionary(ZSTD_CCtx* ctx, const void* src, size_t srcSize);
+
  size_t ZSTD_compressContinue(ZSTD_CCtx* cctx, void* dst, size_t maxDstSize, const void* src, size_t srcSize);
  size_t ZSTD_compressEnd(ZSTD_CCtx* cctx, void* dst, size_t maxDstSize);
  
@@ -118,6 +120,10 @@ size_t ZSTD_compressEnd(ZSTD_CCtx* cctx, void* dst, size_t maxDstSize);
    Use ZSTD_compressBegin().
    You may also prefer the advanced derivative ZSTD_compressBegin_advanced(), for finer parameter control.
  
+  It's then possible to add a dictionary with ZSTD_compressDictionary()
+  Note that dictionary presence is a "hidden" information,
+  the decoder needs to be aware that it is required for proper decoding, or decoding will fail.
+
    Then, consume your input using ZSTD_compressContinue().
    The interface is synchronous, so all input will be consumed.
    You must ensure there is enough space in destination buffer to store compressed data under worst case scenario.
@@ -131,12 +137,15 @@ size_t ZSTD_compressEnd(ZSTD_CCtx* cctx, void* dst, size_t maxDstSize);
  
  typedef struct ZSTD_DCtx_s ZSTD_DCtx;
  ZSTD_DCtx* ZSTD_createDCtx(void);
-size_t     ZSTD_resetDCtx(ZSTD_DCtx* dctx);
  size_t     ZSTD_freeDCtx(ZSTD_DCtx* dctx);
  
+size_t ZSTD_resetDCtx(ZSTD_DCtx* dctx);
  size_t ZSTD_getFrameParams(ZSTD_parameters* params, const void* src, size_t srcSize);
+void   ZSTD_decompress_insertDictionary(ZSTD_DCtx* ctx, const void* src, size_t srcSize);
+
  size_t ZSTD_nextSrcSizeToDecompress(ZSTD_DCtx* dctx);
  size_t ZSTD_decompressContinue(ZSTD_DCtx* dctx, void* dst, size_t maxDstSize, const void* src, size_t srcSize);
+
  /**
    Streaming decompression, bufferless mode
  
@@ -146,15 +155,17 @@ size_t ZSTD_decompressContinue(ZSTD_DCtx* dctx, void* dst, size_t maxDstSize, co
  
    First operation is to retrieve frame parameters, using ZSTD_getFrameParams().
    This function doesn't consume its input. It needs enough input data to properly decode the frame header.
-  The objective is to retrieve *params.windowlog, to know minimum amount of memory required during decoding.
+  Objective is to retrieve *params.windowlog, to know minimum amount of memory required during decoding.
    Result : 0 when successful, it means the ZSTD_parameters structure has been filled.
             >0 : means there is not enough data into src. Provides the expected size to successfully decode header.
             errorCode, which can be tested using ZSTD_isError() (For example, if it's not a ZSTD header)
  
+  Then, you can optionally insert a dictionary. This operation must mimic the compressor behavior, otherwise decompression will fail or be corrupted.
+
    Then it's possible to start decompression.
    Use ZSTD_nextSrcSizeToDecompress() and ZSTD_decompressContinue() alternatively.
    ZSTD_nextSrcSizeToDecompress() tells how much bytes to provide as 'srcSize' to ZSTD_decompressContinue().
-  ZSTD_decompressContinue() requires this exact amount of bytes, or just fails.
+  ZSTD_decompressContinue() requires this exact amount of bytes, or it will fail.
    ZSTD_decompressContinue() needs previous data blocks during decompression, up to (1 << windowlog).
    They should preferably be located contiguously, prior to current block. Alternatively, a round buffer is also possible.
  
diff --git a/programs/fuzzer.c b/programs/fuzzer.c

index 793af7f9cc3601055b8c07dada3c6e6a767bddf6..62f58f10abd154542eabfd38b0495b49f893a620 100644 (file)
--- a/programs/fuzzer.c
+++ b/programs/fuzzer.c
@@ -294,9 +294,10 @@ int fuzzerTests(U32 seed, U32 nbTests, unsigned startTest, double compressibilit
      /* test loop */
      for ( ; testNb <= nbTests; testNb++ )
      {
-        size_t sampleSize, sampleStart;
-        size_t cSize, dSize, dSupSize;
-        U32 sampleSizeLog, buffNb, cLevelMod;
+        size_t sampleSize, sampleStart, maxTestSize, totalTestSize;
+        size_t cSize, dSize, dSupSize, errorCode;
+        U32 sampleSizeLog, buffNb, cLevelMod, nbChunks, n;
+        XXH64_state_t crc64;
          U64 crcOrig, crcDest;
          int cLevel;
          BYTE* sampleBuffer;
@@ -342,7 +343,6 @@ int fuzzerTests(U32 seed, U32 nbTests, unsigned startTest, double compressibilit
          /* compression failure test : too small dest buffer */
          if (cSize > 3)
          {
-            size_t errorCode;
              const size_t missing = (FUZ_rand(&lseed) % (cSize-2)) + 1;   /* no problem, as cSize > 4 (frameHeaderSizer) */
              const size_t tooSmallSize = cSize - missing;
              static const U32 endMark = 0x4DC2B1A9;
@@ -365,7 +365,6 @@ int fuzzerTests(U32 seed, U32 nbTests, unsigned startTest, double compressibilit
  
          /* truncated src decompression test */
          {
-            size_t errorCode;
              const size_t missing = (FUZ_rand(&lseed) % (cSize-2)) + 1;   /* no problem, as cSize > 4 (frameHeaderSizer) */
              const size_t tooSmallSize = cSize - missing;
              void* cBufferTooSmall = malloc(tooSmallSize);   /* valgrind will catch overflows */
@@ -379,7 +378,6 @@ int fuzzerTests(U32 seed, U32 nbTests, unsigned startTest, double compressibilit
          /* too small dst decompression test */
          if (sampleSize > 3)
          {
-            size_t errorCode;
              const size_t missing = (FUZ_rand(&lseed) % (sampleSize-2)) + 1;   /* no problem, as cSize > 4 (frameHeaderSizer) */
              const size_t tooSmallSize = sampleSize - missing;
              static const BYTE token = 0xA9;
@@ -424,7 +422,6 @@ int fuzzerTests(U32 seed, U32 nbTests, unsigned startTest, double compressibilit
                  U32 noiseSrc = FUZ_rand(&lseed) % 5;
                  const U32 endMark = 0xA9B1C3D6;
                  U32 endCheck;
-                size_t errorCode;
                  srcBuffer = cNoiseBuffer[noiseSrc];
                  memcpy(dstBuffer+sampleSize, &endMark, 4);
                  errorCode = ZSTD_decompress(dstBuffer, sampleSize, cBuffer, cSize);
@@ -435,6 +432,39 @@ int fuzzerTests(U32 seed, U32 nbTests, unsigned startTest, double compressibilit
                  CHECK(endMark!=endCheck, "ZSTD_decompress on noisy src : dst buffer overflow");
              }
          }
+
+        /* Multi - segments compression test */
+        XXH64_reset(&crc64, 0);
+        nbChunks = (FUZ_rand(&lseed) & 127) + 2;
+        sampleSizeLog = FUZ_rand(&lseed) % maxSrcLog;
+        maxTestSize = (size_t)1 << sampleSizeLog;
+        maxTestSize += FUZ_rand(&lseed) & (maxTestSize-1);
+        totalTestSize = 0;
+        cSize = ZSTD_compressBegin(ctx, cBuffer, cBufferSize, (FUZ_rand(&lseed) % (20 - (sampleSizeLog/3))) + 1);
+        for (n=0; n<nbChunks; n++)
+        {
+            sampleSizeLog = FUZ_rand(&lseed) % maxSampleLog;
+            sampleSize = (size_t)1 << sampleSizeLog;
+            sampleSize += FUZ_rand(&lseed) & (sampleSize-1);
+            sampleStart = FUZ_rand(&lseed) % (srcBufferSize - sampleSize);
+
+            if (cBufferSize-cSize < ZSTD_compressBound(sampleSize))
+                /* avoid invalid dstBufferTooSmall */
+                break;
+
+            errorCode = ZSTD_compressContinue(ctx, cBuffer+cSize, cBufferSize-cSize, srcBuffer+sampleStart, sampleSize);
+            CHECK (ZSTD_isError(errorCode), "multi-segments compression error : %s", ZSTD_getErrorName(errorCode));
+            cSize += errorCode;
+
+            XXH64_update(&crc64, srcBuffer+sampleStart, sampleSize);
+            totalTestSize += sampleSize;
+
+            if (totalTestSize > maxTestSize) break;
+        }
+        errorCode = ZSTD_compressEnd(ctx, cBuffer+cSize, cBufferSize-cSize);
+        CHECK (ZSTD_isError(errorCode), "multi-segments epilogue error : %s", ZSTD_getErrorName(errorCode));
+        cSize += errorCode;
+        crcOrig = XXH64_digest(&crc64);
      }
      DISPLAY("\rAll fuzzer tests completed   \n");
author	Yann Collet <yann.collet.73@gmail.com>
	Fri, 4 Dec 2015 16:16:37 +0000 (17:16 +0100)
committer	Yann Collet <yann.collet.73@gmail.com>
	Fri, 4 Dec 2015 16:16:37 +0000 (17:16 +0100)
Makefile		patch \| blob \| blame \| history
NEWS		patch \| blob \| blame \| history
lib/zstd.h		patch \| blob \| blame \| history
lib/zstd_compress.c		patch \| blob \| blame \| history
lib/zstd_decompress.c		patch \| blob \| blame \| history
lib/zstd_static.h		patch \| blob \| blame \| history
programs/fuzzer.c		patch \| blob \| blame \| history