]> git.ipfire.org Git - thirdparty/zstd.git/commitdiff
Fixed end of match boundary update issues
authorsenhuang42 <senhuang96@fb.com>
Tue, 29 Sep 2020 16:12:58 +0000 (12:12 -0400)
committersenhuang42 <senhuang96@fb.com>
Wed, 7 Oct 2020 17:56:25 +0000 (13:56 -0400)
lib/compress/zstd_compress.c
lib/compress/zstd_compress_internal.h
lib/compress/zstd_ldm.c
lib/compress/zstd_opt.c
lib/compress/zstdmt_compress.c

index 037b5360aac7ac9bd9bb1ff784408610a7988da1..3c80ab8602743634c21f5ac996faddba3f1b019b 100644 (file)
@@ -2336,7 +2336,7 @@ static size_t ZSTD_buildSeqStore(ZSTD_CCtx* zc, const void* src, size_t srcSize)
         if (curr > ms->nextToUpdate + 384)
             ms->nextToUpdate = curr - MIN(192, (U32)(curr - ms->nextToUpdate - 384));
     }
-    printf("--NEW BLOCK--\n");
+    //printf("--NEW BLOCK--\n");
     /* select and store sequences */
     {   ZSTD_dictMode_e const dictMode = ZSTD_matchState_dictMode(ms);
         size_t lastLLSize;
@@ -2354,7 +2354,7 @@ static size_t ZSTD_buildSeqStore(ZSTD_CCtx* zc, const void* src, size_t srcSize)
                                        src, srcSize);
             assert(zc->externSeqStore.pos <= zc->externSeqStore.size);
         } else if (zc->appliedParams.ldmParams.enableLdm) {
-            rawSeqStore_t ldmSeqStore = {NULL, 0, 0, 0};
+            rawSeqStore_t ldmSeqStore = {NULL, NULL, 0, 0, 0};
 
             ldmSeqStore.seq = zc->ldmSequences;
             ldmSeqStore.capacity = zc->maxNbLdmSequences;
index 524cbb02566d016861c2f2b4115058e3681920cf..53c20b12ccb13544bb7dd3f303c07ae5c164f898 100644 (file)
@@ -94,6 +94,7 @@ typedef struct {
 
 typedef struct {
   rawSeq* seq;     /* The start of the sequences */
+  BYTE const* base;
   size_t pos;      /* The position where reading stopped. <= size. */
   size_t size;     /* The number of sequences. <= capacity. */
   size_t capacity; /* The capacity starting from `seq` pointer */
index e31c6838d533910901c323bdb2b2623ed48e28d8..9fcb9fc298fcf4beceeb748e430e0b37036e6413 100644 (file)
@@ -563,9 +563,9 @@ static rawSeq maybeSplitSequence(rawSeqStore_t* rawSeqStore,
 }
 
 static void printSeqStore(rawSeqStore_t* rawSeqStore) {
-    printf("rawSeqStore: pos: %zu, bytesDiscarded: %zu\n", rawSeqStore->pos);
+    printf("rawSeqStore: pos: %zu\n", rawSeqStore->pos);
     for (int i = 0; i < rawSeqStore->size; ++i) {
-        printf("(of:%u ml:%u ll: %u)\n", rawSeqStore->seq[i].offset, rawSeqStore->seq[i].matchLength, rawSeqStore->seq[i].litLength);
+        printf("pos %d (of:%u ml:%u ll: %u)\n", i, rawSeqStore->seq[i].offset, rawSeqStore->seq[i].matchLength, rawSeqStore->seq[i].litLength);
     }
 }
 
@@ -582,20 +582,19 @@ size_t ZSTD_ldm_blockCompress(rawSeqStore_t* rawSeqStore,
     BYTE const* const iend = istart + srcSize;
     /* Input positions */
     BYTE const* ip = istart;
-
+    //printSeqStore(rawSeqStore);
     if (cParams->strategy >= ZSTD_btopt) {
         size_t lastLLSize;
-        printSeqStore(rawSeqStore);
+        //printSeqStore(rawSeqStore);
         ms->ldmSeqStore = *rawSeqStore; /* copy current seqStore */
-        const BYTE* const prevBase = (BYTE const*)ms->window.base;
+        ms->ldmSeqStore.base = ms->window.base;
         lastLLSize = blockCompressor(ms, seqStore, rep, src, srcSize);
-        rawSeqStore->pos = ms->ldmSeqStore.pos;
-        ms->ldmSeqStore = *rawSeqStore;
-        if (prevBase != ms->window.base) {
+        *rawSeqStore = ms->ldmSeqStore;
+        /*if (prevBase != ms->window.base) {
             int baseDiff = (int)(prevBase - ms->window.base);
             printf("Bases were different, adjusting, diff = %d\n", baseDiff);
             rawSeqStore->seq[rawSeqStore->pos].litLength += baseDiff;
-        }
+        }*/
         return lastLLSize;
     }
 
index bc6436a1acba3e22ed4ae8261ef62856ef2602b4..05fa4cae7225967631cf3c78fec97b3737345666 100644 (file)
@@ -800,85 +800,85 @@ static void ldm_voidSequences(rawSeqStore_t* ldmSeqStore, U32 overshotBytes) {
 
 static void ldm_skipSequences(rawSeqStore_t* rawSeqStore, size_t srcSize, U32 const minMatch) {
     while (srcSize > 0 && rawSeqStore->pos < rawSeqStore->size) {
-        printf("ldm_skipSequences(): %u remaining\n", srcSize);
+        ////printf("ldm_skipSequences(): %u remaining\n", srcSize);
         rawSeq* seq = rawSeqStore->seq + rawSeqStore->pos;
-        printf("ldm_skipSequences(): before: (of: %u, ml: %u, ll: %u)\n", seq->offset, seq->matchLength, seq->litLength);
+        ////printf("ldm_skipSequences(): before: (of: %u, ml: %u, ll: %u)\n", seq->offset, seq->matchLength, seq->litLength);
         if (srcSize <= seq->litLength) {
             /* Skip past srcSize literals */
             seq->litLength -= (U32)srcSize;
-            printf("ldm_skipSequences(): final: (of: %u, ml: %u, ll: %u)\n", seq->offset, seq->matchLength, seq->litLength);
+            ////printf("ldm_skipSequences(): seqstore final: (of: %u, ml: %u, ll: %u) at %u\n", seq->offset, seq->matchLength, seq->litLength, rawSeqStore->pos);
             return;
         }
         srcSize -= seq->litLength;
         seq->litLength = 0;
         if (srcSize < seq->matchLength) {
-            printf("Splitting match: ml curr: %u ", seq->matchLength);
+            ////printf("Splitting match: ml curr: %u ", seq->matchLength);
             /* Skip past the first srcSize of the match */
             seq->matchLength -= (U32)srcSize;
-            printf("ml in store left: %u ", seq->matchLength);
-            if (seq->matchLength < minMatch) {
-                /* The match is too short, omit it */
+            ////printf("ml in store left: %u ", seq->matchLength);
+            /*if (seq->matchLength < minMatch) {
+                // The match is too short, omit it
                 if (rawSeqStore->pos + 1 < rawSeqStore->size) {
                     seq[1].litLength += seq[0].matchLength;
                 }
                 rawSeqStore->pos++;
-            }
-            printf("ldm_skipSequences(): final: (of: %u, ml: %u, ll: %u)\n", seq->offset, seq->matchLength, seq->litLength);
+            }*/
+            //printf("ldm_skipSequences(): seqstore final: (of: %u, ml: %u, ll: %u) at %u\n", seq->offset, seq->matchLength, seq->litLength, rawSeqStore->pos);
             return;
         }
         srcSize -= seq->matchLength;
         seq->matchLength = 0;
-        printf("ldm_skipSequences(): final: (of: %u, ml: %u, ll: %u)\n", seq->offset, seq->matchLength, seq->litLength);
+        //printf("ldm_skipSequences(): seqstore final: (of: %u, ml: %u, ll: %u) at %u\n", seq->offset, seq->matchLength, seq->litLength, rawSeqStore->pos);
         rawSeqStore->pos++;
     }
 }
 
 // The only function that can update pos (i think, for now)
-static rawSeq ldm_splitSequence(rawSeqStore_t* ldmSeqStore, U32 remainingBytes) {
+static rawSeq ldm_splitSequence(rawSeqStore_t* ldmSeqStore, U32* ldmSeqStoreBytesConsumed, U32 remainingBytes) {
     rawSeq currSeq = ldmSeqStore->seq[ldmSeqStore->pos];
-    printf("Current sequence: (of: %u, ml: %u, ll: %u)\n", currSeq.offset, currSeq.matchLength, currSeq.litLength);
-    printf("Split Sequence with remaining = %u : ", remainingBytes);
+    //printf("ldm_splitSequence(): Current sequence: (of: %u, ml: %u, ll: %u) - remaining: %u\n", currSeq.offset, currSeq.matchLength, currSeq.litLength, remainingBytes);
     /* No split */
     if (remainingBytes >= currSeq.litLength + currSeq.matchLength) {
-        printf("NO SPLIT\n");
+        ////printf("NO SPLIT\n");
+        *ldmSeqStoreBytesConsumed += currSeq.litLength + currSeq.matchLength;
         ldmSeqStore->pos++;
-        printf("pos is now: %u\n", ldmSeqStore->pos);
+        ////printf("pos is now: %u\n", ldmSeqStore->pos);
         return currSeq;
     }
     /* Need a split */
     if (remainingBytes <= currSeq.litLength) {
-        printf("SPLITTING: all remaining bytes were literals");
+        ////printf("SPLITTING: all remaining bytes were literals");
         currSeq.offset = 0;
     } else if (remainingBytes < currSeq.litLength + currSeq.matchLength) {
-        if (currSeq.matchLength < MINMATCH) {
-            printf("CurrSeq less than minmatch: all remaining bytes were literals");
+        /*if (currSeq.matchLength < MINMATCH) {
+            ////printf("CurrSeq less than minmatch: all remaining bytes were literals");
             currSeq.offset = 0;
-        }
+        }*/
         currSeq.matchLength = remainingBytes - currSeq.litLength;
     }
-    printf("\n");
-
+    ////printf("\n");
+    *ldmSeqStoreBytesConsumed += remainingBytes;
     ldm_skipSequences(ldmSeqStore, remainingBytes, MINMATCH);
-    printf("Sequence final: (of: %u, ml: %u, ll: %u)\n", currSeq.offset, currSeq.matchLength, currSeq.litLength);
+    //printf("ldm_splitSequence(): Sequence final: (of: %u, ml: %u, ll: %u)\n", currSeq.offset, currSeq.matchLength, currSeq.litLength);
     return currSeq;
 }
 
 /* Returns 1 if the rest of the block is just LDM literals */
-static int ldm_getNextMatch(rawSeqStore_t* ldmSeqStore,
+static int ldm_getNextMatch(rawSeqStore_t* ldmSeqStore, U32* ldmSeqStoreBytesConsumed,
                             U32* matchStartPosInBlock, U32* matchEndPosInBlock,
                             U32* matchOffset, U32 currPosInBlock,
                             U32 remainingBytes, U32 sbi) {
     if (ldmSeqStore->pos >= ldmSeqStore->size) {
         // Don't use the LDM for the rest of the block (there is none)
-        printf("No ldm left in the block, pos max reached\n");
+        //printf("No ldm left in the block, pos max reached\n");
         *matchStartPosInBlock = UINT32_MAX;
         *matchEndPosInBlock = UINT32_MAX;
         return 1;
     }
-    rawSeq seq = ldm_splitSequence(ldmSeqStore, remainingBytes);
+    rawSeq seq = ldm_splitSequence(ldmSeqStore, ldmSeqStoreBytesConsumed, remainingBytes);
     if (seq.offset == 0) {
         // Don't use the LDM for the rest of the block (there is none)
-        printf("No ldm left in the block, offset = 0\n");
+        ////printf("No ldm left in the block, offset = 0\n");
         *matchStartPosInBlock = UINT32_MAX;
         *matchEndPosInBlock = UINT32_MAX;
         return 1;
@@ -887,14 +887,14 @@ static int ldm_getNextMatch(rawSeqStore_t* ldmSeqStore,
     *matchStartPosInBlock = currPosInBlock + seq.litLength;
     *matchEndPosInBlock = *matchStartPosInBlock + seq.matchLength;
     *matchOffset = seq.offset;
-    printf("New match range in block is: (%u, %u) with of: %u where currPosInBlock: %u at adjusted absolute range: %u, %u\n", *matchStartPosInBlock, *matchEndPosInBlock, *matchOffset, currPosInBlock, *matchStartPosInBlock+sbi, *matchEndPosInBlock+sbi);
+    //printf("New match range in block is: (%u, %u) with of: %u where currPosInBlock: %u at adjusted absolute range: %u, %u\n", *matchStartPosInBlock, *matchEndPosInBlock, *matchOffset, currPosInBlock, *matchStartPosInBlock+sbi, *matchEndPosInBlock+sbi);
     return 0;
 }
 
 /* Adds an LDM if it's long enough */
 static void ldm_maybeAddLdm(ZSTD_match_t* matches, U32* nbMatches,
                             U32 matchStartPosInBlock, U32 matchEndPosInBlock,
-                            U32 matchOffset, U32 currPosInBlock, U32 curr, U32 sbi) {
+                            U32 matchOffset, U32 currPosInBlock, U32 curr, U32 sbi, rawSeqStore_t* rawSeqStore) {
     /* Check that current block position is not outside of the match */
     if (currPosInBlock < matchStartPosInBlock || currPosInBlock >= matchEndPosInBlock)
         return;
@@ -906,7 +906,8 @@ static void ldm_maybeAddLdm(ZSTD_match_t* matches, U32* nbMatches,
     U32 candidateOffCode = matchOffset + posDiff + ZSTD_REP_MOVE;
 
     if ((*nbMatches == 0 || candidateMatchLength >= matches[*nbMatches-1].len) && *nbMatches < ZSTD_OPT_NUM) {
-        printf("large enough: curr: %u currposinblock: %u (ofcode: %u, ml: %u)\n", curr, currPosInBlock, candidateOffCode, candidateMatchLength);
+        //printf("large enough: curr: %u currposinblock: %u (ofcode: %u, ml: %u) - range: (%u, %u) - (of: %u ml: %u ll: %u) @ pos: %u - \n",
+        //    curr, currPosInBlock, candidateOffCode, candidateMatchLength, matchStartPosInBlock, matchEndPosInBlock, rawSeqStore->seq[rawSeqStore->pos].offset, rawSeqStore->seq[rawSeqStore->pos].matchLength, rawSeqStore->seq[rawSeqStore->pos].litLength, rawSeqStore->pos);
 
         if (*nbMatches == 0) {
             matches[*nbMatches].len = candidateMatchLength;
@@ -917,15 +918,15 @@ static void ldm_maybeAddLdm(ZSTD_match_t* matches, U32* nbMatches,
                 U32 candidateMatchIdx = *nbMatches;
                 matches[*nbMatches].len = candidateMatchLength;
                 matches[*nbMatches].off = candidateOffCode;
-                printf("Sifting...: idx: %u, len: %u, off: %u\n", candidateMatchIdx, candidateMatchLength, candidateOffCode);
-                //printf("Current best is...: idx: %u, len: %u, off: %u\n", *nbMatches-1,  matches[*nbMatches-1].len, matches[*nbMatches-1].off);
+                ////printf("Sifting...: idx: %u, len: %u, off: %u\n", candidateMatchIdx, candidateMatchLength, candidateOffCode);
+                //////printf("Current best is...: idx: %u, len: %u, off: %u\n", *nbMatches-1,  matches[*nbMatches-1].len, matches[*nbMatches-1].off);
                 if (candidateOffCode != matches[*nbMatches].off)
-                    printf("DIFF: ldm: (len: %u, off: %u), best: (len: %u, off: %u)\n", candidateMatchLength, candidateOffCode, matches[*nbMatches-1].len, matches[*nbMatches-1].off);
-                //printf("Current best is...: idx: %u, len: %u, off: %u\n", *nbMatches-1,  matches[*nbMatches-1].len, matches[*nbMatches-1].off);
+                    ////printf("DIFF: ldm: (len: %u, off: %u), best: (len: %u, off: %u)\n", candidateMatchLength, candidateOffCode, matches[*nbMatches-1].len, matches[*nbMatches-1].off);
+                //////printf("Current best is...: idx: %u, len: %u, off: %u\n", *nbMatches-1,  matches[*nbMatches-1].len, matches[*nbMatches-1].off);
                 while (candidateMatchIdx > 0 &&
                        matches[candidateMatchIdx].off > matches[candidateMatchIdx - 1].off &&
                        matches[candidateMatchIdx].len == matches[candidateMatchIdx - 1].len) {
-                    //printf("Compared to: idx: %u, len: %u, off: %u", candidateMatchIdx - 1, matches[candidateMatchIdx - 1].len, matches[candidateMatchIdx - 1].off);
+                    //////printf("Compared to: idx: %u, len: %u, off: %u", candidateMatchIdx - 1, matches[candidateMatchIdx - 1].len, matches[candidateMatchIdx - 1].off);
                     ZSTD_match_t tmp = matches[candidateMatchIdx - 1];
                     matches[candidateMatchIdx - 1] = matches[candidateMatchIdx];
                     matches[candidateMatchIdx] = tmp;
@@ -933,7 +934,7 @@ static void ldm_maybeAddLdm(ZSTD_match_t* matches, U32* nbMatches,
                 }
                 (*nbMatches)++;
             } else {
-                printf("MATCHDIFF: ldm: (len: %u, off: %u), best: (len: %u, off: %u)\n", candidateMatchLength, candidateOffCode, matches[*nbMatches-1].len, matches[*nbMatches-1].off);
+                ////printf("MATCHDIFF: ldm: (len: %u, off: %u), best: (len: %u, off: %u)\n", candidateMatchLength, candidateOffCode, matches[*nbMatches-1].len, matches[*nbMatches-1].off);
                 matches[*nbMatches].len = candidateMatchLength;
                 matches[*nbMatches].off = candidateOffCode;
                 (*nbMatches)++;
@@ -943,21 +944,22 @@ static void ldm_maybeAddLdm(ZSTD_match_t* matches, U32* nbMatches,
 }
 
 /* Wrapper function to call ldm functions as needed */
-static void ldm_handleLdm(rawSeqStore_t* ldmSeqStore, ZSTD_match_t* matches, U32* nbMatches,
+static void ldm_handleLdm(rawSeqStore_t* ldmSeqStore, U32* ldmSeqStoreBytesConsumed, ZSTD_match_t* matches, U32* nbMatches,
                           U32* matchStartPosInBlock, U32* matchEndPosInBlock, U32* matchOffset,
                           U32 currPosInBlock, U32 remainingBytes, U32 curr, U32 sbi) {
+    ////printf("currPosInBlock: %u, curr: %u, called ldm_handleLdm()", currPosInBlock, curr);
     if (currPosInBlock >= *matchEndPosInBlock) {
-        printf("Went over match boundary: currPosInBlock: %u, %matchEndPosInBlock: %u\n", currPosInBlock, *matchEndPosInBlock);
+        //printf("Went over match boundary: currPosInBlock: %u, %matchEndPosInBlock: %u\n", currPosInBlock, *matchEndPosInBlock);
         if (currPosInBlock > *matchEndPosInBlock) {
             U32 posOvershoot = currPosInBlock - *matchEndPosInBlock;
-            printf("Overshot position by: %u\n", posOvershoot);
+            //printf("Overshot position by: %u\n", posOvershoot);
             ldm_skipSequences(ldmSeqStore, posOvershoot, MINMATCH);
         } 
-        int noMoreLdms = ldm_getNextMatch(ldmSeqStore, matchStartPosInBlock,
+        int noMoreLdms = ldm_getNextMatch(ldmSeqStore, ldmSeqStoreBytesConsumed, matchStartPosInBlock,
                                         matchEndPosInBlock, matchOffset,
                                         currPosInBlock, remainingBytes, sbi);
     }
-    ldm_maybeAddLdm(matches, nbMatches, *matchStartPosInBlock, *matchEndPosInBlock, *matchOffset, currPosInBlock, curr, sbi);
+    ldm_maybeAddLdm(matches, nbMatches, *matchStartPosInBlock, *matchEndPosInBlock, *matchOffset, currPosInBlock, curr, sbi, ldmSeqStore);
 }
 
 
@@ -1018,10 +1020,17 @@ ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms,
     U32 ldmStartPosInBlock = 0;
     U32 ldmEndPosInBlock = 0;
     U32 ldmOffset = 0;
+    U32 ldmSeqStoreBytesConsumed = 0;
     
-    printf("SBI for this block: %u, base: %u\n", sbi, base);
+    ////printf("SBI for this block: %u, base: %u\n", sbi, base);
+    if (ms->ldmSeqStore.size > 0 && ms->ldmSeqStore.base != base) {
+        int baseDiff = (int)(ms->ldmSeqStore.base - base);
+        //printf("Bases were different, adjusting, diff = %d\n", baseDiff);
+        ms->ldmSeqStore.seq[ms->ldmSeqStore.pos].litLength += baseDiff;
+        ms->ldmSeqStore.base = ms->window.base;
+    }
     if (ms->ldmSeqStore.size != 0) {
-        ldm_getNextMatch(&ms->ldmSeqStore, &ldmStartPosInBlock,
+        ldm_getNextMatch(&ms->ldmSeqStore, &ldmSeqStoreBytesConsumed, &ldmStartPosInBlock,
                 &ldmEndPosInBlock, &ldmOffset, (U32)(ip-istart), (U32)(iend-ip), sbi);
     }
     /* init */
@@ -1040,7 +1049,7 @@ ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms,
             U32 const ll0 = !litlen;
             U32 nbMatches = ZSTD_BtGetAllMatches(matches, ms, &nextToUpdate3, ip, iend, dictMode, rep, ll0, minMatch);
             if (ms->ldmSeqStore.size != 0) {
-                ldm_handleLdm(&ms->ldmSeqStore, matches,
+                ldm_handleLdm(&ms->ldmSeqStore, &ldmSeqStoreBytesConsumed, matches,
                               &nbMatches, &ldmStartPosInBlock,
                               &ldmEndPosInBlock, &ldmOffset,
                               (U32)(ip-istart), (U32)(iend - ip), (U32)(ip-base), sbi);
@@ -1162,7 +1171,7 @@ ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms,
 
                 
                 if (ms->ldmSeqStore.size != 0) {
-                    ldm_handleLdm(&ms->ldmSeqStore, matches,
+                    ldm_handleLdm(&ms->ldmSeqStore, &ldmSeqStoreBytesConsumed, matches,
                             &nbMatches, &ldmStartPosInBlock,
                             &ldmEndPosInBlock, &ldmOffset,
                             (U32)(inr-istart), (U32)(iend-inr), (U32)(inr-base), sbi);
@@ -1281,6 +1290,13 @@ _shortestPath:   /* cur, last_pos, best_mlen, best_off have to be set */
         }
     }   /* while (ip < ilimit) */
 
+    int i = ms->ldmSeqStore.pos;
+    //printf("pos %d (of:%u ml:%u ll: %u)\n", i, ms->ldmSeqStore.seq[i].offset, ms->ldmSeqStore.seq[i].matchLength, ms->ldmSeqStore.seq[i].litLength);
+    //printf("matchend: %u bytesconsumed: %u\n", ldmEndPosInBlock, ldmSeqStoreBytesConsumed);
+    if (ldmEndPosInBlock < srcSize) {
+        //printf("Needs adjustment, endpos didn't reach end of block\n");
+        ldm_skipSequences(&ms->ldmSeqStore, srcSize - ldmEndPosInBlock, MINMATCH);
+    }
     /* Return the last literals size */
     return (size_t)(iend - anchor);
 }
index baf6ef4ca6da2f472191773d49f343ded5ee392f..57015f1561f50b1dd87d6c62471ba1b36ec3879f 100644 (file)
@@ -277,7 +277,7 @@ static size_t ZSTDMT_sizeof_seqPool(ZSTDMT_seqPool* seqPool)
 
 static rawSeqStore_t bufferToSeq(buffer_t buffer)
 {
-    rawSeqStore_t seq = {NULL, 0, 0, 0};
+    rawSeqStore_t seq = {NULL, NULL, 0, 0, 0};
     seq.seq = (rawSeq*)buffer.start;
     seq.capacity = buffer.capacity / sizeof(rawSeq);
     return seq;