From: Stella Lau Date: Thu, 20 Jul 2017 23:51:01 +0000 (-0700) Subject: Experiment with 64-bit hash insertion policy X-Git-Tag: v1.3.1^2~12^2~7^2~14 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=0b8fb1703b39ae1f07aea1eaac46b97372d9238d;p=thirdparty%2Fzstd.git Experiment with 64-bit hash insertion policy --- diff --git a/contrib/long_distance_matching/Makefile b/contrib/long_distance_matching/Makefile index 5119f464d..9dc33fae7 100644 --- a/contrib/long_distance_matching/Makefile +++ b/contrib/long_distance_matching/Makefile @@ -25,7 +25,7 @@ LDFLAGS += -lzstd default: all -all: main-circular-buffer main-integrated main-hf +all: main-circular-buffer main-integrated main-64 #main-basic : basic_table.c ldm.c main-ldm.c # $(CC) $(CPPFLAGS) $(CFLAGS) $^ $(LDFLAGS) -o $@ @@ -33,7 +33,7 @@ all: main-circular-buffer main-integrated main-hf main-circular-buffer: circular_buffer_table.c ldm.c main-ldm.c $(CC) $(CPPFLAGS) $(CFLAGS) $^ $(LDFLAGS) -o $@ -main-hf: ldm_hf_test.c main-ldm.c +main-64: ldm_64_hash.c main-ldm.c $(CC) $(CPPFLAGS) $(CFLAGS) $^ $(LDFLAGS) -o $@ main-integrated: ldm_with_table.c main-ldm.c @@ -41,6 +41,6 @@ main-integrated: ldm_with_table.c main-ldm.c clean: @rm -f core *.o tmp* result* *.ldm *.ldm.dec \ - main-basic main-circular-buffer main-integrated main-hf + main-basic main-circular-buffer main-integrated main-64 @echo Cleaning completed diff --git a/contrib/long_distance_matching/circular_buffer_table.c b/contrib/long_distance_matching/circular_buffer_table.c index 66107e069..fb6c19d2a 100644 --- a/contrib/long_distance_matching/circular_buffer_table.c +++ b/contrib/long_distance_matching/circular_buffer_table.c @@ -164,19 +164,19 @@ LDM_hashEntry *HASH_getBestEntry(const LDM_hashTable *table, const BYTE *pIn, const BYTE *pEnd, const BYTE *pAnchor, - U32 *pForwardMatchLength, - U32 *pBackwardMatchLength) { + U64 *pForwardMatchLength, + U64 *pBackwardMatchLength) { LDM_hashEntry *bucket = getBucket(table, hash); LDM_hashEntry *cur = bucket; LDM_hashEntry *bestEntry = NULL; - U32 bestMatchLength = 0; + U64 bestMatchLength = 0; for (; cur < bucket + HASH_BUCKET_SIZE; ++cur) { const BYTE *pMatch = cur->offset + table->offsetBase; // Check checksum for faster check. if (cur->checksum == checksum && pIn - pMatch <= table->maxWindowSize) { - U32 forwardMatchLength = ZSTD_count(pIn, pMatch, pEnd); - U32 backwardMatchLength, totalMatchLength; + U64 forwardMatchLength = ZSTD_count(pIn, pMatch, pEnd); + U64 backwardMatchLength, totalMatchLength; if (forwardMatchLength < table->minMatchLength) { continue; diff --git a/contrib/long_distance_matching/ldm.c b/contrib/long_distance_matching/ldm.c index ab2de7c1e..b018c4755 100644 --- a/contrib/long_distance_matching/ldm.c +++ b/contrib/long_distance_matching/ldm.c @@ -429,7 +429,7 @@ void LDM_destroyCCtx(LDM_CCtx *cctx) { * matchLength contains the forward length of the match. */ static int LDM_findBestMatch(LDM_CCtx *cctx, const BYTE **match, - U32 *matchLength, U32 *backwardMatchLength) { + U64 *matchLength, U64 *backwardMatchLength) { LDM_hashEntry *entry = NULL; cctx->nextIp = cctx->ip + cctx->step; @@ -462,7 +462,7 @@ static int LDM_findBestMatch(LDM_CCtx *cctx, const BYTE **match, } void LDM_encodeLiteralLengthAndLiterals( - LDM_CCtx *cctx, BYTE *pToken, const U32 literalLength) { + LDM_CCtx *cctx, BYTE *pToken, const U64 literalLength) { /* Encode the literal length. */ if (literalLength >= RUN_MASK) { int len = (int)literalLength - RUN_MASK; @@ -481,9 +481,9 @@ void LDM_encodeLiteralLengthAndLiterals( } void LDM_outputBlock(LDM_CCtx *cctx, - const U32 literalLength, + const U64 literalLength, const U32 offset, - const U32 matchLength) { + const U64 matchLength) { BYTE *pToken = cctx->op++; /* Encode the literal length and literals. */ @@ -495,7 +495,7 @@ void LDM_outputBlock(LDM_CCtx *cctx, /* Encode the match length. */ if (matchLength >= ML_MASK) { - unsigned matchLengthRemaining = matchLength; + U64 matchLengthRemaining = matchLength; *pToken += ML_MASK; matchLengthRemaining -= ML_MASK; MEM_write32(cctx->op, 0xFFFFFFFF); diff --git a/contrib/long_distance_matching/ldm.h b/contrib/long_distance_matching/ldm.h index c420e60c7..83cd36230 100644 --- a/contrib/long_distance_matching/ldm.h +++ b/contrib/long_distance_matching/ldm.h @@ -14,7 +14,7 @@ // Note that this is not the number of buckets. // Currently this should be less than WINDOW_SIZE_LOG + 4? #define LDM_MEMORY_USAGE 22 -#define HASH_BUCKET_SIZE_LOG 3 // MAX is 4 for now +#define HASH_BUCKET_SIZE_LOG 1 // MAX is 4 for now // Defines the lag in inserting elements into the hash table. #define LDM_LAG 0 @@ -115,16 +115,16 @@ U32 LDM_countMatchLength(const BYTE *pIn, const BYTE *pMatch, * This is followed by literalLength bytes corresponding to the literals. */ void LDM_encodeLiteralLengthAndLiterals( - LDM_CCtx *cctx, BYTE *pToken, const U32 literalLength); + LDM_CCtx *cctx, BYTE *pToken, const U64 literalLength); /** * Write current block (literals, literal length, match offset, * match length). */ void LDM_outputBlock(LDM_CCtx *cctx, - const U32 literalLength, + const U64 literalLength, const U32 offset, - const U32 matchLength); + const U64 matchLength); /** * Decompresses src into dst. diff --git a/contrib/long_distance_matching/ldm_hf_test.c b/contrib/long_distance_matching/ldm_64_hash.c similarity index 97% rename from contrib/long_distance_matching/ldm_hf_test.c rename to contrib/long_distance_matching/ldm_64_hash.c index 63be82d15..a72c283f2 100644 --- a/contrib/long_distance_matching/ldm_hf_test.c +++ b/contrib/long_distance_matching/ldm_64_hash.c @@ -241,9 +241,9 @@ static size_t ZSTD_count(const BYTE *pIn, const BYTE *pMatch, * * We count only bytes where pMatch > pBaes and pIn > pAnchor. */ -U32 countBackwardsMatch(const BYTE *pIn, const BYTE *pAnchor, +U64 countBackwardsMatch(const BYTE *pIn, const BYTE *pAnchor, const BYTE *pMatch, const BYTE *pBase) { - U32 matchLength = 0; + U64 matchLength = 0; while (pIn > pAnchor && pMatch > pBase && pIn[-1] == pMatch[-1]) { pIn--; pMatch--; @@ -267,8 +267,8 @@ U32 countBackwardsMatch(const BYTE *pIn, const BYTE *pAnchor, LDM_hashEntry *HASH_getBestEntry(const LDM_CCtx *cctx, const hash_t hash, const U32 checksum, - U32 *pForwardMatchLength, - U32 *pBackwardMatchLength) { + U64 *pForwardMatchLength, + U64 *pBackwardMatchLength) { LDM_hashTable *table = cctx->hashTable; LDM_hashEntry *bucket = getBucket(table, hash); LDM_hashEntry *cur = bucket; @@ -541,7 +541,7 @@ static U64 updateChecksum(U64 sum, U32 len, BYTE toRemove, BYTE toAdd) { // TODO: deduplicate. static const U64 prime8bytes = 11400714785074694791ULL; -// static const U64 prime8bytes = 5; + sum -= ((toRemove + CHECKSUM_CHAR_OFFSET) * ipow(prime8bytes, len - 1)); sum *= prime8bytes; @@ -696,11 +696,12 @@ U32 LDM_countMatchLength(const BYTE *pIn, const BYTE *pMatch, void LDM_outputConfiguration(void) { printf("=====================\n"); printf("Configuration\n"); - printf("Window size log: %d\n", LDM_WINDOW_SIZE_LOG); - printf("Min match, hash length: %d, %d\n", + printf("LDM_WINDOW_SIZE_LOG: %d\n", LDM_WINDOW_SIZE_LOG); + printf("LDM_MIN_MATCH_LENGTH, LDM_HASH_LENGTH: %d, %d\n", LDM_MIN_MATCH_LENGTH, LDM_HASH_LENGTH); printf("LDM_MEMORY_USAGE: %d\n", LDM_MEMORY_USAGE); printf("HASH_ONLY_EVERY_LOG: %d\n", HASH_ONLY_EVERY_LOG); + printf("HASH_BUCKET_SIZE_LOG: %d\n", HASH_BUCKET_SIZE_LOG); printf("LDM_LAG %d\n", LDM_LAG); printf("=====================\n"); } @@ -762,7 +763,7 @@ void LDM_destroyCCtx(LDM_CCtx *cctx) { * forwardMatchLength contains the forward length of the match. */ static int LDM_findBestMatch(LDM_CCtx *cctx, const BYTE **match, - U32 *forwardMatchLength, U32 *backwardMatchLength) { + U64 *forwardMatchLength, U64 *backwardMatchLength) { LDM_hashEntry *entry = NULL; cctx->nextIp = cctx->ip + cctx->step; @@ -800,10 +801,10 @@ static int LDM_findBestMatch(LDM_CCtx *cctx, const BYTE **match, } void LDM_encodeLiteralLengthAndLiterals( - LDM_CCtx *cctx, BYTE *pToken, const U32 literalLength) { + LDM_CCtx *cctx, BYTE *pToken, const U64 literalLength) { /* Encode the literal length. */ if (literalLength >= RUN_MASK) { - int len = (int)literalLength - RUN_MASK; + U64 len = (U64)literalLength - RUN_MASK; *pToken = (RUN_MASK << ML_BITS); for (; len >= 255; len -= 255) { *(cctx->op)++ = 255; @@ -819,9 +820,9 @@ void LDM_encodeLiteralLengthAndLiterals( } void LDM_outputBlock(LDM_CCtx *cctx, - const U32 literalLength, + const U64 literalLength, const U32 offset, - const U32 matchLength) { + const U64 matchLength) { BYTE *pToken = cctx->op++; /* Encode the literal length and literals. */ @@ -833,7 +834,7 @@ void LDM_outputBlock(LDM_CCtx *cctx, /* Encode the match length. */ if (matchLength >= ML_MASK) { - unsigned matchLengthRemaining = matchLength; + U64 matchLengthRemaining = matchLength; *pToken += ML_MASK; matchLengthRemaining -= ML_MASK; MEM_write32(cctx->op, 0xFFFFFFFF); @@ -858,8 +859,8 @@ size_t LDM_compress(const void *src, size_t srcSize, void *dst, size_t maxDstSize) { LDM_CCtx cctx; const BYTE *match = NULL; - U32 forwardMatchLength = 0; - U32 backwardsMatchLength = 0; + U64 forwardMatchLength = 0; + U64 backwardsMatchLength = 0; LDM_initializeCCtx(&cctx, src, srcSize, dst, maxDstSize); LDM_outputConfiguration(); @@ -892,9 +893,9 @@ size_t LDM_compress(const void *src, size_t srcSize, * length) and update pointers and hashes. */ { - const U32 literalLength = cctx.ip - cctx.anchor; + const U64 literalLength = cctx.ip - cctx.anchor; const U32 offset = cctx.ip - match; - const U32 matchLength = forwardMatchLength + + const U64 matchLength = forwardMatchLength + backwardsMatchLength - LDM_MIN_MATCH_LENGTH; @@ -934,7 +935,7 @@ size_t LDM_compress(const void *src, size_t srcSize, /* Encode the last literals (no more matches). */ { - const U32 lastRun = cctx.iend - cctx.anchor; + const U64 lastRun = cctx.iend - cctx.anchor; BYTE *pToken = cctx.op++; LDM_encodeLiteralLengthAndLiterals(&cctx, pToken, lastRun); } @@ -979,6 +980,7 @@ void LDM_initializeDCtx(LDM_DCtx *dctx, size_t LDM_decompress(const void *src, size_t compressedSize, void *dst, size_t maxDecompressedSize) { + LDM_DCtx dctx; LDM_initializeDCtx(&dctx, src, compressedSize, dst, maxDecompressedSize); diff --git a/contrib/long_distance_matching/ldm_hashtable.h b/contrib/long_distance_matching/ldm_hashtable.h index 9d5ba0e27..d59f401ec 100644 --- a/contrib/long_distance_matching/ldm_hashtable.h +++ b/contrib/long_distance_matching/ldm_hashtable.h @@ -25,8 +25,8 @@ LDM_hashEntry *HASH_getBestEntry(const LDM_hashTable *table, const BYTE *pIn, const BYTE *pEnd, const BYTE *pAnchor, - U32 *matchLength, - U32 *backwardsMatchLength); + U64 *matchLength, + U64 *backwardsMatchLength); hash_t HASH_hashU32(U32 value); diff --git a/contrib/long_distance_matching/ldm_with_table.c b/contrib/long_distance_matching/ldm_with_table.c index c727616af..babfdf3f2 100644 --- a/contrib/long_distance_matching/ldm_with_table.c +++ b/contrib/long_distance_matching/ldm_with_table.c @@ -719,10 +719,10 @@ static int LDM_findBestMatch(LDM_CCtx *cctx, const BYTE **match, } void LDM_encodeLiteralLengthAndLiterals( - LDM_CCtx *cctx, BYTE *pToken, const U32 literalLength) { + LDM_CCtx *cctx, BYTE *pToken, const U64 literalLength) { /* Encode the literal length. */ if (literalLength >= RUN_MASK) { - int len = (int)literalLength - RUN_MASK; + U64 len = (U64)literalLength - RUN_MASK; *pToken = (RUN_MASK << ML_BITS); for (; len >= 255; len -= 255) { *(cctx->op)++ = 255; @@ -738,9 +738,9 @@ void LDM_encodeLiteralLengthAndLiterals( } void LDM_outputBlock(LDM_CCtx *cctx, - const U32 literalLength, + const U64 literalLength, const U32 offset, - const U32 matchLength) { + const U64 matchLength) { BYTE *pToken = cctx->op++; /* Encode the literal length and literals. */ @@ -811,9 +811,9 @@ size_t LDM_compress(const void *src, size_t srcSize, * length) and update pointers and hashes. */ { - const U32 literalLength = cctx.ip - cctx.anchor; + const U64 literalLength = cctx.ip - cctx.anchor; const U32 offset = cctx.ip - match; - const U32 matchLength = forwardMatchLength + + const U64 matchLength = forwardMatchLength + backwardsMatchLength - LDM_MIN_MATCH_LENGTH; diff --git a/contrib/long_distance_matching/main-ldm.c b/contrib/long_distance_matching/main-ldm.c index 3582d5a21..b6788c67e 100644 --- a/contrib/long_distance_matching/main-ldm.c +++ b/contrib/long_distance_matching/main-ldm.c @@ -94,8 +94,8 @@ static int compress(const char *fname, const char *oname) { // Truncate file to compressedSize. ftruncate(fdout, compressedSize); - printf("%25s : %6u -> %7u - %s (%.1f%%)\n", fname, - (unsigned)statbuf.st_size, (unsigned)compressedSize, oname, + printf("%25s : %10lu -> %10lu - %s (%.1f%%)\n", fname, + (size_t)statbuf.st_size, (size_t)compressedSize, oname, (double)compressedSize / (statbuf.st_size) * 100); timeTaken = (double) (tv2.tv_usec - tv1.tv_usec) / 1000000 + @@ -164,7 +164,7 @@ static int decompress(const char *fname, const char *oname) { src + LDM_HEADER_SIZE, statbuf.st_size - LDM_HEADER_SIZE, dst, decompressedSize); printf("Ret size out: %zu\n", outSize); - ftruncate(fdout, outSize); +// ftruncate(fdout, decompressedSize); close(fdin); close(fdout); @@ -231,7 +231,6 @@ int main(int argc, const char *argv[]) { printf("ldm = [%s]\n", ldmFilename); printf("dec = [%s]\n", decFilename); - /* Compress */ { if (compress(inpFilename, ldmFilename)) {