From: Stella Lau Date: Fri, 21 Jul 2017 17:44:39 +0000 (-0700) Subject: Fix overflow bug when calculating hash X-Git-Tag: v1.3.1^2~12^2~7^2~13 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=1a188fe864c67673621b0b20a9911e3d23ef935a;p=thirdparty%2Fzstd.git Fix overflow bug when calculating hash --- diff --git a/contrib/long_distance_matching/Makefile b/contrib/long_distance_matching/Makefile index 9dc33fae7..168442970 100644 --- a/contrib/long_distance_matching/Makefile +++ b/contrib/long_distance_matching/Makefile @@ -25,7 +25,7 @@ LDFLAGS += -lzstd default: all -all: main-circular-buffer main-integrated main-64 +all: main-circular-buffer main-integrated main-64 #main-basic : basic_table.c ldm.c main-ldm.c # $(CC) $(CPPFLAGS) $(CFLAGS) $^ $(LDFLAGS) -o $@ @@ -41,6 +41,6 @@ main-integrated: ldm_with_table.c main-ldm.c clean: @rm -f core *.o tmp* result* *.ldm *.ldm.dec \ - main-basic main-circular-buffer main-integrated main-64 + main-circular-buffer main-integrated main-64 @echo Cleaning completed diff --git a/contrib/long_distance_matching/ldm.c b/contrib/long_distance_matching/ldm.c index b018c4755..bfaff1f5a 100644 --- a/contrib/long_distance_matching/ldm.c +++ b/contrib/long_distance_matching/ldm.c @@ -520,8 +520,8 @@ size_t LDM_compress(const void *src, size_t srcSize, void *dst, size_t maxDstSize) { LDM_CCtx cctx; const BYTE *match = NULL; - U32 forwardMatchLength = 0; - U32 backwardsMatchLength = 0; + U64 forwardMatchLength = 0; + U64 backwardsMatchLength = 0; LDM_initializeCCtx(&cctx, src, srcSize, dst, maxDstSize); LDM_outputConfiguration(); diff --git a/contrib/long_distance_matching/ldm.h b/contrib/long_distance_matching/ldm.h index 83cd36230..e2f786979 100644 --- a/contrib/long_distance_matching/ldm.h +++ b/contrib/long_distance_matching/ldm.h @@ -14,7 +14,7 @@ // Note that this is not the number of buckets. // Currently this should be less than WINDOW_SIZE_LOG + 4? #define LDM_MEMORY_USAGE 22 -#define HASH_BUCKET_SIZE_LOG 1 // MAX is 4 for now +#define HASH_BUCKET_SIZE_LOG 2 // MAX is 4 for now // Defines the lag in inserting elements into the hash table. #define LDM_LAG 0 @@ -26,7 +26,8 @@ #define LDM_MIN_MATCH_LENGTH 64 #define LDM_HASH_LENGTH 64 - +#define TMP_EVICTION +#define TMP_TAG_INSERT typedef struct LDM_compressStats LDM_compressStats; typedef struct LDM_CCtx LDM_CCtx; typedef struct LDM_DCtx LDM_DCtx; @@ -99,12 +100,6 @@ void LDM_printCompressStats(const LDM_compressStats *stats); */ int LDM_isValidMatch(const BYTE *pIn, const BYTE *pMatch); -/** - * Counts the number of bytes that match from pIn and pMatch, - * up to pInLimit. - */ -U32 LDM_countMatchLength(const BYTE *pIn, const BYTE *pMatch, - const BYTE *pInLimit); /** * Encode the literal length followed by the literals. diff --git a/contrib/long_distance_matching/ldm_64_hash.c b/contrib/long_distance_matching/ldm_64_hash.c index a72c283f2..7a8135342 100644 --- a/contrib/long_distance_matching/ldm_64_hash.c +++ b/contrib/long_distance_matching/ldm_64_hash.c @@ -89,21 +89,16 @@ struct LDM_CCtx { const BYTE *lastPosHashed; /* Last position hashed */ hash_t lastHash; /* Hash corresponding to lastPosHashed */ - U32 lastSum; + U64 lastSum; const BYTE *nextIp; // TODO: this is redundant (ip + step) const BYTE *nextPosHashed; U64 nextSum; -// hash_t nextHash; /* Hash corresponding to nextPosHashed */ -// U32 nextSum; - unsigned step; // ip step, should be 1. const BYTE *lagIp; U64 lagSum; -// hash_t lagHash; -// U32 lagSum; U64 numHashInserts; // DEBUG @@ -273,15 +268,15 @@ LDM_hashEntry *HASH_getBestEntry(const LDM_CCtx *cctx, LDM_hashEntry *bucket = getBucket(table, hash); LDM_hashEntry *cur = bucket; LDM_hashEntry *bestEntry = NULL; - U32 bestMatchLength = 0; + U64 bestMatchLength = 0; for (; cur < bucket + HASH_BUCKET_SIZE; ++cur) { const BYTE *pMatch = cur->offset + cctx->ibase; // Check checksum for faster check. if (cur->checksum == checksum && cctx->ip - pMatch <= LDM_WINDOW_SIZE) { - U32 forwardMatchLength = ZSTD_count(cctx->ip, pMatch, cctx->iend); - U32 backwardMatchLength, totalMatchLength; + U64 forwardMatchLength = ZSTD_count(cctx->ip, pMatch, cctx->iend); + U64 backwardMatchLength, totalMatchLength; // For speed. if (forwardMatchLength < LDM_MIN_MATCH_LENGTH) { @@ -313,6 +308,8 @@ LDM_hashEntry *HASH_getBestEntry(const LDM_CCtx *cctx, return NULL; } +#ifdef TMP_EVICTION + void HASH_insert(LDM_hashTable *table, const hash_t hash, const LDM_hashEntry entry) { *(getBucket(table, hash) + table->bucketOffsets[hash]) = entry; @@ -320,6 +317,17 @@ void HASH_insert(LDM_hashTable *table, table->bucketOffsets[hash] &= HASH_BUCKET_SIZE - 1; } +#else + +void HASH_insert(LDM_hashTable *table, + const hash_t hash, const LDM_hashEntry entry) { + *(getBucket(table, hash) + table->bucketOffsets[hash]) = entry; + table->bucketOffsets[hash]++; + table->bucketOffsets[hash] &= HASH_BUCKET_SIZE - 1; +} +#endif // TMP_EVICTION + + U32 HASH_getSize(const LDM_hashTable *table) { return table->numBuckets; } @@ -349,7 +357,7 @@ void HASH_outputTableOccupancy(const LDM_hashTable *table) { // TODO: This can be done more efficiently (but it is not that important as it // is only used for computing stats). -static int intLog2(U32 x) { +static int intLog2(U64 x) { int ret = 0; while (x >>= 1) { ret++; @@ -357,32 +365,6 @@ static int intLog2(U32 x) { return ret; } -// Maybe we would eventually prefer to have linear rather than -// exponential buckets. -/** -void HASH_outputTableOffsetHistogram(const LDM_CCtx *cctx) { - U32 i = 0; - int buckets[32] = { 0 }; - - printf("\n"); - printf("Hash table histogram\n"); - for (; i < HASH_getSize(cctx->hashTable); i++) { - int offset = (cctx->ip - cctx->ibase) - - HASH_getEntryFromHash(cctx->hashTable, i)->offset; - buckets[intLog2(offset)]++; - } - - i = 0; - for (; i < 32; i++) { - printf("2^%*d: %10u %6.3f%%\n", 2, i, - buckets[i], - 100.0 * (double) buckets[i] / - (double) HASH_getSize(cctx->hashTable)); - } - printf("\n"); -} -*/ - void LDM_printCompressStats(const LDM_compressStats *stats) { int i = 0; printf("=====================\n"); @@ -436,22 +418,6 @@ int LDM_isValidMatch(const BYTE *pIn, const BYTE *pMatch) { return 1; } -#if 0 -hash_t HASH_hashU32(U32 value) { - return ((value * 2654435761U) >> (32 - LDM_HASHLOG)); -} -#endif - -/** - * Convert a sum computed from getChecksum to a hash value in the range - * of the hash table. - */ -#if 0 -static hash_t checksumToHash(U32 sum) { - return HASH_hashU32(sum); -} -#endif - // Upper LDM_HASH_LOG bits. static hash_t checksumToHash(U64 sum) { return sum >> (64 - LDM_HASHLOG); @@ -462,69 +428,19 @@ static U32 checksumFromHfHash(U64 hfHash) { return (hfHash >> (64 - 32 - LDM_HASHLOG)) & 0xFFFFFFFF; } -#if 0 -/** - * Computes a checksum based on rsync's checksum. - * - * a(k,l) = \sum_{i = k}^l x_i (mod M) - * b(k,l) = \sum_{i = k}^l ((l - i + 1) * x_i) (mod M) - * checksum(k,l) = a(k,l) + 2^{16} * b(k,l) - */ -static U32 getChecksum(const BYTE *buf, U32 len) { - U32 i; - U32 s1, s2; - - s1 = s2 = 0; - for (i = 0; i < (len - 4); i += 4) { - s2 += (4 * (s1 + buf[i])) + (3 * buf[i + 1]) + - (2 * buf[i + 2]) + (buf[i + 3]) + - (10 * CHECKSUM_CHAR_OFFSET); - s1 += buf[i] + buf[i + 1] + buf[i + 2] + buf[i + 3] + - + (4 * CHECKSUM_CHAR_OFFSET); - - } - for(; i < len; i++) { - s1 += buf[i] + CHECKSUM_CHAR_OFFSET; - s2 += s1; - } - return (s1 & 0xffff) + (s2 << 16); -} -#endif - static U64 getChecksum(const BYTE *buf, U32 len) { static const U64 prime8bytes = 11400714785074694791ULL; -// static const U64 prime8bytes = 5; U64 ret = 0; U32 i; for (i = 0; i < len; i++) { ret *= prime8bytes; ret += buf[i] + CHECKSUM_CHAR_OFFSET; -// printf("HERE %llu\n", ret); } return ret; } -#if 0 -/** - * Update a checksum computed from getChecksum(data, len). - * - * The checksum can be updated along its ends as follows: - * a(k+1, l+1) = (a(k,l) - x_k + x_{l+1}) (mod M) - * b(k+1, l+1) = (b(k,l) - (l-k+1)*x_k + (a(k+1,l+1)) (mod M) - * - * Thus toRemove should correspond to data[0]. - */ -static U32 updateChecksum(U32 sum, U32 len, - BYTE toRemove, BYTE toAdd) { - U32 s1 = (sum & 0xffff) - toRemove + toAdd; - U32 s2 = (sum >> 16) - ((toRemove + CHECKSUM_CHAR_OFFSET) * len) + s1; - - return (s1 & 0xffff) + (s2 << 16); -} -#endif - static U64 ipow(U64 base, U64 exp) { U64 ret = 1; while (exp) { @@ -542,6 +458,8 @@ static U64 updateChecksum(U64 sum, U32 len, // TODO: deduplicate. static const U64 prime8bytes = 11400714785074694791ULL; + // TODO: relying on compiler optimization here. + // The exponential can be calculated explicitly. sum -= ((toRemove + CHECKSUM_CHAR_OFFSET) * ipow(prime8bytes, len - 1)); sum *= prime8bytes; @@ -558,7 +476,7 @@ static U64 updateChecksum(U64 sum, U32 len, */ static void setNextHash(LDM_CCtx *cctx) { #ifdef RUN_CHECKS - U32 check; + U64 check; if ((cctx->nextIp - cctx->ibase != 1) && (cctx->nextIp - cctx->DEBUG_setNextHash != 1)) { printf("CHECK debug fail: %zu %zu\n", cctx->nextIp - cctx->ibase, @@ -568,16 +486,11 @@ static void setNextHash(LDM_CCtx *cctx) { cctx->DEBUG_setNextHash = cctx->nextIp; #endif -// cctx->nextSum = getChecksum((const char *)cctx->nextIp, LDM_HASH_LENGTH); cctx->nextSum = updateChecksum( cctx->lastSum, LDM_HASH_LENGTH, cctx->lastPosHashed[0], cctx->lastPosHashed[LDM_HASH_LENGTH]); cctx->nextPosHashed = cctx->nextIp; -#if 0 - cctx->nextHash = checksumToHash(cctx->nextSum); -#endif - #if LDM_LAG // printf("LDM_LAG %zu\n", cctx->ip - cctx->lagIp); @@ -586,9 +499,6 @@ static void setNextHash(LDM_CCtx *cctx) { cctx->lagSum, LDM_HASH_LENGTH, cctx->lagIp[0], cctx->lagIp[LDM_HASH_LENGTH]); cctx->lagIp++; -#if 0 - cctx->lagHash = checksumToHash(cctx->lagSum); -#endif } #endif @@ -596,7 +506,7 @@ static void setNextHash(LDM_CCtx *cctx) { check = getChecksum(cctx->nextIp, LDM_HASH_LENGTH); if (check != cctx->nextSum) { - printf("CHECK: setNextHash failed %u %u\n", check, cctx->nextSum); + printf("CHECK: setNextHash failed %llu %llu\n", check, cctx->nextSum); } if ((cctx->nextIp - cctx->lastPosHashed) != 1) { @@ -610,8 +520,6 @@ static void setNextHash(LDM_CCtx *cctx) { static void putHashOfCurrentPositionFromHash(LDM_CCtx *cctx, U64 hfHash) { // Hash only every HASH_ONLY_EVERY times, based on cctx->ip. // Note: this works only when cctx->step is 1. - U32 hash = checksumToHash(hfHash); - U32 sum = checksumFromHfHash(hfHash); // printf("TMP %u %u %llu\n", hash, sum, hfHash); if (((cctx->ip - cctx->ibase) & HASH_ONLY_EVERY) == HASH_ONLY_EVERY) { @@ -619,22 +527,26 @@ static void putHashOfCurrentPositionFromHash(LDM_CCtx *cctx, U64 hfHash) { #if LDM_LAG // TODO: off by 1, but whatever if (cctx->lagIp - cctx->ibase > 0) { - const LDM_hashEntry entry = { cctx->lagIp - cctx->ibase, cctx->lagSum }; - HASH_insert(cctx->hashTable, cctx->lagHash, entry); + U32 hash = checksumToHash(cctx->lagSum); + U32 sum = checksumFromHfHash(cctx->lagSum); + const LDM_hashEntry entry = { cctx->lagIp - cctx->ibase, sum }; + HASH_insert(cctx->hashTable, hash, entry); } else { + U32 hash = checksumToHash(hfHash); + U32 sum = checksumFromHfHash(hfHash); + const LDM_hashEntry entry = { cctx->ip - cctx->ibase, sum }; HASH_insert(cctx->hashTable, hash, entry); } #else + U32 hash = checksumToHash(hfHash); + U32 sum = checksumFromHfHash(hfHash); const LDM_hashEntry entry = { cctx->ip - cctx->ibase, sum }; HASH_insert(cctx->hashTable, hash, entry); #endif } cctx->lastPosHashed = cctx->ip; -#if 0 - cctx->lastHash = hash; -#endif cctx->lastSum = hfHash; } @@ -650,9 +562,6 @@ static void LDM_updateLastHashFromNextHash(LDM_CCtx *cctx) { printf("CHECK failed: updateLastHashFromNextHash %zu\n", cctx->ip - cctx->ibase); } -#endif -#if 0 - putHashOfCurrentPositionFromHash(cctx, cctx->nextHash, cctx->nextSum); #endif putHashOfCurrentPositionFromHash(cctx, cctx->nextSum); } @@ -661,10 +570,7 @@ static void LDM_updateLastHashFromNextHash(LDM_CCtx *cctx) { * Insert hash of the current position into the hash table. */ static void LDM_putHashOfCurrentPosition(LDM_CCtx *cctx) { - U32 sum = getChecksum(cctx->ip, LDM_HASH_LENGTH); -#if 0 - hash_t hash = checksumToHash(sum); -#endif + U64 sum = getChecksum(cctx->ip, LDM_HASH_LENGTH); #ifdef RUN_CHECKS if (cctx->nextPosHashed != cctx->ip && (cctx->ip != cctx->ibase)) { @@ -672,13 +578,11 @@ static void LDM_putHashOfCurrentPosition(LDM_CCtx *cctx) { cctx->ip - cctx->ibase); } #endif -#if 0 - putHashOfCurrentPositionFromHash(cctx, hash, sum); -#endif + putHashOfCurrentPositionFromHash(cctx, sum); } -U32 LDM_countMatchLength(const BYTE *pIn, const BYTE *pMatch, +U64 LDM_countMatchLength(const BYTE *pIn, const BYTE *pMatch, const BYTE *pInLimit) { const BYTE * const pStart = pIn; while (pIn < pInLimit - 1) { @@ -688,9 +592,9 @@ U32 LDM_countMatchLength(const BYTE *pIn, const BYTE *pMatch, pMatch++; continue; } - return (U32)(pIn - pStart); + return (U64)(pIn - pStart); } - return (U32)(pIn - pStart); + return (U64)(pIn - pStart); } void LDM_outputConfiguration(void) { @@ -773,10 +677,6 @@ static int LDM_findBestMatch(LDM_CCtx *cctx, const BYTE **match, U64 hash; U32 sum; setNextHash(cctx); -#if 0 - h = cctx->nextHash; - sum = cctx->nextSum; -#endif hash = cctx->nextSum; h = checksumToHash(hash); sum = checksumFromHfHash(hash); diff --git a/contrib/long_distance_matching/main-ldm.c b/contrib/long_distance_matching/main-ldm.c index b6788c67e..9769f10e7 100644 --- a/contrib/long_distance_matching/main-ldm.c +++ b/contrib/long_distance_matching/main-ldm.c @@ -94,9 +94,10 @@ static int compress(const char *fname, const char *oname) { // Truncate file to compressedSize. ftruncate(fdout, compressedSize); - printf("%25s : %10lu -> %10lu - %s (%.1f%%)\n", fname, + printf("%25s : %10lu -> %10lu - %s (%.2fx --- %.1f%%)\n", fname, (size_t)statbuf.st_size, (size_t)compressedSize, oname, - (double)compressedSize / (statbuf.st_size) * 100); + (statbuf.st_size) / (double)compressedSize, + (double)compressedSize / (double)(statbuf.st_size) * 100.0); timeTaken = (double) (tv2.tv_usec - tv1.tv_usec) / 1000000 + (double) (tv2.tv_sec - tv1.tv_sec),