From: Stella Lau Date: Tue, 18 Jul 2017 01:13:09 +0000 (-0700) Subject: Experiment with using a lag when hashing X-Git-Tag: v1.3.1^2~12^2~7^2~22 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=fc41a8796493063c26103ca7ed32561dd2e649d0;p=thirdparty%2Fzstd.git Experiment with using a lag when hashing --- diff --git a/contrib/long_distance_matching/Makefile b/contrib/long_distance_matching/Makefile index 47085022d..df4390157 100644 --- a/contrib/long_distance_matching/Makefile +++ b/contrib/long_distance_matching/Makefile @@ -25,7 +25,7 @@ LDFLAGS += -lzstd default: all -all: main-basic main-circular-buffer +all: main-basic main-circular-buffer main-lag main-basic : basic_table.c ldm.c main-ldm.c $(CC) $(CPPFLAGS) $(CFLAGS) $^ $(LDFLAGS) -o $@ @@ -33,9 +33,11 @@ main-basic : basic_table.c ldm.c main-ldm.c main-circular-buffer: circular_buffer_table.c ldm.c main-ldm.c $(CC) $(CPPFLAGS) $(CFLAGS) $^ $(LDFLAGS) -o $@ +main-lag: lag_table.c ldm.c main-ldm.c + $(CC) $(CPPFLAGS) $(CFLAGS) $^ $(LDFLAGS) -o $@ clean: @rm -f core *.o tmp* result* *.ldm *.ldm.dec \ - main-basic main-circular-buffer + main-basic main-circular-buffer main-lag @echo Cleaning completed diff --git a/contrib/long_distance_matching/basic_table.c b/contrib/long_distance_matching/basic_table.c index 859bf0618..893a4caf9 100644 --- a/contrib/long_distance_matching/basic_table.c +++ b/contrib/long_distance_matching/basic_table.c @@ -27,7 +27,6 @@ LDM_hashEntry *getBucket(const LDM_hashTable *table, const hash_t hash) { return table->entries + hash; } - LDM_hashEntry *HASH_getEntryFromHash( const LDM_hashTable *table, const hash_t hash, const U32 checksum) { (void)checksum; @@ -43,13 +42,10 @@ LDM_hashEntry *HASH_getValidEntry(const LDM_hashTable *table, (void)checksum; if ((*isValid)(pIn, entry->offset + table->offsetBase)) { return entry; - } else { - return NULL; } + return NULL; } - - void HASH_insert(LDM_hashTable *table, const hash_t hash, const LDM_hashEntry entry) { *getBucket(table, hash) = entry; diff --git a/contrib/long_distance_matching/circular_buffer_table.c b/contrib/long_distance_matching/circular_buffer_table.c index f45f945ce..b578d2bf1 100644 --- a/contrib/long_distance_matching/circular_buffer_table.c +++ b/contrib/long_distance_matching/circular_buffer_table.c @@ -9,7 +9,7 @@ // refactor code to scale the number of elements appropriately. // Number of elements per hash bucket. -#define HASH_BUCKET_SIZE_LOG 1 // MAX is 4 for now +#define HASH_BUCKET_SIZE_LOG 0 // MAX is 4 for now #define HASH_BUCKET_SIZE (1 << (HASH_BUCKET_SIZE_LOG)) struct LDM_hashTable { @@ -19,6 +19,7 @@ struct LDM_hashTable { // Position corresponding to offset=0 in LDM_hashEntry. const BYTE *offsetBase; BYTE *bucketOffsets; // Pointer to current insert position. + // Last insert was at bucketOffsets - 1? }; @@ -35,15 +36,6 @@ static LDM_hashEntry *getBucket(const LDM_hashTable *table, const hash_t hash) { return table->entries + (hash << HASH_BUCKET_SIZE_LOG); } -/* -static LDM_hashEntry *getLastInsertFromHash(const LDM_hashTable *table, - const hash_t hash) { - LDM_hashEntry *bucket = getBucket(table, hash); - BYTE offset = (table->bucketOffsets[hash] - 1) & (HASH_BUCKET_SIZE - 1); - return bucket + offset; -} -*/ - LDM_hashEntry *HASH_getValidEntry(const LDM_hashTable *table, const hash_t hash, const U32 checksum, @@ -53,7 +45,12 @@ LDM_hashEntry *HASH_getValidEntry(const LDM_hashTable *table, LDM_hashEntry *cur = bucket; // TODO: in order of recency? for (; cur < bucket + HASH_BUCKET_SIZE; ++cur) { - // CHeck checksum for faster check. + /* + if (cur->checksum == 0 && cur->offset == 0) { + return NULL; + } + */ + // Check checksum for faster check. if (cur->checksum == checksum && (*isValid)(pIn, cur->offset + table->offsetBase)) { return cur; @@ -62,7 +59,6 @@ LDM_hashEntry *HASH_getValidEntry(const LDM_hashTable *table, return NULL; } - LDM_hashEntry *HASH_getEntryFromHash(const LDM_hashTable *table, const hash_t hash, const U32 checksum) { diff --git a/contrib/long_distance_matching/ldm.c b/contrib/long_distance_matching/ldm.c index bf54842f6..dedbf79a9 100644 --- a/contrib/long_distance_matching/ldm.c +++ b/contrib/long_distance_matching/ldm.c @@ -5,7 +5,7 @@ #include // Insert every (HASH_ONLY_EVERY + 1) into the hash table. -#define HASH_ONLY_EVERY 31 +#define HASH_ONLY_EVERY 15 #define LDM_HASHLOG (LDM_MEMORY_USAGE-2) #define LDM_HASHTABLESIZE (1 << (LDM_MEMORY_USAGE)) @@ -18,6 +18,10 @@ #define COMPUTE_STATS #define CHECKSUM_CHAR_OFFSET 10 + +#define LAG 0 + +//#define HASH_CHECK //#define RUN_CHECKS //#define LDM_DEBUG @@ -79,6 +83,10 @@ struct LDM_CCtx { unsigned step; // ip step, should be 1. + const BYTE *lagIp; + hash_t lagHash; + U32 lagSum; + // DEBUG const BYTE *DEBUG_setNextHash; }; @@ -253,6 +261,17 @@ static void setNextHash(LDM_CCtx *cctx) { cctx->nextPosHashed = cctx->nextIp; cctx->nextHash = checksumToHash(cctx->nextSum); +#if LAG + if (cctx->ip - cctx->ibase > LAG) { +// printf("LAG %zu\n", cctx->ip - cctx->lagIp); + cctx->lagSum = updateChecksum( + cctx->lagSum, LDM_HASH_LENGTH, + cctx->lagIp[0], cctx->lagIp[LDM_HASH_LENGTH]); + cctx->lagIp++; + cctx->lagHash = checksumToHash(cctx->lagSum); + } +#endif + #ifdef RUN_CHECKS check = getChecksum(cctx->nextIp, LDM_HASH_LENGTH); @@ -270,18 +289,6 @@ static void setNextHash(LDM_CCtx *cctx) { static void putHashOfCurrentPositionFromHash( LDM_CCtx *cctx, hash_t hash, U32 sum) { - /* -#ifdef COMPUTE_STATS - if (cctx->stats.numHashInserts < HASH_getSize(cctx->hashTable)) { - U32 offset = HASH_getEntryFromHash(cctx->hashTable, hash)->offset; - cctx->stats.numHashInserts++; - if (offset != 0 && !LDM_isValidMatch(cctx->ip, offset + cctx->ibase)) { - cctx->stats.numCollisions++; - } - } -#endif -*/ - // Hash only every HASH_ONLY_EVERY times, based on cctx->ip. // Note: this works only when cctx->step is 1. if (((cctx->ip - cctx->ibase) & HASH_ONLY_EVERY) == HASH_ONLY_EVERY) { @@ -289,8 +296,19 @@ static void putHashOfCurrentPositionFromHash( const LDM_hashEntry entry = { cctx->ip - cctx->ibase , MEM_read32(cctx->ip) }; */ +#if LAG + // TODO: off by 1, but whatever + if (cctx->lagIp - cctx->ibase > 0) { + const LDM_hashEntry entry = { cctx->lagIp - cctx->ibase, cctx->lagSum }; + HASH_insert(cctx->hashTable, cctx->lagHash, entry); + } else { + const LDM_hashEntry entry = { cctx->ip - cctx->ibase, sum }; + HASH_insert(cctx->hashTable, hash, entry); + } +#else const LDM_hashEntry entry = { cctx->ip - cctx->ibase, sum }; HASH_insert(cctx->hashTable, hash, entry); +#endif } cctx->lastPosHashed = cctx->ip; @@ -331,15 +349,6 @@ static void LDM_putHashOfCurrentPosition(LDM_CCtx *cctx) { putHashOfCurrentPositionFromHash(cctx, hash, sum); } -/** - * Returns the position of the entry at hashTable[hash]. - */ -/* -static const BYTE *getPositionOnHash(const LDM_CCtx *cctx, const hash_t hash) { - return HASH_getEntryFromHash(cctx->hashTable, hash)->offset + cctx->ibase; -} -*/ - U32 LDM_countMatchLength(const BYTE *pIn, const BYTE *pMatch, const BYTE *pInLimit) { const BYTE * const pStart = pIn; @@ -431,12 +440,20 @@ static int LDM_findBestMatch(LDM_CCtx *cctx, const BYTE **match) { if (cctx->ip > cctx->imatchLimit) { return 1; } - +#ifdef HASH_CHECK + entry = HASH_getEntryFromHash(cctx->hashTable, h, sum); +#else entry = HASH_getValidEntry(cctx->hashTable, h, sum, cctx->ip, &LDM_isValidMatch); +#endif if (entry != NULL) { *match = entry->offset + cctx->ibase; +#ifdef HASH_CHECK + if (!LDM_isValidMatch(cctx->ip, *match)) { + entry = NULL; + } +#endif } putHashOfCurrentPositionFromHash(cctx, h, sum); } @@ -508,6 +525,12 @@ size_t LDM_compress(const void *src, size_t srcSize, /* Hash the first position and put it into the hash table. */ LDM_putHashOfCurrentPosition(&cctx); +#if LAG + cctx.lagIp = cctx.ip; + cctx.lagHash = cctx.lastHash; + cctx.lagSum = cctx.lastSum; +#endif + /** * Find a match. * If no more matches can be found (i.e. the length of the remaining input @@ -575,7 +598,7 @@ size_t LDM_compress(const void *src, size_t srcSize, /* Encode the last literals (no more matches). */ { - const size_t lastRun = cctx.iend - cctx.anchor; + const U32 lastRun = cctx.iend - cctx.anchor; BYTE *pToken = cctx.op++; LDM_encodeLiteralLengthAndLiterals(&cctx, pToken, lastRun); } diff --git a/contrib/long_distance_matching/ldm.h b/contrib/long_distance_matching/ldm.h index 6d97bd560..6d7c4af27 100644 --- a/contrib/long_distance_matching/ldm.h +++ b/contrib/long_distance_matching/ldm.h @@ -10,8 +10,8 @@ #define LDM_HEADER_SIZE ((LDM_COMPRESS_SIZE)+(LDM_DECOMPRESS_SIZE)) #define LDM_OFFSET_SIZE 4 -// Defines the size of the hash table. -#define LDM_MEMORY_USAGE 20 +// Defines the size of the hash table (currently the number of elements). +#define LDM_MEMORY_USAGE 12 #define LDM_WINDOW_SIZE_LOG 30 #define LDM_WINDOW_SIZE (1 << (LDM_WINDOW_SIZE_LOG)) diff --git a/contrib/long_distance_matching/ldm_hashtable.h b/contrib/long_distance_matching/ldm_hashtable.h index 88d19ae20..83a9ed27a 100644 --- a/contrib/long_distance_matching/ldm_hashtable.h +++ b/contrib/long_distance_matching/ldm_hashtable.h @@ -3,34 +3,54 @@ #include "mem.h" +// TODO: clean up comments + typedef U32 hash_t; typedef struct LDM_hashEntry { - U32 offset; + U32 offset; // TODO: Replace with pointer? U32 checksum; } LDM_hashEntry; typedef struct LDM_hashTable LDM_hashTable; -// TODO: rename functions -// TODO: comments - +/** + * Create a hash table with size hash buckets. + * LDM_hashEntry.offset is added to offsetBase to calculate pMatch in + * HASH_getValidEntry. + */ LDM_hashTable *HASH_createTable(U32 size, const BYTE *offsetBase); -//TODO: unneeded? +/** + * Returns an LDM_hashEntry from the table that matches the checksum. + * Returns NULL if one does not exist. + */ LDM_hashEntry *HASH_getEntryFromHash(const LDM_hashTable *table, const hash_t hash, const U32 checksum); +/** + * Gets a valid entry that matches the checksum. A valid entry is defined by + * *isValid. + * + * The function finds an entry matching the checksum, computes pMatch as + * offset + table.offsetBase, and calls isValid. + */ LDM_hashEntry *HASH_getValidEntry(const LDM_hashTable *table, const hash_t hash, const U32 checksum, const BYTE *pIn, int (*isValid)(const BYTE *pIn, const BYTE *pMatch)); +/** + * Insert an LDM_hashEntry into the bucket corresponding to hash. + */ void HASH_insert(LDM_hashTable *table, const hash_t hash, - const LDM_hashEntry entry); + const LDM_hashEntry entry); +/** + * Return the number of distinct hash buckets. + */ U32 HASH_getSize(const LDM_hashTable *table); void HASH_destroyTable(LDM_hashTable *table); diff --git a/contrib/long_distance_matching/main-ldm.c b/contrib/long_distance_matching/main-ldm.c index ea6375ba7..a379d3a6d 100644 --- a/contrib/long_distance_matching/main-ldm.c +++ b/contrib/long_distance_matching/main-ldm.c @@ -163,7 +163,6 @@ static int decompress(const char *fname, const char *oname) { outSize = LDM_decompress( src + LDM_HEADER_SIZE, statbuf.st_size - LDM_HEADER_SIZE, dst, decompressedSize); - printf("Ret size out: %zu\n", outSize); ftruncate(fdout, outSize);