From: Stella Lau Date: Sat, 15 Jul 2017 00:17:00 +0000 (-0700) Subject: Decouple hash table from compression function X-Git-Tag: v1.3.1^2~12^2~7^2~26 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=ca300ce6e0004a447327d19a2ed879923e8c8baa;p=thirdparty%2Fzstd.git Decouple hash table from compression function --- diff --git a/contrib/long_distance_matching/Makefile b/contrib/long_distance_matching/Makefile index cff786442..0d4dea069 100644 --- a/contrib/long_distance_matching/Makefile +++ b/contrib/long_distance_matching/Makefile @@ -27,7 +27,7 @@ default: all all: main-ldm -main-ldm : ldm.h ldm.c main-ldm.c +main-ldm : basic_table.c ldm.c main-ldm.c $(CC) $(CPPFLAGS) $(CFLAGS) $^ $(LDFLAGS) -o $@ clean: diff --git a/contrib/long_distance_matching/basic_table.c b/contrib/long_distance_matching/basic_table.c new file mode 100644 index 000000000..007086fee --- /dev/null +++ b/contrib/long_distance_matching/basic_table.c @@ -0,0 +1,56 @@ +#include +#include + +#include "ldm_hashtable.h" + +struct LDM_hashTable { + U32 size; + LDM_hashEntry *entries; +}; + +LDM_hashTable *HASH_createTable(U32 size) { + LDM_hashTable *table = malloc(sizeof(LDM_hashTable)); + table->size = size; + table->entries = calloc(size, sizeof(LDM_hashEntry)); + return table; +} + +void HASH_initializeTable(LDM_hashTable *table, U32 size) { + table->size = size; + table->entries = calloc(size, sizeof(LDM_hashEntry)); +} + + +LDM_hashEntry *HASH_getEntryFromHash( + const LDM_hashTable *table, const hash_t hash) { + return &(table->entries[hash]); +} + +void HASH_insert(LDM_hashTable *table, + const hash_t hash, const LDM_hashEntry entry) { + *HASH_getEntryFromHash(table, hash) = entry; +} + +U32 HASH_getSize(const LDM_hashTable *table) { + return table->size; +} + +void HASH_destroyTable(LDM_hashTable *table) { + free(table->entries); + free(table); +} + +void HASH_outputTableOccupancy(const LDM_hashTable *hashTable) { + U32 i = 0; + U32 ctr = 0; + for (; i < HASH_getSize(hashTable); i++) { + if (HASH_getEntryFromHash(hashTable, i)->offset == 0) { + ctr++; + } + } + printf("Hash table size, empty slots, %% empty: %u, %u, %.3f\n", + HASH_getSize(hashTable), ctr, + 100.0 * (double)(ctr) / (double)HASH_getSize(hashTable)); +} + + diff --git a/contrib/long_distance_matching/ldm.c b/contrib/long_distance_matching/ldm.c index 08cb856c7..32da40f82 100644 --- a/contrib/long_distance_matching/ldm.c +++ b/contrib/long_distance_matching/ldm.c @@ -4,11 +4,13 @@ #include #include -#include "ldm.h" - // Insert every (HASH_ONLY_EVERY + 1) into the hash table. #define HASH_ONLY_EVERY 0 +#define LDM_HASHLOG (LDM_MEMORY_USAGE-2) +#define LDM_HASHTABLESIZE (1 << (LDM_MEMORY_USAGE)) +#define LDM_HASHTABLESIZE_U32 ((LDM_HASHTABLESIZE) >> 2) + #define ML_BITS 4 #define ML_MASK ((1U<entries[hash]); -} - -void HASH_insert(LDM_hashTable *table, - const hash_t hash, const LDM_hashEntry entry) { - *HASH_getHash(table, hash) = entry; -} - +#include "ldm_hashtable.h" // TODO: Scanning speed // TODO: Memory usage @@ -54,6 +39,8 @@ struct LDM_compressStats { U32 numCollisions; U32 numHashInserts; +// U64 numInvalidHashes, numValidHashes; // tmp + U32 offsetHistogram[32]; }; @@ -80,9 +67,7 @@ struct LDM_CCtx { LDM_compressStats stats; /* Compression statistics */ - LDM_hashTable hashTable; - -// LDM_hashEntry *hashTable; + LDM_hashTable *hashTable; // LDM_hashEntry hashTable[LDM_HASHTABLESIZE_U32]; @@ -101,21 +86,6 @@ struct LDM_CCtx { const BYTE *DEBUG_setNextHash; }; - - -void LDM_outputHashTableOccupancy(const LDM_hashTable *hashTable) { - U32 i = 0; - U32 ctr = 0; - for (; i < hashTable->size; i++) { - if (HASH_getHash(hashTable, i)->offset == 0) { - ctr++; - } - } - printf("Hash table size, empty slots, %% empty: %u, %u, %.3f\n", - hashTable->size, ctr, - 100.0 * (double)(ctr) / (double)hashTable->size); -} - // TODO: This can be done more efficiently (but it is not that important as it // is only used for computing stats). static int intLog2(U32 x) { @@ -128,15 +98,15 @@ static int intLog2(U32 x) { // TODO: Maybe we would eventually prefer to have linear rather than // exponential buckets. -void LDM_outputHashTableOffsetHistogram(const LDM_CCtx *cctx) { +void HASH_outputTableOffsetHistogram(const LDM_CCtx *cctx) { U32 i = 0; int buckets[32] = { 0 }; printf("\n"); printf("Hash table histogram\n"); - for (; i < cctx->hashTable.size; i++) { + for (; i < HASH_getSize(cctx->hashTable); i++) { int offset = (cctx->ip - cctx->ibase) - - HASH_getHash(&cctx->hashTable, i)->offset; + HASH_getEntryFromHash(cctx->hashTable, i)->offset; buckets[intLog2(offset)]++; } @@ -145,7 +115,7 @@ void LDM_outputHashTableOffsetHistogram(const LDM_CCtx *cctx) { printf("2^%*d: %10u %6.3f%%\n", 2, i, buckets[i], 100.0 * (double) buckets[i] / - (double) cctx->hashTable.size); + (double) HASH_getSize(cctx->hashTable)); } printf("\n"); } @@ -181,7 +151,10 @@ void LDM_printCompressStats(const LDM_compressStats *stats) { } printf("\n"); - + /* + printf("Num invalid hashes, num valid hashes, %llu %llu\n", + stats->numInvalidHashes, stats->numValidHashes); + */ printf("num collisions, num hash inserts, %% collisions: %u, %u, %.3f\n", stats->numCollisions, stats->numHashInserts, stats->numHashInserts == 0 ? @@ -315,7 +288,7 @@ static void putHashOfCurrentPositionFromHash( LDM_CCtx *cctx, hash_t hash, U32 sum) { #ifdef COMPUTE_STATS if (cctx->stats.numHashInserts < LDM_HASHTABLESIZE_U32) { - offset_t offset = HASH_getHash(&cctx->hashTable, hash)->offset; + U32 offset = HASH_getEntryFromHash(cctx->hashTable, hash)->offset; cctx->stats.numHashInserts++; if (offset != 0 && !LDM_isValidMatch(cctx->ip, offset + cctx->ibase)) { cctx->stats.numCollisions++; @@ -327,7 +300,7 @@ static void putHashOfCurrentPositionFromHash( // Note: this works only when cctx->step is 1. if (((cctx->ip - cctx->ibase) & HASH_ONLY_EVERY) == HASH_ONLY_EVERY) { const LDM_hashEntry entry = { cctx->ip - cctx->ibase }; - HASH_insert(&cctx->hashTable, hash, entry); + HASH_insert(cctx->hashTable, hash, entry); } cctx->lastPosHashed = cctx->ip; @@ -371,9 +344,11 @@ static void LDM_putHashOfCurrentPosition(LDM_CCtx *cctx) { /** * Returns the position of the entry at hashTable[hash]. */ -static const BYTE *getPositionOnHash(LDM_CCtx *cctx, hash_t hash) { - return HASH_getHash(&cctx->hashTable, hash)->offset + cctx->ibase; +/* +static const BYTE *getPositionOnHash(const LDM_CCtx *cctx, const hash_t hash) { + return HASH_getEntryFromHash(cctx->hashTable, hash)->offset + cctx->ibase; } +*/ U32 LDM_countMatchLength(const BYTE *pIn, const BYTE *pMatch, const BYTE *pInLimit) { @@ -399,11 +374,6 @@ void LDM_readHeader(const void *src, U64 *compressedSize, // ip += sizeof(U64); } -static void LDM_initializeHashTable(LDM_hashTable *table) { - table->size = LDM_HASHTABLESIZE_U32; - table->entries = calloc(LDM_HASHTABLESIZE_U32, sizeof(LDM_hashEntry)); -} - void LDM_initializeCCtx(LDM_CCtx *cctx, const void *src, size_t srcSize, void *dst, size_t maxDstSize) { @@ -423,8 +393,10 @@ void LDM_initializeCCtx(LDM_CCtx *cctx, cctx->anchor = cctx->ibase; memset(&(cctx->stats), 0, sizeof(cctx->stats)); + cctx->hashTable = HASH_createTable(LDM_HASHTABLESIZE_U32); + + //HASH_initializeTable(cctx->hashTable, LDM_HASHTABLESIZE_U32); - LDM_initializeHashTable(&cctx->hashTable); // calloc(LDM_HASHTABLESIZE_U32, sizeof(LDM_hashEntry)); // memset(cctx->hashTable, 0, sizeof(cctx->hashTable)); cctx->stats.minOffset = UINT_MAX; @@ -442,7 +414,7 @@ void LDM_initializeCCtx(LDM_CCtx *cctx, } void LDM_destroyCCtx(LDM_CCtx *cctx) { - free((cctx->hashTable).entries); + HASH_destroyTable(cctx->hashTable); } /** @@ -458,6 +430,7 @@ static int LDM_findBestMatch(LDM_CCtx *cctx, const BYTE **match) { do { hash_t h; U32 sum; + LDM_hashEntry *entry; setNextHash(cctx); h = cctx->nextHash; sum = cctx->nextSum; @@ -468,7 +441,9 @@ static int LDM_findBestMatch(LDM_CCtx *cctx, const BYTE **match) { return 1; } - *match = getPositionOnHash(cctx, h); + entry = HASH_getEntryFromHash(cctx->hashTable, h); + *match = entry->offset + cctx->ibase; + putHashOfCurrentPositionFromHash(cctx, h, sum); } while (cctx->ip - *match > LDM_WINDOW_SIZE || @@ -604,7 +579,7 @@ size_t LDM_compress(const void *src, size_t srcSize, LDM_updateLastHashFromNextHash(&cctx); } - // LDM_outputHashTableOffsetHistogram(&cctx); + // HASH_outputTableOffsetHistogram(&cctx); /* Encode the last literals (no more matches). */ { @@ -615,7 +590,7 @@ size_t LDM_compress(const void *src, size_t srcSize, #ifdef COMPUTE_STATS LDM_printCompressStats(&cctx.stats); - LDM_outputHashTableOccupancy(&cctx.hashTable); + HASH_outputTableOccupancy(cctx.hashTable); #endif { diff --git a/contrib/long_distance_matching/ldm.h b/contrib/long_distance_matching/ldm.h index 8c3aa4e61..18b64e378 100644 --- a/contrib/long_distance_matching/ldm.h +++ b/contrib/long_distance_matching/ldm.h @@ -12,9 +12,6 @@ // Defines the size of the hash table. #define LDM_MEMORY_USAGE 16 -#define LDM_HASHLOG (LDM_MEMORY_USAGE-2) -#define LDM_HASHTABLESIZE (1 << (LDM_MEMORY_USAGE)) -#define LDM_HASHTABLESIZE_U32 ((LDM_HASHTABLESIZE) >> 2) #define LDM_WINDOW_SIZE_LOG 25 #define LDM_WINDOW_SIZE (1 << (LDM_WINDOW_SIZE_LOG)) @@ -23,10 +20,6 @@ #define LDM_MIN_MATCH_LENGTH 4 #define LDM_HASH_LENGTH 4 -typedef U32 offset_t; -typedef U32 hash_t; -typedef struct LDM_hashEntry LDM_hashEntry; -typedef struct LDM_hashTable LDM_hashTable; typedef struct LDM_compressStats LDM_compressStats; typedef struct LDM_CCtx LDM_CCtx; typedef struct LDM_DCtx LDM_DCtx; @@ -75,12 +68,6 @@ void LDM_initializeCCtx(LDM_CCtx *cctx, */ void LDM_destroyCCtx(LDM_CCtx *cctx); -/** - * Prints the percentage of the hash table occupied (where occupied is defined - * as the entry being non-zero). - */ -void LDM_outputHashTableOccupancy(const LDM_hashTable *hashTable); - /** * Prints the distribution of offsets in the hash table. * diff --git a/contrib/long_distance_matching/ldm_hashtable.h b/contrib/long_distance_matching/ldm_hashtable.h new file mode 100644 index 000000000..690c47a15 --- /dev/null +++ b/contrib/long_distance_matching/ldm_hashtable.h @@ -0,0 +1,36 @@ +#ifndef LDM_HASHTABLE_H +#define LDM_HASHTABLE_H + +#include "mem.h" + +typedef U32 hash_t; + +typedef struct LDM_hashEntry { + U32 offset; +} LDM_hashEntry; + +typedef struct LDM_hashTable LDM_hashTable; + +// TODO: rename functions +// TODO: comments + +LDM_hashTable *HASH_createTable(U32 size); + +LDM_hashEntry *HASH_getEntryFromHash(const LDM_hashTable *table, + const hash_t hash); + +void HASH_insert(LDM_hashTable *table, const hash_t hash, + const LDM_hashEntry entry); + +U32 HASH_getSize(const LDM_hashTable *table); + +void HASH_destroyTable(LDM_hashTable *table); + +/** + * Prints the percentage of the hash table occupied (where occupied is defined + * as the entry being non-zero). + */ +void HASH_outputTableOccupancy(const LDM_hashTable *hashTable); + + +#endif /* LDM_HASHTABLE_H */