offset_t offset;
};
-typedef struct LDM_hashTable {
- U32 numEntries;
- U32 minimumTagMask; // TODO: what if tag == offset?
+// TODO: move to its own file.
+struct LDM_hashTable {
+ U32 size;
+ LDM_hashEntry *entries;
+};
- // Maximum number of elements in the table.
- U32 limit;
+LDM_hashEntry *HASH_getHash(
+ const LDM_hashTable *table, const hash_t hash) {
+ return &(table->entries[hash]);
+}
+
+void HASH_insert(LDM_hashTable *table,
+ const hash_t hash, const LDM_hashEntry entry) {
+ *HASH_getHash(table, hash) = entry;
+}
- LDM_hashEntry *entries;
-} LDM_hashTable;
-// TODO: Add offset histogram by powers of two
// TODO: Scanning speed
// TODO: Memory usage
struct LDM_compressStats {
LDM_compressStats stats; /* Compression statistics */
- LDM_hashEntry *hashTable;
+ LDM_hashTable hashTable;
+
+// LDM_hashEntry *hashTable;
// LDM_hashEntry hashTable[LDM_HASHTABLESIZE_U32];
const BYTE *DEBUG_setNextHash;
};
-void LDM_outputHashTableOccupancy(
- const LDM_hashEntry *hashTable, U32 hashTableSize) {
+
+
+void LDM_outputHashTableOccupancy(const LDM_hashTable *hashTable) {
U32 i = 0;
U32 ctr = 0;
- for (; i < hashTableSize; i++) {
- if (hashTable[i].offset == 0) {
+ for (; i < hashTable->size; i++) {
+ if (HASH_getHash(hashTable, i)->offset == 0) {
ctr++;
}
}
printf("Hash table size, empty slots, %% empty: %u, %u, %.3f\n",
- hashTableSize, ctr,
- 100.0 * (double)(ctr) / (double)hashTableSize);
+ hashTable->size, ctr,
+ 100.0 * (double)(ctr) / (double)hashTable->size);
}
// TODO: This can be done more efficiently (but it is not that important as it
// TODO: Maybe we would eventually prefer to have linear rather than
// exponential buckets.
void LDM_outputHashTableOffsetHistogram(const LDM_CCtx *cctx) {
- int i = 0;
+ U32 i = 0;
int buckets[32] = { 0 };
printf("\n");
printf("Hash table histogram\n");
- for (; i < LDM_HASHTABLESIZE_U32; i++) {
- int offset = (cctx->ip - cctx->ibase) - cctx->hashTable[i].offset;
+ for (; i < cctx->hashTable.size; i++) {
+ int offset = (cctx->ip - cctx->ibase) -
+ HASH_getHash(&cctx->hashTable, i)->offset;
buckets[intLog2(offset)]++;
}
printf("2^%*d: %10u %6.3f%%\n", 2, i,
buckets[i],
100.0 * (double) buckets[i] /
- (double) LDM_HASHTABLESIZE_U32);
+ (double) cctx->hashTable.size);
}
printf("\n");
}
LDM_CCtx *cctx, hash_t hash, U32 sum) {
#ifdef COMPUTE_STATS
if (cctx->stats.numHashInserts < LDM_HASHTABLESIZE_U32) {
- offset_t offset = cctx->hashTable[hash].offset;
+ offset_t offset = HASH_getHash(&cctx->hashTable, hash)->offset;
cctx->stats.numHashInserts++;
if (offset != 0 && !LDM_isValidMatch(cctx->ip, offset + cctx->ibase)) {
cctx->stats.numCollisions++;
// Note: this works only when cctx->step is 1.
if (((cctx->ip - cctx->ibase) & HASH_ONLY_EVERY) == HASH_ONLY_EVERY) {
const LDM_hashEntry entry = { cctx->ip - cctx->ibase };
- cctx->hashTable[hash] = entry;
+ HASH_insert(&cctx->hashTable, hash, entry);
}
cctx->lastPosHashed = cctx->ip;
* Returns the position of the entry at hashTable[hash].
*/
static const BYTE *getPositionOnHash(LDM_CCtx *cctx, hash_t hash) {
- return cctx->hashTable[hash].offset + cctx->ibase;
+ return HASH_getHash(&cctx->hashTable, hash)->offset + cctx->ibase;
}
U32 LDM_countMatchLength(const BYTE *pIn, const BYTE *pMatch,
// ip += sizeof(U64);
}
+static void LDM_initializeHashTable(LDM_hashTable *table) {
+ table->size = LDM_HASHTABLESIZE_U32;
+ table->entries = calloc(LDM_HASHTABLESIZE_U32, sizeof(LDM_hashEntry));
+}
+
void LDM_initializeCCtx(LDM_CCtx *cctx,
const void *src, size_t srcSize,
void *dst, size_t maxDstSize) {
cctx->anchor = cctx->ibase;
memset(&(cctx->stats), 0, sizeof(cctx->stats));
- cctx->hashTable = calloc(LDM_HASHTABLESIZE_U32, sizeof(LDM_hashEntry));
+
+ LDM_initializeHashTable(&cctx->hashTable);
+// calloc(LDM_HASHTABLESIZE_U32, sizeof(LDM_hashEntry));
// memset(cctx->hashTable, 0, sizeof(cctx->hashTable));
cctx->stats.minOffset = UINT_MAX;
cctx->stats.windowSizeLog = LDM_WINDOW_SIZE_LOG;
cctx->DEBUG_setNextHash = 0;
}
+void LDM_destroyCCtx(LDM_CCtx *cctx) {
+ free((cctx->hashTable).entries);
+}
+
/**
* Finds the "best" match.
*
#ifdef COMPUTE_STATS
LDM_printCompressStats(&cctx.stats);
- LDM_outputHashTableOccupancy(cctx.hashTable, LDM_HASHTABLESIZE_U32);
+ LDM_outputHashTableOccupancy(&cctx.hashTable);
#endif
- return cctx.op - cctx.obase;
+ {
+ const size_t ret = cctx.op - cctx.obase;
+ LDM_destroyCCtx(&cctx);
+ return ret;
+ }
}
struct LDM_DCtx {
#define RUN_MASK ((1U<<RUN_BITS)-1)
#define COMPUTE_STATS
-#define CHECKSUM_CHAR_OFFSET 0
+#define CHECKSUM_CHAR_OFFSET 10
//#define RUN_CHECKS
//#define LDM_DEBUG
offset_t offset;
};
+typedef struct LDM_hashTable {
+ U32 numEntries;
+ U32 minimumTagMask; // TODO: what if tag == offset?
+
+ // Maximum number of elements in the table.
+ U32 limit;
+
+ LDM_hashEntry *entries;
+} LDM_hashTable;
+
// TODO: Add offset histogram by powers of two
// TODO: Scanning speed
// TODO: Memory usage
struct LDM_compressStats {
+ U32 windowSizeLog, hashTableSizeLog;
U32 numMatches;
U64 totalMatchLength;
U64 totalLiteralLength;
U32 numCollisions;
U32 numHashInserts;
+
+ U32 offsetHistogram[32];
};
struct LDM_CCtx {
LDM_compressStats stats; /* Compression statistics */
- LDM_hashEntry hashTable[LDM_HASHTABLESIZE_U32];
+ LDM_hashEntry *hashTable;
+
+// LDM_hashEntry hashTable[LDM_HASHTABLESIZE_U32];
const BYTE *lastPosHashed; /* Last position hashed */
hash_t lastHash; /* Hash corresponding to lastPosHashed */
const BYTE *DEBUG_setNextHash;
};
-void LDM_outputHashtableOccupancy(
+void LDM_outputHashTableOccupancy(
const LDM_hashEntry *hashTable, U32 hashTableSize) {
U32 i = 0;
U32 ctr = 0;
100.0 * (double)(ctr) / (double)hashTableSize);
}
+// TODO: This can be done more efficiently (but it is not that important as it
+// is only used for computing stats).
+static int intLog2(U32 x) {
+ int ret = 0;
+ while (x >>= 1) {
+ ret++;
+ }
+ return ret;
+}
+
+// TODO: Maybe we would eventually prefer to have linear rather than
+// exponential buckets.
+void LDM_outputHashTableOffsetHistogram(const LDM_CCtx *cctx) {
+ int i = 0;
+ int buckets[32] = { 0 };
+
+ printf("\n");
+ printf("Hash table histogram\n");
+ for (; i < LDM_HASHTABLESIZE_U32; i++) {
+ int offset = (cctx->ip - cctx->ibase) - cctx->hashTable[i].offset;
+ buckets[intLog2(offset)]++;
+ }
+
+ i = 0;
+ for (; i < 32; i++) {
+ printf("2^%*d: %10u %6.3f%%\n", 2, i,
+ buckets[i],
+ 100.0 * (double) buckets[i] /
+ (double) LDM_HASHTABLESIZE_U32);
+ }
+ printf("\n");
+}
+
void LDM_printCompressStats(const LDM_compressStats *stats) {
+ int i = 0;
printf("=====================\n");
printf("Compression statistics\n");
//TODO: compute percentage matched?
+ printf("Window size, hash table size (bytes): 2^%u, 2^%u\n",
+ stats->windowSizeLog, stats->hashTableSizeLog);
printf("num matches, total match length: %u, %llu\n",
stats->numMatches,
stats->totalMatchLength);
printf("avg match length: %.1f\n", ((double)stats->totalMatchLength) /
(double)stats->numMatches);
- printf("avg literal length: %.1f\n",
- ((double)stats->totalLiteralLength) / (double)stats->numMatches);
+ printf("avg literal length, total literalLength: %.1f, %llu\n",
+ ((double)stats->totalLiteralLength) / (double)stats->numMatches,
+ stats->totalLiteralLength);
printf("avg offset length: %.1f\n",
((double)stats->totalOffset) / (double)stats->numMatches);
- printf("min offset, max offset: %u %u\n",
+ printf("min offset, max offset: %u, %u\n",
stats->minOffset, stats->maxOffset);
+
+ printf("\n");
+ printf("offset histogram: offset, num matches, %% of matches\n");
+
+ for (; i <= intLog2(stats->maxOffset); i++) {
+ printf("2^%*d: %10u %6.3f%%\n", 2, i,
+ stats->offsetHistogram[i],
+ 100.0 * (double) stats->offsetHistogram[i] /
+ (double) stats->numMatches);
+ }
+ printf("\n");
+
+
printf("num collisions, num hash inserts, %% collisions: %u, %u, %.3f\n",
stats->numCollisions, stats->numHashInserts,
stats->numHashInserts == 0 ?
1.0 : (100.0 * (double)stats->numCollisions) /
(double)stats->numHashInserts);
+ printf("=====================\n");
+
}
int LDM_isValidMatch(const BYTE *pIn, const BYTE *pMatch) {
* of the hash table.
*/
static hash_t checksumToHash(U32 sum) {
- return ((sum * 2654435761U) >> ((32)-LDM_HASHLOG));
+ return ((sum * 2654435761U) >> (32 - LDM_HASHLOG));
}
/**
cctx->anchor = cctx->ibase;
memset(&(cctx->stats), 0, sizeof(cctx->stats));
- memset(cctx->hashTable, 0, sizeof(cctx->hashTable));
+ cctx->hashTable = calloc(LDM_HASHTABLESIZE_U32, sizeof(LDM_hashEntry));
+// memset(cctx->hashTable, 0, sizeof(cctx->hashTable));
cctx->stats.minOffset = UINT_MAX;
+ cctx->stats.windowSizeLog = LDM_WINDOW_SIZE_LOG;
+ cctx->stats.hashTableSizeLog = LDM_MEMORY_USAGE;
+
cctx->lastPosHashed = NULL;
cctx->DEBUG_setNextHash = 0;
}
+void LDM_destroyCCtx(LDM_CCtx *cctx) {
+ free(cctx->hashTable);
+}
+
/**
* Finds the "best" match.
*
*match = getPositionOnHash(cctx, h);
putHashOfCurrentPositionFromHash(cctx, h, sum);
- } while (cctx->ip - *match > WINDOW_SIZE ||
+ } while (cctx->ip - *match > LDM_WINDOW_SIZE ||
!LDM_isValidMatch(cctx->ip, *match));
setNextHash(cctx);
return 0;
size_t LDM_compress(const void *src, size_t srcSize,
void *dst, size_t maxDstSize) {
LDM_CCtx cctx;
+ const BYTE *match;
LDM_initializeCCtx(&cctx, src, srcSize, dst, maxDstSize);
/* Hash the first position and put it into the hash table. */
LDM_putHashOfCurrentPosition(&cctx);
- // TODO: loop condition is not accurate.
- while (1) {
- const BYTE *match;
-
- /**
- * Find a match.
- * If no more matches can be found (i.e. the length of the remaining input
- * is less than the minimum match length), then stop searching for matches
- * and encode the final literals.
- */
- if (LDM_findBestMatch(&cctx, &match) != 0) {
- goto _last_literals;
- }
+ /**
+ * Find a match.
+ * If no more matches can be found (i.e. the length of the remaining input
+ * is less than the minimum match length), then stop searching for matches
+ * and encode the final literals.
+ */
+ while (LDM_findBestMatch(&cctx, &match) == 0) {
#ifdef COMPUTE_STATS
cctx.stats.numMatches++;
#endif
cctx.ip + LDM_MIN_MATCH_LENGTH, match + LDM_MIN_MATCH_LENGTH,
cctx.ihashLimit);
+ LDM_outputBlock(&cctx, literalLength, offset, matchLength);
+
#ifdef COMPUTE_STATS
cctx.stats.totalLiteralLength += literalLength;
cctx.stats.totalOffset += offset;
offset < cctx.stats.minOffset ? offset : cctx.stats.minOffset;
cctx.stats.maxOffset =
offset > cctx.stats.maxOffset ? offset : cctx.stats.maxOffset;
+ cctx.stats.offsetHistogram[(U32)intLog2(offset)]++;
#endif
- LDM_outputBlock(&cctx, literalLength, offset, matchLength);
// Move ip to end of block, inserting hashes at each position.
cctx.nextIp = cctx.ip + cctx.step;
cctx.anchor = cctx.ip;
LDM_updateLastHashFromNextHash(&cctx);
}
-_last_literals:
+
+ // LDM_outputHashTableOffsetHistogram(&cctx);
+
/* Encode the last literals (no more matches). */
{
- const size_t lastRun = (size_t)(cctx.iend - cctx.anchor);
+ const size_t lastRun = cctx.iend - cctx.anchor;
BYTE *pToken = cctx.op++;
LDM_encodeLiteralLengthAndLiterals(&cctx, pToken, lastRun);
}
#ifdef COMPUTE_STATS
LDM_printCompressStats(&cctx.stats);
- LDM_outputHashtableOccupancy(cctx.hashTable, LDM_HASHTABLESIZE_U32);
+ LDM_outputHashTableOccupancy(cctx.hashTable, LDM_HASHTABLESIZE_U32);
#endif
- return (cctx.op - (const BYTE *)cctx.obase);
+ {
+ const size_t ret = cctx.op - cctx.obase;
+ LDM_destroyCCtx(&cctx);
+ return ret;
+ }
}
struct LDM_DCtx {
// TODO: implement and test hash function
void LDM_test(void) {
-
}
/*
#define LDM_OFFSET_SIZE 4
// Defines the size of the hash table.
-#define LDM_MEMORY_USAGE 22
+#define LDM_MEMORY_USAGE 16
#define LDM_HASHLOG (LDM_MEMORY_USAGE-2)
#define LDM_HASHTABLESIZE (1 << (LDM_MEMORY_USAGE))
#define LDM_HASHTABLESIZE_U32 ((LDM_HASHTABLESIZE) >> 2)
-#define WINDOW_SIZE (1 << 25)
+#define LDM_WINDOW_SIZE_LOG 25
+#define LDM_WINDOW_SIZE (1 << (LDM_WINDOW_SIZE_LOG))
//These should be multiples of four.
-#define LDM_MIN_MATCH_LENGTH 8
-#define LDM_HASH_LENGTH 8
+#define LDM_MIN_MATCH_LENGTH 4
+#define LDM_HASH_LENGTH 4
typedef U32 offset_t;
typedef U32 hash_t;
/**
* Initialize the compression context.
+ *
+ * Allocates memory for the hash table.
*/
void LDM_initializeCCtx(LDM_CCtx *cctx,
const void *src, size_t srcSize,
void *dst, size_t maxDstSize);
+/**
+ * Frees up memory allocating in initializeCCtx
+ */
+void LDM_destroyCCtx(LDM_CCtx *cctx);
+
/**
* Prints the percentage of the hash table occupied (where occupied is defined
* as the entry being non-zero).
*/
-void LDM_outputHashtableOccupancy(const LDM_hashEntry *hashTable,
+void LDM_outputHashTableOccupancy(const LDM_hashEntry *hashTable,
U32 hashTableSize);
+/**
+ * Prints the distribution of offsets in the hash table.
+ *
+ * The offsets are defined as the distance of the hash table entry from the
+ * current input position of the cctx.
+ */
+void LDM_outputHashTableOffsetHistogram(const LDM_CCtx *cctx);
+
/**
* Outputs compression statistics to stdout.
*/