// TODO: Scanning speed
// TODO: Memory usage
struct LDM_compressStats {
+ U32 windowSizeLog, hashTableSizeLog;
U32 numMatches;
U64 totalMatchLength;
U64 totalLiteralLength;
LDM_compressStats stats; /* Compression statistics */
- LDM_hashEntry hashTable[LDM_HASHTABLESIZE_U32];
+ LDM_hashEntry *hashTable;
+
+// LDM_hashEntry hashTable[LDM_HASHTABLESIZE_U32];
const BYTE *lastPosHashed; /* Last position hashed */
hash_t lastHash; /* Hash corresponding to lastPosHashed */
const BYTE *DEBUG_setNextHash;
};
-void LDM_outputHashtableOccupancy(
+void LDM_outputHashTableOccupancy(
const LDM_hashEntry *hashTable, U32 hashTableSize) {
U32 i = 0;
U32 ctr = 0;
100.0 * (double)(ctr) / (double)hashTableSize);
}
-// TODO: This can be done more efficienctly but is not that important as it
-// is only used for computing stats.
-//
+// TODO: This can be done more efficiently (but it is not that important as it
+// is only used for computing stats).
static int intLog2(U32 x) {
int ret = 0;
while (x >>= 1) {
return ret;
}
+// TODO: Maybe we would eventually prefer to have linear rather than
+// exponential buckets.
+void LDM_outputHashTableOffsetHistogram(const LDM_CCtx *cctx) {
+ int i = 0;
+ int buckets[32] = { 0 };
+
+ printf("\n");
+ printf("Hash table histogram\n");
+ for (; i < LDM_HASHTABLESIZE_U32; i++) {
+ int offset = (cctx->ip - cctx->ibase) - cctx->hashTable[i].offset;
+ buckets[intLog2(offset)]++;
+ }
+
+ i = 0;
+ for (; i < 32; i++) {
+ printf("2^%*d: %10u %6.3f%%\n", 2, i,
+ buckets[i],
+ 100.0 * (double) buckets[i] /
+ (double) LDM_HASHTABLESIZE_U32);
+ }
+ printf("\n");
+}
+
void LDM_printCompressStats(const LDM_compressStats *stats) {
int i = 0;
printf("=====================\n");
printf("Compression statistics\n");
//TODO: compute percentage matched?
+ printf("Window size, hash table size (bytes): 2^%u, 2^%u\n",
+ stats->windowSizeLog, stats->hashTableSizeLog);
printf("num matches, total match length: %u, %llu\n",
stats->numMatches,
stats->totalMatchLength);
printf("avg match length: %.1f\n", ((double)stats->totalMatchLength) /
(double)stats->numMatches);
- printf("avg literal length: %.1f\n",
- ((double)stats->totalLiteralLength) / (double)stats->numMatches);
+ printf("avg literal length, total literalLength: %.1f, %llu\n",
+ ((double)stats->totalLiteralLength) / (double)stats->numMatches,
+ stats->totalLiteralLength);
printf("avg offset length: %.1f\n",
((double)stats->totalOffset) / (double)stats->numMatches);
- printf("min offset, max offset: %u %u\n",
+ printf("min offset, max offset: %u, %u\n",
stats->minOffset, stats->maxOffset);
printf("\n");
- printf("offset histogram\n");
+ printf("offset histogram: offset, num matches, %% of matches\n");
+
for (; i <= intLog2(stats->maxOffset); i++) {
printf("2^%*d: %10u %6.3f%%\n", 2, i,
stats->offsetHistogram[i],
100.0 * (double) stats->offsetHistogram[i] /
- (double)stats->numMatches);
+ (double) stats->numMatches);
}
printf("\n");
cctx->anchor = cctx->ibase;
memset(&(cctx->stats), 0, sizeof(cctx->stats));
- memset(cctx->hashTable, 0, sizeof(cctx->hashTable));
+ cctx->hashTable = calloc(LDM_HASHTABLESIZE_U32, sizeof(LDM_hashEntry));
+// memset(cctx->hashTable, 0, sizeof(cctx->hashTable));
cctx->stats.minOffset = UINT_MAX;
+ cctx->stats.windowSizeLog = LDM_WINDOW_SIZE_LOG;
+ cctx->stats.hashTableSizeLog = LDM_MEMORY_USAGE;
+
cctx->lastPosHashed = NULL;
*match = getPositionOnHash(cctx, h);
putHashOfCurrentPositionFromHash(cctx, h, sum);
- } while (cctx->ip - *match > WINDOW_SIZE ||
+ } while (cctx->ip - *match > LDM_WINDOW_SIZE ||
!LDM_isValidMatch(cctx->ip, *match));
setNextHash(cctx);
return 0;
LDM_updateLastHashFromNextHash(&cctx);
}
+ // LDM_outputHashTableOffsetHistogram(&cctx);
+
/* Encode the last literals (no more matches). */
{
const size_t lastRun = cctx.iend - cctx.anchor;
#ifdef COMPUTE_STATS
LDM_printCompressStats(&cctx.stats);
- LDM_outputHashtableOccupancy(cctx.hashTable, LDM_HASHTABLESIZE_U32);
+ LDM_outputHashTableOccupancy(cctx.hashTable, LDM_HASHTABLESIZE_U32);
#endif
return cctx.op - cctx.obase;
#define LDM_OFFSET_SIZE 4
// Defines the size of the hash table.
-#define LDM_MEMORY_USAGE 20
+#define LDM_MEMORY_USAGE 16
#define LDM_HASHLOG (LDM_MEMORY_USAGE-2)
#define LDM_HASHTABLESIZE (1 << (LDM_MEMORY_USAGE))
#define LDM_HASHTABLESIZE_U32 ((LDM_HASHTABLESIZE) >> 2)
-#define WINDOW_SIZE (1 << 25)
+#define LDM_WINDOW_SIZE_LOG 25
+#define LDM_WINDOW_SIZE (1 << (LDM_WINDOW_SIZE_LOG))
//These should be multiples of four.
-#define LDM_MIN_MATCH_LENGTH 1024
-#define LDM_HASH_LENGTH 1024
+#define LDM_MIN_MATCH_LENGTH 4
+#define LDM_HASH_LENGTH 4
typedef U32 offset_t;
typedef U32 hash_t;
* Prints the percentage of the hash table occupied (where occupied is defined
* as the entry being non-zero).
*/
-void LDM_outputHashtableOccupancy(const LDM_hashEntry *hashTable,
+void LDM_outputHashTableOccupancy(const LDM_hashEntry *hashTable,
U32 hashTableSize);
+/**
+ * Prints the distribution of offsets in the hash table.
+ *
+ * The offsets are defined as the distance of the hash table entry from the
+ * current input position of the cctx.
+ */
+void LDM_outputHashTableOffsetHistogram(const LDM_CCtx *cctx);
+
/**
* Outputs compression statistics to stdout.
*/