all: main-circular-buffer main-integrated main-64
-#main-basic : basic_table.c ldm.c main-ldm.c
-# $(CC) $(CPPFLAGS) $(CFLAGS) $^ $(LDFLAGS) -o $@
-
main-circular-buffer: circular_buffer_table.c ldm.c main-ldm.c
$(CC) $(CPPFLAGS) $(CFLAGS) $^ $(LDFLAGS) -o $@
main-64: ldm_64_hash.c main-ldm.c
$(CC) $(CPPFLAGS) $(CFLAGS) $^ $(LDFLAGS) -o $@
-main-integrated: ldm_with_table.c main-ldm.c
+main-integrated: ldm_integrated.c main-ldm.c
$(CC) $(CPPFLAGS) $(CFLAGS) $^ $(LDFLAGS) -o $@
clean:
@rm -f core *.o tmp* result* *.ldm *.ldm.dec \
- main-circular-buffer main-integrated main-64
+ main-circular-buffer main-64 main-integrated
@echo Cleaning completed
#include "ldm_hashtable.h"
#include "mem.h"
-// Number of elements per hash bucket.
-// HASH_BUCKET_SIZE_LOG defined in ldm.h
+// THe number of elements per hash bucket.
+// HASH_BUCKET_SIZE_LOG is defined in ldm.h.
#define HASH_BUCKET_SIZE (1 << (HASH_BUCKET_SIZE_LOG))
-#define LDM_HASHLOG ((LDM_MEMORY_USAGE)-(LDM_HASH_ENTRY_SIZE_LOG)-(HASH_BUCKET_SIZE_LOG))
-
-
-// TODO: rename. Number of hash buckets.
-// TODO: Link to HASH_ENTRY_SIZE_LOG
+// The number of hash buckets.
+#define LDM_HASHLOG ((LDM_MEMORY_USAGE)-(LDM_HASH_ENTRY_SIZE_LOG)-(HASH_BUCKET_SIZE_LOG))
+// If ZSTD_SKIP is defined, then the first entry is returned in HASH_getBestEntry
+// (without looking at other entries in the bucket).
//#define ZSTD_SKIP
struct LDM_hashTable {
- U32 numBuckets;
- U32 numEntries;
+ U32 numBuckets; // The number of buckets.
+ U32 numEntries; // numBuckets * HASH_BUCKET_SIZE.
LDM_hashEntry *entries;
- BYTE *bucketOffsets; // Pointer to current insert position.
+ BYTE *bucketOffsets; // A pointer (per bucket) to the next insert position.
- // Position corresponding to offset=0 in LDM_hashEntry.
- const BYTE *offsetBase;
+ const BYTE *offsetBase; // Corresponds to offset=0 in LDM_hashEntry.
U32 minMatchLength;
U32 maxWindowSize;
};
return table->entries + (hash << HASH_BUCKET_SIZE_LOG);
}
+// From lib/compress/zstd_compress.c
static unsigned ZSTD_NbCommonBytes (register size_t val)
{
if (MEM_isLittleEndian()) {
} }
}
-// From lib/compress/zstd_compress.c
+/**
+ * From lib/compress/zstd_compress.c
+ * Returns the number of bytes (consecutively) in common between pIn and pMatch
+ * up to pInLimit.
+ */
static size_t ZSTD_count(const BYTE *pIn, const BYTE *pMatch,
const BYTE *const pInLimit) {
const BYTE * const pStart = pIn;
return (size_t)(pIn - pStart);
}
-U32 countBackwardsMatch(const BYTE *pIn, const BYTE *pAnchor,
- const BYTE *pMatch, const BYTE *pBase) {
- U32 matchLength = 0;
+/**
+ * Returns the number of bytes in common between pIn and pMatch,
+ * counting backwards, with pIn having a lower limit of pAnchor and
+ * pMatch having a lower limit of pBase.
+ */
+static size_t countBackwardsMatch(const BYTE *pIn, const BYTE *pAnchor,
+ const BYTE *pMatch, const BYTE *pBase) {
+ size_t matchLength = 0;
while (pIn > pAnchor && pMatch > pBase && pIn[-1] == pMatch[-1]) {
pIn--;
pMatch--;
U64 forwardMatchLength = ZSTD_count(pIn, pMatch, pEnd);
U64 backwardMatchLength, totalMatchLength;
+ // Only take matches where the forwardMatchLength is large enough
+ // for speed.
if (forwardMatchLength < table->minMatchLength) {
continue;
}
void HASH_insert(LDM_hashTable *table,
const hash_t hash, const LDM_hashEntry entry) {
+ // Circular buffer.
*(getBucket(table, hash) + table->bucketOffsets[hash]) = entry;
table->bucketOffsets[hash]++;
table->bucketOffsets[hash] &= HASH_BUCKET_SIZE - 1;
typedef U32 checksum_t;
-// TODO: Scanning speed
// TODO: Memory usage
struct LDM_compressStats {
U32 windowSizeLog, hashTableSizeLog;
U32 minOffset, maxOffset;
- U32 numCollisions;
- U32 numHashInserts;
-
U32 offsetHistogram[32];
};
hash_t nextHash; /* Hash corresponding to nextPosHashed */
checksum_t nextSum;
-
-
unsigned step; // ip step, should be 1.
const BYTE *lagIp;
hash_t lagHash;
checksum_t lagSum;
- U64 numHashInserts;
// DEBUG
const BYTE *DEBUG_setNextHash;
};
return ret;
}
-// TODO: Maybe we would eventually prefer to have linear rather than
-// exponential buckets.
-/**
-void HASH_outputTableOffsetHistogram(const LDM_CCtx *cctx) {
- U32 i = 0;
- int buckets[32] = { 0 };
-
- printf("\n");
- printf("Hash table histogram\n");
- for (; i < HASH_getSize(cctx->hashTable); i++) {
- int offset = (cctx->ip - cctx->ibase) -
- HASH_getEntryFromHash(cctx->hashTable, i)->offset;
- buckets[intLog2(offset)]++;
- }
-
- i = 0;
- for (; i < 32; i++) {
- printf("2^%*d: %10u %6.3f%%\n", 2, i,
- buckets[i],
- 100.0 * (double) buckets[i] /
- (double) HASH_getSize(cctx->hashTable));
- }
- printf("\n");
-}
-*/
-
void LDM_printCompressStats(const LDM_compressStats *stats) {
int i = 0;
printf("=====================\n");
printf("=====================\n");
}
-int LDM_isValidMatch(const BYTE *pIn, const BYTE *pMatch) {
+/*
+static int LDM_isValidMatch(const BYTE *pIn, const BYTE *pMatch) {
U32 lengthLeft = LDM_MIN_MATCH_LENGTH;
const BYTE *curIn = pIn;
const BYTE *curMatch = pMatch;
}
return 1;
}
+*/
/**
* Convert a sum computed from getChecksum to a hash value in the range
cctx->DEBUG_setNextHash = cctx->nextIp;
#endif
-// cctx->nextSum = getChecksum((const char *)cctx->nextIp, LDM_HASH_LENGTH);
cctx->nextSum = updateChecksum(
cctx->lastSum, LDM_HASH_LENGTH,
cctx->lastPosHashed[0],
// Note: this works only when cctx->step is 1.
if (((cctx->ip - cctx->ibase) & HASH_ONLY_EVERY) == HASH_ONLY_EVERY) {
#if LDM_LAG
- // TODO: off by 1, but whatever
+ // Off by 1, but whatever
if (cctx->lagIp - cctx->ibase > 0) {
const LDM_hashEntry entry = { cctx->lagIp - cctx->ibase, cctx->lagSum };
HASH_insert(cctx->hashTable, cctx->lagHash, entry);
putHashOfCurrentPositionFromHash(cctx, hash, sum);
}
+/*
U32 LDM_countMatchLength(const BYTE *pIn, const BYTE *pMatch,
const BYTE *pInLimit) {
const BYTE * const pStart = pIn;
}
return (U32)(pIn - pStart);
}
+*/
void LDM_outputConfiguration(void) {
printf("=====================\n");
// ip += sizeof(U64);
}
+void LDM_writeHeader(void *memPtr, U64 compressedSize,
+ U64 decompressedSize) {
+ MEM_write64(memPtr, compressedSize);
+ MEM_write64((BYTE *)memPtr + 8, decompressedSize);
+}
+
void LDM_initializeCCtx(LDM_CCtx *cctx,
const void *src, size_t srcSize,
void *dst, size_t maxDstSize) {
LDM_updateLastHashFromNextHash(&cctx);
}
- // HASH_outputTableOffsetHistogram(&cctx);
-
/* Encode the last literals (no more matches). */
{
const U32 lastRun = cctx.iend - cctx.anchor;
return dctx.op - (BYTE *)dst;
}
-// TODO: implement and test hash function
void LDM_test(const BYTE *src) {
(void)src;
}
#ifndef LDM_H
#define LDM_H
-#include <stddef.h> /* size_t */
-
#include "mem.h" // from /lib/common/mem.h
-#define LDM_COMPRESS_SIZE 8
-#define LDM_DECOMPRESS_SIZE 8
-#define LDM_HEADER_SIZE ((LDM_COMPRESS_SIZE)+(LDM_DECOMPRESS_SIZE))
+// The number of bytes storing the compressed and decompressed size
+// in the header.
+#define LDM_COMPRESSED_SIZE 8
+#define LDM_DECOMPRESSED_SIZE 8
+#define LDM_HEADER_SIZE ((LDM_COMPRESSED_SIZE)+(LDM_DECOMPRESSED_SIZE))
+
+// THe number of bytes storing the offset.
#define LDM_OFFSET_SIZE 4
// Defines the size of the hash table.
// Note that this is not the number of buckets.
// Currently this should be less than WINDOW_SIZE_LOG + 4?
-#define LDM_MEMORY_USAGE 24
-#define HASH_BUCKET_SIZE_LOG 0 // MAX is 4 for now
+#define LDM_MEMORY_USAGE 23
+
+// The number of entries in a hash bucket.
+#define HASH_BUCKET_SIZE_LOG 0 // The maximum is 4 for now.
// Defines the lag in inserting elements into the hash table.
#define LDM_LAG 0
#define LDM_WINDOW_SIZE (1 << (LDM_WINDOW_SIZE_LOG))
//These should be multiples of four (and perhaps set to the same value?).
-#define LDM_MIN_MATCH_LENGTH 64
-#define LDM_HASH_LENGTH 64
+#define LDM_MIN_MATCH_LENGTH 16
+#define LDM_HASH_LENGTH 16
// Experimental.
-//:w
//#define TMP_EVICTION
#define TMP_TAG_INSERT
//#define TMP_SIMPLE_LOWER
typedef struct LDM_CCtx LDM_CCtx;
typedef struct LDM_DCtx LDM_DCtx;
-
/**
* Compresses src into dst.
*
* Outputs compression statistics to stdout.
*/
void LDM_printCompressStats(const LDM_compressStats *stats);
-/**
- * Checks whether the LDM_MIN_MATCH_LENGTH bytes from p are the same as the
- * LDM_MIN_MATCH_LENGTH bytes from match and also if
- * pIn - pMatch <= LDM_WINDOW_SIZE.
- *
- * This assumes LDM_MIN_MATCH_LENGTH is a multiple of four.
- *
- * Return 1 if valid, 0 otherwise.
- */
-int LDM_isValidMatch(const BYTE *pIn, const BYTE *pMatch);
-
/**
* Encode the literal length followed by the literals.
void LDM_readHeader(const void *src, U64 *compressedSize,
U64 *decompressedSize);
+/**
+ * Write the compressed and decompressed size.
+ */
+void LDM_writeHeader(void *memPtr, U64 compressedSize,
+ U64 decompressedSize);
+
+/**
+ * Output the configuration used.
+ */
void LDM_outputConfiguration(void);
void LDM_test(const BYTE *src);
//#define ZSTD_SKIP
//#define RUN_CHECKS
-//
-//
+
static const U64 prime8bytes = 11400714785074694791ULL;
/* Hash table stuff */
U32 checksum;
} LDM_hashEntry;
-// TODO: Memory usage
struct LDM_compressStats {
U32 windowSizeLog, hashTableSizeLog;
U32 numMatches;
U32 minOffset, maxOffset;
- U32 numCollisions;
- U32 numHashInserts;
-
U32 offsetHistogram[32];
U64 TMP_hashCount[1 << HASH_ONLY_EVERY_LOG];
const BYTE *lagIp;
U64 lagSum;
- U64 numHashInserts;
// DEBUG
const BYTE *DEBUG_setNextHash;
};
struct LDM_hashTable {
- U32 numBuckets; // Number of buckets
- U32 numEntries; // Rename...
- LDM_hashEntry *entries;
+ U32 numBuckets; // The number of buckets.
+ U32 numEntries; // numBuckets * HASH_BUCKET_SIZE.
- BYTE *bucketOffsets;
- // Position corresponding to offset=0 in LDM_hashEntry.
+ LDM_hashEntry *entries;
+ BYTE *bucketOffsets; // A pointer (per bucket) to the next insert position.
};
+
/**
* Create a hash table that can contain size elements.
* The number of buckets is determined by size >> HASH_BUCKET_SIZE_LOG.
*
* We count only bytes where pMatch > pBaes and pIn > pAnchor.
*/
-U64 countBackwardsMatch(const BYTE *pIn, const BYTE *pAnchor,
+size_t countBackwardsMatch(const BYTE *pIn, const BYTE *pAnchor,
const BYTE *pMatch, const BYTE *pBase) {
- U64 matchLength = 0;
+ size_T matchLength = 0;
while (pIn > pAnchor && pMatch > pBase && pIn[-1] == pMatch[-1]) {
pIn--;
pMatch--;
U64 forwardMatchLength = ZSTD_count(cctx->ip, pMatch, cctx->iend);
U64 backwardMatchLength, totalMatchLength;
- // For speed.
+ // Only take matches where the forward match length is large enough
+ // for speed.
if (forwardMatchLength < LDM_MIN_MATCH_LENGTH) {
continue;
}
// ip += sizeof(U64);
}
+void LDM_writeHeader(void *memPtr, U64 compressedSize,
+ U64 decompressedSize) {
+ MEM_write64(memPtr, compressedSize);
+ MEM_write64((BYTE *)memPtr + 8, decompressedSize);
+}
+
+
void LDM_initializeCCtx(LDM_CCtx *cctx,
const void *src, size_t srcSize,
void *dst, size_t maxDstSize) {
+/**
+ * A "hash" table used in LDM compression.
+ *
+ * This is not exactly a hash table in the sense that inserted entries
+ * are not guaranteed to remain in the hash table.
+ */
+
#ifndef LDM_HASHTABLE_H
#define LDM_HASHTABLE_H
#include "mem.h"
+// The log size of LDM_hashEntry in bytes.
#define LDM_HASH_ENTRY_SIZE_LOG 3
-// TODO: clean up comments
-
typedef U32 hash_t;
typedef struct LDM_hashEntry {
- U32 offset; // TODO: Replace with pointer?
- U32 checksum;
+ U32 offset; // Represents the offset of the entry from offsetBase.
+ U32 checksum; // A checksum to select entries with the same hash value.
} LDM_hashEntry;
typedef struct LDM_hashTable LDM_hashTable;
+/**
+ * Create a table that can contain size elements. This does not necessarily
+ * correspond to the number of hash buckets. The number of hash buckets
+ * is size / (1 << HASH_BUCKET_SIZE_LOG)
+ *
+ * minMatchLength is the minimum match length required in HASH_getBestEntry.
+ *
+ * maxWindowSize is the maximum distance from pIn in HASH_getBestEntry.
+ * The window is defined to be (pIn - offsetBase - offset).
+ */
LDM_hashTable *HASH_createTable(U32 size, const BYTE *offsetBase,
U32 minMatchLength, U32 maxWindowSize);
+/**
+ * Return the "best" entry from the table with the same hash and checksum.
+ *
+ * pIn: a pointer to the current input position.
+ * pEnd: a pointer to the maximum input position.
+ * pAnchor: a pointer to the minimum input position.
+ *
+ * This function computes the forward and backward match length from pIn
+ * and writes it to forwardMatchLength and backwardsMatchLength.
+ *
+ * E.g. for the two strings "aaabbbb" "aaabbbb" with pIn and the
+ * entry pointing at the first "b", the forward match length would be
+ * four (representing the "b" matches) and the backward match length would
+ * three (representing the "a" matches before the pointer).
+ */
LDM_hashEntry *HASH_getBestEntry(const LDM_hashTable *table,
const hash_t hash,
const U32 checksum,
const BYTE *pIn,
const BYTE *pEnd,
const BYTE *pAnchor,
- U64 *matchLength,
+ U64 *forwardMatchLength,
U64 *backwardsMatchLength);
+/**
+ * Return a hash of the value.
+ */
hash_t HASH_hashU32(U32 value);
/**
* Insert an LDM_hashEntry into the bucket corresponding to hash.
+ *
+ * An entry may be evicted in the process.
*/
void HASH_insert(LDM_hashTable *table, const hash_t hash,
const LDM_hashEntry entry);
*/
U32 HASH_getSize(const LDM_hashTable *table);
+/**
+ * Destroy the table.
+ */
void HASH_destroyTable(LDM_hashTable *table);
/**
//#define RUN_CHECKS
-/* Hash table stuff */
-
typedef U32 hash_t;
typedef struct LDM_hashEntry {
U32 checksum;
} LDM_hashEntry;
-// TODO: Memory usage
struct LDM_compressStats {
U32 windowSizeLog, hashTableSizeLog;
U32 numMatches;
U32 minOffset, maxOffset;
- U32 numCollisions;
- U32 numHashInserts;
-
U32 offsetHistogram[32];
};
LDM_hashTable *hashTable;
-// LDM_hashEntry hashTable[LDM_HASHTABLESIZE_U32];
-
const BYTE *lastPosHashed; /* Last position hashed */
hash_t lastHash; /* Hash corresponding to lastPosHashed */
U32 lastSum;
struct LDM_hashTable {
U32 numBuckets; // Number of buckets
- U32 numEntries; // Rename...
+ U32 numEntries;
LDM_hashEntry *entries;
BYTE *bucketOffsets;
- // Position corresponding to offset=0 in LDM_hashEntry.
};
/**
return ret;
}
-// Maybe we would eventually prefer to have linear rather than
-// exponential buckets.
-/**
-void HASH_outputTableOffsetHistogram(const LDM_CCtx *cctx) {
- U32 i = 0;
- int buckets[32] = { 0 };
-
- printf("\n");
- printf("Hash table histogram\n");
- for (; i < HASH_getSize(cctx->hashTable); i++) {
- int offset = (cctx->ip - cctx->ibase) -
- HASH_getEntryFromHash(cctx->hashTable, i)->offset;
- buckets[intLog2(offset)]++;
- }
-
- i = 0;
- for (; i < 32; i++) {
- printf("2^%*d: %10u %6.3f%%\n", 2, i,
- buckets[i],
- 100.0 * (double) buckets[i] /
- (double) HASH_getSize(cctx->hashTable));
- }
- printf("\n");
-}
-*/
-
void LDM_printCompressStats(const LDM_compressStats *stats) {
int i = 0;
printf("=====================\n");
cctx->DEBUG_setNextHash = cctx->nextIp;
#endif
-// cctx->nextSum = getChecksum((const char *)cctx->nextIp, LDM_HASH_LENGTH);
cctx->nextSum = updateChecksum(
cctx->lastSum, LDM_HASH_LENGTH,
cctx->lastPosHashed[0],
cctx->nextHash = checksumToHash(cctx->nextSum);
#if LDM_LAG
-// printf("LDM_LAG %zu\n", cctx->ip - cctx->lagIp);
if (cctx->ip - cctx->ibase > LDM_LAG) {
cctx->lagSum = updateChecksum(
cctx->lagSum, LDM_HASH_LENGTH,
// Hash only every HASH_ONLY_EVERY times, based on cctx->ip.
// Note: this works only when cctx->step is 1.
if (((cctx->ip - cctx->ibase) & HASH_ONLY_EVERY) == HASH_ONLY_EVERY) {
- /**
- const LDM_hashEntry entry = { cctx->ip - cctx->ibase ,
- MEM_read32(cctx->ip) };
- */
#if LDM_LAG
// TODO: off by 1, but whatever
if (cctx->lagIp - cctx->ibase > 0) {
putHashOfCurrentPositionFromHash(cctx, hash, sum);
}
-U32 LDM_countMatchLength(const BYTE *pIn, const BYTE *pMatch,
- const BYTE *pInLimit) {
- const BYTE * const pStart = pIn;
- while (pIn < pInLimit - 1) {
- BYTE const diff = (*pMatch) ^ *(pIn);
- if (!diff) {
- pIn++;
- pMatch++;
- continue;
- }
- return (U32)(pIn - pStart);
- }
- return (U32)(pIn - pStart);
-}
-
void LDM_outputConfiguration(void) {
printf("=====================\n");
printf("Configuration\n");
// ip += sizeof(U64);
}
+void LDM_writeHeader(void *memPtr, U64 compressedSize,
+ U64 decompressedSize) {
+ MEM_write64(memPtr, compressedSize);
+ MEM_write64((BYTE *)memPtr + 8, decompressedSize);
+}
+
+
void LDM_initializeCCtx(LDM_CCtx *cctx,
const void *src, size_t srcSize,
void *dst, size_t maxDstSize) {
#include "ldm.h"
#include "zstd.h"
-#define DEBUG
//#define TEST
/* Compress file given by fname and output to oname.
* Returns 0 if successful, error code otherwise.
*
- * TODO: This might seg fault if the compressed size is > the decompress
+ * This might seg fault if the compressed size is > the decompress
* size due to the mmapping and output file size allocated to be the input size
* The compress function should check before writing or buffer writes.
*/
struct timeval tv1, tv2;
double timeTaken;
+
/* Open the input file. */
if ((fdin = open(fname, O_RDONLY)) < 0) {
perror("Error in file opening");
}
maxCompressedSize = (statbuf.st_size + LDM_HEADER_SIZE);
+
// Handle case where compressed size is > decompressed size.
// The compress function should check before writing or buffer writes.
maxCompressedSize += statbuf.st_size / 255;
compressedSize = LDM_HEADER_SIZE +
LDM_compress(src, statbuf.st_size,
dst + LDM_HEADER_SIZE, maxCompressedSize);
+
gettimeofday(&tv2, NULL);
// Write compress and decompress size to header
// TODO: should depend on LDM_DECOMPRESS_SIZE write32
- memcpy(dst, &compressedSize, 8);
- memcpy(dst + 8, &(statbuf.st_size), 8);
-
-#ifdef DEBUG
- printf("Compressed size: %zu\n", compressedSize);
- printf("Decompressed size: %zu\n", (size_t)statbuf.st_size);
-#endif
+ LDM_writeHeader(dst, compressedSize, statbuf.st_size);
// Truncate file to compressedSize.
ftruncate(fdout, compressedSize);
+
printf("%25s : %10lu -> %10lu - %s (%.2fx --- %.1f%%)\n", fname,
(size_t)statbuf.st_size, (size_t)compressedSize, oname,
(statbuf.st_size) / (double)compressedSize,
timeTaken = (double) (tv2.tv_usec - tv1.tv_usec) / 1000000 +
(double) (tv2.tv_sec - tv1.tv_sec),
- printf("Total compress time = %.3f seconds, Average compression speed: %.3f MB/s\n",
+ printf("Total compress time = %.3f seconds, Average scanning speed: %.3f MB/s\n",
timeTaken,
((double)statbuf.st_size / (double) (1 << 20)) / timeTaken);