From: Jennifer Liu Date: Thu, 26 Jul 2018 20:53:13 +0000 (-0700) Subject: Add array to keep track of frequency within active segment, fix malloc bug, update... X-Git-Tag: v0.0.29~58^2~3 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=3b163e0b5b5f9eec427b87001483c3b627c95a8f;p=thirdparty%2Fzstd.git Add array to keep track of frequency within active segment, fix malloc bug, update benchmarking result --- diff --git a/contrib/experimental_dict_builders/benchmarkDictBuilder/README.md b/contrib/experimental_dict_builders/benchmarkDictBuilder/README.md index 478d8793e..07d65b08c 100644 --- a/contrib/experimental_dict_builders/benchmarkDictBuilder/README.md +++ b/contrib/experimental_dict_builders/benchmarkDictBuilder/README.md @@ -14,42 +14,46 @@ make ARG="in=../../../lib/dictBuilder in=../../../lib/compress" ###Benchmarking Result: +d=8 +f=23 +freq[i] = 0 when dmer added to best segment + github: | Algorithm | Speed(sec) | Compression Ratio | -| ------------------|:-------------:| ------------------:| -| nodict | 0.000004 | 2.999642 | -| random | 0.148247 | 8.786957 | -| cover | 56.331553 | 10.641263 | -| legacy | 0.917595 | 8.989482 | -| fastCover(opt) | 13.169979 | 10.215174 | -| fastCover(k=200) | 2.692406 | 8.657219 | +| ----------------- | ------------- | ------------------ | +| nodict | 0.000007 | 2.999642 | +| random | 0.150258 | 8.786957 | +| cover | 60.388853 | 10.641263 | +| legacy | 0.965050 | 8.989482 | +| fastCover(opt) | 84.968131 | 10.614747 | +| fastCover(k=200) | 6.465490 | 9.484150 | hg-commands | Algorithm | Speed(sec) | Compression Ratio | -| ----------------- |:-------------:| ------------------:| -| nodict | 0.000007 | 2.425291 | -| random | 0.093990 | 3.489515 | -| cover | 58.602385 | 4.131136 | -| legacy | 0.865683 | 3.911896 | -| fastCover(opt) | 9.404134 | 3.977229 | -| fastCover(k=200) | 1.037434 | 3.810326 | +| ----------------- | ------------- | ------------------ | +| nodict | 0.000005 | 2.425291 | +| random | 0.084348 | 3.489515 | +| cover | 60.144894 | 4.131136 | +| legacy | 0.831981 | 3.911896 | +| fastCover(opt) | 59.030437 | 4.157595 | +| fastCover(k=200) | 3.702932 | 4.134222 | hg-changelog | Algorithm | Speed(sec) | Compression Ratio | -| ----------------- |:-------------:| ------------------:| -| nodict | 0.000022 | 1.377613 | -| random | 0.551539 | 2.096785 | -| cover | 221.370056 | 2.188654 | -| legacy | 2.405923 | 2.058273 | -| fastCover(opt) | 49.526246 | 2.124185 | -| fastCover(k=200) | 9.746872 | 2.114674 | +| ----------------- | ------------- | ------------------ | +| nodict | 0.000004 | 1.377613 | +| random | 0.555964 | 2.096785 | +| cover | 214.423753 | 2.188654 | +| legacy | 2.180249 | 2.058273 | +| fastCover(opt) | 102.261452 | 2.180347 | +| fastCover(k=200) | 11.81039 | 2.170673 | hg-manifest | Algorithm | Speed(sec) | Compression Ratio | -| ----------------- |:-------------:| ------------------:| -| nodict | 0.000019 | 1.866385 | -| random | 1.083536 | 2.309485 | -| cover | 928.894887 | 2.582597 | -| legacy | 9.110371 | 2.506775 | -| fastCover(opt) | 116.508270 | 2.525689 | -| fastCover(k=200) | 12.176555 | 2.472221 | +| ----------------- | ------------- | ------------------ | +| nodict | 0.000006 | 1.866385 | +| random | 1.063974 | 2.309485 | +| cover | 909.101849 | 2.582597 | +| legacy | 8.706580 | 2.506775 | +| fastCover(opt) | 188.598079 | 2.596761 | +| fastCover(k=200) | 13.392734 | 2.592985 | diff --git a/contrib/experimental_dict_builders/fastCover/fastCover.c b/contrib/experimental_dict_builders/fastCover/fastCover.c index abd592cd8..6f990e0c2 100644 --- a/contrib/experimental_dict_builders/fastCover/fastCover.c +++ b/contrib/experimental_dict_builders/fastCover/fastCover.c @@ -48,7 +48,7 @@ static clock_t g_time = 0; /*-************************************* -* Hash Function +* Hash Functions ***************************************/ static const U64 prime6bytes = 227718039650203ULL; static size_t ZSTD_hash6(U64 u, U32 h) { return (size_t)(((u << (64-48)) * prime6bytes) >> (64-h)) ; } @@ -58,6 +58,7 @@ static const U64 prime8bytes = 0xCF1BBCDCB7A56463ULL; static size_t ZSTD_hash8(U64 u, U32 h) { return (size_t)(((u) * prime8bytes) >> (64-h)) ; } static size_t ZSTD_hash8Ptr(const void* p, U32 h) { return ZSTD_hash8(MEM_readLE64(p), h); } + /** * Hash the d-byte value pointed to by p and mod 2^f */ @@ -140,29 +141,41 @@ static FASTCOVER_segment_t FASTCOVER_selectSegment(const FASTCOVER_ctx_t *ctx, activeSegment.begin = begin; activeSegment.end = begin; activeSegment.score = 0; - /* Slide the activeSegment through the whole epoch. - * Save the best segment in bestSegment. - */ - while (activeSegment.end < end) { - /* Get hash value of current dmer */ - const size_t index = FASTCOVER_hashPtrToIndex(ctx->samples + activeSegment.end, parameters.f, ctx->d); - /* Add frequency of this index to score */ - activeSegment.score += freqs[index]; - /* Increment end of segment */ - activeSegment.end += 1; - /* If the window is now too large, drop the first position */ - if (activeSegment.end - activeSegment.begin == dmersInK + 1) { - /* Get hash value of the dmer to be eliminated from active segment */ - const size_t delIndex = FASTCOVER_hashPtrToIndex(ctx->samples + activeSegment.begin, parameters.f, ctx->d); - /* Subtract frequency of this index from score */ - activeSegment.score -= freqs[delIndex]; - /* Increment start of segment */ - activeSegment.begin += 1; - } - /* If this segment is the best so far save it */ - if (activeSegment.score > bestSegment.score) { - bestSegment = activeSegment; + { + /* Keep track of number of times an index has been seen in current segment */ + U16* currfreqs =(U16 *)malloc((1 << parameters.f) * sizeof(U16)); + memset(currfreqs, 0, (1 << parameters.f) * sizeof(*currfreqs)); + /* Slide the activeSegment through the whole epoch. + * Save the best segment in bestSegment. + */ + while (activeSegment.end < end) { + /* Get hash value of current dmer */ + const size_t index = FASTCOVER_hashPtrToIndex(ctx->samples + activeSegment.end, parameters.f, ctx->d); + /* Add frequency of this index to score if this is the first occurence of index in active segment */ + if (currfreqs[index] == 0) { + activeSegment.score += freqs[index]; + } + currfreqs[index] += 1; + /* Increment end of segment */ + activeSegment.end += 1; + /* If the window is now too large, drop the first position */ + if (activeSegment.end - activeSegment.begin == dmersInK + 1) { + /* Get hash value of the dmer to be eliminated from active segment */ + const size_t delIndex = FASTCOVER_hashPtrToIndex(ctx->samples + activeSegment.begin, parameters.f, ctx->d); + currfreqs[delIndex] -= 1; + /* Subtract frequency of this index from score if this is the last occurrence of this index in active segment */ + if (currfreqs[delIndex] == 0) { + activeSegment.score -= freqs[delIndex]; + } + /* Increment start of segment */ + activeSegment.begin += 1; + } + /* If this segment is the best so far save it */ + if (activeSegment.score > bestSegment.score) { + bestSegment = activeSegment; + } } + free(currfreqs); } { /* Trim off the zero frequency head and tail from the segment. */ @@ -185,7 +198,7 @@ static FASTCOVER_segment_t FASTCOVER_selectSegment(const FASTCOVER_ctx_t *ctx, U32 pos; for (pos = bestSegment.begin; pos != bestSegment.end; ++pos) { const size_t i = FASTCOVER_hashPtrToIndex(ctx->samples + pos, parameters.f, ctx->d); - freqs[i] = freqs[i]/2; + freqs[i] = 0; } } return bestSegment; @@ -245,12 +258,12 @@ static void FASTCOVER_ctx_destroy(FASTCOVER_ctx_t *ctx) { /** * Calculate for frequency of hash value of each dmer in ctx->samples */ -static void FASTCOVER_getFrequency(U32 *freqs, unsigned f, FASTCOVER_ctx_t *ctx){ +static void FASTCOVER_computeFrequency(U32 *freqs, unsigned f, FASTCOVER_ctx_t *ctx){ /* inCurrSample keeps track of this hash value has already be seen in previous dmers in the same sample*/ - size_t* inCurrSample = (size_t *)malloc((1<nbTrainSamples; i++) { - memset(inCurrSample, 0, (1 << f)); /* Reset inCurrSample for each sample */ + memset(inCurrSample, 0, (1 << f) * sizeof(*inCurrSample)); /* Reset inCurrSample for each sample */ size_t currSampleStart = ctx->offsets[i]; size_t currSampleEnd = ctx->offsets[i+1]; start = currSampleStart; @@ -338,7 +351,7 @@ static int FASTCOVER_ctx_init(FASTCOVER_ctx_t *ctx, const void *samplesBuffer, memset(ctx->freqs, 0, (1 << f) * sizeof(U32)); DISPLAYLEVEL(2, "Computing frequencies\n"); - FASTCOVER_getFrequency(ctx->freqs, f, ctx); + FASTCOVER_computeFrequency(ctx->freqs, f, ctx); return 1; } diff --git a/contrib/experimental_dict_builders/fastCover/main.c b/contrib/experimental_dict_builders/fastCover/main.c index 260eeb281..f286b0506 100644 --- a/contrib/experimental_dict_builders/fastCover/main.c +++ b/contrib/experimental_dict_builders/fastCover/main.c @@ -165,7 +165,7 @@ int main(int argCount, const char* argv[]) params.splitPoint = (double)split/100; /* Build dictionary */ - sampleInfo* info= getSampleInfo(filenameTable, + sampleInfo* info = getSampleInfo(filenameTable, filenameIdx, blockSize, maxDictSize, zParams.notificationLevel); operationResult = FASTCOVER_trainFromFiles(outputFile, info, maxDictSize, ¶ms); diff --git a/contrib/experimental_dict_builders/randomDictBuilder/main.c b/contrib/experimental_dict_builders/randomDictBuilder/main.c index 3f3a6ca70..3ad885746 100644 --- a/contrib/experimental_dict_builders/randomDictBuilder/main.c +++ b/contrib/experimental_dict_builders/randomDictBuilder/main.c @@ -149,7 +149,7 @@ int main(int argCount, const char* argv[]) params.zParams = zParams; params.k = k; - sampleInfo* info= getSampleInfo(filenameTable, + sampleInfo* info = getSampleInfo(filenameTable, filenameIdx, blockSize, maxDictSize, zParams.notificationLevel); operationResult = RANDOM_trainFromFiles(outputFile, info, maxDictSize, ¶ms);