/*! FSE_normalizeCount():
normalize counts so that sum(count[]) == Power_of_2 (2^tableLog)
'normalizedCounter' is a table of short, of minimum size (maxSymbolValue+1).
+ useLowProbCount is a bool param which is set to 1 to use count=-1 or set to 0 to
+ use count=1 instead, which speeds up FSE_readNCount() and FSE_buildDTable().
@return : tableLog,
or an errorCode, which can be tested using FSE_isError() */
FSE_PUBLIC_API size_t FSE_normalizeCount(short* normalizedCounter, unsigned tableLog,
- const unsigned* count, size_t srcSize, unsigned maxSymbolValue);
+ const unsigned* count, size_t srcSize, unsigned maxSymbolValue, unsigned useLowProbCount);
/*! FSE_NCountWriteBound():
Provides the maximum possible size of an FSE normalized table, given 'maxSymbolValue' and 'tableLog'.
return FSE_optimalTableLog_internal(maxTableLog, srcSize, maxSymbolValue, 2);
}
-// TODO: Emit -1 based on # of symbols
-#define LOW_PROB 0
-
/* Secondary normalization method.
To be used when primary method fails. */
-static size_t FSE_normalizeM2(short* norm, U32 tableLog, const unsigned* count, size_t total, U32 maxSymbolValue)
+static size_t FSE_normalizeM2(short* norm, U32 tableLog, const unsigned* count, size_t total, U32 maxSymbolValue, short lowProbCount)
{
short const NOT_YET_ASSIGNED = -2;
U32 s;
norm[s]=0;
continue;
}
- if (LOW_PROB && count[s] <= lowThreshold) {
- norm[s] = -1;
+ if (count[s] <= lowThreshold) {
+ norm[s] = lowProbCount;
distributed++;
total -= count[s];
continue;
size_t FSE_normalizeCount (short* normalizedCounter, unsigned tableLog,
const unsigned* count, size_t total,
- unsigned maxSymbolValue)
+ unsigned maxSymbolValue, unsigned useLowProbCount)
{
/* Sanity checks */
if (tableLog==0) tableLog = FSE_DEFAULT_TABLELOG;
if (tableLog < FSE_minTableLog(total, maxSymbolValue)) return ERROR(GENERIC); /* Too small tableLog, compression potentially impossible */
{ static U32 const rtbTable[] = { 0, 473195, 504333, 520860, 550000, 700000, 750000, 830000 };
+ short const lowProbCount = useLowProbCount ? -1 : 1;
U64 const scale = 62 - tableLog;
U64 const step = ((U64)1<<62) / total; /* <== here, one division ! */
U64 const vStep = 1ULL<<(scale-20);
for (s=0; s<=maxSymbolValue; s++) {
if (count[s] == total) return 0; /* rle special case */
if (count[s] == 0) { normalizedCounter[s]=0; continue; }
- if (LOW_PROB && count[s] <= lowThreshold) {
- normalizedCounter[s] = -1;
+ if (count[s] <= lowThreshold) {
+ normalizedCounter[s] = lowProbCount;
stillToDistribute--;
} else {
short proba = (short)((count[s]*step) >> scale);
} }
if (-stillToDistribute >= (normalizedCounter[largest] >> 1)) {
/* corner case, need another normalization method */
- size_t const errorCode = FSE_normalizeM2(normalizedCounter, tableLog, count, total, maxSymbolValue);
+ size_t const errorCode = FSE_normalizeM2(normalizedCounter, tableLog, count, total, maxSymbolValue, lowProbCount);
if (FSE_isError(errorCode)) return errorCode;
}
else normalizedCounter[largest] += (short)stillToDistribute;
}
tableLog = FSE_optimalTableLog(tableLog, srcSize, maxSymbolValue);
- CHECK_F( FSE_normalizeCount(norm, tableLog, count, srcSize, maxSymbolValue) );
+ CHECK_F( FSE_normalizeCount(norm, tableLog, count, srcSize, maxSymbolValue, /* useLowProbCount */ srcSize >= 2048) );
/* Write table description header */
{ CHECK_V_F(nc_err, FSE_writeNCount(op, oend-op, norm, maxSymbolValue, tableLog) );
}
tableLog = FSE_optimalTableLog(tableLog, wtSize, maxSymbolValue);
- CHECK_F( FSE_normalizeCount(norm, tableLog, count, wtSize, maxSymbolValue) );
+ CHECK_F( FSE_normalizeCount(norm, tableLog, count, wtSize, maxSymbolValue, /* useLowProbCount */ 0) );
/* Write table description header */
{ CHECK_V_F(hSize, FSE_writeNCount(op, (size_t)(oend-op), norm, maxSymbolValue, tableLog) );
return maxSymbolValue;
}
+/**
+ * Returns true if we should use ncount=-1 else we should
+ * use ncount=1 for low probability symbols instead.
+ */
+static unsigned ZSTD_useLowProbCount(size_t const nbSeq)
+{
+ /* Heuristic: This should cover most blocks <= 16K and
+ * start to fade out after 16K to about 32K depending on
+ * comprssibility.
+ */
+ return nbSeq >= 2048;
+}
+
/**
* Returns the cost in bytes of encoding the normalized count header.
* Returns an error if any of the helper functions return an error.
BYTE wksp[FSE_NCOUNTBOUND];
S16 norm[MaxSeq + 1];
const U32 tableLog = FSE_optimalTableLog(FSELog, nbSeq, max);
- FORWARD_IF_ERROR(FSE_normalizeCount(norm, tableLog, count, nbSeq, max), "");
+ FORWARD_IF_ERROR(FSE_normalizeCount(norm, tableLog, count, nbSeq, max, ZSTD_useLowProbCount(nbSeq)), "");
return FSE_writeNCount(wksp, sizeof(wksp), norm, max, tableLog);
}
nbSeq_1--;
}
assert(nbSeq_1 > 1);
- FORWARD_IF_ERROR(FSE_normalizeCount(norm, tableLog, count, nbSeq_1, max), "");
+ FORWARD_IF_ERROR(FSE_normalizeCount(norm, tableLog, count, nbSeq_1, max, ZSTD_useLowProbCount(nbSeq_1)), "");
{ size_t const NCountSize = FSE_writeNCount(op, oend - op, norm, max, tableLog); /* overflow protected */
FORWARD_IF_ERROR(NCountSize, "FSE_writeNCount failed");
FORWARD_IF_ERROR(FSE_buildCTable_wksp(nextCTable, norm, max, tableLog, entropyWorkspace, entropyWorkspaceSize), "");
U16* symbolNext = (U16*)wksp;
BYTE* spread = (BYTE*)(symbolNext + MaxSeq + 1);
+ U32 highThreshold = tableSize - 1;
- assert(wkspSize >= ZSTD_BUILD_FSE_TABLE_WKSP_SIZE);
- (void)wkspSize;
/* Sanity Checks */
assert(maxSymbolValue <= MaxSeq);
assert(tableLog <= MaxFSELog);
- U32 highThreshold = tableSize - 1;
+ assert(wkspSize >= ZSTD_BUILD_FSE_TABLE_WKSP_SIZE);
+ (void)wkspSize;
/* Init, lay down lowprob symbols */
{ ZSTD_seqSymbol_header DTableH;
DTableH.tableLog = tableLog;
/* note : the result of this phase should be used to better appreciate the impact on statistics */
total=0; for (u=0; u<=offcodeMax; u++) total+=offcodeCount[u];
- errorCode = FSE_normalizeCount(offcodeNCount, Offlog, offcodeCount, total, offcodeMax);
+ errorCode = FSE_normalizeCount(offcodeNCount, Offlog, offcodeCount, total, offcodeMax, /* useLowProbCount */ 1);
if (FSE_isError(errorCode)) {
eSize = errorCode;
DISPLAYLEVEL(1, "FSE_normalizeCount error with offcodeCount \n");
Offlog = (U32)errorCode;
total=0; for (u=0; u<=MaxML; u++) total+=matchLengthCount[u];
- errorCode = FSE_normalizeCount(matchLengthNCount, mlLog, matchLengthCount, total, MaxML);
+ errorCode = FSE_normalizeCount(matchLengthNCount, mlLog, matchLengthCount, total, MaxML, /* useLowProbCount */ 1);
if (FSE_isError(errorCode)) {
eSize = errorCode;
DISPLAYLEVEL(1, "FSE_normalizeCount error with matchLengthCount \n");
mlLog = (U32)errorCode;
total=0; for (u=0; u<=MaxLL; u++) total+=litLengthCount[u];
- errorCode = FSE_normalizeCount(litLengthNCount, llLog, litLengthCount, total, MaxLL);
+ errorCode = FSE_normalizeCount(litLengthNCount, llLog, litLengthCount, total, MaxLL, /* useLowProbCount */ 1);
if (FSE_isError(errorCode)) {
eSize = errorCode;
DISPLAYLEVEL(1, "FSE_normalizeCount error with litLengthCount \n");
const void* const contentStart = (const char*)dict + flatdictSize;
size_t const target_nodict_cSize[22+1] = { 3840, 3770, 3870, 3830, 3770,
3770, 3770, 3770, 3750, 3750,
- 3740, 3670, 3670, 3660, 3660,
+ 3742, 3670, 3670, 3660, 3660,
3660, 3660, 3660, 3660, 3660,
3660, 3660, 3660 };
size_t const target_wdict_cSize[22+1] = { 2830, 2890, 2890, 2820, 2940,
- 2950, 2950, 2920, 2900, 2890,
+ 2950, 2950, 2921, 2900, 2891,
2910, 2910, 2910, 2770, 2760,
2750, 2750, 2750, 2750, 2750,
2750, 2750, 2750 };
/* Calling FSE_normalizeCount() on a uniform distribution should not
* cause a division by zero.
*/
- FSE_normalizeCount(norm, tableLog, count, nbSeq, maxSymbolValue);
+ FSE_normalizeCount(norm, tableLog, count, nbSeq, maxSymbolValue, /* useLowProbCount */ 1);
}
DISPLAYLEVEL(3, "OK \n");
#ifdef ZSTD_MULTITHREAD