Use 32 buckets for log2 bucketing in huffman sort

author Sen Huang <senhuang96@fb.com>

Thu, 9 Sep 2021 15:28:12 +0000 (08:28 -0700)

committer senhuang42 <senhuang96@fb.com>

Mon, 13 Sep 2021 16:29:16 +0000 (12:29 -0400)
author Sen Huang <senhuang96@fb.com>
Thu, 9 Sep 2021 15:28:12 +0000 (08:28 -0700)
committer senhuang42 <senhuang96@fb.com>
Mon, 13 Sep 2021 16:29:16 +0000 (12:29 -0400)
diff --git a/lib/compress/huf_compress.c b/lib/compress/huf_compress.c

index 67b7ef956fc208974cdaf6a1f41926268fa65680..db11ab3dcfb7d875ad5a42be0a779c18a81f8784 100644 (file)
--- a/lib/compress/huf_compress.c
+++ b/lib/compress/huf_compress.c
@@ -435,7 +435,7 @@ typedef struct {
  typedef nodeElt huffNodeTable[HUF_CTABLE_WORKSPACE_SIZE_U32];
  
  /* Number of buckets available for HUF_sort() */
-#define RANK_POSITION_TABLE_SIZE 128
+#define RANK_POSITION_TABLE_SIZE 192
  
  typedef struct {
    huffNodeTable huffNodeTbl;
@@ -444,18 +444,15 @@ typedef struct {
  
  /* RANK_POSITION_DISTINCT_COUNT_CUTOFF == Cutoff point in HUF_sort() buckets for which we use log2 bucketing.
   * Strategy is to use as many buckets as possible for representing distinct
- * counts while using the remainder to represent all counts up to HUF_BLOCKSIZE_MAX
- * using log2 bucketing.
+ * counts while using the remainder to represent all "large" counts.
   * 
- * To satisfy this requirement for 128 buckets, we can do the following:
- * Let buckets 0-114 represent distinct counts of [0, 114]
- * Let buckets 115 to 126 represent counts of [115, HUF_BLOCKSIZE_MAX]. (the final bucket 127 must remain empty)
- * 
- * Note that we don't actually need 17 buckets (assuming 2^17 maxcount) for log2 bucketing since
- * the first few buckets in the log2 bucketing representation are already covered by the distinct count bucketing.
+ * To satisfy this requirement for 192 buckets, we can do the following:
+ * Let buckets 0-166 represent distinct counts of [0, 166]
+ * Let buckets 166 to 192 represent all remaining counts up to RANK_POSITION_MAX_COUNT_LOG using log2 bucketing.
   */
-#define RANK_POSITION_LOG_BUCKETS_BEGIN (RANK_POSITION_TABLE_SIZE - 1) - BIT_highbit32(HUF_BLOCKSIZE_MAX) - 1
-#define RANK_POSITION_DISTINCT_COUNT_CUTOFF RANK_POSITION_LOG_BUCKETS_BEGIN + BIT_highbit32(RANK_POSITION_LOG_BUCKETS_BEGIN)
+#define RANK_POSITION_MAX_COUNT_LOG 32
+#define RANK_POSITION_LOG_BUCKETS_BEGIN (RANK_POSITION_TABLE_SIZE - 1) - RANK_POSITION_MAX_COUNT_LOG - 1 /* == 158 */
+#define RANK_POSITION_DISTINCT_COUNT_CUTOFF RANK_POSITION_LOG_BUCKETS_BEGIN + BIT_highbit32(RANK_POSITION_LOG_BUCKETS_BEGIN) /* == 166 */
  
  /* Return the appropriate bucket index for a given count. See definition of
   * RANK_POSITION_DISTINCT_COUNT_CUTOFF for explanation of bucketing strategy.
author	Sen Huang <senhuang96@fb.com>
	Thu, 9 Sep 2021 15:28:12 +0000 (08:28 -0700)
committer	senhuang42 <senhuang96@fb.com>
	Mon, 13 Sep 2021 16:29:16 +0000 (12:29 -0400)