Add split=# to cli

author Jennifer Liu <jenniferliu620@fb.com>

Sat, 30 Jun 2018 00:54:41 +0000 (17:54 -0700)

committer Jennifer Liu <jenniferliu620@fb.com>

Sat, 30 Jun 2018 00:54:41 +0000 (17:54 -0700)
author Jennifer Liu <jenniferliu620@fb.com>
Sat, 30 Jun 2018 00:54:41 +0000 (17:54 -0700)
committer Jennifer Liu <jenniferliu620@fb.com>
Sat, 30 Jun 2018 00:54:41 +0000 (17:54 -0700)
diff --git a/lib/dictBuilder/cover.c b/lib/dictBuilder/cover.c

index 53f3d79a8bfd4c1acd254de7908b8e7123aeb404..a3195aa77ca08449e316a62b1d13e098d5dbcc28 100644 (file)
--- a/lib/dictBuilder/cover.c
+++ b/lib/dictBuilder/cover.c
@@ -558,15 +558,15 @@ static int COVER_ctx_init(COVER_ctx_t *ctx, const void *samplesBuffer,
    /* Check if there's training sample */
    if (nbTrainSamples < 1) {
      DISPLAYLEVEL(1, "Total number of training samples is %u and is invalid.", nbTrainSamples);
-    DISPLAYLEVEL(1, "splitPoint is %i", (int)(splitPoint*100));
-    DISPLAYLEVEL(1, "nbSamples is %u", nbSamples);
      return 0;
    }
    /* Check if there's testing sample when splitPoint is nonzero */
    if (nbTestSamples < 1 && splitPoint < 1.0) {
      DISPLAYLEVEL(1, "Total number of testing samples is %u and is invalid.", nbTestSamples);
-    DISPLAYLEVEL(1, "splitPoint is %i", (int)(splitPoint*100));
-    DISPLAYLEVEL(1, "nbSamples is %u", nbSamples);
+    return 0;
+  }
+  if (nbTrainSamples + nbTestSamples != nbSamples) {
+    DISPLAYLEVEL(1, "nbTrainSamples plus nbTestSamples don't add up to nbSamples");
      return 0;
    }
    /* Zero the context */
diff --git a/programs/zstd.1.md b/programs/zstd.1.md

index 4b3818141bd3960c8d521a9fa897d6d83c7461e3..c45bdb3860a8842f4679fc3f5a4af0643e6f810d 100644 (file)
--- a/programs/zstd.1.md
+++ b/programs/zstd.1.md
@@ -223,11 +223,12 @@ Compression of small files similar to the sample set will be greatly improved.
      This compares favorably to 4 bytes default.
      However, it's up to the dictionary manager to not assign twice the same ID to
      2 different dictionaries.
-* `--train-cover[=k#,d=#,steps=#]`:
+* `--train-cover[=k#,d=#,steps=#,split=#]`:
      Select parameters for the default dictionary builder algorithm named cover.
      If _d_ is not specified, then it tries _d_ = 6 and _d_ = 8.
      If _k_ is not specified, then it tries _steps_ values in the range [50, 2000].
      If _steps_ is not specified, then the default value of 40 is used.
+    If _split_ is not specified, then the default value of 80 is used.
      Requires that _d_ <= _k_.
  
      Selects segments of size _k_ with highest score to put in the dictionary.
@@ -249,6 +250,8 @@ Compression of small files similar to the sample set will be greatly improved.
  
      `zstd --train-cover=k=50 FILEs`
  
+    `zstd --train-cover=k=50,split=60 FILEs`
+
  * `--train-legacy[=selectivity=#]`:
      Use legacy dictionary builder algorithm with the given dictionary
      _selectivity_ (default: 9).
diff --git a/programs/zstdcli.c b/programs/zstdcli.c

index ae8c9cba9aa8034aa758c4f1a1d6db27774221b6..68404d66036711873bab1ff2e71938f9ff684c4c 100644 (file)
--- a/programs/zstdcli.c
+++ b/programs/zstdcli.c
@@ -278,14 +278,20 @@ static unsigned longCommandWArg(const char** stringPtr, const char* longCommand)
  static unsigned parseCoverParameters(const char* stringPtr, ZDICT_cover_params_t* params)
  {
      memset(params, 0, sizeof(*params));
+    unsigned splitPercentage = 100;
      for (; ;) {
          if (longCommandWArg(&stringPtr, "k=")) { params->k = readU32FromChar(&stringPtr); if (stringPtr[0]==',') { stringPtr++; continue; } else break; }
          if (longCommandWArg(&stringPtr, "d=")) { params->d = readU32FromChar(&stringPtr); if (stringPtr[0]==',') { stringPtr++; continue; } else break; }
          if (longCommandWArg(&stringPtr, "steps=")) { params->steps = readU32FromChar(&stringPtr); if (stringPtr[0]==',') { stringPtr++; continue; } else break; }
+        if (longCommandWArg(&stringPtr, "split=")) {
+          splitPercentage = readU32FromChar(&stringPtr);
+          params->splitPoint = (double)splitPercentage / 100.0;
+          if (stringPtr[0]==',') { stringPtr++; continue; } else break;
+        }
          return 0;
      }
      if (stringPtr[0] != 0) return 0;
-    DISPLAYLEVEL(4, "cover: k=%u\nd=%u\nsteps=%u\n", params->k, params->d, params->steps);
+    DISPLAYLEVEL(4, "cover: k=%u\nd=%u\nsteps=%u\nsplitPoint=%d\n", params->k, params->d, params->steps, splitPercentage);
      return 1;
  }
author	Jennifer Liu <jenniferliu620@fb.com>
	Sat, 30 Jun 2018 00:54:41 +0000 (17:54 -0700)
committer	Jennifer Liu <jenniferliu620@fb.com>
	Sat, 30 Jun 2018 00:54:41 +0000 (17:54 -0700)
lib/dictBuilder/cover.c		patch \| blob \| blame \| history
programs/zstd.1.md		patch \| blob \| blame \| history
programs/zstdcli.c		patch \| blob \| blame \| history