[cover] Improvements for small or homogeneous data

author Nick Terrell <terrelln@fb.com>

Fri, 22 Mar 2019 19:28:55 +0000 (12:28 -0700)

committer Nick Terrell <terrelln@fb.com>

Fri, 22 Mar 2019 21:14:46 +0000 (14:14 -0700)
author Nick Terrell <terrelln@fb.com>
Fri, 22 Mar 2019 19:28:55 +0000 (12:28 -0700)
committer Nick Terrell <terrelln@fb.com>
Fri, 22 Mar 2019 21:14:46 +0000 (14:14 -0700)
diff --git a/lib/dictBuilder/cover.c b/lib/dictBuilder/cover.c

index b55bfb510b728586a0219ad073eae7a3651a9e64..180a68ae8c6dab2a9b29eed12295c5cf972d58ff 100644 (file)
--- a/lib/dictBuilder/cover.c
+++ b/lib/dictBuilder/cover.c
@@ -627,6 +627,38 @@ static int COVER_ctx_init(COVER_ctx_t *ctx, const void *samplesBuffer,
    return 1;
  }
  
+void COVER_warnOnSmallCorpus(size_t maxDictSize, size_t nbDmers)
+{
+  const double ratio = (double)nbDmers / maxDictSize;
+  if (ratio >= 10) {
+      return;
+  }
+  DISPLAYLEVEL(1, "WARNING: The maximum dictionary size %u is too large "
+                  "compared to the source size %u! "
+                  "size(source)/size(dictionary) = %f, but it should be >= "
+                  "10! This may lead to a subpar dictionary! We recommend "
+                  "training on sources at least 10x, and up to 100x the "
+                  "size of the dictionary!\n", (U32)maxDictSize,
+                  (U32)nbDmers, ratio);
+}
+
+COVER_epoch_info_t COVER_computeEpochs(U32 maxDictSize,
+                                       U32 nbDmers, U32 k, U32 passes)
+{
+  const U32 minEpochSize = k * 10;
+  COVER_epoch_info_t epochs;
+  epochs.num = MAX(1, maxDictSize / k / passes);
+  epochs.size = nbDmers / epochs.num;
+  if (epochs.size >= minEpochSize) {
+      assert(epochs.size * epochs.num <= nbDmers);
+      return epochs;
+  }
+  epochs.size = MIN(minEpochSize, nbDmers);
+  epochs.num = nbDmers / epochs.size;
+  assert(epochs.size * epochs.num <= nbDmers);
+  return epochs;
+}
+
  /**
   * Given the prepared context build the dictionary.
   */
@@ -636,28 +668,34 @@ static size_t COVER_buildDictionary(const COVER_ctx_t *ctx, U32 *freqs,
                                      ZDICT_cover_params_t parameters) {
    BYTE *const dict = (BYTE *)dictBuffer;
    size_t tail = dictBufferCapacity;
-  /* Divide the data up into epochs of equal size.
-   * We will select at least one segment from each epoch.
-   */
-  const unsigned epochs = MAX(1, (U32)(dictBufferCapacity / parameters.k / 4));
-  const unsigned epochSize = (U32)(ctx->suffixSize / epochs);
+  /* Divide the data into epochs. We will select one segment from each epoch. */
+  const COVER_epoch_info_t epochs = COVER_computeEpochs(
+      (U32)dictBufferCapacity, (U32)ctx->suffixSize, parameters.k, 4);
+  const size_t maxZeroScoreRun = MAX(10, MIN(100, epochs.num >> 3));
+  size_t zeroScoreRun = 0;
    size_t epoch;
    DISPLAYLEVEL(2, "Breaking content into %u epochs of size %u\n",
-                epochs, epochSize);
+                (U32)epochs.num, (U32)epochs.size);
    /* Loop through the epochs until there are no more segments or the dictionary
     * is full.
     */
-  for (epoch = 0; tail > 0; epoch = (epoch + 1) % epochs) {
-    const U32 epochBegin = (U32)(epoch * epochSize);
-    const U32 epochEnd = epochBegin + epochSize;
+  for (epoch = 0; tail > 0; epoch = (epoch + 1) % epochs.num) {
+    const U32 epochBegin = (U32)(epoch * epochs.size);
+    const U32 epochEnd = epochBegin + epochs.size;
      size_t segmentSize;
      /* Select a segment */
      COVER_segment_t segment = COVER_selectSegment(
          ctx, freqs, activeDmers, epochBegin, epochEnd, parameters);
-    /* If the segment covers no dmers, then we are out of content */
+    /* If the segment covers no dmers, then we are out of content.
+     * There may be new content in other epochs, for continue for some time.
+     */
      if (segment.score == 0) {
-      break;
+      if (++zeroScoreRun >= maxZeroScoreRun) {
+          break;
+      }
+      continue;
      }
+    zeroScoreRun = 0;
      /* Trim the segment if necessary and if it is too small then we are done */
      segmentSize = MIN(segment.end - segment.begin + parameters.d - 1, tail);
      if (segmentSize < parameters.d) {
@@ -706,6 +744,7 @@ ZDICTLIB_API size_t ZDICT_trainFromBuffer_cover(
                        parameters.d, parameters.splitPoint)) {
      return ERROR(GENERIC);
    }
+  COVER_warnOnSmallCorpus(dictBufferCapacity, ctx.suffixSize);
    if (!COVER_map_init(&activeDmers, parameters.k - parameters.d + 1)) {
      DISPLAYLEVEL(1, "Failed to allocate dmer map: out of memory\n");
      COVER_ctx_destroy(&ctx);
@@ -977,6 +1016,7 @@ ZDICTLIB_API size_t ZDICT_optimizeTrainFromBuffer_cover(
    unsigned k;
    COVER_best_t best;
    POOL_ctx *pool = NULL;
+  int warned = 0;
  
    /* Checks */
    if (splitPoint <= 0 || splitPoint > 1) {
@@ -1019,6 +1059,10 @@ ZDICTLIB_API size_t ZDICT_optimizeTrainFromBuffer_cover(
        POOL_free(pool);
        return ERROR(GENERIC);
      }
+    if (!warned) {
+      COVER_warnOnSmallCorpus(dictBufferCapacity, ctx.suffixSize);
+      warned = 1;
+    }
      /* Loop through k reusing the same context */
      for (k = kMinK; k <= kMaxK; k += kStepSize) {
        /* Prepare the arguments */
diff --git a/lib/dictBuilder/cover.h b/lib/dictBuilder/cover.h

index 82e2e1cea43cf17051dc2fbdb419e42cb0312a12..71c520e9e6207dfd856cee68e1753f174ed445db 100644 (file)
--- a/lib/dictBuilder/cover.h
+++ b/lib/dictBuilder/cover.h
@@ -38,6 +38,35 @@ typedef struct {
    U32 score;
  } COVER_segment_t;
  
+/**
+ *Number of epochs and size of each epoch.
+ */
+typedef struct {
+  U32 num;
+  U32 size;
+} COVER_epoch_info_t;
+
+/**
+ * Computes the number of epochs and the size of each epoch.
+ * We will make sure that each epoch gets at least 10 * k bytes.
+ *
+ * The COVER algorithms divide the data up into epochs of equal size and
+ * select one segemnt from each epoch.
+ *
+ * @param maxDictSize The maximum allowed dictioary size.
+ * @param nbDmers     The number of dmers we are training on.
+ * @param k           The parameter k (segment size).
+ * @param passes      The target number of passes over the dmer corpus.
+ *                    More passes means a better dictionary.
+ */
+COVER_epoch_info_t COVER_computeEpochs(U32 maxDictSize, U32 nbDmers,
+                                       U32 k, U32 passes);
+
+/**
+ * Warns the user when their corpus is too small.
+ */
+void COVER_warnOnSmallCorpus(size_t maxDictSize, size_t nbDmers);
+
  /**
   *  Checks total compressed size of a dictionary
   */
diff --git a/lib/dictBuilder/fastcover.c b/lib/dictBuilder/fastcover.c

index c289c0690145d92f925f72b1e22951f36a9c7889..8cb89c93886d320bd5110cc37ea25c76f32a7676 100644 (file)
--- a/lib/dictBuilder/fastcover.c
+++ b/lib/dictBuilder/fastcover.c
@@ -386,29 +386,35 @@ FASTCOVER_buildDictionary(const FASTCOVER_ctx_t* ctx,
  {
    BYTE *const dict = (BYTE *)dictBuffer;
    size_t tail = dictBufferCapacity;
-  /* Divide the data up into epochs of equal size.
-   * We will select at least one segment from each epoch.
-   */
-  const unsigned epochs = MAX(1, (U32)(dictBufferCapacity / parameters.k));
-  const unsigned epochSize = (U32)(ctx->nbDmers / epochs);
+  /* Divide the data into epochs. We will select one segment from each epoch. */
+  const COVER_epoch_info_t epochs = COVER_computeEpochs(
+      (U32)dictBufferCapacity, (U32)ctx->nbDmers, parameters.k, 1);
+  const size_t maxZeroScoreRun = 10;
+  size_t zeroScoreRun = 0;
    size_t epoch;
    DISPLAYLEVEL(2, "Breaking content into %u epochs of size %u\n",
-                epochs, epochSize);
+                (U32)epochs.num, (U32)epochs.size);
    /* Loop through the epochs until there are no more segments or the dictionary
     * is full.
     */
-  for (epoch = 0; tail > 0; epoch = (epoch + 1) % epochs) {
-    const U32 epochBegin = (U32)(epoch * epochSize);
-    const U32 epochEnd = epochBegin + epochSize;
+  for (epoch = 0; tail > 0; epoch = (epoch + 1) % epochs.num) {
+    const U32 epochBegin = (U32)(epoch * epochs.size);
+    const U32 epochEnd = epochBegin + epochs.size;
      size_t segmentSize;
      /* Select a segment */
      COVER_segment_t segment = FASTCOVER_selectSegment(
          ctx, freqs, epochBegin, epochEnd, parameters, segmentFreqs);
  
-    /* If the segment covers no dmers, then we are out of content */
+    /* If the segment covers no dmers, then we are out of content.
+     * There may be new content in other epochs, for continue for some time.
+     */
      if (segment.score == 0) {
-      break;
+      if (++zeroScoreRun >= maxZeroScoreRun) {
+          break;
+      }
+      continue;
      }
+    zeroScoreRun = 0;
  
      /* Trim the segment if necessary and if it is too small then we are done */
      segmentSize = MIN(segment.end - segment.begin + parameters.d - 1, tail);
@@ -564,6 +570,7 @@ ZDICT_trainFromBuffer_fastCover(void* dictBuffer, size_t dictBufferCapacity,
        DISPLAYLEVEL(1, "Failed to initialize context\n");
        return ERROR(GENERIC);
      }
+    COVER_warnOnSmallCorpus(dictBufferCapacity, ctx.nbDmers);
      /* Build the dictionary */
      DISPLAYLEVEL(2, "Building dictionary\n");
      {
@@ -616,6 +623,7 @@ ZDICT_optimizeTrainFromBuffer_fastCover(
      unsigned k;
      COVER_best_t best;
      POOL_ctx *pool = NULL;
+    int warned = 0;
      /* Checks */
      if (splitPoint <= 0 || splitPoint > 1) {
        LOCALDISPLAYLEVEL(displayLevel, 1, "Incorrect splitPoint\n");
@@ -664,6 +672,10 @@ ZDICT_optimizeTrainFromBuffer_fastCover(
          POOL_free(pool);
          return ERROR(GENERIC);
        }
+      if (!warned) {
+        COVER_warnOnSmallCorpus(dictBufferCapacity, ctx.nbDmers);
+        warned = 1;
+      }
        /* Loop through k reusing the same context */
        for (k = kMinK; k <= kMaxK; k += kStepSize) {
          /* Prepare the arguments */
author	Nick Terrell <terrelln@fb.com>
	Fri, 22 Mar 2019 19:28:55 +0000 (12:28 -0700)
committer	Nick Terrell <terrelln@fb.com>
	Fri, 22 Mar 2019 21:14:46 +0000 (14:14 -0700)
lib/dictBuilder/cover.c		patch \| blob \| blame \| history
lib/dictBuilder/cover.h		patch \| blob \| blame \| history
lib/dictBuilder/fastcover.c		patch \| blob \| blame \| history