Allow user to specify memory limit for dictionary training

author Elliot Gorokhovsky <embg@fb.com>

Fri, 10 Dec 2021 21:19:40 +0000 (16:19 -0500)

committer Elliot Gorokhovsky <embg@fb.com>

Tue, 14 Dec 2021 19:29:01 +0000 (14:29 -0500)
author Elliot Gorokhovsky <embg@fb.com>
Fri, 10 Dec 2021 21:19:40 +0000 (16:19 -0500)
committer Elliot Gorokhovsky <embg@fb.com>
Tue, 14 Dec 2021 19:29:01 +0000 (14:29 -0500)
diff --git a/programs/dibio.c b/programs/dibio.c

index e7fb905ec0b7df74c0e28fb8f51194970c085528..04860dbbfa6726f43e9ab8792bf580c8a20874b3 100644 (file)
--- a/programs/dibio.c
+++ b/programs/dibio.c
@@ -309,7 +309,7 @@ static fileStats DiB_fileStats(const char** fileNamesTable, int nbFiles, size_t
  int DiB_trainFromFiles(const char* dictFileName, size_t maxDictSize,
                         const char** fileNamesTable, int nbFiles, size_t chunkSize,
                         ZDICT_legacy_params_t* params, ZDICT_cover_params_t* coverParams,
-                       ZDICT_fastCover_params_t* fastCoverParams, int optimize)
+                       ZDICT_fastCover_params_t* fastCoverParams, int optimize, unsigned memLimit)
  {
      fileStats fs;
      size_t* sampleSizes; /* vector of sample sizes. Each sample can be up to SAMPLESIZE_MAX */
@@ -341,6 +341,11 @@ int DiB_trainFromFiles(const char* dictFileName, size_t maxDictSize,
          /* Limit the size of the training data to 2GB */
          /* TODO: there is opportunity to stop DiB_fileStats() early when the data limit is reached */
          loadedSize = (size_t)MIN( MIN((S64)maxMem, fs.totalSizeToLoad), MAX_SAMPLES_SIZE );
+        if (memLimit != 0) {
+            DISPLAYLEVEL(2, "!  Warning : setting manual memory limit for dictionary training data at %u MB \n",
+                (unsigned)(memLimit / (1 MB)));
+            loadedSize = (size_t)MIN(loadedSize, memLimit);
+        }
          srcBuffer = malloc(loadedSize+NOISELENGTH);
          sampleSizes = (size_t*)malloc(fs.nbSamples * sizeof(size_t));
      }
diff --git a/programs/dibio.h b/programs/dibio.h

index 03ec80e595ba9750bfd3b300059e1c7927ac8ddb..666c1e661800c7d34b140e2af19c8799f3ddcac6 100644 (file)
--- a/programs/dibio.h
+++ b/programs/dibio.h
@@ -34,6 +34,6 @@
  int DiB_trainFromFiles(const char* dictFileName, size_t maxDictSize,
                         const char** fileNamesTable, int nbFiles, size_t chunkSize,
                         ZDICT_legacy_params_t* params, ZDICT_cover_params_t* coverParams,
-                       ZDICT_fastCover_params_t* fastCoverParams, int optimize);
+                       ZDICT_fastCover_params_t* fastCoverParams, int optimize, unsigned memLimit);
  
  #endif
diff --git a/programs/zstd.1.md b/programs/zstd.1.md

index ef37fef3225f203efee1e70ab6ebb14f9b58b674..e343ec0448b4151a9bc502629ab8389782cf6218 100644 (file)
--- a/programs/zstd.1.md
+++ b/programs/zstd.1.md
@@ -190,6 +190,10 @@ the last one takes effect.
  
      This is also used during compression when using with --patch-from=. In this case,
      this parameter overrides that maximum size allowed for a dictionary. (128 MB).
+
+    Additionally, this can be used to limit memory for dictionary training. This parameter
+    overrides the default limit of 2 GB. zstd will load training samples up to the memory limit
+    and ignore the rest.
  * `--stream-size=#` :
      Sets the pledged source size of input coming from a stream. This value must be exact, as it
      will be included in the produced frame header. Incorrect stream sizes will cause an error.
@@ -329,6 +333,8 @@ Compression of small files similar to the sample set will be greatly improved.
      resulting in a _small_ compression ratio improvement for this level.
  * `-B#`:
      Split input files into blocks of size # (default: no split)
+* `-M#`, `--memory=#`:
+    Limit the amount of sample data loaded for training (default: 2 GB). See above for details.
  * `--dictID=#`:
      A dictionary ID is a locally unique ID
      that a decoder can use to verify it is using the right dictionary.
diff --git a/programs/zstdcli.c b/programs/zstdcli.c

index 4d1978c80c689bb0fa0caf0ccbb014478f0f36cb..bfe18c0c1ba334cd17125eb91eed827f55fbbc0c 100644 (file)
--- a/programs/zstdcli.c
+++ b/programs/zstdcli.c
@@ -1327,18 +1327,18 @@ int main(int argCount, const char* argv[])
              int const optimize = !coverParams.k || !coverParams.d;
              coverParams.nbThreads = (unsigned)nbWorkers;
              coverParams.zParams = zParams;
-            operationResult = DiB_trainFromFiles(outFileName, maxDictSize, filenames->fileNames, (int)filenames->tableSize, blockSize, NULL, &coverParams, NULL, optimize);
+            operationResult = DiB_trainFromFiles(outFileName, maxDictSize, filenames->fileNames, (int)filenames->tableSize, blockSize, NULL, &coverParams, NULL, optimize, memLimit);
          } else if (dict == fastCover) {
              int const optimize = !fastCoverParams.k || !fastCoverParams.d;
              fastCoverParams.nbThreads = (unsigned)nbWorkers;
              fastCoverParams.zParams = zParams;
-            operationResult = DiB_trainFromFiles(outFileName, maxDictSize, filenames->fileNames, (int)filenames->tableSize, blockSize, NULL, NULL, &fastCoverParams, optimize);
+            operationResult = DiB_trainFromFiles(outFileName, maxDictSize, filenames->fileNames, (int)filenames->tableSize, blockSize, NULL, NULL, &fastCoverParams, optimize, memLimit);
          } else {
              ZDICT_legacy_params_t dictParams;
              memset(&dictParams, 0, sizeof(dictParams));
              dictParams.selectivityLevel = dictSelect;
              dictParams.zParams = zParams;
-            operationResult = DiB_trainFromFiles(outFileName, maxDictSize, filenames->fileNames, (int)filenames->tableSize, blockSize, &dictParams, NULL, NULL, 0);
+            operationResult = DiB_trainFromFiles(outFileName, maxDictSize, filenames->fileNames, (int)filenames->tableSize, blockSize, &dictParams, NULL, NULL, 0, memLimit);
          }
  #else
          (void)dictCLevel; (void)dictSelect; (void)dictID;  (void)maxDictSize; /* not used when ZSTD_NODICT set */
diff --git a/tests/playTests.sh b/tests/playTests.sh

index f6b6ac8f2039caf9b20f9ae368bf5946a1d0bf20..ebe7bf289f37a0e789a95f7f47408ef16bd91918 100755 (executable)
--- a/tests/playTests.sh
+++ b/tests/playTests.sh
@@ -1029,6 +1029,13 @@ then
  fi
  rm -f tmp* dictionary
  
+println "- Test --memory for dictionary compression"
+datagen -g12M -P90 > tmpCorpusHighCompress
+zstd --train -B2K tmpCorpusHighCompress -o tmpDictHighCompress --memory=10K && die "Dictionary training should fail : --memory too low (10K)"
+zstd --train -B2K tmpCorpusHighCompress -o tmpDictHighCompress --memory=5MB 2> zstTrainWithMemLimitStdErr
+cat zstTrainWithMemLimitStdErr | grep "setting manual memory limit for dictionary training data at 5 MB"
+cat zstTrainWithMemLimitStdErr | grep "Training samples set too large (12 MB); training on 5 MB only..."
+rm zstTrainWithMemLimitStdErr
  
  println "\n===>  fastCover dictionary builder : advanced options "
  TESTFILE="$PRGDIR"/zstdcli.c
author	Elliot Gorokhovsky <embg@fb.com>
	Fri, 10 Dec 2021 21:19:40 +0000 (16:19 -0500)
committer	Elliot Gorokhovsky <embg@fb.com>
	Tue, 14 Dec 2021 19:29:01 +0000 (14:29 -0500)
programs/dibio.c		patch \| blob \| blame \| history
programs/dibio.h		patch \| blob \| blame \| history
programs/zstd.1.md		patch \| blob \| blame \| history
programs/zstdcli.c		patch \| blob \| blame \| history
tests/playTests.sh		patch \| blob \| blame \| history