runtime weight distribution table

author Yann Collet <cyan@fb.com>

Tue, 20 Feb 2024 20:26:37 +0000 (12:26 -0800)

committer Yann Collet <cyan@fb.com>

Tue, 20 Feb 2024 20:26:37 +0000 (12:26 -0800)
author Yann Collet <cyan@fb.com>
Tue, 20 Feb 2024 20:26:37 +0000 (12:26 -0800)
committer Yann Collet <cyan@fb.com>
Tue, 20 Feb 2024 20:26:37 +0000 (12:26 -0800)
diff --git a/programs/lorem.c b/programs/lorem.c

index 8f2596e79b993fa2102ab40c5212b0a34c5991d4..49408b5a1e3b9e70344e9c39185e0f2f1e01db0c 100644 (file)
--- a/programs/lorem.c
+++ b/programs/lorem.c
@@ -38,7 +38,7 @@
  #define WORD_MAX_SIZE 20
  
  /* Define the word pool */
-static const char* words[] = {
+static const char* kWords[] = {
      "lorem",        "ipsum",      "dolor",       "sit",          "amet",
      "consectetur",  "adipiscing", "elit",        "sed",          "do",
      "eiusmod",      "tempor",     "incididunt",  "ut",           "labore",
@@ -75,29 +75,56 @@ static const char* words[] = {
      "quam",         "nihil",      "molestiae",   "illum",        "fugiat",
      "quo",          "pariatur"
  };
+static const unsigned kNbWords = sizeof(kWords) / sizeof(kWords[0]);
  
-/* simple 1-dimension distribution that favors small words :
- * 1 letter : weight 3
- * 2-3 letters : weight 2
- * 4+ letters : weight 1
+/* simple 1-dimension distribution, based on word's length, favors small words
   */
-static const int distrib[] = {
-    0,   1,   2,   3,   3,   4,   5,   6,   7,   8,   8,   9,   9,   10,  11,
-    12,  13,  13,  14,  15,  15,  16,  17,  18,  19,  19,  20,  21,  22,  23,
-    24,  25,  26,  26,  26,  27,  28,  29,  30,  31,  32,  33,  34,  34,  35,
-    36,  37,  38,  39,  40,  41,  41,  42,  43,  43,  44,  45,  45,  46,  47,
-    48,  49,  50,  51,  52,  53,  54,  55,  55,  56,  57,  58,  58,  59,  60,
-    60,  61,  62,  63,  64,  65,  66,  67,  67,  68,  69,  70,  71,  72,  72,
-    73,  73,  74,  75,  76,  77,  78,  79,  80,  81,  82,  83,  84,  84,  85,
-    86,  87,  88,  89,  89,  90,  91,  92,  93,  94,  95,  96,  97,  98,  99,
-    100, 101, 101, 102, 103, 104, 105, 106, 106, 107, 108, 109, 110, 111, 112,
-    113, 114, 115, 116, 117, 118, 119, 129, 121, 122, 123, 124, 125, 126, 127,
-    128, 128, 129, 129, 130, 131, 132, 133, 134, 135, 136, 136, 137, 138, 139,
-    140, 141, 142, 143, 144, 145, 146, 146, 147, 148, 149, 150, 151, 152, 153,
-    154, 155, 156, 156, 157, 157, 158, 159, 160, 161, 161, 162, 163, 164, 165,
-    166, 167, 168, 169, 170, 170, 171,
-};
-static const unsigned distribCount = sizeof(distrib) / sizeof(distrib[0]);
+static const int kWeights[]      = { 0, 8, 6, 4, 3, 2 };
+static const unsigned kNbWeights = sizeof(kWeights) / sizeof(kWeights[0]);
+
+#define DISTRIB_SIZE_MAX 500
+static int g_distrib[DISTRIB_SIZE_MAX] = { 0 };
+static unsigned g_distribCount         = 0;
+
+static void countFreqs(
+        const char* words[],
+        size_t nbWords,
+        const int* weights,
+        unsigned long nbWeights)
+{
+    unsigned total = 0;
+    size_t w;
+    for (w = 0; w < nbWords; w++) {
+        unsigned long len = strlen(words[w]);
+        int lmax;
+        if (len >= nbWeights)
+            len = nbWeights - 1;
+        lmax = weights[len];
+        total += (unsigned)lmax;
+    }
+    g_distribCount = total;
+    assert(g_distribCount <= DISTRIB_SIZE_MAX);
+}
+
+static void init_word_distrib(
+        const char* words[],
+        size_t nbWords,
+        const int* weights,
+        unsigned long nbWeights)
+{
+    size_t w, d = 0;
+    countFreqs(words, nbWords, weights, nbWeights);
+    for (w = 0; w < nbWords; w++) {
+        unsigned long len = strlen(words[w]);
+        int l, lmax;
+        if (len >= nbWeights)
+            len = nbWeights - 1;
+        lmax = weights[len];
+        for (l = 0; l < lmax; l++) {
+            g_distrib[d++] = (int)w;
+        }
+    }
+}
  
  /* Note: this unit only works when invoked sequentially.
   * No concurrent access is allowed */
@@ -166,8 +193,8 @@ static void generateSentence(int nbWords)
      const char* endSep = qmark ? "? " : ". ";
      int i;
      for (i = 0; i < nbWords; i++) {
-        int const wordID       = distrib[LOREM_rand(distribCount)];
-        const char* const word = words[wordID];
+        int const wordID       = g_distrib[LOREM_rand(g_distribCount)];
+        const char* const word = kWords[wordID];
          const char* sep        = " ";
          if (i == commaPos)
              sep = ", ";
@@ -200,7 +227,7 @@ static void generateFirstSentence(void)
  {
      int i;
      for (i = 0; i < 18; i++) {
-        const char* word      = words[i];
+        const char* word      = kWords[i];
          const char* separator = " ";
          if (i == 4)
              separator = ", ";
@@ -208,7 +235,7 @@ static void generateFirstSentence(void)
              separator = ", ";
          generateWord(word, separator, i == 0);
      }
-    generateWord(words[18], ". ", 0);
+    generateWord(kWords[18], ". ", 0);
  }
  
  size_t
@@ -219,6 +246,10 @@ LOREM_genBlock(void* buffer, size_t size, unsigned seed, int first, int fill)
      g_maxChars = size;
      g_nbChars  = 0;
      g_randRoot = seed;
+    if (g_distribCount == 0) {
+        init_word_distrib(kWords, kNbWords, kWeights, kNbWeights);
+    }
+
      if (first) {
          generateFirstSentence();
      }
author	Yann Collet <cyan@fb.com>
	Tue, 20 Feb 2024 20:26:37 +0000 (12:26 -0800)
committer	Yann Collet <cyan@fb.com>
	Tue, 20 Feb 2024 20:26:37 +0000 (12:26 -0800)