From: Vsevolod Stakhov <vsevolod@highsecure.ru>
Date: Fri, 30 Mar 2018 13:27:14 +0000 (+0100)
Subject: [Test] Improve tokenization tests
X-Git-Tag: 1.7.3~72
X-Git-Url: http://git.ipfire.org/gitweb.cgi?a=commitdiff_plain;h=8160aa803846616f28d94519486d310a771f58bf;p=thirdparty%2Frspamd.git

[Test] Improve tokenization tests
---

diff --git a/src/lua/lua_util.c b/src/lua/lua_util.c
index 46f103f1a8..64b25c14ad 100644
--- a/src/lua/lua_util.c
+++ b/src/lua/lua_util.c
@@ -1007,6 +1007,7 @@ lua_util_tokenize_text (lua_State *L)
 					ex = g_malloc0 (sizeof (*ex));
 					ex->pos = pos;
 					ex->len = ex_len;
+					ex->type = RSPAMD_EXCEPTION_URL;
 					exceptions = g_list_prepend (exceptions, ex);
 				}
 			}
diff --git a/test/lua/unit/tokenizer.lua b/test/lua/unit/tokenizer.lua
index e05f74d86b..16f8f18461 100644
--- a/test/lua/unit/tokenizer.lua
+++ b/test/lua/unit/tokenizer.lua
@@ -1,111 +1,115 @@
 context("Text tokenization test", function()
   local util = require "rspamd_util"
   local logger = require "rspamd_logger"
-  test("Tokenize simple text", function()
-    local cases = {
-      {"Lorem ipsum dolor sit amet, consectetur adipiscing elit. Integer mattis, nibh",
-        {"Lorem", "ipsum", "dolor", "sit", "amet", "consectetur", "adipiscing", "elit",
-        "Integer", "mattis", "nibh"
-        }
-      },
-      {"ÕÕ¥Õ¿Õ¡ÖÖÖÖÕ¾Õ¸Õ²Õ¶Õ¥ÖÕ« Õ°Õ¡Õ´Õ¡Ö Õ¸Õ¿Õ¸ÖÖ Õ¶Õ¥ÖÕ¯Õ¡ÕµÕ¡ÖÕ¾Õ¡Õ®",
-        {"ÕÕ¥Õ¿Õ¡ÖÖÖÖÕ¾Õ¸Õ²Õ¶Õ¥ÖÕ«", "Õ°Õ¡Õ´Õ¡Ö", "Õ¸Õ¿Õ¸ÖÖ", "Õ¶Õ¥ÖÕ¯Õ¡ÕµÕ¡ÖÕ¾Õ¡Õ®"}
-      },
-      {"", {}},
-      {",,,,,", {}},
-      {"word,,,,,word    ", {"word", "word"}},
-      {"word", {"word"}},
-      {",,,,word,,,", {"word"}}
-    }
-    
-    for _,c in ipairs(cases) do
+
+  local cases = {
+    {"Lorem ipsum dolor sit amet, consectetur adipiscing elit. Integer mattis, nibh",
+     {"Lorem", "ipsum", "dolor", "sit", "amet", "consectetur", "adipiscing", "elit",
+      "Integer", "mattis", "nibh"
+     }
+    },
+    {"ÕÕ¥Õ¿Õ¡ÖÖÖÖÕ¾Õ¸Õ²Õ¶Õ¥ÖÕ« Õ°Õ¡Õ´Õ¡Ö Õ¸Õ¿Õ¸ÖÖ Õ¶Õ¥ÖÕ¯Õ¡ÕµÕ¡ÖÕ¾Õ¡Õ®",
+     {"ÕÕ¥Õ¿Õ¡ÖÖÖÖÕ¾Õ¸Õ²Õ¶Õ¥ÖÕ«", "Õ°Õ¡Õ´Õ¡Ö", "Õ¸Õ¿Õ¸ÖÖ", "Õ¶Õ¥ÖÕ¯Õ¡ÕµÕ¡ÖÕ¾Õ¡Õ®"}
+    },
+    {"", {}},
+    {",,,,,", {}},
+    {"word,,,,,word    ", {"word", "word"}},
+    {"word", {"word"}},
+    {",,,,word,,,", {"word"}}
+  }
+
+  for i,c in ipairs(cases) do
+    test("Tokenize simple " .. i, function()
       local w = util.tokenize_text(c[1])
       if #c[2] == 0 then
         assert_equal(#w, 0, "must not have tokens " .. c[1])
       else
         assert_not_nil(w, "must tokenize " .. c[1])
-        
+
         for i,wrd in ipairs(w) do
           assert_equal(wrd, c[2][i])
         end
       end
-    end
-  end)
-    test("Tokenize simple text (legacy)", function()
-    local cases = {
-      -- First token is bad
-      {"Lorem ipsum dolor sit amet, consectetur adipiscing elit. Integer mattis, nibh",
-        {"orem", "ipsum", "dolor", "sit", "amet", "consectetur", "adipiscing", "elit",
-        "Integer", "mattis", "nibh"
-        }
-      },
-      -- Unicode is broken
-      --{"ÕÕ¥Õ¿Õ¡ÖÖÖÖÕ¾Õ¸Õ²Õ¶Õ¥ÖÕ« Õ°Õ¡Õ´Õ¡Ö Õ¸Õ¿Õ¸ÖÖ Õ¶Õ¥ÖÕ¯Õ¡ÕµÕ¡ÖÕ¾Õ¡Õ®",
-      --  {"ÕÕ¥Õ¿Õ¡ÖÖÖÖÕ¾Õ¸Õ²Õ¶Õ¥ÖÕ«", "Õ°Õ¡Õ´Õ¡Ö", "Õ¸Õ¿Õ¸ÖÖ", "Õ¶Õ¥ÖÕ¯Õ¡ÕµÕ¡ÖÕ¾Õ¡Õ®"}
-      --},
-      {"", {}},
-      {",,,,,", {}},
-      {"word,,,,,word    ", {"ord", "word"}},
-      {"word", {"ord"}},
-      {",,,,word,,,", {"word"}}
-    }
-    
-    for _,c in ipairs(cases) do
+    end)
+  end
+
+
+  cases = {
+    -- First token is bad
+    {"Lorem ipsum dolor sit amet, consectetur adipiscing elit. Integer mattis, nibh",
+     {"orem", "ipsum", "dolor", "sit", "amet", "consectetur", "adipiscing", "elit",
+      "Integer", "mattis", "nibh"
+     }
+    },
+    -- Unicode is broken
+    --{"ÕÕ¥Õ¿Õ¡ÖÖÖÖÕ¾Õ¸Õ²Õ¶Õ¥ÖÕ« Õ°Õ¡Õ´Õ¡Ö Õ¸Õ¿Õ¸ÖÖ Õ¶Õ¥ÖÕ¯Õ¡ÕµÕ¡ÖÕ¾Õ¡Õ®",
+    --  {"ÕÕ¥Õ¿Õ¡ÖÖÖÖÕ¾Õ¸Õ²Õ¶Õ¥ÖÕ«", "Õ°Õ¡Õ´Õ¡Ö", "Õ¸Õ¿Õ¸ÖÖ", "Õ¶Õ¥ÖÕ¯Õ¡ÕµÕ¡ÖÕ¾Õ¡Õ®"}
+    --},
+    {"", {}},
+    {",,,,,", {}},
+    {"word,,,,,word    ", {"ord", "word"}},
+    {"word", {"ord"}},
+    {",,,,word,,,", {"word"}}
+  }
+
+  for i,c in ipairs(cases) do
+    test("Tokenize simple text (legacy) " .. i, function()
       local w = util.tokenize_text(c[1], {}, true)
       if #c[2] == 0 then
         assert_equal(#w, 0, "must not have tokens " .. c[1])
       else
         assert_not_nil(w, "must tokenize " .. c[1])
-        
+
         for i,wrd in ipairs(w) do
           assert_equal(wrd, c[2][i])
         end
       end
-    end
-  end)
-  test("Tokenize with exceptions", function()
-    local cases = {
-      {"word https://example.com/path word",
-        {{5, 24}},
-        {"word", "!!EX!!", "word"}
-      },
-      {"Õ°Õ¡Õ´Õ¡Ö https://example.com/path Õ°Õ¡Õ´Õ¡Ö",
-        {{11, 24}},
-        {"Õ°Õ¡Õ´Õ¡Ö", "!!EX!!", "Õ°Õ¡Õ´Õ¡Ö"}
-      },
-      {"word https://example.com/path https://example.com/path word",
-        {{5, 24}, {30, 24}},
-        {"word", "!!EX!!", "!!EX!!", "word"}
-      },
-      {"word https://example.com/path https://example.com/path",
-        {{5, 24}, {30, 24}},
-        {"word", "!!EX!!", "!!EX!!"}
-      },
-      {"https://example.com/path https://example.com/path word",
-        {{0, 24}, {25, 24}},
-        {"!!EX!!", "!!EX!!", "word"}
-      },
-      {"https://example.com/path https://example.com/path",
-        {{0, 24}, {25, 24}},
-        {"!!EX!!", "!!EX!!"}
-      },
-      {",,,,https://example.com/path https://example.com/path    ",
-        {{4, 24}, {29, 24}},
-        {"!!EX!!", "!!EX!!"}
-      },
-    }
-    
-    for _,c in ipairs(cases) do
+    end)
+  end
+
+  cases = {
+    {"word https://example.com/path word",
+     {{5, 24}},
+     {"word", "!!EX!!", "word"}
+    },
+    {"Õ°Õ¡Õ´Õ¡Ö https://example.com/path Õ°Õ¡Õ´Õ¡Ö",
+     {{11, 24}},
+     {"Õ°Õ¡Õ´Õ¡Ö", "!!EX!!", "Õ°Õ¡Õ´Õ¡Ö"}
+    },
+    {"word https://example.com/path https://example.com/path word",
+     {{5, 24}, {30, 24}},
+     {"word", "!!EX!!", "!!EX!!", "word"}
+    },
+    {"word https://example.com/path https://example.com/path",
+     {{5, 24}, {30, 24}},
+     {"word", "!!EX!!", "!!EX!!"}
+    },
+    {"https://example.com/path https://example.com/path word",
+     {{0, 24}, {25, 24}},
+     {"!!EX!!", "!!EX!!", "word"}
+    },
+    {"https://example.com/path https://example.com/path",
+     {{0, 24}, {25, 24}},
+     {"!!EX!!", "!!EX!!"}
+    },
+    {",,,,https://example.com/path https://example.com/path    ",
+     {{4, 24}, {29, 24}},
+     {"!!EX!!", "!!EX!!"}
+    },
+  }
+
+  for i,c in ipairs(cases) do
+    test("Tokenize with exceptions " .. i, function()
       local w = util.tokenize_text(c[1], c[2])
       if #c[3] == 0 then
         assert_equal(#w, 0, "must not have tokens " .. c[1])
       else
         assert_not_nil(w, "must tokenize " .. c[1])
-        
         for i,wrd in ipairs(w) do
           assert_equal(wrd, c[3][i])
         end
       end
-    end
-  end)
+    end)
+  end
+
 end)
\ No newline at end of file