[Feature] Implement new text tokenizer based on libicu

author Vsevolod Stakhov <vsevolod@highsecure.ru>

Thu, 6 Sep 2018 18:49:44 +0000 (19:49 +0100)

committer Vsevolod Stakhov <vsevolod@highsecure.ru>

Thu, 6 Sep 2018 18:50:18 +0000 (19:50 +0100)
author Vsevolod Stakhov <vsevolod@highsecure.ru>
Thu, 6 Sep 2018 18:49:44 +0000 (19:49 +0100)
committer Vsevolod Stakhov <vsevolod@highsecure.ru>
Thu, 6 Sep 2018 18:50:18 +0000 (19:50 +0100)
diff --git a/src/libmime/message.c b/src/libmime/message.c

index e59d34b259b6b9228f96c93112b97fd5febbc6bf..4ec0218434b9ddd092e011d1eb3b74c7ecfaa561 100644 (file)
--- a/src/libmime/message.c
+++ b/src/libmime/message.c
@@ -203,21 +203,14 @@ rspamd_mime_part_create_words (struct rspamd_task *task,
                 tok_type = RSPAMD_TOKENIZE_RAW;
         }
  
-       /* Ugly workaround */
-       if (IS_PART_HTML (part)) {
-               part->utf_words = rspamd_tokenize_text (
-                               part->utf_stripped_content->data,
-                               part->utf_stripped_content->len, tok_type, task->cfg,
-                               part->exceptions,
-                               NULL);
-       }
-       else {
-               part->utf_words = rspamd_tokenize_text (
-                               part->utf_stripped_content->data,
-                               part->utf_stripped_content->len, tok_type, task->cfg,
-                               part->exceptions,
-                               NULL);
-       }
+       part->utf_words = rspamd_tokenize_text (
+                       part->utf_stripped_content->data,
+                       part->utf_stripped_content->len,
+                       &part->utf_stripped_text,
+                       tok_type, task->cfg,
+                       part->exceptions,
+                       NULL);
+
  
         if (part->utf_words) {
                 part->normalized_hashes = g_array_sized_new (FALSE, FALSE,
diff --git a/src/libstat/stat_process.c b/src/libstat/stat_process.c

index 394173444df65f63bc08762da90789090da993c4..6d34ba51c814e5ae99165a8ebdc27d64ad559b7c 100644 (file)
--- a/src/libstat/stat_process.c
+++ b/src/libstat/stat_process.c
@@ -365,8 +365,18 @@ rspamd_stat_process_tokenize (struct rspamd_stat_ctx *st_ctx,
         }
  
         if (sub != NULL) {
-               words = rspamd_tokenize_text (sub, strlen (sub), RSPAMD_TOKENIZE_UTF,
+               UText utxt = UTEXT_INITIALIZER;
+               UErrorCode uc_err = U_ZERO_ERROR;
+               gsize slen = strlen (sub);
+
+               utext_openUTF8 (&utxt,
+                               sub,
+                               slen,
+                               &uc_err);
+
+               words = rspamd_tokenize_text (sub, slen, &utxt, RSPAMD_TOKENIZE_UTF,
                                 NULL, NULL, NULL);
+
                 if (words != NULL) {
  
                         for (i = 0; i < words->len; i ++) {
diff --git a/src/libstat/tokenizers/tokenizers.c b/src/libstat/tokenizers/tokenizers.c

index 5436430fe9cdbea65fabcae36010f262dfd24ef8..9babfc8a1ccea4636895d4e414e8977bb370cb12 100644 (file)
--- a/src/libstat/tokenizers/tokenizers.c
+++ b/src/libstat/tokenizers/tokenizers.c
@@ -21,8 +21,10 @@
  #include "tokenizers.h"
  #include "stat_internal.h"
  #include "../../../contrib/mumhash/mum.h"
-#include "unicode/utf8.h"
-#include "unicode/uchar.h"
+#include <unicode/utf8.h>
+#include <unicode/uchar.h>
+#include <unicode/uiter.h>
+#include <unicode/ubrk.h>
  
  typedef gboolean (*token_get_function) (rspamd_stat_token_t * buf, gchar const **pos,
                 rspamd_stat_token_t * token,
@@ -148,187 +150,88 @@ rspamd_tokenizer_get_word_raw (rspamd_stat_token_t * buf,
         return TRUE;
  }
  
-static gboolean
-rspamd_tokenizer_get_word_utf8 (rspamd_stat_token_t * buf,
-               gchar const **cur, rspamd_stat_token_t * token,
-               GList **exceptions, gsize *rl,
-               gboolean check_signature)
+static inline gboolean
+rspamd_tokenize_check_limit (gboolean decay,
+                                                        guint word_decay,
+                                                        guint nwords,
+                                                        guint64 *hv,
+                                                        guint64 *prob,
+                                                        const rspamd_stat_token_t *token,
+                                                        gssize remain,
+                                                        gssize total)
  {
-       gint32 i, siglen = 0, remain;
-       goffset pos;
-       const gchar *p, *s, *sig = NULL;
-       UChar32 uc;
-       guint processed = 0;
-       struct rspamd_process_exception *ex = NULL;
-       enum {
-               skip_delimiters = 0,
-               feed_token,
-               process_signature
-       } state = skip_delimiters;
-
-       if (buf == NULL) {
-               return FALSE;
-       }
-
-       if (exceptions != NULL && *exceptions != NULL) {
-               ex = (*exceptions)->data;
-       }
-
-       g_assert (cur != NULL);
-
-       if (*cur == NULL) {
-               *cur = buf->begin;
-       }
+       static const gdouble avg_word_len = 6.0;
  
-       token->len = 0;
+       if (!decay) {
+               if (token->len >= sizeof (guint64)) {
+#ifdef _MUM_UNALIGNED_ACCESS
+                       *hv = mum_hash_step (*hv, *(guint64 *)token->begin);
+#else
+                       guint64 tmp;
+                       memcpy (&tmp, token->begin, sizeof (tmp));
+                       *hv = mum_hash_step (*hv, tmp);
+#endif
+               }
  
-       pos = *cur - buf->begin;
-       if (pos >= buf->len) {
-               return FALSE;
-       }
+               /* Check for decay */
+               if (word_decay > 0 && nwords > word_decay && remain < (gssize)total) {
+                       /* Start decay */
+                       gdouble decay_prob;
  
-       remain = buf->len - pos;
-       s = *cur;
-       p = s;
-       token->begin = s;
+                       *hv = mum_hash_finish (*hv);
  
-       for (i = 0; i < remain; ) {
-               p = &s[i];
-               U8_NEXT (s, i, remain, uc); /* This also advances i */
+                       /* We assume that word is 6 symbols length in average */
+                       decay_prob = (gdouble)word_decay / ((total - (remain)) / avg_word_len);
  
-               if (uc < 0) {
-                       if (i < remain) {
-                               uc = 0xFFFD;
+                       if (decay_prob >= 1.0) {
+                               *prob = G_MAXUINT64;
                         }
                         else {
-                               return FALSE;
+                               *prob = decay_prob * G_MAXUINT64;
                         }
-               }
  
-               switch (state) {
-               case skip_delimiters:
-                       if (ex != NULL && p - buf->begin == ex->pos) {
-                               goto process_exception;
-                       }
-                       else if (u_isgraph (uc)) {
-                               if (u_isalnum (uc)) {
-                                       state = feed_token;
-                                       token->begin = p;
-                                       continue;
-                               }
-                               else if (check_signature && pos != 0 && (*p == '_' || *p == '-')) {
-                                       sig = p;
-                                       siglen = remain - i;
-                                       state = process_signature;
-                                       continue;
-                               }
-                       }
-                       break;
-               case feed_token:
-                       if (ex != NULL && p - buf->begin == (gint)ex->pos) {
-                               token->flags = RSPAMD_STAT_TOKEN_FLAG_TEXT;
-                               goto process_exception;
-                       }
-                       else if (!u_isalnum (uc)) {
-                               token->flags = RSPAMD_STAT_TOKEN_FLAG_TEXT;
-                               goto set_token;
-                       }
-                       processed ++;
-                       break;
-               case process_signature:
-                       if (*p == '\r' || *p == '\n') {
-                               msg_debug ("signature found: %*s", (gint)siglen, sig);
-                               return FALSE;
-                       }
-                       else if (*p != ' ' && *p != '-' && *p != '_') {
-                               state = skip_delimiters;
-                               continue;
-                       }
-                       break;
+                       return TRUE;
                 }
         }
+       else {
+               /* Decaying probability */
+               /* LCG64 x[n] = a x[n - 1] + b mod 2^64 */
+               *hv = (*hv) * 2862933555777941757ULL + 3037000493ULL;
  
-       /* Last character */
-       if (state == feed_token) {
-               p = &s[i];
-               goto set_token;
+               if (*hv > *prob) {
+                       return TRUE;
+               }
         }
  
         return FALSE;
+}
  
-set_token:
-       if (rl) {
-               *rl = processed;
-       }
+static inline gboolean
+rspamd_utf_word_valid (const gchar *text, const gchar *end,
+               gint32 start, gint32 finish)
+{
+       const gchar *st = text + start, *fin = text + finish;
+       UChar32 c;
  
-       if (token->len == 0 && processed > 0) {
-               token->len = p - token->begin;
-               g_assert (token->len > 0);
+       if (st >= end || fin > end || st >= fin) {
+               return FALSE;
         }
  
-       *cur = &s[i];
-
-       return TRUE;
-
-process_exception:
-       if (token->len == 0 && processed > 0) {
-               /*
-                * We have processed something before the next exception, so
-                * continue processing on next iteration of this function call
-                */
-               token->len = p - token->begin;
-               g_assert (token->len > 0);
-
-               *cur = p;
+       U8_NEXT (text, start, finish, c);
  
+       if (u_isalnum (c)) {
                 return TRUE;
         }
  
-       if (ex->type == RSPAMD_EXCEPTION_URL) {
-               token->begin = "!!EX!!";
-               token->len = sizeof ("!!EX!!") - 1;
-               token->flags = RSPAMD_STAT_TOKEN_FLAG_EXCEPTION;
-               processed = token->len;
-       }
-
-       p += ex->len;
-
-       /* We need to skip all exceptions that are within this exception */
-       *exceptions = g_list_next (*exceptions);
-
-       while (*exceptions) {
-               ex = (*exceptions)->data;
-
-               if (ex->pos < p - buf->begin) {
-                       /* Nested exception */
-                       if (ex->pos + ex->len > p - buf->begin) {
-                               /*
-                                * We have somehow overlapping nesting exception,
-                                * extend current offset
-                                */
-                               p = buf->begin + ex->pos + ex->len;
-                       }
-
-                       *exceptions = g_list_next (*exceptions);
-               }
-               else {
-                       break;
-               }
-       }
-
-       *cur = p;
-
-       if (rl) {
-               *rl = processed;
-       }
-
-       return TRUE;
+       return FALSE;
  }
  
  GArray *
  rspamd_tokenize_text (const gchar *text, gsize len,
+                                         const UText *utxt,
                                           enum rspamd_tokenize_type how,
-                                         struct rspamd_config *cfg, GList *exceptions,
+                                         struct rspamd_config *cfg,
+                                         GList *exceptions,
                                           guint64 *hash)
  {
         rspamd_stat_token_t token, buf;
@@ -336,11 +239,11 @@ rspamd_tokenize_text (const gchar *text, gsize len,
         gsize l = 0;
         GArray *res;
         GList *cur = exceptions;
-       token_get_function func;
         guint min_len = 0, max_len = 0, word_decay = 0, initial_size = 128;
         guint64 hv = 0;
         gboolean decay = FALSE;
         guint64 prob;
+       static UBreakIterator* bi = NULL;
  
         if (text == NULL) {
                 return NULL;
@@ -353,18 +256,6 @@ rspamd_tokenize_text (const gchar *text, gsize len,
         token.len = 0;
         token.flags = 0;
  
-       switch (how) {
-       case RSPAMD_TOKENIZE_RAW:
-               func = rspamd_tokenizer_get_word_raw;
-               break;
-       case RSPAMD_TOKENIZE_UTF:
-               func = rspamd_tokenizer_get_word_utf8;
-               break;
-       default:
-               g_assert_not_reached ();
-               break;
-       }
-
         if (cfg != NULL) {
                 min_len = cfg->min_word_len;
                 max_len = cfg->max_word_len;
@@ -375,56 +266,177 @@ rspamd_tokenize_text (const gchar *text, gsize len,
         res = g_array_sized_new (FALSE, FALSE, sizeof (rspamd_stat_token_t),
                         initial_size);
  
-       while (func (&buf, &pos, &token, &cur, &l, FALSE)) {
-               if (l == 0 || (min_len > 0 && l < min_len) ||
-                                       (max_len > 0 && l > max_len)) {
+       if (G_UNLIKELY (how == RSPAMD_TOKENIZE_RAW || utxt == NULL)) {
+               while (rspamd_tokenizer_get_word_raw (&buf, &pos, &token, &cur, &l, FALSE)) {
+                       if (l == 0 || (min_len > 0 && l < min_len) ||
+                               (max_len > 0 && l > max_len)) {
+                               token.begin = pos;
+                               continue;
+                       }
+
+                       if (rspamd_tokenize_check_limit (decay, word_decay, res->len,
+                                       &hv, &prob, &token, pos - text, len)) {
+                               if (!decay) {
+                                       decay = TRUE;
+                               } else {
+                                       token.begin = pos;
+                                       continue;
+                               }
+                       }
+
+                       g_array_append_val (res, token);
                         token.begin = pos;
-                       continue;
                 }
+       }
+       else {
+               /* UTF8 boundaries */
+               UErrorCode uc_err = U_ZERO_ERROR;
+               int32_t last, p;
+               struct rspamd_process_exception *ex = NULL;
  
-               if (!decay) {
-                       if (token.len >= sizeof (guint64)) {
-#ifdef _MUM_UNALIGNED_ACCESS
-                               hv = mum_hash_step (hv, *(guint64 *)token.begin);
-#else
-                               guint64 tmp;
-                               memcpy (&tmp, token.begin, sizeof (tmp));
-                               hv = mum_hash_step (hv, tmp);
-#endif
-                       }
+               if (bi == NULL) {
+                       bi = ubrk_open (UBRK_WORD, NULL, NULL, 0, &uc_err);
  
-                       /* Check for decay */
-                       if (word_decay > 0 && res->len > word_decay && pos - text < (gssize)len) {
-                               /* Start decay */
-                               gdouble decay_prob;
+                       g_assert (U_SUCCESS (uc_err));
+               }
  
-                               decay = TRUE;
-                               hv = mum_hash_finish (hv);
+               ubrk_setUText (bi, (UText*)utxt, &uc_err);
+               last = ubrk_first (bi);
+               p = last;
  
-                               /* We assume that word is 6 symbols length in average */
-                               decay_prob = (gdouble)word_decay / ((len - (pos - text)) / 6.0);
+               if (cur) {
+                       ex = (struct rspamd_process_exception *)cur->data;
+               }
  
-                               if (decay_prob >= 1.0) {
-                                       prob = G_MAXUINT64;
+               while (p != UBRK_DONE) {
+start_over:
+                       token.len = 0;
+
+                       if (p > last) {
+                               if (ex && cur) {
+                                       /* Check exception */
+                                       if (ex->pos >= last && ex->pos <= p) {
+                                               /* We have an exception within boundary */
+                                               /* First, start to drain exceptions from the start */
+                                               while (cur && ex->pos <= last) {
+                                                       /* We have an exception at the beginning, skip those */
+                                                       last += ex->len;
+
+                                                       if (last > p) {
+                                                               /* Exception spread over the boundaries */
+                                                               while (last > p && p != UBRK_DONE) {
+                                                                       p = ubrk_next (bi);
+                                                               }
+
+                                                               /* We need to reset our scan with new p and last */
+                                                               goto start_over;
+                                                       }
+
+                                                       if (ex->type == RSPAMD_EXCEPTION_URL) {
+                                                               token.begin = "!!EX!!";
+                                                               token.len = sizeof ("!!EX!!") - 1;
+                                                               token.flags = RSPAMD_STAT_TOKEN_FLAG_EXCEPTION;
+
+                                                               g_array_append_val (res, token);
+                                                               token.flags = 0;
+                                                       }
+
+                                                       cur = g_list_next (cur);
+
+                                                       if (cur) {
+                                                               ex = (struct rspamd_process_exception *) cur->data;
+                                                       }
+                                               }
+
+                                               /* Now, we can have an exception within boundary again */
+                                               if (cur && ex->pos >= last && ex->pos <= p) {
+                                                       /* Append the first part */
+                                                       if (rspamd_utf_word_valid (text, text + len, last,
+                                                                       ex->pos)) {
+                                                               token.begin = text + last;
+                                                               token.len = ex->pos - last;
+                                                               token.flags = 0;
+                                                               g_array_append_val (res, token);
+                                                       }
+
+                                                       /* Process the current exception */
+                                                       last += ex->len + token.len;
+
+                                                       if (ex->type == RSPAMD_EXCEPTION_URL) {
+                                                               token.begin = "!!EX!!";
+                                                               token.len = sizeof ("!!EX!!") - 1;
+                                                               token.flags = RSPAMD_STAT_TOKEN_FLAG_EXCEPTION;
+
+                                                               g_array_append_val (res, token);
+                                                       }
+
+                                                       if (last > p) {
+                                                               /* Exception spread over the boundaries */
+                                                               while (last > p && p != UBRK_DONE) {
+                                                                       p = ubrk_next (bi);
+                                                               }
+                                                               /* We need to reset our scan with new p and last */
+                                                               goto start_over;
+                                                       }
+                                               }
+                                               else if (p > last) {
+                                                       if (rspamd_utf_word_valid (text, text + len, last, p)) {
+                                                               token.begin = text + last;
+                                                               token.len = p - last;
+                                                               token.flags = 0;
+                                                       }
+                                               }
+                                       }
+                                       else if (ex->pos < last) {
+                                               /* Forward exceptions list */
+                                               while (cur && ex->pos <= last) {
+                                                       /* We have an exception at the beginning, skip those */
+                                                       cur = g_list_next (cur);
+
+                                                       if (cur) {
+                                                               ex = (struct rspamd_process_exception *) cur->data;
+                                                       }
+                                               }
+
+                                               if (rspamd_utf_word_valid (text, text + len, last, p)) {
+                                                       token.begin = text + last;
+                                                       token.len = p - last;
+                                                       token.flags = 0;
+                                               }
+                                       }
+                                       else {
+                                               /* No exceptions within boundary */
+                                               if (rspamd_utf_word_valid (text, text + len, last, p)) {
+                                                       token.begin = text + last;
+                                                       token.len = p - last;
+                                                       token.flags = 0;
+                                               }
+                                       }
                                 }
                                 else {
-                                       prob = decay_prob * G_MAXUINT64;
+                                       if (rspamd_utf_word_valid (text, text + len, last, p)) {
+                                               token.begin = text + last;
+                                               token.len = p - last;
+                                       }
+                               }
+
+                               if (rspamd_tokenize_check_limit (decay, word_decay, res->len,
+                                               &hv, &prob, &token, pos - text, len)) {
+                                       if (!decay) {
+                                               decay = TRUE;
+                                       } else {
+                                               token.len = 0;
+                                       }
                                 }
                         }
-               }
-               else {
-                       /* Decaying probability */
-                       /* LCG64 x[n] = a x[n - 1] + b mod 2^64 */
-                       hv = 2862933555777941757ULL * hv + 3037000493ULL;
  
-                       if (hv > prob) {
-                               token.begin = pos;
-                               continue;
+                       if (token.len > 0) {
+                               g_array_append_val (res, token);
                         }
-               }
  
-               g_array_append_val (res, token);
-               token.begin = pos;
+                       last = p;
+                       p = ubrk_next (bi);
+               }
         }
  
         if (!decay) {
diff --git a/src/libstat/tokenizers/tokenizers.h b/src/libstat/tokenizers/tokenizers.h

index 16ab142fd86f7b5272716db841cf145820763cfc..6c538eafc049f4556f25d7492a664292f4e96524 100644 (file)
--- a/src/libstat/tokenizers/tokenizers.h
+++ b/src/libstat/tokenizers/tokenizers.h
@@ -7,6 +7,8 @@
  #include "rspamd.h"
  #include "stat_api.h"
  
+#include <unicode/utext.h>
+
  #define RSPAMD_DEFAULT_TOKENIZER "osb"
  
  struct rspamd_tokenizer_runtime;
@@ -37,6 +39,7 @@ gint token_node_compare_func (gconstpointer a, gconstpointer b);
  
  /* Tokenize text into array of words (rspamd_stat_token_t type) */
  GArray * rspamd_tokenize_text (const gchar *text, gsize len,
+                                                          const UText *utxt,
                                                            enum rspamd_tokenize_type how,
                                                            struct rspamd_config *cfg,
                                                            GList *exceptions,
diff --git a/src/lua/lua_util.c b/src/lua/lua_util.c

index 3de68e60a48d35a3a56fb90281affdeac690dee6..d6095ab52c649825a4b64c4405ec2ee30b6f0488 100644 (file)
--- a/src/lua/lua_util.c
+++ b/src/lua/lua_util.c
@@ -1078,6 +1078,7 @@ lua_util_tokenize_text (lua_State *L)
         GList *exceptions = NULL, *cur;
         struct rspamd_lua_text *t;
         struct rspamd_process_exception *ex;
+       UText utxt = UTEXT_INITIALIZER;
         GArray *res;
         rspamd_stat_token_t *w;
  
@@ -1129,7 +1130,15 @@ lua_util_tokenize_text (lua_State *L)
                 exceptions = g_list_reverse (exceptions);
         }
  
-       res = rspamd_tokenize_text ((gchar *)in, len, RSPAMD_TOKENIZE_UTF, NULL,
+       UErrorCode uc_err = U_ZERO_ERROR;
+       utext_openUTF8 (&utxt,
+                       in,
+                       len,
+                       &uc_err);
+
+       res = rspamd_tokenize_text ((gchar *)in, len,
+                       &utxt,
+                       RSPAMD_TOKENIZE_UTF, NULL,
                         exceptions,
                         NULL);
  
diff --git a/src/plugins/chartable.c b/src/plugins/chartable.c

index 3c7157311f27a84cc5a31f0d1d94cc8b6cd468d2..f917c26c896f408000d63a71bfe254edf26cf844 100644 (file)
--- a/src/plugins/chartable.c
+++ b/src/plugins/chartable.c
@@ -619,7 +619,17 @@ chartable_symbol_callback (struct rspamd_task *task, void *unused)
                 guint i;
                 gdouble cur_score = 0.0;
  
-               words = rspamd_tokenize_text (task->subject, strlen (task->subject),
+               UText utxt = UTEXT_INITIALIZER;
+               UErrorCode uc_err = U_ZERO_ERROR;
+               gsize slen = strlen (task->subject);
+
+               utext_openUTF8 (&utxt,
+                               task->subject,
+                               slen,
+                               &uc_err);
+
+               words = rspamd_tokenize_text (task->subject, slen,
+                               &utxt,
                                 RSPAMD_TOKENIZE_UTF,
                                 NULL,
                                 NULL,
author	Vsevolod Stakhov <vsevolod@highsecure.ru>
	Thu, 6 Sep 2018 18:49:44 +0000 (19:49 +0100)
committer	Vsevolod Stakhov <vsevolod@highsecure.ru>
	Thu, 6 Sep 2018 18:50:18 +0000 (19:50 +0100)
src/libmime/message.c		patch \| blob \| blame \| history
src/libstat/stat_process.c		patch \| blob \| blame \| history
src/libstat/tokenizers/tokenizers.c		patch \| blob \| blame \| history
src/libstat/tokenizers/tokenizers.h		patch \| blob \| blame \| history
src/lua/lua_util.c		patch \| blob \| blame \| history
src/plugins/chartable.c		patch \| blob \| blame \| history