Rework tokenization:

author Vsevolod Stakhov <vsevolod@highsecure.ru>

Mon, 23 Feb 2015 14:29:31 +0000 (14:29 +0000)

committer Vsevolod Stakhov <vsevolod@highsecure.ru>

Mon, 23 Feb 2015 14:29:31 +0000 (14:29 +0000)
author Vsevolod Stakhov <vsevolod@highsecure.ru>
Mon, 23 Feb 2015 14:29:31 +0000 (14:29 +0000)
committer Vsevolod Stakhov <vsevolod@highsecure.ru>
Mon, 23 Feb 2015 14:29:31 +0000 (14:29 +0000)
diff --git a/src/libstat/stat_config.c b/src/libstat/stat_config.c

index 8a05147216d5f05e7fcef7e275ec85bae222bb44..8b537f732802aeead8061bfff3d16f21b1864bc2 100644 (file)
--- a/src/libstat/stat_config.c
+++ b/src/libstat/stat_config.c
@@ -41,7 +41,7 @@ static struct rspamd_stat_classifier stat_classifiers[] = {
  };
  
  static struct rspamd_stat_tokenizer stat_tokenizers[] = {
-       {"osb-text", osb_tokenize_text},
+       {"osb-text", rspamd_tokenizer_osb},
  };
  
  static struct rspamd_stat_backend stat_backends[] = {
diff --git a/src/libstat/stat_process.c b/src/libstat/stat_process.c

index eafbe209266099e69f756b3f784addc1a73959d2..f5a4b939839658c722ac367b2d2b68d8bd070ccd 100644 (file)
--- a/src/libstat/stat_process.c
+++ b/src/libstat/stat_process.c
@@ -287,6 +287,20 @@ rspamd_stat_process_tokenize (struct rspamd_tokenizer_config *cf,
         GArray *words;
         gchar *sub;
         GList *cur;
+       const ucl_object_t *elt;
+       gboolean compat = TRUE;
+
+       /*
+        * XXX: Ugly repetition to be backward compatible
+        */
+       if (cf != NULL && cf->opts != NULL) {
+               elt = ucl_object_find_key (cf->opts, "hash");
+               if (elt != NULL && ucl_object_type (elt) == UCL_STRING) {
+                       if (g_ascii_strcasecmp (ucl_object_tostring (elt), "xxh") == 0) {
+                               compat = FALSE;
+                       }
+               }
+       }
  
         cur = task->text_parts;
  
@@ -297,8 +311,15 @@ rspamd_stat_process_tokenize (struct rspamd_tokenizer_config *cf,
                         /*
                          * XXX: Use normalized words if needed here
                          */
-                       tok->tokenizer->tokenize_func (cf, task->task_pool,
+
+                       if (compat) {
+                               tok->tokenizer->tokenize_func (cf, task->task_pool,
                                         part->words, tok->tokens, part->is_utf);
+                       }
+                       else {
+                               tok->tokenizer->tokenize_func (cf, task->task_pool,
+                                       part->normalized_words, tok->tokens, part->is_utf);
+                       }
                 }
  
                 cur = g_list_next (cur);
diff --git a/src/libstat/tokenizers/osb.c b/src/libstat/tokenizers/osb.c

index b51e909a9f789d4b0d62803a1df2c0708fbc8fb1..18157acd18c737356ea7a5471e7668d116de8140 100644 (file)
--- a/src/libstat/tokenizers/osb.c
+++ b/src/libstat/tokenizers/osb.c
@@ -28,17 +28,28 @@
  
  #include "tokenizers.h"
  #include "stat_internal.h"
+#include "libstemmer.h"
+#include "xxhash.h"
+#include "siphash.h"
  
  /* Size for features pipe */
-#define FEATURE_WINDOW_SIZE 5
-
-/* Minimum length of token */
-#define MIN_LEN 4
-
-extern const int primes[];
+#define DEFAULT_FEATURE_WINDOW_SIZE 5
+
+static const int primes[] = {
+       1, 7,
+       3, 13,
+       5, 29,
+       11, 51,
+       23, 101,
+       47, 203,
+       97, 407,
+       197, 817,
+       397, 1637,
+       797, 3277,
+};
  
  int
-osb_tokenize_text (struct rspamd_tokenizer_config *cf,
+rspamd_tokenizer_osb (struct rspamd_tokenizer_config *cf,
         rspamd_mempool_t * pool,
         GArray * input,
         GTree * tree,
@@ -46,9 +57,15 @@ osb_tokenize_text (struct rspamd_tokenizer_config *cf,
  {
         rspamd_token_t *new = NULL;
         rspamd_fstring_t *token;
-       guint32 hashpipe[FEATURE_WINDOW_SIZE], h1, h2;
-       gint i, processed = 0;
-       guint w;
+       const ucl_object_t *elt;
+       guint64 *hashpipe, cur;
+       guint32 h1, h2;
+       guint processed = 0, i, w, window_size = DEFAULT_FEATURE_WINDOW_SIZE;
+       gboolean compat = TRUE, secure = FALSE;
+       gint64 seed = 0xdeadbabe;
+       guchar *key = NULL;
+       gsize keylen;
+       struct sipkey sk;
  
         g_assert (tree != NULL);
  
@@ -56,32 +73,100 @@ osb_tokenize_text (struct rspamd_tokenizer_config *cf,
                 return FALSE;
         }
  
-       memset (hashpipe, 0xfe, FEATURE_WINDOW_SIZE * sizeof (hashpipe[0]));
+       if (cf != NULL && cf->opts != NULL) {
+               elt = ucl_object_find_key (cf->opts, "hash");
+               if (elt != NULL && ucl_object_type (elt) == UCL_STRING) {
+                       if (g_ascii_strncasecmp (ucl_object_tostring (elt), "xxh", 3)
+                                       == 0) {
+                               compat = FALSE;
+                               secure = FALSE;
+                               elt = ucl_object_find_key (cf->opts, "seed");
+                               if (elt != NULL && ucl_object_type (elt) == UCL_INT) {
+                                       seed = ucl_object_toint (elt);
+                               }
+                       }
+                       else if (g_ascii_strncasecmp (ucl_object_tostring (elt), "sip", 3)
+                                       == 0) {
+                               compat = FALSE;
+                               elt = ucl_object_find_key (cf->opts, "seed");
+
+                               if (elt != NULL && ucl_object_type (elt) == UCL_STRING) {
+                                       key = rspamd_decode_base32 (ucl_object_tostring (elt),
+                                                       0, &keylen);
+                                       if (keylen < 16) {
+                                               msg_warn ("siphash seed is too short: %s", keylen);
+                                               g_free (key);
+                                       }
+                                       else {
+                                               secure = TRUE;
+                                               sip_tokey (&sk, key);
+                                               g_free (key);
+                                       }
+                               }
+                               else {
+                                       msg_warn ("siphash cannot be used without seed");
+                               }
+
+                       }
+               }
+               elt = ucl_object_find_key (cf->opts, "window");
+               if (elt != NULL && ucl_object_type (elt) == UCL_INT) {
+                       window_size = ucl_object_toint (elt);
+                       if (window_size > DEFAULT_FEATURE_WINDOW_SIZE * 4) {
+                               msg_err ("too large window size: %d", window_size);
+                               window_size = DEFAULT_FEATURE_WINDOW_SIZE;
+                       }
+               }
+       }
+
+       hashpipe = g_alloca (window_size * sizeof (hashpipe[0]));
+       memset (hashpipe, 0xfe, window_size * sizeof (hashpipe[0]));
  
         for (w = 0; w < input->len; w ++) {
                 token = &g_array_index (input, rspamd_fstring_t, w);
  
-               if (processed < FEATURE_WINDOW_SIZE) {
+               if (compat) {
+                       cur = rspamd_fstrhash_lc (token, is_utf);
+               }
+               else {
+                       /* We know that the words are normalized */
+                       if (!secure) {
+                               cur = XXH64 (token->begin, token->len, seed);
+                       }
+                       else {
+                               cur = siphash24 (token->begin, token->len, &sk);
+                       }
+               }
+
+               if (processed < window_size) {
                         /* Just fill a hashpipe */
-                       hashpipe[FEATURE_WINDOW_SIZE - ++processed] =
-                               rspamd_fstrhash_lc (token, is_utf);
+                       hashpipe[window_size - ++processed] = cur;
                 }
                 else {
                         /* Shift hashpipe */
-                       for (i = FEATURE_WINDOW_SIZE - 1; i > 0; i--) {
+                       for (i = window_size - 1; i > 0; i--) {
                                 hashpipe[i] = hashpipe[i - 1];
                         }
-                       hashpipe[0] = rspamd_fstrhash_lc (token, is_utf);
+                       hashpipe[0] = cur;
                         processed++;
  
-                       for (i = 1; i < FEATURE_WINDOW_SIZE; i++) {
-                               h1 = hashpipe[0] * primes[0] + hashpipe[i] * primes[i << 1];
-                               h2 = hashpipe[0] * primes[1] + hashpipe[i] *
-                                       primes[(i << 1) - 1];
+                       for (i = 1; i < window_size; i++) {
                                 new = rspamd_mempool_alloc0 (pool, sizeof (rspamd_token_t));
-                               new->datalen = sizeof(gint32) * 2;
-                               memcpy(new->data, &h1, sizeof(h1));
-                               memcpy(new->data + sizeof(h1), &h2, sizeof(h2));
+                               new->datalen = sizeof (gint64);
+
+                               if (compat) {
+                                       h1 = ((guint32)hashpipe[0]) * primes[0] +
+                                                       ((guint32)hashpipe[i]) * primes[i << 1];
+                                       h2 = ((guint32)hashpipe[0]) * primes[1] +
+                                                       ((guint32)hashpipe[i]) * primes[(i << 1) - 1];
+
+                                       memcpy(new->data, &h1, sizeof (h1));
+                                       memcpy(new->data + sizeof (h1), &h2, sizeof (h2));
+                               }
+                               else {
+                                       cur = hashpipe[0] * primes[0] + hashpipe[i] * primes[i << 1];
+                                       memcpy (new->data, &cur, sizeof (cur));
+                               }
  
                                 if (g_tree_lookup (tree, new) == NULL) {
                                         g_tree_insert (tree, new, new);
@@ -90,14 +175,23 @@ osb_tokenize_text (struct rspamd_tokenizer_config *cf,
                 }
         }
  
-       if (processed <= FEATURE_WINDOW_SIZE) {
+       if (processed <= window_size) {
                 for (i = 1; i < processed; i++) {
-                       h1 = hashpipe[0] * primes[0] + hashpipe[i] * primes[i << 1];
-                       h2 = hashpipe[0] * primes[1] + hashpipe[i] * primes[(i << 1) - 1];
                         new = rspamd_mempool_alloc0 (pool, sizeof (rspamd_token_t));
-                       new->datalen = sizeof(gint32) * 2;
-                       memcpy(new->data, &h1, sizeof(h1));
-                       memcpy(new->data + sizeof(h1), &h2, sizeof(h2));
+                       new->datalen = sizeof (gint64);
+
+                       if (compat) {
+                               h1 = ((guint32)hashpipe[0]) * primes[0] +
+                                               ((guint32)hashpipe[i]) * primes[i << 1];
+                               h2 = ((guint32)hashpipe[0]) * primes[1] +
+                                               ((guint32)hashpipe[i]) * primes[(i << 1) - 1];
+                               memcpy(new->data, &h1, sizeof (h1));
+                               memcpy(new->data + sizeof (h1), &h2, sizeof (h2));
+                       }
+                       else {
+                               cur = hashpipe[0] * primes[0] + hashpipe[i] * primes[i << 1];
+                               memcpy (new->data, &cur, sizeof (cur));
+                       }
  
                         if (g_tree_lookup (tree, new) == NULL) {
                                 g_tree_insert (tree, new, new);
diff --git a/src/libstat/tokenizers/tokenizers.c b/src/libstat/tokenizers/tokenizers.c

index 6ec7b1e10819cc6275f4daebbdf14d413e9ae714..2abe0f318a2dcab164de6d8af5d52def7e6bc1a0 100644 (file)
--- a/src/libstat/tokenizers/tokenizers.c
+++ b/src/libstat/tokenizers/tokenizers.c
@@ -30,19 +30,6 @@
  #include "tokenizers.h"
  #include "stat_internal.h"
  
-const int primes[] = {
-       1, 7,
-       3, 13,
-       5, 29,
-       11, 51,
-       23, 101,
-       47, 203,
-       97, 407,
-       197, 817,
-       397, 1637,
-       797, 3277,
-};
-
  const gchar t_delimiters[255] = {
         0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
         1, 0, 0, 1, 0, 0, 0, 0, 0, 0,
diff --git a/src/libstat/tokenizers/tokenizers.h b/src/libstat/tokenizers/tokenizers.h

index 0bc5948423b30f00fe981564443148cab109ac6d..bab18b00aeeadc5c6a5d95b90684ef14450db9b0 100644 (file)
--- a/src/libstat/tokenizers/tokenizers.h
+++ b/src/libstat/tokenizers/tokenizers.h
@@ -31,7 +31,7 @@ GArray * rspamd_tokenize_text (gchar *text, gsize len, gboolean is_utf,
                 gsize min_len, GList **exceptions);
  
  /* OSB tokenize function */
-int osb_tokenize_text (struct rspamd_tokenizer_config *cf,
+int rspamd_tokenizer_osb (struct rspamd_tokenizer_config *cf,
         rspamd_mempool_t *pool,
         GArray *input,
         GTree *tokens,
author	Vsevolod Stakhov <vsevolod@highsecure.ru>
	Mon, 23 Feb 2015 14:29:31 +0000 (14:29 +0000)
committer	Vsevolod Stakhov <vsevolod@highsecure.ru>
	Mon, 23 Feb 2015 14:29:31 +0000 (14:29 +0000)
src/libstat/stat_config.c		patch \| blob \| blame \| history
src/libstat/stat_process.c		patch \| blob \| blame \| history
src/libstat/tokenizers/osb.c		patch \| blob \| blame \| history
src/libstat/tokenizers/tokenizers.c		patch \| blob \| blame \| history
src/libstat/tokenizers/tokenizers.h		patch \| blob \| blame \| history