]> git.ipfire.org Git - thirdparty/rspamd.git/commitdiff
Rework tokenization:
authorVsevolod Stakhov <vsevolod@highsecure.ru>
Mon, 23 Feb 2015 14:29:31 +0000 (14:29 +0000)
committerVsevolod Stakhov <vsevolod@highsecure.ru>
Mon, 23 Feb 2015 14:29:31 +0000 (14:29 +0000)
- Use normalized words if needed
- Allow using of seeded XXHash instead of hand-made legacy shit
- Allow secure hashing using siphash

src/libstat/stat_config.c
src/libstat/stat_process.c
src/libstat/tokenizers/osb.c
src/libstat/tokenizers/tokenizers.c
src/libstat/tokenizers/tokenizers.h

index 8a05147216d5f05e7fcef7e275ec85bae222bb44..8b537f732802aeead8061bfff3d16f21b1864bc2 100644 (file)
@@ -41,7 +41,7 @@ static struct rspamd_stat_classifier stat_classifiers[] = {
 };
 
 static struct rspamd_stat_tokenizer stat_tokenizers[] = {
-       {"osb-text", osb_tokenize_text},
+       {"osb-text", rspamd_tokenizer_osb},
 };
 
 static struct rspamd_stat_backend stat_backends[] = {
index eafbe209266099e69f756b3f784addc1a73959d2..f5a4b939839658c722ac367b2d2b68d8bd070ccd 100644 (file)
@@ -287,6 +287,20 @@ rspamd_stat_process_tokenize (struct rspamd_tokenizer_config *cf,
        GArray *words;
        gchar *sub;
        GList *cur;
+       const ucl_object_t *elt;
+       gboolean compat = TRUE;
+
+       /*
+        * XXX: Ugly repetition to be backward compatible
+        */
+       if (cf != NULL && cf->opts != NULL) {
+               elt = ucl_object_find_key (cf->opts, "hash");
+               if (elt != NULL && ucl_object_type (elt) == UCL_STRING) {
+                       if (g_ascii_strcasecmp (ucl_object_tostring (elt), "xxh") == 0) {
+                               compat = FALSE;
+                       }
+               }
+       }
 
        cur = task->text_parts;
 
@@ -297,8 +311,15 @@ rspamd_stat_process_tokenize (struct rspamd_tokenizer_config *cf,
                        /*
                         * XXX: Use normalized words if needed here
                         */
-                       tok->tokenizer->tokenize_func (cf, task->task_pool,
+
+                       if (compat) {
+                               tok->tokenizer->tokenize_func (cf, task->task_pool,
                                        part->words, tok->tokens, part->is_utf);
+                       }
+                       else {
+                               tok->tokenizer->tokenize_func (cf, task->task_pool,
+                                       part->normalized_words, tok->tokens, part->is_utf);
+                       }
                }
 
                cur = g_list_next (cur);
index b51e909a9f789d4b0d62803a1df2c0708fbc8fb1..18157acd18c737356ea7a5471e7668d116de8140 100644 (file)
 
 #include "tokenizers.h"
 #include "stat_internal.h"
+#include "libstemmer.h"
+#include "xxhash.h"
+#include "siphash.h"
 
 /* Size for features pipe */
-#define FEATURE_WINDOW_SIZE 5
-
-/* Minimum length of token */
-#define MIN_LEN 4
-
-extern const int primes[];
+#define DEFAULT_FEATURE_WINDOW_SIZE 5
+
+static const int primes[] = {
+       1, 7,
+       3, 13,
+       5, 29,
+       11, 51,
+       23, 101,
+       47, 203,
+       97, 407,
+       197, 817,
+       397, 1637,
+       797, 3277,
+};
 
 int
-osb_tokenize_text (struct rspamd_tokenizer_config *cf,
+rspamd_tokenizer_osb (struct rspamd_tokenizer_config *cf,
        rspamd_mempool_t * pool,
        GArray * input,
        GTree * tree,
@@ -46,9 +57,15 @@ osb_tokenize_text (struct rspamd_tokenizer_config *cf,
 {
        rspamd_token_t *new = NULL;
        rspamd_fstring_t *token;
-       guint32 hashpipe[FEATURE_WINDOW_SIZE], h1, h2;
-       gint i, processed = 0;
-       guint w;
+       const ucl_object_t *elt;
+       guint64 *hashpipe, cur;
+       guint32 h1, h2;
+       guint processed = 0, i, w, window_size = DEFAULT_FEATURE_WINDOW_SIZE;
+       gboolean compat = TRUE, secure = FALSE;
+       gint64 seed = 0xdeadbabe;
+       guchar *key = NULL;
+       gsize keylen;
+       struct sipkey sk;
 
        g_assert (tree != NULL);
 
@@ -56,32 +73,100 @@ osb_tokenize_text (struct rspamd_tokenizer_config *cf,
                return FALSE;
        }
 
-       memset (hashpipe, 0xfe, FEATURE_WINDOW_SIZE * sizeof (hashpipe[0]));
+       if (cf != NULL && cf->opts != NULL) {
+               elt = ucl_object_find_key (cf->opts, "hash");
+               if (elt != NULL && ucl_object_type (elt) == UCL_STRING) {
+                       if (g_ascii_strncasecmp (ucl_object_tostring (elt), "xxh", 3)
+                                       == 0) {
+                               compat = FALSE;
+                               secure = FALSE;
+                               elt = ucl_object_find_key (cf->opts, "seed");
+                               if (elt != NULL && ucl_object_type (elt) == UCL_INT) {
+                                       seed = ucl_object_toint (elt);
+                               }
+                       }
+                       else if (g_ascii_strncasecmp (ucl_object_tostring (elt), "sip", 3)
+                                       == 0) {
+                               compat = FALSE;
+                               elt = ucl_object_find_key (cf->opts, "seed");
+
+                               if (elt != NULL && ucl_object_type (elt) == UCL_STRING) {
+                                       key = rspamd_decode_base32 (ucl_object_tostring (elt),
+                                                       0, &keylen);
+                                       if (keylen < 16) {
+                                               msg_warn ("siphash seed is too short: %s", keylen);
+                                               g_free (key);
+                                       }
+                                       else {
+                                               secure = TRUE;
+                                               sip_tokey (&sk, key);
+                                               g_free (key);
+                                       }
+                               }
+                               else {
+                                       msg_warn ("siphash cannot be used without seed");
+                               }
+
+                       }
+               }
+               elt = ucl_object_find_key (cf->opts, "window");
+               if (elt != NULL && ucl_object_type (elt) == UCL_INT) {
+                       window_size = ucl_object_toint (elt);
+                       if (window_size > DEFAULT_FEATURE_WINDOW_SIZE * 4) {
+                               msg_err ("too large window size: %d", window_size);
+                               window_size = DEFAULT_FEATURE_WINDOW_SIZE;
+                       }
+               }
+       }
+
+       hashpipe = g_alloca (window_size * sizeof (hashpipe[0]));
+       memset (hashpipe, 0xfe, window_size * sizeof (hashpipe[0]));
 
        for (w = 0; w < input->len; w ++) {
                token = &g_array_index (input, rspamd_fstring_t, w);
 
-               if (processed < FEATURE_WINDOW_SIZE) {
+               if (compat) {
+                       cur = rspamd_fstrhash_lc (token, is_utf);
+               }
+               else {
+                       /* We know that the words are normalized */
+                       if (!secure) {
+                               cur = XXH64 (token->begin, token->len, seed);
+                       }
+                       else {
+                               cur = siphash24 (token->begin, token->len, &sk);
+                       }
+               }
+
+               if (processed < window_size) {
                        /* Just fill a hashpipe */
-                       hashpipe[FEATURE_WINDOW_SIZE - ++processed] =
-                               rspamd_fstrhash_lc (token, is_utf);
+                       hashpipe[window_size - ++processed] = cur;
                }
                else {
                        /* Shift hashpipe */
-                       for (i = FEATURE_WINDOW_SIZE - 1; i > 0; i--) {
+                       for (i = window_size - 1; i > 0; i--) {
                                hashpipe[i] = hashpipe[i - 1];
                        }
-                       hashpipe[0] = rspamd_fstrhash_lc (token, is_utf);
+                       hashpipe[0] = cur;
                        processed++;
 
-                       for (i = 1; i < FEATURE_WINDOW_SIZE; i++) {
-                               h1 = hashpipe[0] * primes[0] + hashpipe[i] * primes[i << 1];
-                               h2 = hashpipe[0] * primes[1] + hashpipe[i] *
-                                       primes[(i << 1) - 1];
+                       for (i = 1; i < window_size; i++) {
                                new = rspamd_mempool_alloc0 (pool, sizeof (rspamd_token_t));
-                               new->datalen = sizeof(gint32) * 2;
-                               memcpy(new->data, &h1, sizeof(h1));
-                               memcpy(new->data + sizeof(h1), &h2, sizeof(h2));
+                               new->datalen = sizeof (gint64);
+
+                               if (compat) {
+                                       h1 = ((guint32)hashpipe[0]) * primes[0] +
+                                                       ((guint32)hashpipe[i]) * primes[i << 1];
+                                       h2 = ((guint32)hashpipe[0]) * primes[1] +
+                                                       ((guint32)hashpipe[i]) * primes[(i << 1) - 1];
+
+                                       memcpy(new->data, &h1, sizeof (h1));
+                                       memcpy(new->data + sizeof (h1), &h2, sizeof (h2));
+                               }
+                               else {
+                                       cur = hashpipe[0] * primes[0] + hashpipe[i] * primes[i << 1];
+                                       memcpy (new->data, &cur, sizeof (cur));
+                               }
 
                                if (g_tree_lookup (tree, new) == NULL) {
                                        g_tree_insert (tree, new, new);
@@ -90,14 +175,23 @@ osb_tokenize_text (struct rspamd_tokenizer_config *cf,
                }
        }
 
-       if (processed <= FEATURE_WINDOW_SIZE) {
+       if (processed <= window_size) {
                for (i = 1; i < processed; i++) {
-                       h1 = hashpipe[0] * primes[0] + hashpipe[i] * primes[i << 1];
-                       h2 = hashpipe[0] * primes[1] + hashpipe[i] * primes[(i << 1) - 1];
                        new = rspamd_mempool_alloc0 (pool, sizeof (rspamd_token_t));
-                       new->datalen = sizeof(gint32) * 2;
-                       memcpy(new->data, &h1, sizeof(h1));
-                       memcpy(new->data + sizeof(h1), &h2, sizeof(h2));
+                       new->datalen = sizeof (gint64);
+
+                       if (compat) {
+                               h1 = ((guint32)hashpipe[0]) * primes[0] +
+                                               ((guint32)hashpipe[i]) * primes[i << 1];
+                               h2 = ((guint32)hashpipe[0]) * primes[1] +
+                                               ((guint32)hashpipe[i]) * primes[(i << 1) - 1];
+                               memcpy(new->data, &h1, sizeof (h1));
+                               memcpy(new->data + sizeof (h1), &h2, sizeof (h2));
+                       }
+                       else {
+                               cur = hashpipe[0] * primes[0] + hashpipe[i] * primes[i << 1];
+                               memcpy (new->data, &cur, sizeof (cur));
+                       }
 
                        if (g_tree_lookup (tree, new) == NULL) {
                                g_tree_insert (tree, new, new);
index 6ec7b1e10819cc6275f4daebbdf14d413e9ae714..2abe0f318a2dcab164de6d8af5d52def7e6bc1a0 100644 (file)
 #include "tokenizers.h"
 #include "stat_internal.h"
 
-const int primes[] = {
-       1, 7,
-       3, 13,
-       5, 29,
-       11, 51,
-       23, 101,
-       47, 203,
-       97, 407,
-       197, 817,
-       397, 1637,
-       797, 3277,
-};
-
 const gchar t_delimiters[255] = {
        0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
        1, 0, 0, 1, 0, 0, 0, 0, 0, 0,
index 0bc5948423b30f00fe981564443148cab109ac6d..bab18b00aeeadc5c6a5d95b90684ef14450db9b0 100644 (file)
@@ -31,7 +31,7 @@ GArray * rspamd_tokenize_text (gchar *text, gsize len, gboolean is_utf,
                gsize min_len, GList **exceptions);
 
 /* OSB tokenize function */
-int osb_tokenize_text (struct rspamd_tokenizer_config *cf,
+int rspamd_tokenizer_osb (struct rspamd_tokenizer_config *cf,
        rspamd_mempool_t *pool,
        GArray *input,
        GTree *tokens,