]> git.ipfire.org Git - thirdparty/rspamd.git/commitdiff
[Minor] Further fixes in tokenization algorithm
authorVsevolod Stakhov <vsevolod@highsecure.ru>
Fri, 7 Sep 2018 08:26:27 +0000 (09:26 +0100)
committerVsevolod Stakhov <vsevolod@highsecure.ru>
Fri, 7 Sep 2018 08:26:27 +0000 (09:26 +0100)
src/libstat/tokenizers/tokenizers.c

index 9babfc8a1ccea4636895d4e414e8977bb370cb12..0902ceb05578bbb5f2e095aef1d8b608f3890ba7 100644 (file)
@@ -225,6 +225,15 @@ rspamd_utf_word_valid (const gchar *text, const gchar *end,
 
        return FALSE;
 }
+#define SHIFT_EX do { \
+    cur = g_list_next (cur); \
+    if (cur) { \
+        ex = (struct rspamd_process_exception *) cur->data; \
+    } \
+    else { \
+        ex = NULL; \
+    } \
+} while(0)
 
 GArray *
 rspamd_tokenize_text (const gchar *text, gsize len,
@@ -278,7 +287,8 @@ rspamd_tokenize_text (const gchar *text, gsize len,
                                        &hv, &prob, &token, pos - text, len)) {
                                if (!decay) {
                                        decay = TRUE;
-                               } else {
+                               }
+                               else {
                                        token.begin = pos;
                                        continue;
                                }
@@ -322,16 +332,6 @@ start_over:
                                                        /* We have an exception at the beginning, skip those */
                                                        last += ex->len;
 
-                                                       if (last > p) {
-                                                               /* Exception spread over the boundaries */
-                                                               while (last > p && p != UBRK_DONE) {
-                                                                       p = ubrk_next (bi);
-                                                               }
-
-                                                               /* We need to reset our scan with new p and last */
-                                                               goto start_over;
-                                                       }
-
                                                        if (ex->type == RSPAMD_EXCEPTION_URL) {
                                                                token.begin = "!!EX!!";
                                                                token.len = sizeof ("!!EX!!") - 1;
@@ -341,11 +341,18 @@ start_over:
                                                                token.flags = 0;
                                                        }
 
-                                                       cur = g_list_next (cur);
+                                                       if (last > p) {
+                                                               /* Exception spread over the boundaries */
+                                                               while (last > p && p != UBRK_DONE) {
+                                                                       p = ubrk_next (bi);
+                                                               }
 
-                                                       if (cur) {
-                                                               ex = (struct rspamd_process_exception *) cur->data;
+                                                               /* We need to reset our scan with new p and last */
+                                                               SHIFT_EX;
+                                                               goto start_over;
                                                        }
+
+                                                       SHIFT_EX;
                                                }
 
                                                /* Now, we can have an exception within boundary again */
@@ -360,7 +367,7 @@ start_over:
                                                        }
 
                                                        /* Process the current exception */
-                                                       last += ex->len + token.len;
+                                                       last += ex->len + (ex->pos - last);
 
                                                        if (ex->type == RSPAMD_EXCEPTION_URL) {
                                                                token.begin = "!!EX!!";
@@ -376,8 +383,11 @@ start_over:
                                                                        p = ubrk_next (bi);
                                                                }
                                                                /* We need to reset our scan with new p and last */
+                                                               SHIFT_EX;
                                                                goto start_over;
                                                        }
+
+                                                       SHIFT_EX;
                                                }
                                                else if (p > last) {
                                                        if (rspamd_utf_word_valid (text, text + len, last, p)) {
@@ -391,11 +401,7 @@ start_over:
                                                /* Forward exceptions list */
                                                while (cur && ex->pos <= last) {
                                                        /* We have an exception at the beginning, skip those */
-                                                       cur = g_list_next (cur);
-
-                                                       if (cur) {
-                                                               ex = (struct rspamd_process_exception *) cur->data;
-                                                       }
+                                                       SHIFT_EX;
                                                }
 
                                                if (rspamd_utf_word_valid (text, text + len, last, p)) {
@@ -450,6 +456,8 @@ start_over:
        return res;
 }
 
+#undef SHIFT_EX
+
 /*
  * vi:ts=4
  */