]> git.ipfire.org Git - thirdparty/rspamd.git/commitdiff
[Minor] Add some more heuristics for stop words detection
authorVsevolod Stakhov <vsevolod@highsecure.ru>
Sat, 8 Feb 2020 13:36:58 +0000 (13:36 +0000)
committerVsevolod Stakhov <vsevolod@highsecure.ru>
Sat, 8 Feb 2020 13:36:58 +0000 (13:36 +0000)
src/libmime/lang_detection.c
src/libmime/lang_detection.h

index f27d71a766283fe197c8f867a9cdff6668c1f3d2..a178b1bf802e5d6a960415b8f7e337e2f9b6326d 100644 (file)
@@ -446,6 +446,9 @@ rspamd_language_detector_read_file (struct rspamd_config *cfg,
                                if (strcmp (fl, "diacritics") == 0) {
                                        nelt->flags |= RS_LANGUAGE_DIACRITICS;
                                }
+                               else if (strcmp (fl, "ascii") == 0) {
+                                       nelt->flags |= RS_LANGUAGE_ASCII;
+                               }
                                else {
                                        msg_debug_config ("unknown flag %s of language %s", fl, nelt->name);
                                }
@@ -1668,7 +1671,8 @@ rspamd_language_detector_try_stop_words (struct rspamd_task *task,
        struct rspamd_stop_word_elt *elt;
        struct rspamd_sw_cbdata cbdata;
        gboolean ret = FALSE;
-       static const int stop_words_threshold = 4;
+       static const int stop_words_threshold = 4, /* minimum stop words count */
+                       strong_confidence_threshold = 10 /* we are sure that this is enough */;
 
        elt = &d->stop_words[cat];
        cbdata.res = kh_init (rspamd_sw_hash);
@@ -1683,18 +1687,52 @@ rspamd_language_detector_try_stop_words (struct rspamd_task *task,
                gint cur_matches;
                double max_rate = G_MINDOUBLE;
                struct rspamd_language_elt *cur_lang, *sel = NULL;
+               gboolean ignore_ascii = FALSE, ignore_latin = FALSE;
 
+               again:
                kh_foreach (cbdata.res, cur_lang, cur_matches, {
+                       if (!ignore_ascii && (cur_lang->flags & RS_LANGUAGE_DIACRITICS)) {
+                               /* Restart matches */
+                               ignore_ascii = TRUE;
+                               sel = NULL;
+                               max_rate = G_MINDOUBLE;
+                               msg_debug_lang_det ("ignore ascii after finding %d stop words from %s",
+                                               cur_matches, cur_lang->name);
+                               goto again;
+                       }
+
+                       if (!ignore_latin && cur_lang->category != RSPAMD_LANGUAGE_LATIN) {
+                               /* Restart matches */
+                               ignore_latin = TRUE;
+                               sel = NULL;
+                               max_rate = G_MINDOUBLE;
+                               msg_debug_lang_det ("ignore latin after finding stop %d words from %s",
+                                               cur_matches, cur_lang->name);
+                               goto again;
+                       }
+
                        if (cur_matches < stop_words_threshold) {
                                continue;
                        }
 
+                       if (cur_matches < strong_confidence_threshold) {
+                               /* Ignore mixed languages when not enough confidence */
+                               if (ignore_ascii && (cur_lang->flags & RS_LANGUAGE_ASCII)) {
+                                       continue;
+                               }
+
+                               if (ignore_latin && cur_lang->category == RSPAMD_LANGUAGE_LATIN) {
+                                       continue;
+                               }
+                       }
+
                        double rate = (double)cur_matches / (double)cur_lang->stop_words;
 
                        if (rate > max_rate) {
                                max_rate = rate;
                                sel = cur_lang;
                        }
+
                        msg_debug_lang_det ("found %d stop words from %s: %3f rate",
                                        cur_matches, cur_lang->name, rate);
                });
index b1382e6ad9bed791297cfce8267c7078bca95f9b..6c32348484374b06b1fe86adb38fa10daf5d0b0f 100644 (file)
@@ -56,6 +56,7 @@ enum rspamd_language_elt_flags {
        RS_LANGUAGE_TIER1 = (1 << 3),
        RS_LANGUAGE_TIER0 = (1 << 4),
        RS_LANGUAGE_DIACRITICS = (1 << 5),
+       RS_LANGUAGE_ASCII = (1 << 6),
 };
 
 struct rspamd_lang_detector_res {