]> git.ipfire.org Git - thirdparty/rspamd.git/commitdiff
[Minor] Do not apply ngramms detection for short texts
authorVsevolod Stakhov <vsevolod@highsecure.ru>
Sat, 8 Sep 2018 15:46:49 +0000 (16:46 +0100)
committerVsevolod Stakhov <vsevolod@highsecure.ru>
Sat, 8 Sep 2018 15:46:49 +0000 (16:46 +0100)
src/libmime/lang_detection.c

index fbc5f56c909dccb3d2e0b770cf899068df700a19..dfd62d4e5f4295594c8851710a5c6dc238e14fbe 100644 (file)
@@ -27,7 +27,7 @@
 #include <unicode/ustring.h>
 #include <math.h>
 
-static const gsize default_short_text_limit = 200;
+static const gsize default_short_text_limit = 20;
 static const gsize default_words = 80;
 static const gdouble update_prob = 0.6;
 static const gchar *default_languages_path = RSPAMD_PLUGINSDIR "/languages";
@@ -1597,62 +1597,71 @@ rspamd_language_detector_detect (struct rspamd_task *task,
        }
 
        if (!ret) {
-               candidates = kh_init (rspamd_candidates_hash);
-               kh_resize (rspamd_candidates_hash, candidates, 32);
-
-               r = rspamd_language_detector_try_ngramm (task,
-                               default_words,
-                               d,
-                               part->utf_words,
-                               cat,
-                               candidates);
-
-               if (r == rs_detect_none) {
-                       msg_debug_lang_det ("no trigramms found, fallback to english");
+               if (part->utf_words->len < default_short_text_limit) {
+                       r = rs_detect_none;
+                       msg_debug_lang_det ("text is too short for trigramms detection: "
+                                          "%d words; at least %d words required",
+                                       (int)part->utf_words->len,
+                                       (int)default_short_text_limit);
                        rspamd_language_detector_set_language (task, part, "en");
                }
-               else if (r == rs_detect_multiple) {
-                       /* Check our guess */
-
-                       mean = 0.0;
-                       std = 0.0;
-                       cand_len = 0;
-
-                       /* Check distirbution */
-                       kh_foreach_value (candidates, cand, {
-                               if (!isnan (cand->prob)) {
-                                       mean += cand->prob;
-                                       cand_len++;
-                               }
-                       });
-
-                       if (cand_len > 0) {
-                               mean /= cand_len;
-
+               else {
+                       candidates = kh_init (rspamd_candidates_hash);
+                       kh_resize (rspamd_candidates_hash, candidates, 32);
+
+                       r = rspamd_language_detector_try_ngramm (task,
+                                       default_words,
+                                       d,
+                                       part->utf_words,
+                                       cat,
+                                       candidates);
+
+                       if (r == rs_detect_none) {
+                               msg_debug_lang_det ("no trigramms found, fallback to english");
+                               rspamd_language_detector_set_language (task, part, "en");
+                       } else if (r == rs_detect_multiple) {
+                               /* Check our guess */
+
+                               mean = 0.0;
+                               std = 0.0;
+                               cand_len = 0;
+
+                               /* Check distirbution */
                                kh_foreach_value (candidates, cand, {
-                                       gdouble err;
                                        if (!isnan (cand->prob)) {
-                                               err = cand->prob - mean;
-                                               std += fabs (err);
+                                               mean += cand->prob;
+                                               cand_len++;
                                        }
                                });
 
-                               std /= cand_len;
-                       }
+                               if (cand_len > 0) {
+                                       mean /= cand_len;
 
-                       msg_debug_lang_det ("trigramms checked, %d candidates, %.3f mean, %.4f stddev",
-                                       cand_len, mean, std);
+                                       kh_foreach_value (candidates, cand, {
+                                               gdouble err;
+                                               if (!isnan (cand->prob)) {
+                                                       err = cand->prob - mean;
+                                                       std += fabs (err);
+                                               }
+                                       });
 
-                       if (cand_len > 0 && std / fabs (mean) < 0.25) {
-                               msg_debug_lang_det ("apply frequency heuristic sorting");
-                               frequency_heuristic_applied = TRUE;
-                               cbd.d = d;
-                               cbd.mean = mean;
-                               cbd.std = std;
-                               cbd.flags = RSPAMD_LANG_FLAG_DEFAULT;
+                                       std /= cand_len;
+                               }
 
-                               if (part->utf_words->len < default_words / 2) {
-                                       cbd.flags |= RSPAMD_LANG_FLAG_SHORT;
+                               msg_debug_lang_det ("trigramms checked, %d candidates, %.3f mean, %.4f stddev",
+                                               cand_len, mean, std);
+
+                               if (cand_len > 0 && std / fabs (mean) < 0.25) {
+                                       msg_debug_lang_det ("apply frequency heuristic sorting");
+                                       frequency_heuristic_applied = TRUE;
+                                       cbd.d = d;
+                                       cbd.mean = mean;
+                                       cbd.std = std;
+                                       cbd.flags = RSPAMD_LANG_FLAG_DEFAULT;
+
+                                       if (part->utf_words->len < default_words / 2) {
+                                               cbd.flags |= RSPAMD_LANG_FLAG_SHORT;
+                                       }
                                }
                        }
                }