[Minor] Treat English as Tier0 language

author Vsevolod Stakhov <vsevolod@highsecure.ru>

Mon, 29 Jan 2018 08:07:16 +0000 (08:07 +0000)

committer Vsevolod Stakhov <vsevolod@highsecure.ru>

Mon, 29 Jan 2018 08:07:16 +0000 (08:07 +0000)
author Vsevolod Stakhov <vsevolod@highsecure.ru>
Mon, 29 Jan 2018 08:07:16 +0000 (08:07 +0000)
committer Vsevolod Stakhov <vsevolod@highsecure.ru>
Mon, 29 Jan 2018 08:07:16 +0000 (08:07 +0000)
diff --git a/src/libmime/lang_detection.c b/src/libmime/lang_detection.c

index b3188e6dddf0b3e27867405eb6c171aa5c6a792c..c340a1b33a4cfb49ed1a0f592bf8c0a49e35ec29 100644 (file)
--- a/src/libmime/lang_detection.c
+++ b/src/libmime/lang_detection.c
@@ -67,8 +67,11 @@ static const gchar *unigramms_langs[] = {
  /*
   * Top languages
   */
+static const gchar *tier0_langs[] = {
+               "en",
+};
  static const gchar *tier1_langs[] = {
-               "en", "fr", "it", "de", "es", "nl", "zh-CN", "zh-TW", "ja",
+               "fr", "it", "de", "es", "nl", "zh-CN", "zh-TW", "ja",
                 "ko", "pt", "ru", "pl", "tk", "th", "ar"
  };
  
@@ -78,7 +81,7 @@ enum rspamd_language_elt_flags {
         RS_LANGUAGE_UNISCRIPT = (1 << 1),
         RS_LANGUAGE_UNIGRAMM = (1 << 2),
         RS_LANGUAGE_TIER1 = (1 << 3),
-       RS_LANGUAGE_TIER2 = (1 << 4),
+       RS_LANGUAGE_TIER0 = (1 << 4),
  };
  
  struct rspamd_language_elt {
@@ -343,6 +346,11 @@ rspamd_language_detector_read_file (struct rspamd_config *cfg,
                         nelt->flags |= RS_LANGUAGE_TIER1;
                 }
  
+               if (rspamd_language_search_str (nelt->name, tier0_langs,
+                               G_N_ELEMENTS (tier0_langs))) {
+                       nelt->flags |= RS_LANGUAGE_TIER0;
+               }
+
                 it = NULL;
                 ngramms = g_ptr_array_sized_new (freqs->len);
  
@@ -972,6 +980,7 @@ struct rspamd_frequency_sort_cbdata {
         gdouble mean;
  };
  
+static const gdouble tier0_adjustment = 1.2;
  static const gdouble tier1_adjustment = 0.8;
  static const gdouble frequency_adjustment = 0.8;
  
@@ -1021,6 +1030,14 @@ rspamd_language_detector_cmp_heuristic (gconstpointer a, gconstpointer b,
                         probb_adjusted += cbd->std * tier1_adjustment;
                 }
  
+               if (canda->elt->flags & RS_LANGUAGE_TIER0) {
+                       proba_adjusted += cbd->std * tier0_adjustment;
+               }
+
+               if (candb->elt->flags & RS_LANGUAGE_TIER0) {
+                       probb_adjusted += cbd->std * tier0_adjustment;
+               }
+
                 if (proba_adjusted > probb_adjusted) {
                         return -1;
                 } else if (probb_adjusted > proba_adjusted) {
author	Vsevolod Stakhov <vsevolod@highsecure.ru>
	Mon, 29 Jan 2018 08:07:16 +0000 (08:07 +0000)
committer	Vsevolod Stakhov <vsevolod@highsecure.ru>
	Mon, 29 Jan 2018 08:07:16 +0000 (08:07 +0000)