#include <unicode/ustring.h>
#include <math.h>
-static const gsize default_short_text_limit = 200;
+static const gsize default_short_text_limit = 20;
static const gsize default_words = 80;
static const gdouble update_prob = 0.6;
static const gchar *default_languages_path = RSPAMD_PLUGINSDIR "/languages";
}
if (!ret) {
- candidates = kh_init (rspamd_candidates_hash);
- kh_resize (rspamd_candidates_hash, candidates, 32);
-
- r = rspamd_language_detector_try_ngramm (task,
- default_words,
- d,
- part->utf_words,
- cat,
- candidates);
-
- if (r == rs_detect_none) {
- msg_debug_lang_det ("no trigramms found, fallback to english");
+ if (part->utf_words->len < default_short_text_limit) {
+ r = rs_detect_none;
+ msg_debug_lang_det ("text is too short for trigramms detection: "
+ "%d words; at least %d words required",
+ (int)part->utf_words->len,
+ (int)default_short_text_limit);
rspamd_language_detector_set_language (task, part, "en");
}
- else if (r == rs_detect_multiple) {
- /* Check our guess */
-
- mean = 0.0;
- std = 0.0;
- cand_len = 0;
-
- /* Check distirbution */
- kh_foreach_value (candidates, cand, {
- if (!isnan (cand->prob)) {
- mean += cand->prob;
- cand_len++;
- }
- });
-
- if (cand_len > 0) {
- mean /= cand_len;
-
+ else {
+ candidates = kh_init (rspamd_candidates_hash);
+ kh_resize (rspamd_candidates_hash, candidates, 32);
+
+ r = rspamd_language_detector_try_ngramm (task,
+ default_words,
+ d,
+ part->utf_words,
+ cat,
+ candidates);
+
+ if (r == rs_detect_none) {
+ msg_debug_lang_det ("no trigramms found, fallback to english");
+ rspamd_language_detector_set_language (task, part, "en");
+ } else if (r == rs_detect_multiple) {
+ /* Check our guess */
+
+ mean = 0.0;
+ std = 0.0;
+ cand_len = 0;
+
+ /* Check distirbution */
kh_foreach_value (candidates, cand, {
- gdouble err;
if (!isnan (cand->prob)) {
- err = cand->prob - mean;
- std += fabs (err);
+ mean += cand->prob;
+ cand_len++;
}
});
- std /= cand_len;
- }
+ if (cand_len > 0) {
+ mean /= cand_len;
- msg_debug_lang_det ("trigramms checked, %d candidates, %.3f mean, %.4f stddev",
- cand_len, mean, std);
+ kh_foreach_value (candidates, cand, {
+ gdouble err;
+ if (!isnan (cand->prob)) {
+ err = cand->prob - mean;
+ std += fabs (err);
+ }
+ });
- if (cand_len > 0 && std / fabs (mean) < 0.25) {
- msg_debug_lang_det ("apply frequency heuristic sorting");
- frequency_heuristic_applied = TRUE;
- cbd.d = d;
- cbd.mean = mean;
- cbd.std = std;
- cbd.flags = RSPAMD_LANG_FLAG_DEFAULT;
+ std /= cand_len;
+ }
- if (part->utf_words->len < default_words / 2) {
- cbd.flags |= RSPAMD_LANG_FLAG_SHORT;
+ msg_debug_lang_det ("trigramms checked, %d candidates, %.3f mean, %.4f stddev",
+ cand_len, mean, std);
+
+ if (cand_len > 0 && std / fabs (mean) < 0.25) {
+ msg_debug_lang_det ("apply frequency heuristic sorting");
+ frequency_heuristic_applied = TRUE;
+ cbd.d = d;
+ cbd.mean = mean;
+ cbd.std = std;
+ cbd.flags = RSPAMD_LANG_FLAG_DEFAULT;
+
+ if (part->utf_words->len < default_words / 2) {
+ cbd.flags |= RSPAMD_LANG_FLAG_SHORT;
+ }
}
}
}