From: Vsevolod Stakhov Date: Sat, 8 Sep 2018 15:40:05 +0000 (+0100) Subject: [Fix] Fix various corner cases for language detection X-Git-Tag: 1.8.0~163 X-Git-Url: http://git.ipfire.org/gitweb.cgi?a=commitdiff_plain;h=e81e8ffb30b953c42e52e5bf20d97f820e8b08e0;p=thirdparty%2Frspamd.git [Fix] Fix various corner cases for language detection --- diff --git a/src/libmime/lang_detection.c b/src/libmime/lang_detection.c index 64a602e7b7..fbc5f56c90 100644 --- a/src/libmime/lang_detection.c +++ b/src/libmime/lang_detection.c @@ -1608,8 +1608,10 @@ rspamd_language_detector_detect (struct rspamd_task *task, candidates); if (r == rs_detect_none) { - msg_debug_lang_det ("no trigramms found, switch to nothing"); - } else if (r == rs_detect_multiple) { + msg_debug_lang_det ("no trigramms found, fallback to english"); + rspamd_language_detector_set_language (task, part, "en"); + } + else if (r == rs_detect_multiple) { /* Check our guess */ mean = 0.0; @@ -1656,34 +1658,38 @@ rspamd_language_detector_detect (struct rspamd_task *task, } /* Now, convert hash to array and sort it */ - result = g_ptr_array_sized_new (kh_size (candidates)); + if (r != rs_detect_none && kh_size (candidates) > 0) { + result = g_ptr_array_sized_new (kh_size (candidates)); - kh_foreach_value (candidates, cand, { - if (!isnan (cand->prob)) { - msg_debug_lang_det ("final probability %s -> %.2f", cand->lang, - cand->prob); - g_ptr_array_add (result, cand); - } - }); + kh_foreach_value (candidates, cand, { + if (!isnan (cand->prob)) { + msg_debug_lang_det ("final probability %s -> %.2f", cand->lang, + cand->prob); + g_ptr_array_add (result, cand); + } + }); - if (frequency_heuristic_applied) { - g_ptr_array_sort_with_data (result, - rspamd_language_detector_cmp_heuristic, (gpointer) &cbd); - } else { - g_ptr_array_sort (result, rspamd_language_detector_cmp); - } + if (frequency_heuristic_applied) { + g_ptr_array_sort_with_data (result, + rspamd_language_detector_cmp_heuristic, (gpointer) &cbd); + } else { + g_ptr_array_sort (result, rspamd_language_detector_cmp); + } - kh_destroy (rspamd_candidates_hash, candidates); + if (result->len > 0 && !frequency_heuristic_applied) { + cand = g_ptr_array_index (result, 0); + cand->elt->occurencies++; + d->total_occurencies++; + } - if (result->len > 0 && !frequency_heuristic_applied) { - cand = g_ptr_array_index (result, 0); - cand->elt->occurencies++; - d->total_occurencies++; + part->languages = result; + ret = TRUE; + } + else if (part->languages == NULL) { + rspamd_language_detector_set_language (task, part, "en"); } - part->languages = result; - - ret = TRUE; + kh_destroy (rspamd_candidates_hash, candidates); } end_ticks = rspamd_get_ticks (TRUE); diff --git a/src/libmime/message.c b/src/libmime/message.c index 70a08a06ff..0d4581ad7b 100644 --- a/src/libmime/message.c +++ b/src/libmime/message.c @@ -223,13 +223,17 @@ rspamd_mime_part_detect_language (struct rspamd_task *task, { struct rspamd_lang_detector_res *lang; - if (part->utf_words && task->lang_det) { + if (!IS_PART_EMPTY (part) && part->utf_words && part->utf_words->len > 0 && + task->lang_det) { if (rspamd_language_detector_detect (task, task->lang_det, part)) { lang = g_ptr_array_index (part->languages, 0); part->language = lang->lang; msg_info_task ("detected part language: %s", part->language); } + else { + part->language = "en"; /* Safe fallback */ + } } }