From: Vsevolod Stakhov Date: Mon, 15 Jan 2018 21:10:33 +0000 (+0000) Subject: [Minor] Improve language detection debug logging X-Git-Tag: 1.7.0~274 X-Git-Url: http://git.ipfire.org/gitweb.cgi?a=commitdiff_plain;h=98f063228f4ca570ec8d40bc1a07bc785fe94e14;p=thirdparty%2Frspamd.git [Minor] Improve language detection debug logging --- diff --git a/src/libmime/lang_detection.c b/src/libmime/lang_detection.c index 17042f1a95..4daed73d13 100644 --- a/src/libmime/lang_detection.c +++ b/src/libmime/lang_detection.c @@ -45,6 +45,11 @@ struct rspamd_lang_detector { gsize short_text_limit; }; +#define msg_debug_lang_det(...) rspamd_default_log_function (G_LOG_LEVEL_DEBUG, \ + "langdet", task->task_pool->tag.uid, \ + G_STRFUNC, \ + __VA_ARGS__) + static guint rspamd_unigram_hash (gconstpointer key) { @@ -406,7 +411,8 @@ rspamd_language_detector_next_ngramm (rspamd_stat_token_t *tok, UChar *window, * Do full guess for a specific ngramm, checking all languages defined */ static void -rspamd_language_detector_process_ngramm_full (struct rspamd_lang_detector *d, +rspamd_language_detector_process_ngramm_full (struct rspamd_task *task, + struct rspamd_lang_detector *d, UChar *window, enum rspamd_language_gramm_type type, GHashTable *candidates) { @@ -459,7 +465,8 @@ rspamd_language_detector_process_ngramm_full (struct rspamd_lang_detector *d, * Check only candidates, if none found, switch to full version */ static gboolean -rspamd_language_detector_process_ngramm_update (struct rspamd_lang_detector *d, +rspamd_language_detector_process_ngramm_update (struct rspamd_task *task, + struct rspamd_lang_detector *d, UChar *window, enum rspamd_language_gramm_type type, GHashTable *candidates) { @@ -500,7 +507,8 @@ rspamd_language_detector_process_ngramm_update (struct rspamd_lang_detector *d, if (total_freq == 0) { /* Nothing found , do full scan which will also update candidates */ - rspamd_language_detector_process_ngramm_full (d, window, type, candidates); + rspamd_language_detector_process_ngramm_full (task, d, window, + type, candidates); return FALSE; } @@ -509,7 +517,8 @@ rspamd_language_detector_process_ngramm_update (struct rspamd_lang_detector *d, } static gboolean -rspamd_language_detector_update_guess (struct rspamd_lang_detector *d, +rspamd_language_detector_update_guess (struct rspamd_task *task, + struct rspamd_lang_detector *d, rspamd_stat_token_t *tok, GHashTable *candidates, enum rspamd_language_gramm_type type) { @@ -535,14 +544,14 @@ rspamd_language_detector_update_guess (struct rspamd_lang_detector *d, != -1) { if (rspamd_random_double_fast () > update_prob) { - if (!rspamd_language_detector_process_ngramm_update (d, window, + if (!rspamd_language_detector_process_ngramm_update (task, d, window, type, candidates)) { ret = FALSE; } } else { /* Try to do full update in case if we are missing some candidates */ - rspamd_language_detector_process_ngramm_full (d, window, type, + rspamd_language_detector_process_ngramm_full (task, d, window, type, candidates); } } @@ -551,7 +560,8 @@ rspamd_language_detector_update_guess (struct rspamd_lang_detector *d, } static void -rspamd_language_detector_detect_word (struct rspamd_lang_detector *d, +rspamd_language_detector_detect_word (struct rspamd_task *task, + struct rspamd_lang_detector *d, rspamd_stat_token_t *tok, GHashTable *candidates, enum rspamd_language_gramm_type type) { @@ -574,7 +584,8 @@ rspamd_language_detector_detect_word (struct rspamd_lang_detector *d, /* Split words */ while ((cur = rspamd_language_detector_next_ngramm (tok, window, wlen, cur)) != -1) { - rspamd_language_detector_process_ngramm_full (d, window, type, candidates); + rspamd_language_detector_process_ngramm_full (task, + d, window, type, candidates); } } @@ -583,11 +594,13 @@ rspamd_language_detector_detect_word (struct rspamd_lang_detector *d, * has the lowest probabilities */ static void -rspamd_language_detector_filter_negligible (GHashTable *candidates) +rspamd_language_detector_filter_negligible (struct rspamd_task *task, + GHashTable *candidates) { GHashTableIter it; gpointer k, v; struct rspamd_lang_detector_res *cand; + guint filtered = 0; gdouble max_prob = -(G_MAXDOUBLE); /* Normalize step */ @@ -618,43 +631,51 @@ rspamd_language_detector_filter_negligible (GHashTable *candidates) * prob2 is 2^4 less than prob1 */ if (max_prob - cand->prob > 1.5) { + msg_debug_lang_det ("exclude language %s: %.3f (%.3f max)", + cand->lang, cand->prob, max_prob); g_hash_table_iter_remove (&it); + filtered ++; } } + + msg_debug_lang_det ("removed %d languages", filtered); } static void -rspamd_language_detector_detect_type (struct rspamd_lang_detector *d, +rspamd_language_detector_detect_type (struct rspamd_task *task, + guint nwords, + struct rspamd_lang_detector *d, GArray *ucs_tokens, GHashTable *candidates, enum rspamd_language_gramm_type type, gboolean start_over) { - guint nparts = MIN (ucs_tokens->len, default_words); + guint nparts = MIN (ucs_tokens->len, nwords); goffset *selected_words; rspamd_stat_token_t *tok; guint i; selected_words = g_new0 (goffset, nparts); rspamd_language_detector_random_select (ucs_tokens, nparts, selected_words); + msg_debug_lang_det ("randomly selected %d words", nparts); /* Deal with the first word in a special case */ tok = &g_array_index (ucs_tokens, rspamd_stat_token_t, selected_words[0]); if (start_over) { - rspamd_language_detector_detect_word (d, tok, candidates, type); + rspamd_language_detector_detect_word (task, d, tok, candidates, type); } else { - rspamd_language_detector_update_guess (d, tok, candidates, type); + rspamd_language_detector_update_guess (task, d, tok, candidates, type); } for (i = 1; i < nparts; i ++) { tok = &g_array_index (ucs_tokens, rspamd_stat_token_t, selected_words[i]); - rspamd_language_detector_update_guess (d, tok, candidates, type); + rspamd_language_detector_update_guess (task, d, tok, candidates, type); } /* Filter negligible candidates */ - rspamd_language_detector_filter_negligible (candidates); + rspamd_language_detector_filter_negligible (task, candidates); } static gint @@ -681,14 +702,16 @@ enum rspamd_language_detected_type { }; static enum rspamd_language_detected_type -rspamd_language_detector_try_ngramm (struct rspamd_lang_detector *d, +rspamd_language_detector_try_ngramm (struct rspamd_task *task, + guint nwords, + struct rspamd_lang_detector *d, GArray *ucs_tokens, enum rspamd_language_gramm_type type, GHashTable *candidates) { guint cand_len; - rspamd_language_detector_detect_type (d, ucs_tokens, candidates, + rspamd_language_detector_detect_type (task, nwords, d, ucs_tokens, candidates, type, TRUE); cand_len = g_hash_table_size (candidates); @@ -704,7 +727,8 @@ rspamd_language_detector_try_ngramm (struct rspamd_lang_detector *d, } GPtrArray * -rspamd_language_detector_detect (struct rspamd_lang_detector *d, +rspamd_language_detector_detect (struct rspamd_task *task, + struct rspamd_lang_detector *d, GArray *ucs_tokens, gsize words_len) { GHashTable *candidates, *tcandidates; @@ -724,34 +748,46 @@ rspamd_language_detector_detect (struct rspamd_lang_detector *d, if (words_len < d->short_text_limit) { /* For short text, start directly from trigramms */ - r = rspamd_language_detector_try_ngramm (d, ucs_tokens, rs_trigramm, + msg_debug_lang_det ("text is less than %z words: %z, start with trigramms", + d->short_text_limit, words_len); + r = rspamd_language_detector_try_ngramm (task, default_words, d, + ucs_tokens, rs_trigramm, candidates); if (r == rs_detect_none) { - r = rspamd_language_detector_try_ngramm (d, ucs_tokens, rs_bigramm, + msg_debug_lang_det ("short mode; no trigramms found, switch to bigramms"); + r = rspamd_language_detector_try_ngramm (task, default_words, d, + ucs_tokens, rs_bigramm, candidates); if (r == rs_detect_none) { - r = rspamd_language_detector_try_ngramm (d, ucs_tokens, rs_unigramm, + msg_debug_lang_det ("short mode; no trigramms found, " + "switch to unigramms"); + r = rspamd_language_detector_try_ngramm (task, default_words, + d, ucs_tokens, rs_unigramm, candidates); } } } else { /* Start with unigramms */ - r = rspamd_language_detector_try_ngramm (d, ucs_tokens, rs_unigramm, + r = rspamd_language_detector_try_ngramm (task, default_words, + d, ucs_tokens, rs_unigramm, candidates); switch (r) { case rs_detect_none: case rs_detect_single: - /* No unigramms found or single set found, no reason to continue */; + msg_debug_lang_det ("no unigramms found, try bigramms"); break; case rs_detect_multiple: /* Try to improve guess */ + msg_debug_lang_det ("unigramms pass finished, found %d candidates", + (gint)g_hash_table_size (candidates)); tcandidates = g_hash_table_new_full (rspamd_str_hash, rspamd_str_equal, NULL, g_free); - r = rspamd_language_detector_try_ngramm (d, ucs_tokens, rs_trigramm, + r = rspamd_language_detector_try_ngramm (task, default_words, + d, ucs_tokens, rs_trigramm, tcandidates); switch (r) { @@ -789,7 +825,8 @@ rspamd_language_detector_detect (struct rspamd_lang_detector *d, g_hash_table_unref (candidates); candidates = tcandidates; - msg_err ("trigramms checked, %.3f mean, %.4f stddev", mean, std); + msg_debug_lang_det ("trigramms checked, %.3f mean, %.4f stddev", + mean, std); if (std / fabs (mean) < 0.01) { /* Try trigramms */ @@ -797,7 +834,10 @@ rspamd_language_detector_detect (struct rspamd_lang_detector *d, rspamd_str_equal, NULL, g_free); - r = rspamd_language_detector_try_ngramm (d, ucs_tokens, + r = rspamd_language_detector_try_ngramm (task, + default_words * 2, + d, + ucs_tokens, rs_trigramm, tcandidates); @@ -819,7 +859,7 @@ rspamd_language_detector_detect (struct rspamd_lang_detector *d, while (g_hash_table_iter_next (&it, &k, &v)) { cand = (struct rspamd_lang_detector_res *) v; - msg_err ("%s -> %.2f", cand->lang, cand->prob); + msg_debug_lang_det ("final probability %s -> %.2f", cand->lang, cand->prob); g_ptr_array_add (result, cand); g_hash_table_iter_steal (&it); } diff --git a/src/libmime/lang_detection.h b/src/libmime/lang_detection.h index 048e425f6c..0058801b8a 100644 --- a/src/libmime/lang_detection.h +++ b/src/libmime/lang_detection.h @@ -23,6 +23,7 @@ struct rspamd_lang_detector; struct rspamd_language_elt; +struct rspamd_task; struct rspamd_lang_detector_res { gdouble prob; @@ -54,7 +55,8 @@ void rspamd_language_detector_to_ucs (struct rspamd_lang_detector *d, * @param words_len * @return array of struct rspamd_lang_detector_res sorted by freq descending */ -GPtrArray * rspamd_language_detector_detect (struct rspamd_lang_detector *d, +GPtrArray * rspamd_language_detector_detect (struct rspamd_task *task, + struct rspamd_lang_detector *d, GArray *ucs_tokens, gsize words_len); #endif diff --git a/src/libmime/message.c b/src/libmime/message.c index 2a78011001..49cbc585c5 100644 --- a/src/libmime/message.c +++ b/src/libmime/message.c @@ -107,7 +107,8 @@ rspamd_extract_words (struct rspamd_task *task, } } - part->languages = rspamd_language_detector_detect (task->lang_det, + part->languages = rspamd_language_detector_detect (task, + task->lang_det, part->ucs32_words, ucs_len); if (part->languages->len > 0) {