]> git.ipfire.org Git - thirdparty/rspamd.git/commitdiff
[Minor] Chartable: Adjustments to the metatokens handling
authorVsevolod Stakhov <vsevolod@rspamd.com>
Thu, 8 Dec 2022 21:36:36 +0000 (21:36 +0000)
committerVsevolod Stakhov <vsevolod@rspamd.com>
Thu, 8 Dec 2022 21:36:36 +0000 (21:36 +0000)
src/plugins/chartable.cxx

index c5820c606551d534dcfd00684f4943142e6d7a75..6e3fd9b101cc9bcbfc8990ab953f69035a0f4163 100644 (file)
 INIT_LOG_MODULE(chartable)
 
 /* Initialization */
-gint chartable_module_init (struct rspamd_config *cfg, struct module_ctx **ctx);
-gint chartable_module_config (struct rspamd_config *cfg, bool validate);
-gint chartable_module_reconfig (struct rspamd_config *cfg);
+gint chartable_module_init(struct rspamd_config *cfg, struct module_ctx **ctx);
+
+gint chartable_module_config(struct rspamd_config *cfg, bool validate);
+
+gint chartable_module_reconfig(struct rspamd_config *cfg);
 
 module_t chartable_module = {
-               "chartable",
-               chartable_module_init,
-               chartable_module_config,
-               chartable_module_reconfig,
-               nullptr,
-               RSPAMD_MODULE_VER,
-               (guint)-1,
+       "chartable",
+       chartable_module_init,
+       chartable_module_config,
+       chartable_module_reconfig,
+       nullptr,
+       RSPAMD_MODULE_VER,
+       (guint) -1,
 };
 
 struct chartable_ctx {
@@ -68,21 +70,22 @@ struct chartable_ctx {
 };
 
 static inline struct chartable_ctx *
-chartable_get_context (struct rspamd_config *cfg)
+chartable_get_context(struct rspamd_config *cfg)
 {
-       return (struct chartable_ctx *)g_ptr_array_index (cfg->c_modules,
-                       chartable_module.ctx_offset);
+       return (struct chartable_ctx *) g_ptr_array_index(cfg->c_modules,
+               chartable_module.ctx_offset);
 }
 
-static void chartable_symbol_callback (struct rspamd_task *task,
-                                                                          struct rspamd_symcache_dynamic_item *item,
-                                                                          void *unused);
-static void chartable_url_symbol_callback (struct rspamd_task *task,
-                                                                                  struct rspamd_symcache_dynamic_item *item,
-                                                                                  void *unused);
+static void chartable_symbol_callback(struct rspamd_task *task,
+                                                                         struct rspamd_symcache_dynamic_item *item,
+                                                                         void *unused);
+
+static void chartable_url_symbol_callback(struct rspamd_task *task,
+                                                                                 struct rspamd_symcache_dynamic_item *item,
+                                                                                 void *unused);
 
 gint
-chartable_module_init (struct rspamd_config *cfg, struct module_ctx **ctx)
+chartable_module_init(struct rspamd_config *cfg, struct module_ctx **ctx)
 {
        struct chartable_ctx *chartable_module_ctx;
 
@@ -90,40 +93,40 @@ chartable_module_init (struct rspamd_config *cfg, struct module_ctx **ctx)
                struct chartable_ctx);
        chartable_module_ctx->max_word_len = 10;
 
-       *ctx = (struct module_ctx *)chartable_module_ctx;
+       *ctx = (struct module_ctx *) chartable_module_ctx;
 
        return 0;
 }
 
 
 gint
-chartable_module_config (struct rspamd_config *cfg, bool validate)
+chartable_module_config(struct rspamd_config *cfg, bool _)
 {
        const ucl_object_t *value;
        gint res = TRUE;
-       struct chartable_ctx *chartable_module_ctx = chartable_get_context (cfg);
+       struct chartable_ctx *chartable_module_ctx = chartable_get_context(cfg);
 
-       if (!rspamd_config_is_module_enabled (cfg, "chartable")) {
+       if (!rspamd_config_is_module_enabled(cfg, "chartable")) {
                return TRUE;
        }
 
        if ((value =
-               rspamd_config_get_module_opt (cfg, "chartable", "symbol")) != nullptr) {
-               chartable_module_ctx->symbol = ucl_obj_tostring (value);
+                        rspamd_config_get_module_opt(cfg, "chartable", "symbol")) != nullptr) {
+               chartable_module_ctx->symbol = ucl_obj_tostring(value);
        }
        else {
                chartable_module_ctx->symbol = DEFAULT_SYMBOL;
        }
        if ((value =
-               rspamd_config_get_module_opt (cfg, "chartable", "url_symbol")) != nullptr) {
-               chartable_module_ctx->url_symbol = ucl_obj_tostring (value);
+                        rspamd_config_get_module_opt(cfg, "chartable", "url_symbol")) != nullptr) {
+               chartable_module_ctx->url_symbol = ucl_obj_tostring(value);
        }
        else {
                chartable_module_ctx->url_symbol = DEFAULT_URL_SYMBOL;
        }
        if ((value =
-               rspamd_config_get_module_opt (cfg, "chartable", "threshold")) != nullptr) {
-               if (!ucl_obj_todouble_safe (value, &chartable_module_ctx->threshold)) {
+                        rspamd_config_get_module_opt(cfg, "chartable", "threshold")) != nullptr) {
+               if (!ucl_obj_todouble_safe(value, &chartable_module_ctx->threshold)) {
                        msg_warn_config ("invalid numeric value");
                        chartable_module_ctx->threshold = DEFAULT_THRESHOLD;
                }
@@ -132,37 +135,37 @@ chartable_module_config (struct rspamd_config *cfg, bool validate)
                chartable_module_ctx->threshold = DEFAULT_THRESHOLD;
        }
        if ((value =
-                       rspamd_config_get_module_opt (cfg, "chartable", "max_word_len")) != nullptr) {
-               chartable_module_ctx->max_word_len = ucl_object_toint (value);
+                        rspamd_config_get_module_opt(cfg, "chartable", "max_word_len")) != nullptr) {
+               chartable_module_ctx->max_word_len = ucl_object_toint(value);
        }
        else {
                chartable_module_ctx->threshold = DEFAULT_THRESHOLD;
        }
 
-       rspamd_symcache_add_symbol (cfg->cache,
-                       chartable_module_ctx->symbol,
-                       0,
-                       chartable_symbol_callback,
-                       nullptr,
-                       SYMBOL_TYPE_NORMAL,
-                       -1);
-       rspamd_symcache_add_symbol (cfg->cache,
-                       chartable_module_ctx->url_symbol,
-                       0,
-                       chartable_url_symbol_callback,
-                       nullptr,
-                       SYMBOL_TYPE_NORMAL,
-                       -1);
-
-       msg_info_config ("init internal chartable module");
+       rspamd_symcache_add_symbol(cfg->cache,
+               chartable_module_ctx->symbol,
+               0,
+               chartable_symbol_callback,
+               nullptr,
+               SYMBOL_TYPE_NORMAL,
+               -1);
+       rspamd_symcache_add_symbol(cfg->cache,
+               chartable_module_ctx->url_symbol,
+               0,
+               chartable_url_symbol_callback,
+               nullptr,
+               SYMBOL_TYPE_NORMAL,
+               -1);
+
+       msg_info_config("init internal chartable module");
 
        return res;
 }
 
 gint
-chartable_module_reconfig (struct rspamd_config *cfg)
+chartable_module_reconfig(struct rspamd_config *cfg)
 {
-       return chartable_module_config (cfg, false);
+       return chartable_module_config(cfg, false);
 }
 
 static const auto latin_confusable = ankerl::unordered_dense::set<int>{
@@ -321,19 +324,18 @@ static const auto latin_confusable = ankerl::unordered_dense::set<int>{
 };
 
 static gboolean
-rspamd_can_alias_latin (gint ch)
+rspamd_can_alias_latin(gint ch)
 {
        return latin_confusable.contains(ch);
 }
 
 static gdouble
-rspamd_chartable_process_word_utf (struct rspamd_task *task,
-                                                                  rspamd_stat_token_t *w,
-                                                                  gboolean is_url,
-                                                                  guint *ncap,
-                                                                  struct chartable_ctx *chartable_module_ctx,
-                                                                  const gchar *lang,
-                                                                  gboolean ignore_diacritics)
+rspamd_chartable_process_word_utf(struct rspamd_task *task,
+                                                                 rspamd_stat_token_t *w,
+                                                                 gboolean is_url,
+                                                                 guint *ncap,
+                                                                 struct chartable_ctx *chartable_module_ctx,
+                                                                 gboolean ignore_diacritics)
 {
        const UChar32 *p, *end;
        gdouble badness = 0.0;
@@ -357,12 +359,12 @@ rspamd_chartable_process_word_utf (struct rspamd_task *task,
        while (p < end) {
                uc = *p++;
 
-               if (((gint32)uc) < 0) {
+               if (((gint32) uc) < 0) {
                        break;
                }
 
-               sc = ublock_getCode (uc);
-               cat = u_charType (uc);
+               sc = ublock_getCode(uc);
+               cat = u_charType(uc);
 
                if (!ignore_diacritics) {
                        if (cat == U_NON_SPACING_MARK ||
@@ -375,10 +377,10 @@ rspamd_chartable_process_word_utf (struct rspamd_task *task,
                        }
                }
 
-               if (u_isalpha (uc)) {
+               if (u_isalpha(uc)) {
 
                        if (sc <= UBLOCK_COMBINING_DIACRITICAL_MARKS ||
-                                       sc == UBLOCK_LATIN_EXTENDED_ADDITIONAL) {
+                               sc == UBLOCK_LATIN_EXTENDED_ADDITIONAL) {
                                /*
                                 * Assume all latin, IPA, diacritic and space modifiers
                                 * characters as basic latin
@@ -386,16 +388,16 @@ rspamd_chartable_process_word_utf (struct rspamd_task *task,
                                sc = UBLOCK_BASIC_LATIN;
                        }
 
-                       if (sc != UBLOCK_BASIC_LATIN && u_isupper (uc)) {
+                       if (sc != UBLOCK_BASIC_LATIN && u_isupper(uc)) {
                                if (ncap) {
-                                       (*ncap) ++;
+                                       (*ncap)++;
                                }
                        }
 
                        if (state == got_digit) {
                                /* Penalize digit -> alpha translations */
                                if (!is_url && sc != UBLOCK_BASIC_LATIN &&
-                                               prev_state != start_process) {
+                                       prev_state != start_process) {
                                        badness += 0.25;
                                }
                        }
@@ -404,15 +406,15 @@ rspamd_chartable_process_word_utf (struct rspamd_task *task,
                                if (same_script_count > 0) {
                                        if (sc != UBLOCK_BASIC_LATIN && last_is_latin) {
 
-                                               if (rspamd_can_alias_latin (uc)) {
-                                                       badness += 1.0 / (gdouble)same_script_count;
+                                               if (rspamd_can_alias_latin(uc)) {
+                                                       badness += 1.0 / (gdouble) same_script_count;
                                                }
 
                                                last_is_latin = 0;
                                                same_script_count = 1;
                                        }
                                        else {
-                                               same_script_count ++;
+                                               same_script_count++;
                                        }
                                }
                                else {
@@ -425,7 +427,7 @@ rspamd_chartable_process_word_utf (struct rspamd_task *task,
                        state = got_alpha;
 
                }
-               else if (u_isdigit (uc)) {
+               else if (u_isdigit(uc)) {
                        if (state != got_digit) {
                                prev_state = state;
                        }
@@ -443,7 +445,7 @@ rspamd_chartable_process_word_utf (struct rspamd_task *task,
                        same_script_count = 0;
                }
 
-               nsym ++;
+               nsym++;
        }
 
        if (nspecial > 0) {
@@ -467,17 +469,17 @@ rspamd_chartable_process_word_utf (struct rspamd_task *task,
        }
 
        msg_debug_chartable ("word %*s, badness: %.2f",
-                       (gint)w->normalized.len, w->normalized.begin,
-                       badness);
+               (gint) w->normalized.len, w->normalized.begin,
+               badness);
 
        return badness;
 }
 
 static gdouble
-rspamd_chartable_process_word_ascii (struct rspamd_task *task,
-                                                                        rspamd_stat_token_t *w,
-                                                                        gboolean is_url,
-                                                                        struct chartable_ctx *chartable_module_ctx)
+rspamd_chartable_process_word_ascii(struct rspamd_task *task,
+                                                                       rspamd_stat_token_t *w,
+                                                                       gboolean is_url,
+                                                                       struct chartable_ctx *chartable_module_ctx)
 {
        gdouble badness = 0.0;
        enum {
@@ -516,12 +518,12 @@ rspamd_chartable_process_word_ascii (struct rspamd_task *task,
 
                                if (same_script_count > 0) {
                                        if (sc != last_sc) {
-                                               badness += 1.0 / (gdouble)same_script_count;
+                                               badness += 1.0 / (gdouble) same_script_count;
                                                last_sc = sc;
                                                same_script_count = 1;
                                        }
                                        else {
-                                               same_script_count ++;
+                                               same_script_count++;
                                        }
                                }
                                else {
@@ -544,7 +546,7 @@ rspamd_chartable_process_word_ascii (struct rspamd_task *task,
                        same_script_count = 0;
                }
 
-               p ++;
+               p++;
        }
 
        if (badness > 4.0) {
@@ -552,24 +554,24 @@ rspamd_chartable_process_word_ascii (struct rspamd_task *task,
        }
 
        msg_debug_chartable ("word %*s, badness: %.2f",
-                       (gint)w->normalized.len, w->normalized.begin,
-                       badness);
+               (gint) w->normalized.len, w->normalized.begin,
+               badness);
 
        return badness;
 }
 
 static gboolean
-rspamd_chartable_process_part (struct rspamd_task *task,
-                                                          struct rspamd_mime_text_part *part,
-                                                          struct chartable_ctx *chartable_module_ctx,
-                                                          gboolean ignore_diacritics)
+rspamd_chartable_process_part(struct rspamd_task *task,
+                                                         struct rspamd_mime_text_part *part,
+                                                         struct chartable_ctx *chartable_module_ctx,
+                                                         gboolean ignore_diacritics)
 {
        rspamd_stat_token_t *w;
        guint i, ncap = 0;
        gdouble cur_score = 0.0;
 
        if (part == nullptr || part->utf_words == nullptr ||
-                       part->utf_words->len == 0 || part->nwords == 0) {
+               part->utf_words->len == 0 || part->nwords == 0) {
                return FALSE;
        }
 
@@ -579,12 +581,12 @@ rspamd_chartable_process_part (struct rspamd_task *task,
                if ((w->flags & RSPAMD_STAT_TOKEN_FLAG_TEXT)) {
 
                        if (w->flags & RSPAMD_STAT_TOKEN_FLAG_UTF) {
-                               cur_score += rspamd_chartable_process_word_utf (task, w, FALSE,
-                                               &ncap, chartable_module_ctx, part->language, ignore_diacritics);
+                               cur_score += rspamd_chartable_process_word_utf(task, w, FALSE,
+                                       &ncap, chartable_module_ctx, ignore_diacritics);
                        }
                        else {
-                               cur_score += rspamd_chartable_process_word_ascii (task, w,
-                                               FALSE, chartable_module_ctx);
+                               cur_score += rspamd_chartable_process_word_ascii(task, w,
+                                       FALSE, chartable_module_ctx);
                        }
                }
        }
@@ -596,7 +598,7 @@ rspamd_chartable_process_part (struct rspamd_task *task,
         */
        part->capital_letters += ncap;
 
-       cur_score /= (gdouble)part->nwords;
+       cur_score /= (gdouble) part->nwords;
 
        if (cur_score > 1.0) {
                cur_score = 1.0;
@@ -604,7 +606,7 @@ rspamd_chartable_process_part (struct rspamd_task *task,
 
        if (cur_score > chartable_module_ctx->threshold) {
                rspamd_task_insert_result (task, chartable_module_ctx->symbol,
-                               cur_score, nullptr);
+                       cur_score, nullptr);
                return TRUE;
        }
 
@@ -612,37 +614,37 @@ rspamd_chartable_process_part (struct rspamd_task *task,
 }
 
 static void
-chartable_symbol_callback (struct rspamd_task *task,
-               struct rspamd_symcache_dynamic_item *item,
-               void *unused)
+chartable_symbol_callback(struct rspamd_task *task,
+                                                 struct rspamd_symcache_dynamic_item *item,
+                                                 void *_)
 {
        guint i;
        struct rspamd_mime_text_part *part;
-       struct chartable_ctx *chartable_module_ctx = chartable_get_context (task->cfg);
-       const gchar *language = nullptr;
-       gboolean ignore_diacritics = FALSE, seen_violated_part = FALSE;
+       struct chartable_ctx *chartable_module_ctx = chartable_get_context(task->cfg);
+       gboolean ignore_diacritics = TRUE, seen_violated_part = FALSE;
 
        /* Check if we have parts with diacritic symbols language */
-       PTR_ARRAY_FOREACH (MESSAGE_FIELD (task, text_parts), i, part) {
+       PTR_ARRAY_FOREACH (MESSAGE_FIELD(task, text_parts), i, part) {
                if (part->languages && part->languages->len > 0) {
-                       struct rspamd_lang_detector_res *lang =
-                                       (struct rspamd_lang_detector_res *)g_ptr_array_index (part->languages, 0);
+                       auto *lang = (struct rspamd_lang_detector_res *) g_ptr_array_index(part->languages, 0);
                        gint flags;
 
-                       flags = rspamd_language_detector_elt_flags (lang->elt);
+                       flags = rspamd_language_detector_elt_flags(lang->elt);
 
-                       if (flags & RS_LANGUAGE_DIACRITICS) {
+                       if ((flags & RS_LANGUAGE_DIACRITICS)) {
                                ignore_diacritics = TRUE;
                        }
+                       else if (lang->prob > 0.75) {
+                               ignore_diacritics = FALSE;
+                       }
                }
 
-               if (rspamd_chartable_process_part (task, part, chartable_module_ctx,
-                               ignore_diacritics)) {
+               if (rspamd_chartable_process_part(task, part, chartable_module_ctx, ignore_diacritics)) {
                        seen_violated_part = TRUE;
                }
        }
 
-       if (MESSAGE_FIELD (task, text_parts)->len == 0) {
+       if (MESSAGE_FIELD(task, text_parts)->len == 0) {
                /* No text parts, assume that we should ignore diacritics checks for metatokens */
                ignore_diacritics = TRUE;
        }
@@ -653,12 +655,12 @@ chartable_symbol_callback (struct rspamd_task *task,
                gsize arlen = task->meta_words->len;
 
                for (i = 0; i < arlen; i++) {
-                       w = &g_array_index (task->meta_words, rspamd_stat_token_t, i);
-                       cur_score += rspamd_chartable_process_word_utf (task, w, FALSE,
-                                       nullptr, chartable_module_ctx, language, ignore_diacritics);
+                       w = &g_array_index(task->meta_words, rspamd_stat_token_t, i);
+                       cur_score += rspamd_chartable_process_word_utf(task, w, FALSE,
+                               nullptr, chartable_module_ctx, ignore_diacritics);
                }
 
-               cur_score /= (gdouble)arlen;
+               cur_score /= (gdouble) (arlen + 1);
 
                if (cur_score > 1.0) {
                        cur_score = 1.0;
@@ -672,19 +674,19 @@ chartable_symbol_callback (struct rspamd_task *task,
                                }
                        }
 
-                       rspamd_task_insert_result (task, chartable_module_ctx->symbol,
-                                       cur_score, "subject");
+                       rspamd_task_insert_result(task, chartable_module_ctx->symbol,
+                               cur_score, "subject");
 
                }
        }
 
-       rspamd_symcache_finalize_item (task, item);
+       rspamd_symcache_finalize_item(task, item);
 }
 
 static void
-chartable_url_symbol_callback (struct rspamd_task *task,
-               struct rspamd_symcache_dynamic_item *item,
-               void *unused)
+chartable_url_symbol_callback(struct rspamd_task *task,
+                                                         struct rspamd_symcache_dynamic_item *item,
+                                                         void *unused)
 {
        /* XXX: TODO: unbreak module once URLs unicode project is over */
 #if 0
@@ -751,5 +753,5 @@ chartable_url_symbol_callback (struct rspamd_task *task,
 
        }
 #endif
-       rspamd_symcache_finalize_item (task, item);
+       rspamd_symcache_finalize_item(task, item);
 }