From: Prem Date: Sat, 3 Jan 2026 11:26:04 +0000 (+0530) Subject: modified as suggested X-Git-Tag: 3.14.3~12^2~3 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=b5d82dc857e70473db6e93d9a8a6b4a033b159e1;p=thirdparty%2Frspamd.git modified as suggested --- diff --git a/src/libmime/lang_detection.c b/src/libmime/lang_detection.c index 167b44f565..82997a7749 100644 --- a/src/libmime/lang_detection.c +++ b/src/libmime/lang_detection.c @@ -41,23 +41,24 @@ static const char *default_languages_path = RSPAMD_SHAREDIR "/languages"; struct rspamd_language_unicode_match { const char *lang; int unicode_code; + int flags; /* enum rspamd_language_elt_flags */ }; /* * List of languages detected by unicode scripts */ static const struct rspamd_language_unicode_match unicode_langs[] = { - {"el", RSPAMD_UNICODE_GREEK}, - {"ml", RSPAMD_UNICODE_MALAYALAM}, - {"te", RSPAMD_UNICODE_TELUGU}, - {"ta", RSPAMD_UNICODE_TAMIL}, - {"gu", RSPAMD_UNICODE_GUJARATI}, - {"th", RSPAMD_UNICODE_THAI}, - {"ka", RSPAMD_UNICODE_GEORGIAN}, - {"si", RSPAMD_UNICODE_SINHALA}, - {"hy", RSPAMD_UNICODE_ARMENIAN}, - {"ja", RSPAMD_UNICODE_JP}, - {"ko", RSPAMD_UNICODE_HANGUL}, + {"el", RSPAMD_UNICODE_GREEK, 0}, + {"ml", RSPAMD_UNICODE_MALAYALAM, 0}, + {"te", RSPAMD_UNICODE_TELUGU, 0}, + {"ta", RSPAMD_UNICODE_TAMIL, 0}, + {"gu", RSPAMD_UNICODE_GUJARATI, 0}, + {"th", RSPAMD_UNICODE_THAI, RS_LANGUAGE_DIACRITICS}, + {"ka", RSPAMD_UNICODE_GEORGIAN, 0}, + {"si", RSPAMD_UNICODE_SINHALA, 0}, + {"hy", RSPAMD_UNICODE_ARMENIAN, 0}, + {"ja", RSPAMD_UNICODE_JP, 0}, + {"ko", RSPAMD_UNICODE_HANGUL, 0}, }; /* @@ -695,10 +696,15 @@ rspamd_language_detector_read_file(struct rspamd_config *cfg, g_assert(ret > 0); /* must be unique */ kh_value(d->languages, k) = nelt; - /* Mark Thai as having diacritics to prevent R_MIXED_CHARSET false positives */ - if (strcmp(nelt->name, "th") == 0) { - nelt->flags |= RS_LANGUAGE_DIACRITICS; - msg_debug_lang_det_cfg("marked Thai language as having diacritics"); + /* Apply flags from unicode_langs structure for glyph-based languages */ + const struct rspamd_language_unicode_match *unicode_match; + unicode_match = rspamd_language_search_unicode_match(nelt->name, unicode_langs, + G_N_ELEMENTS(unicode_langs)); + if (unicode_match != NULL && unicode_match->flags != 0) { + nelt->flags |= unicode_match->flags; + msg_debug_lang_det_cfg("applied flags from unicode_langs for language %s: %s", + nelt->name, + rspamd_language_detector_print_flags(nelt)); } ucl_object_unref(top);