struct rspamd_language_unicode_match {
const char *lang;
int unicode_code;
+ int flags; /* enum rspamd_language_elt_flags */
};
/*
* List of languages detected by unicode scripts
*/
static const struct rspamd_language_unicode_match unicode_langs[] = {
- {"el", RSPAMD_UNICODE_GREEK},
- {"ml", RSPAMD_UNICODE_MALAYALAM},
- {"te", RSPAMD_UNICODE_TELUGU},
- {"ta", RSPAMD_UNICODE_TAMIL},
- {"gu", RSPAMD_UNICODE_GUJARATI},
- {"th", RSPAMD_UNICODE_THAI},
- {"ka", RSPAMD_UNICODE_GEORGIAN},
- {"si", RSPAMD_UNICODE_SINHALA},
- {"hy", RSPAMD_UNICODE_ARMENIAN},
- {"ja", RSPAMD_UNICODE_JP},
- {"ko", RSPAMD_UNICODE_HANGUL},
+ {"el", RSPAMD_UNICODE_GREEK, 0},
+ {"ml", RSPAMD_UNICODE_MALAYALAM, 0},
+ {"te", RSPAMD_UNICODE_TELUGU, 0},
+ {"ta", RSPAMD_UNICODE_TAMIL, 0},
+ {"gu", RSPAMD_UNICODE_GUJARATI, 0},
+ {"th", RSPAMD_UNICODE_THAI, RS_LANGUAGE_DIACRITICS},
+ {"ka", RSPAMD_UNICODE_GEORGIAN, 0},
+ {"si", RSPAMD_UNICODE_SINHALA, 0},
+ {"hy", RSPAMD_UNICODE_ARMENIAN, 0},
+ {"ja", RSPAMD_UNICODE_JP, 0},
+ {"ko", RSPAMD_UNICODE_HANGUL, 0},
};
/*
g_assert(ret > 0); /* must be unique */
kh_value(d->languages, k) = nelt;
- /* Mark Thai as having diacritics to prevent R_MIXED_CHARSET false positives */
- if (strcmp(nelt->name, "th") == 0) {
- nelt->flags |= RS_LANGUAGE_DIACRITICS;
- msg_debug_lang_det_cfg("marked Thai language as having diacritics");
+ /* Apply flags from unicode_langs structure for glyph-based languages */
+ const struct rspamd_language_unicode_match *unicode_match;
+ unicode_match = rspamd_language_search_unicode_match(nelt->name, unicode_langs,
+ G_N_ELEMENTS(unicode_langs));
+ if (unicode_match != NULL && unicode_match->flags != 0) {
+ nelt->flags |= unicode_match->flags;
+ msg_debug_lang_det_cfg("applied flags from unicode_langs for language %s: %s",
+ nelt->name,
+ rspamd_language_detector_print_flags(nelt));
}
ucl_object_unref(top);