enum rspamd_language_elt_flags {
RS_LANGUAGE_DEFAULT = 0,
- RS_LANGUAGE_LATIN = (1 <<0),
+ RS_LANGUAGE_LATIN = (1 << 0),
+ RS_LANGUAGE_TIER1 = (1 << 1),
+ RS_LANGUAGE_TIER2 = (1 << 2),
};
+struct rspamd_language_unicode_match {
+ const gchar *lang;
+ gint unicode_code;
+};
+
+/*
+ * List of languages detected by unicode scripts
+ */
+static const struct rspamd_language_unicode_match unicode_langs[] = {
+ {"el", UBLOCK_GREEK},
+ {"ml", UBLOCK_MALAYALAM},
+ {"te", UBLOCK_TELUGU},
+ {"ta", UBLOCK_TAMIL},
+ {"gu", UBLOCK_GUJARATI},
+ {"th", UBLOCK_THAI},
+ {"kn", UBLOCK_KANNADA},
+ {"ka", UBLOCK_GEORGIAN},
+ {"si", UBLOCK_SINHALA},
+ {"hy", UBLOCK_ARMENIAN},
+ {"lo", UBLOCK_LAO},
+ {"km", UBLOCK_KHMER}
+};
+
+/*
+ * List of languages to apply unigramms only
+ */
+static const gchar *unigramms_langs[] = {
+ "ja",
+ "ko",
+ "zh-CN",
+ "zh-TW"
+};
+
+
struct rspamd_language_elt {
const gchar *name; /* e.g. "en" or "ru" */
enum rspamd_language_elt_flags flags;
guint unigramms_total; /* total frequencies for unigramms */
- guint bigramms_total; /* total frequencies for bigramms */
guint trigramms_total; /* total frequencies for trigramms */
guint occurencies; /* total number of parts with this language */
};
struct rspamd_lang_detector {
GPtrArray *languages;
GHashTable *unigramms; /* unigramms frequencies */
- GHashTable *bigramms; /* bigramms frequencies */
GHashTable *trigramms; /* trigramms frequencies */
+ GHashTable *unicode_scripts; /* indexed by unicode script */
UConverter *uchar_converter;
gsize short_text_limit;
gsize total_occurencies; /* number of all languages found */
return memcmp (v, v2, sizeof (UChar)) == 0;
}
-static guint
-rspamd_bigram_hash (gconstpointer key)
-{
- return rspamd_cryptobox_fast_hash (key, 2 * sizeof (UChar), rspamd_hash_seed ());
-}
-
-static gboolean
-rspamd_bigram_equal (gconstpointer v, gconstpointer v2)
-{
- return memcmp (v, v2, 2 * sizeof (UChar)) == 0;
-}
-
static guint
rspamd_trigram_hash (gconstpointer key)
{
target = d->unigramms;
break;
case 2:
- target = d->bigramms;
+ /* Ignore */
break;
case 3:
target = d->trigramms;
else {
nelt->unigramms_total = ucl_object_toint (ucl_array_find_index (n_words,
0));
- nelt->bigramms_total = ucl_object_toint (ucl_array_find_index (n_words,
- 1));
nelt->trigramms_total = ucl_object_toint (ucl_array_find_index (n_words,
2));
}
rspamd_language_detector_ucs_lowercase (ucs_key, nsym);
if (nsym == 2) {
/* We have a digraph */
- total = nelt->bigramms_total;
+ continue;
}
else if (nsym == 3) {
total = nelt->trigramms_total;
nelt->flags |= RS_LANGUAGE_LATIN;
}
- msg_info_config ("loaded %s language, %d unigramms, %d digramms, %d trigramms",
+ msg_info_config ("loaded %s language, %d unigramms, %d trigramms",
nelt->name,
(gint)nelt->unigramms_total,
- (gint)nelt->bigramms_total,
(gint)nelt->trigramms_total);
g_ptr_array_add (d->languages, nelt);
/* Map from ngramm in ucs32 to GPtrArray of rspamd_language_elt */
ret->unigramms = g_hash_table_new_full (rspamd_unigram_hash,
rspamd_unigram_equal, NULL, rspamd_ptr_array_free_hard);
- ret->bigramms = g_hash_table_new_full (rspamd_bigram_hash,
- rspamd_bigram_equal, NULL, rspamd_ptr_array_free_hard);
ret->trigramms = g_hash_table_new_full (rspamd_trigram_hash,
rspamd_trigram_equal, NULL, rspamd_ptr_array_free_hard);
g_free (fname);
}
- msg_info_config ("loaded %d languages, %d unigramms, %d bigramms, "
+ msg_info_config ("loaded %d languages, %d unigramms, "
"%d trigramms",
(gint)ret->languages->len,
g_hash_table_size (ret->unigramms),
- g_hash_table_size (ret->bigramms),
g_hash_table_size (ret->trigramms));
end:
if (gl.gl_pathc > 0) {
enum rspamd_language_gramm_type {
rs_unigramm = 0,
- rs_bigramm,
rs_trigramm
};
case rs_unigramm:
ngramms = d->unigramms;
break;
- case rs_bigramm:
- ngramms = d->bigramms;
- break;
case rs_trigramm:
ngramms = d->trigramms;
break;
case rs_unigramm:
wlen = 1;
break;
- case rs_bigramm:
- wlen = 2;
- break;
case rs_trigramm:
wlen = 3;
break;
candidates);
if (r == rs_detect_none) {
- msg_debug_lang_det ("short mode; no trigramms found, switch to bigramms");
- r = rspamd_language_detector_try_ngramm (task, default_words, d,
- ucs_tokens, rs_bigramm,
+ msg_debug_lang_det ("short mode; no trigramms found, "
+ "switch to unigramms");
+ r = rspamd_language_detector_try_ngramm (task, default_words,
+ d, ucs_tokens, rs_unigramm,
candidates);
-
- if (r == rs_detect_none) {
- msg_debug_lang_det ("short mode; no trigramms found, "
- "switch to unigramms");
- r = rspamd_language_detector_try_ngramm (task, default_words,
- d, ucs_tokens, rs_unigramm,
- candidates);
- }
}
else if (r == rs_detect_multiple) {
/* Check our guess */