]> git.ipfire.org Git - thirdparty/rspamd.git/commitdiff
[Minor] Lowercase ucs data
authorVsevolod Stakhov <vsevolod@highsecure.ru>
Sun, 14 Jan 2018 23:05:08 +0000 (23:05 +0000)
committerVsevolod Stakhov <vsevolod@highsecure.ru>
Sun, 14 Jan 2018 23:05:08 +0000 (23:05 +0000)
src/libmime/lang_detection.c

index f73bc7aeaa0a95c63e9f99b79410b77c10da15e6..374e38f833b9a9fa2aa1ea9ce19f15c7a823caf4 100644 (file)
@@ -21,6 +21,7 @@
 #include <glob.h>
 #include <unicode/utf8.h>
 #include <unicode/ucnv.h>
+#include <unicode/uchar.h>
 #include <math.h>
 
 static const gsize default_short_text_limit = 200;
@@ -80,6 +81,16 @@ rspamd_trigram_equal (gconstpointer v, gconstpointer v2)
        return memcmp (v, v2, 3 * sizeof (UChar)) == 0;
 }
 
+static void
+rspamd_language_detector_ucs_lowercase (UChar *s, gsize len)
+{
+       gsize i;
+
+       for (i = 0; i < len; i ++) {
+               s[i] = u_tolower (s[i]);
+       }
+}
+
 static void
 rspamd_language_detector_read_file (struct rspamd_config *cfg,
                struct rspamd_lang_detector *d,
@@ -149,6 +160,8 @@ rspamd_language_detector_read_file (struct rspamd_config *cfg,
                                continue;
                        }
 
+                       rspamd_language_detector_ucs_lowercase (ucs_key, nsym);
+
                        if (nsym == 2) {
                                /* We have a digraph */
                                g_hash_table_insert (nelt->bigramms, ucs_key,
@@ -270,6 +283,7 @@ rspamd_language_detector_to_ucs (struct rspamd_lang_detector *d,
                        utf_token->begin, utf_token->len, &uc_err);
 
        if (nsym >= 0) {
+               rspamd_language_detector_ucs_lowercase (out, nsym);
                ucs_token->begin = (const gchar *) out;
                ucs_token->len = nsym;
        }