]> git.ipfire.org Git - thirdparty/rspamd.git/commitdiff
[Fix] Properly escape utf8 regexps in hyperscan mode
authorVsevolod Stakhov <vsevolod@highsecure.ru>
Tue, 20 Nov 2018 20:44:49 +0000 (20:44 +0000)
committerVsevolod Stakhov <vsevolod@highsecure.ru>
Tue, 20 Nov 2018 20:45:22 +0000 (20:45 +0000)
src/libmime/lang_detection.c
src/libutil/map_helpers.c
src/libutil/multipattern.c
src/libutil/str_util.c
src/libutil/str_util.h
src/lua/lua_regexp.c

index 49e78843851a179b86ae738b76f9e92d25080f57..102117b214604fe7d4bc741e83bc16f857b83654 100644 (file)
@@ -458,8 +458,9 @@ rspamd_language_detector_read_file (struct rspamd_config *cfg,
                                const char *word = ucl_object_tolstring (w, &wlen);
                                const char *saved;
 
-                               rspamd_multipattern_add_pattern (d->stop_words[cat].mp,
-                                               word, wlen);
+                               rspamd_multipattern_add_pattern_len (d->stop_words[cat].mp,
+                                               word, wlen,
+                                               RSPAMD_MULTIPATTERN_ICASE|RSPAMD_MULTIPATTERN_UTF8);
                                nelt->stop_words ++;
                                nstop ++;
 
index e6b940f23a1990b295eaa5fe3e1bfe12c0e4b4cf..4c34cba158b2b995ba3e1cbbc2778c6c42ceff44 100644 (file)
@@ -540,7 +540,7 @@ rspamd_map_helper_insert_re (gpointer st, gconstpointer key, gconstpointer value
 
        if (re_map->map_flags & RSPAMD_REGEXP_MAP_FLAG_GLOB) {
                escaped = rspamd_str_regexp_escape (key, strlen (key), &escaped_len,
-                               TRUE);
+                               RSPAMD_REGEXP_ESCAPE_GLOB|RSPAMD_REGEXP_ESCAPE_UTF);
                re = rspamd_regexp_new (escaped, NULL, &err);
                g_free (escaped);
        }
index 94b5398b34f7a04bdae3e876c875e53f01334a6e..e4a39d5fee8b505f97b454234d111fac8a6ef900 100644 (file)
@@ -193,6 +193,12 @@ rspamd_multipattern_pattern_filter (const gchar *pattern, gsize len,
        gchar *ret = NULL;
 #ifdef WITH_HYPERSCAN
        if (rspamd_hs_check ()) {
+               gint gl_flags = RSPAMD_REGEXP_ESCAPE_ASCII;
+
+               if (flags & RSPAMD_MULTIPATTERN_UTF8) {
+                       gl_flags |= RSPAMD_REGEXP_ESCAPE_UTF;
+               }
+
                if (flags & RSPAMD_MULTIPATTERN_TLD) {
                        ret = rspamd_multipattern_escape_tld_hyperscan (pattern, len, dst_len);
                }
@@ -201,10 +207,11 @@ rspamd_multipattern_pattern_filter (const gchar *pattern, gsize len,
                        *dst_len = rspamd_strlcpy (ret, pattern, len + 1);
                }
                else if (flags & RSPAMD_MULTIPATTERN_GLOB) {
-                       ret = rspamd_str_regexp_escape (pattern, len, dst_len, TRUE);
+                       ret = rspamd_str_regexp_escape (pattern, len, dst_len,
+                                       gl_flags | RSPAMD_REGEXP_ESCAPE_GLOB);
                }
                else {
-                       ret = rspamd_str_regexp_escape (pattern, len, dst_len, FALSE);
+                       ret = rspamd_str_regexp_escape (pattern, len, dst_len, gl_flags);
                }
 
                return ret;
index f798d9eeb445dedf3953f725537eb26d7994d4c5..be7323df3079f916fd7787d1f422861345832c2a 100644 (file)
@@ -2327,10 +2327,10 @@ out:
 
 gchar *
 rspamd_str_regexp_escape (const gchar *pattern, gsize slen,
-               gsize *dst_len, gboolean allow_glob)
+               gsize *dst_len, enum rspamd_regexp_escape_flags flags)
 {
        const gchar *p, *end = pattern + slen;
-       gchar *res, *d, t;
+       gchar *res, *d, t, *tmp_utf = NULL;
        gsize len;
        static const gchar hexdigests[16] = "0123456789abcdef";
 
@@ -2365,20 +2365,46 @@ rspamd_str_regexp_escape (const gchar *pattern, gsize slen,
                        if (g_ascii_isspace (t)) {
                                len ++;
                        }
-                       else if (!g_ascii_isprint (t)) {
-                               /* \\xHH -> 4 symbols */
-                               len += 3;
+                       else {
+                               if (!(flags & RSPAMD_REGEXP_ESCAPE_UTF)) {
+                                       if (!g_ascii_isprint (t)) {
+                                               /* \\xHH -> 4 symbols */
+                                               len += 3;
+                                       }
+                               }
                        }
                        break;
                }
        }
 
+       if (flags & RSPAMD_REGEXP_ESCAPE_UTF) {
+               if (!g_utf8_validate (pattern, slen, NULL)) {
+                       tmp_utf = g_utf8_make_valid (pattern, slen);
+               }
+       }
+
        if (slen == len) {
                if (dst_len) {
+
+                       if (tmp_utf) {
+                               slen = strlen (tmp_utf);
+                       }
+
                        *dst_len = slen;
                }
 
-               return g_strdup (pattern);
+
+
+               if (tmp_utf) {
+                       return tmp_utf;
+               }
+               else {
+                       return g_strdup (pattern);
+               }
+       }
+
+       if (tmp_utf) {
+               pattern = tmp_utf;
        }
 
        res = g_malloc (len + 1);
@@ -2408,7 +2434,7 @@ rspamd_str_regexp_escape (const gchar *pattern, gsize slen,
                case '*':
                case '?':
                case '+':
-                       if (allow_glob) {
+                       if (flags & RSPAMD_REGEXP_ESCAPE_GLOB) {
                                /* Treat * as .* and ? as .? */
                                *d++ = '.';
                        }
@@ -2420,7 +2446,7 @@ rspamd_str_regexp_escape (const gchar *pattern, gsize slen,
                        if (g_ascii_isspace (t)) {
                                *d++ = '\\';
                        }
-                       else if (!g_ascii_isgraph (t)) {
+                       else if (!(flags & RSPAMD_REGEXP_ESCAPE_UTF) && !g_ascii_isgraph (t)) {
                                *d++ = '\\';
                                *d++ = 'x';
                                *d++ = hexdigests[((t >> 4) & 0xF)];
@@ -2439,5 +2465,9 @@ rspamd_str_regexp_escape (const gchar *pattern, gsize slen,
                *dst_len = d - res;
        }
 
+       if (tmp_utf) {
+               g_free (tmp_utf);
+       }
+
        return res;
 }
index 100b64b88d6804bee7397fc5f1dcae4bf32e06e2..ffcc691972dc612d5a04ed2ae2af561755e9fb76 100644 (file)
@@ -396,6 +396,11 @@ rspamd_str_has_8bit (const guchar *beg, gsize len)
 gboolean rspamd_normalise_unicode_inplace (rspamd_mempool_t *pool,
                gchar *start, guint *len);
 
+enum rspamd_regexp_escape_flags {
+       RSPAMD_REGEXP_ESCAPE_ASCII = 0,
+       RSPAMD_REGEXP_ESCAPE_UTF = 1u << 0,
+       RSPAMD_REGEXP_ESCAPE_GLOB = 1u << 1,
+};
 /**
  * Escapes special characters when reading plain data to be processed in pcre
  * @param pattern pattern to process
@@ -406,6 +411,6 @@ gboolean rspamd_normalise_unicode_inplace (rspamd_mempool_t *pool,
  */
 gchar *
 rspamd_str_regexp_escape (const gchar *pattern, gsize slen,
-               gsize *dst_len, gboolean allow_glob);
+               gsize *dst_len, enum rspamd_regexp_escape_flags flags);
 
 #endif /* SRC_LIBUTIL_STR_UTIL_H_ */
index 584f7d8c1c8d975db1c69f8cbff44e309a998473..4e233448b83060ab2648bff6f8a844123ce272af 100644 (file)
@@ -191,7 +191,8 @@ lua_regexp_import_glob (lua_State *L)
        }
 
        if (string) {
-               escaped = rspamd_str_regexp_escape (string, pat_len, NULL, TRUE);
+               escaped = rspamd_str_regexp_escape (string, pat_len, NULL,
+                               RSPAMD_REGEXP_ESCAPE_GLOB|RSPAMD_REGEXP_ESCAPE_UTF);
 
                re = rspamd_regexp_new (escaped, flags_str, &err);
 
@@ -249,7 +250,8 @@ lua_regexp_import_plain (lua_State *L)
        }
 
        if (string) {
-               escaped = rspamd_str_regexp_escape (string, pat_len, NULL, FALSE);
+               escaped = rspamd_str_regexp_escape (string, pat_len, NULL,
+                               RSPAMD_REGEXP_ESCAPE_ASCII);
 
                re = rspamd_regexp_new (escaped, flags_str, &err);