From: Vsevolod Stakhov Date: Thu, 14 Feb 2019 17:27:01 +0000 (+0000) Subject: [Feature] Better escaping of unicode X-Git-Tag: 1.9.0~150 X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=9b2e2d70a83c5c679f917253bcdb733d4bbbe705;p=thirdparty%2Frspamd.git [Feature] Better escaping of unicode --- diff --git a/src/libutil/str_util.c b/src/libutil/str_util.c index 06d7a6cc79..0defa2acf1 100644 --- a/src/libutil/str_util.c +++ b/src/libutil/str_util.c @@ -2605,7 +2605,7 @@ rspamd_str_regexp_escape (const gchar *pattern, gsize slen, gsize *dst_len, enum rspamd_regexp_escape_flags flags) { const gchar *p, *end = pattern + slen; - gchar *res, *d, t, *tmp_utf = NULL; + gchar *res, *d, t, *tmp_utf = NULL, *dend; gsize len; static const gchar hexdigests[16] = "0123456789abcdef"; @@ -2634,15 +2634,22 @@ rspamd_str_regexp_escape (const gchar *pattern, gsize slen, case '$': case '|': case '#': - len ++; + if (!(flags & RSPAMD_REGEXP_ESCAPE_RE)) { + len++; + } break; default: if (g_ascii_isspace (t)) { len ++; } else { - if (!(flags & RSPAMD_REGEXP_ESCAPE_UTF)) { - if (!g_ascii_isprint (t)) { + if (!g_ascii_isprint (t) || (t & 0x80)) { + + if (flags & RSPAMD_REGEXP_ESCAPE_UTF) { + /* \x{code}, where code can be up to 5 digits */ + len += 4; + } + else { /* \\xHH -> 4 symbols */ len += 3; } @@ -2668,8 +2675,6 @@ rspamd_str_regexp_escape (const gchar *pattern, gsize slen, *dst_len = slen; } - - if (tmp_utf) { return tmp_utf; } @@ -2685,8 +2690,10 @@ rspamd_str_regexp_escape (const gchar *pattern, gsize slen, res = g_malloc (len + 1); p = pattern; d = res; + dend = d + len; while (p < end) { + g_assert (d < dend); t = *p ++; switch (t) { @@ -2704,7 +2711,9 @@ rspamd_str_regexp_escape (const gchar *pattern, gsize slen, case '$': case '|': case '#': - *d++ = '\\'; + if (!(flags & RSPAMD_REGEXP_ESCAPE_RE)) { + *d++ = '\\'; + } break; case '*': case '?': @@ -2714,19 +2723,40 @@ rspamd_str_regexp_escape (const gchar *pattern, gsize slen, *d++ = '.'; } else { - *d++ = '\\'; + if (!(flags & RSPAMD_REGEXP_ESCAPE_RE)) { + *d++ = '\\'; + } } break; default: if (g_ascii_isspace (t)) { - *d++ = '\\'; + if (!(flags & RSPAMD_REGEXP_ESCAPE_RE)) { + *d++ = '\\'; + } } - else if (!(flags & RSPAMD_REGEXP_ESCAPE_UTF) && !g_ascii_isgraph (t)) { - *d++ = '\\'; - *d++ = 'x'; - *d++ = hexdigests[((t >> 4) & 0xF)]; - *d++ = hexdigests[((t) & 0xF)]; - continue; /* To avoid *d++ = t; */ + else if (t & 0x80 || !g_ascii_isprint (t)) { + if (!(flags & RSPAMD_REGEXP_ESCAPE_UTF)) { + *d++ = '\\'; + *d++ = 'x'; + *d++ = hexdigests[((t >> 4) & 0xF)]; + *d++ = hexdigests[((t) & 0xF)]; + continue; /* To avoid *d++ = t; */ + } + else { + if (flags & (RSPAMD_REGEXP_ESCAPE_RE|RSPAMD_REGEXP_ESCAPE_GLOB)) { + UChar32 uc; + gint32 off = p - pattern - 1; + U8_NEXT (pattern, off, slen, uc); + + if (uc > 0) { + d += rspamd_snprintf (d, dend - d, + "\\x{%xd}", uc); + p = pattern + off; + } + + continue; /* To avoid *d++ = t; */ + } + } } break; } diff --git a/src/libutil/str_util.h b/src/libutil/str_util.h index 46b74001bf..34c1271d4f 100644 --- a/src/libutil/str_util.h +++ b/src/libutil/str_util.h @@ -436,6 +436,7 @@ enum rspamd_regexp_escape_flags { RSPAMD_REGEXP_ESCAPE_ASCII = 0, RSPAMD_REGEXP_ESCAPE_UTF = 1u << 0, RSPAMD_REGEXP_ESCAPE_GLOB = 1u << 1, + RSPAMD_REGEXP_ESCAPE_RE = 1u << 2, }; /** * Escapes special characters when reading plain data to be processed in pcre