const char *word = ucl_object_tolstring (w, &wlen);
const char *saved;
- rspamd_multipattern_add_pattern (d->stop_words[cat].mp,
- word, wlen);
+ rspamd_multipattern_add_pattern_len (d->stop_words[cat].mp,
+ word, wlen,
+ RSPAMD_MULTIPATTERN_ICASE|RSPAMD_MULTIPATTERN_UTF8);
nelt->stop_words ++;
nstop ++;
if (re_map->map_flags & RSPAMD_REGEXP_MAP_FLAG_GLOB) {
escaped = rspamd_str_regexp_escape (key, strlen (key), &escaped_len,
- TRUE);
+ RSPAMD_REGEXP_ESCAPE_GLOB|RSPAMD_REGEXP_ESCAPE_UTF);
re = rspamd_regexp_new (escaped, NULL, &err);
g_free (escaped);
}
gchar *ret = NULL;
#ifdef WITH_HYPERSCAN
if (rspamd_hs_check ()) {
+ gint gl_flags = RSPAMD_REGEXP_ESCAPE_ASCII;
+
+ if (flags & RSPAMD_MULTIPATTERN_UTF8) {
+ gl_flags |= RSPAMD_REGEXP_ESCAPE_UTF;
+ }
+
if (flags & RSPAMD_MULTIPATTERN_TLD) {
ret = rspamd_multipattern_escape_tld_hyperscan (pattern, len, dst_len);
}
*dst_len = rspamd_strlcpy (ret, pattern, len + 1);
}
else if (flags & RSPAMD_MULTIPATTERN_GLOB) {
- ret = rspamd_str_regexp_escape (pattern, len, dst_len, TRUE);
+ ret = rspamd_str_regexp_escape (pattern, len, dst_len,
+ gl_flags | RSPAMD_REGEXP_ESCAPE_GLOB);
}
else {
- ret = rspamd_str_regexp_escape (pattern, len, dst_len, FALSE);
+ ret = rspamd_str_regexp_escape (pattern, len, dst_len, gl_flags);
}
return ret;
gchar *
rspamd_str_regexp_escape (const gchar *pattern, gsize slen,
- gsize *dst_len, gboolean allow_glob)
+ gsize *dst_len, enum rspamd_regexp_escape_flags flags)
{
const gchar *p, *end = pattern + slen;
- gchar *res, *d, t;
+ gchar *res, *d, t, *tmp_utf = NULL;
gsize len;
static const gchar hexdigests[16] = "0123456789abcdef";
if (g_ascii_isspace (t)) {
len ++;
}
- else if (!g_ascii_isprint (t)) {
- /* \\xHH -> 4 symbols */
- len += 3;
+ else {
+ if (!(flags & RSPAMD_REGEXP_ESCAPE_UTF)) {
+ if (!g_ascii_isprint (t)) {
+ /* \\xHH -> 4 symbols */
+ len += 3;
+ }
+ }
}
break;
}
}
+ if (flags & RSPAMD_REGEXP_ESCAPE_UTF) {
+ if (!g_utf8_validate (pattern, slen, NULL)) {
+ tmp_utf = g_utf8_make_valid (pattern, slen);
+ }
+ }
+
if (slen == len) {
if (dst_len) {
+
+ if (tmp_utf) {
+ slen = strlen (tmp_utf);
+ }
+
*dst_len = slen;
}
- return g_strdup (pattern);
+
+
+ if (tmp_utf) {
+ return tmp_utf;
+ }
+ else {
+ return g_strdup (pattern);
+ }
+ }
+
+ if (tmp_utf) {
+ pattern = tmp_utf;
}
res = g_malloc (len + 1);
case '*':
case '?':
case '+':
- if (allow_glob) {
+ if (flags & RSPAMD_REGEXP_ESCAPE_GLOB) {
/* Treat * as .* and ? as .? */
*d++ = '.';
}
if (g_ascii_isspace (t)) {
*d++ = '\\';
}
- else if (!g_ascii_isgraph (t)) {
+ else if (!(flags & RSPAMD_REGEXP_ESCAPE_UTF) && !g_ascii_isgraph (t)) {
*d++ = '\\';
*d++ = 'x';
*d++ = hexdigests[((t >> 4) & 0xF)];
*dst_len = d - res;
}
+ if (tmp_utf) {
+ g_free (tmp_utf);
+ }
+
return res;
}
gboolean rspamd_normalise_unicode_inplace (rspamd_mempool_t *pool,
gchar *start, guint *len);
+enum rspamd_regexp_escape_flags {
+ RSPAMD_REGEXP_ESCAPE_ASCII = 0,
+ RSPAMD_REGEXP_ESCAPE_UTF = 1u << 0,
+ RSPAMD_REGEXP_ESCAPE_GLOB = 1u << 1,
+};
/**
* Escapes special characters when reading plain data to be processed in pcre
* @param pattern pattern to process
*/
gchar *
rspamd_str_regexp_escape (const gchar *pattern, gsize slen,
- gsize *dst_len, gboolean allow_glob);
+ gsize *dst_len, enum rspamd_regexp_escape_flags flags);
#endif /* SRC_LIBUTIL_STR_UTIL_H_ */
}
if (string) {
- escaped = rspamd_str_regexp_escape (string, pat_len, NULL, TRUE);
+ escaped = rspamd_str_regexp_escape (string, pat_len, NULL,
+ RSPAMD_REGEXP_ESCAPE_GLOB|RSPAMD_REGEXP_ESCAPE_UTF);
re = rspamd_regexp_new (escaped, flags_str, &err);
}
if (string) {
- escaped = rspamd_str_regexp_escape (string, pat_len, NULL, FALSE);
+ escaped = rspamd_str_regexp_escape (string, pat_len, NULL,
+ RSPAMD_REGEXP_ESCAPE_ASCII);
re = rspamd_regexp_new (escaped, flags_str, &err);