]> git.ipfire.org Git - thirdparty/rspamd.git/commitdiff
[Feature] Deal with unnormalised Unicode obfuscation
authorVsevolod Stakhov <vsevolod@highsecure.ru>
Mon, 26 Mar 2018 12:04:50 +0000 (13:04 +0100)
committerVsevolod Stakhov <vsevolod@highsecure.ru>
Mon, 26 Mar 2018 12:04:50 +0000 (13:04 +0100)
src/libserver/html.c
src/libserver/url.c
src/libserver/url.h
src/libutil/http.c
src/libutil/http.h
src/libutil/str_util.c
src/libutil/str_util.h

index c8917503d41147a6fe1b7c5b11e148bf52d79e84..b27e07fadf1c1702b2c3bb0d144a881ec3e78597 100644 (file)
@@ -22,6 +22,7 @@
 #include "html_colors.h"
 #include "url.h"
 #include <unicode/uversion.h>
+#include <unicode/ucnv.h>
 #if U_ICU_VERSION_MAJOR_NUM >= 46
 #include <unicode/uidna.h>
 #endif
@@ -1469,6 +1470,8 @@ rspamd_html_parse_tag_content (rspamd_mempool_t *pool,
        *statep = state;
 }
 
+
+
 struct rspamd_url *
 rspamd_html_process_url (rspamd_mempool_t *pool, const gchar *start, guint len,
                struct html_tag_component *comp)
@@ -1554,9 +1557,15 @@ rspamd_html_process_url (rspamd_mempool_t *pool, const gchar *start, guint len,
        }
 
        *d = '\0';
+       dlen = d - decoded;
 
        url = rspamd_mempool_alloc0 (pool, sizeof (*url));
-       rc = rspamd_url_parse (url, decoded, d - decoded, pool);
+
+       if (rspamd_normalise_unicode_inplace (pool, decoded, &dlen)) {
+               url->flags |= RSPAMD_URL_FLAG_UNNORMALISED;
+       }
+
+       rc = rspamd_url_parse (url, decoded, dlen, pool);
 
        if (rc == URI_ERRNO_OK) {
                if (has_bad_chars) {
index 1665ff379413abab652218a8688b24b4a2bf0541..ef187f94cd0476ece88d07595b3e45f706314303 100644 (file)
@@ -1543,7 +1543,7 @@ rspamd_url_parse (struct rspamd_url *uri, gchar *uristring, gsize len,
        gchar *p, *comp;
        const gchar *end;
        guint i, complen, ret, flags = 0;
-       gsize unquoted_len = 0;
+       guint unquoted_len = 0;
 
        memset (uri, 0, sizeof (*uri));
        memset (&u, 0, sizeof (u));
@@ -1649,10 +1649,16 @@ rspamd_url_parse (struct rspamd_url *uri, gchar *uristring, gsize len,
                        uri->protocollen);
        rspamd_url_shift (uri, unquoted_len, UF_SCHEMA);
        unquoted_len = rspamd_url_decode (uri->host, uri->host, uri->hostlen);
+       if (rspamd_normalise_unicode_inplace (pool, uri->host, &unquoted_len)) {
+               uri->flags |= RSPAMD_URL_FLAG_UNNORMALISED;
+       }
        rspamd_url_shift (uri, unquoted_len, UF_HOST);
 
        if (uri->datalen) {
                unquoted_len = rspamd_url_decode (uri->data, uri->data, uri->datalen);
+               if (rspamd_normalise_unicode_inplace (pool, uri->data, &unquoted_len)) {
+                       uri->flags |= RSPAMD_URL_FLAG_UNNORMALISED;
+               }
                rspamd_url_shift (uri, unquoted_len, UF_PATH);
                /* We now normalize path */
                rspamd_http_normalize_path_inplace (uri->data, uri->datalen, &unquoted_len);
@@ -1662,12 +1668,18 @@ rspamd_url_parse (struct rspamd_url *uri, gchar *uristring, gsize len,
                unquoted_len = rspamd_url_decode (uri->query,
                                uri->query,
                                uri->querylen);
+               if (rspamd_normalise_unicode_inplace (pool, uri->query, &unquoted_len)) {
+                       uri->flags |= RSPAMD_URL_FLAG_UNNORMALISED;
+               }
                rspamd_url_shift (uri, unquoted_len, UF_QUERY);
        }
        if (uri->fragmentlen) {
                unquoted_len = rspamd_url_decode (uri->fragment,
                                uri->fragment,
                                uri->fragmentlen);
+               if (rspamd_normalise_unicode_inplace (pool, uri->fragment, &unquoted_len)) {
+                       uri->flags |= RSPAMD_URL_FLAG_UNNORMALISED;
+               }
                rspamd_url_shift (uri, unquoted_len, UF_FRAGMENT);
        }
 
index e6ccfc0f9938540cc284d1bfc7a4cee389d03e7c..a02d3c9d0867a9e149c1083d9728120ee9591b27 100644 (file)
@@ -26,6 +26,7 @@ enum rspamd_url_flags {
        RSPAMD_URL_FLAG_HAS_PORT = 1 << 13,
        RSPAMD_URL_FLAG_HAS_USER = 1 << 14,
        RSPAMD_URL_FLAG_SCHEMALESS = 1 << 15,
+       RSPAMD_URL_FLAG_UNNORMALISED = 1 << 16,
 };
 
 struct rspamd_url_tag {
index 5732f8b8e7ab4eed2a27a352f67124909e627377..c6b77ee1591cf99d6eeddaaaeaa162d18748eed8 100644 (file)
@@ -3252,12 +3252,14 @@ rspamd_http_router_finish_handler (struct rspamd_http_connection *conn,
                        http_parser_parse_url (msg->url->str, msg->url->len, TRUE, &u);
 
                        if (u.field_set & (1 << UF_PATH)) {
+                               guint unnorm_len;
                                lookup.begin = msg->url->str + u.field_data[UF_PATH].off;
                                lookup.len = u.field_data[UF_PATH].len;
 
                                rspamd_http_normalize_path_inplace ((gchar *)lookup.begin,
                                                lookup.len,
-                                               &lookup.len);
+                                               &unnorm_len);
+                               lookup.len = unnorm_len;
                        }
                        else {
                                lookup.begin = msg->url->str;
@@ -3712,7 +3714,7 @@ rspamd_http_message_unref (struct rspamd_http_message *msg)
 
 
 void
-rspamd_http_normalize_path_inplace (gchar *path, gsize len, gsize *nlen)
+rspamd_http_normalize_path_inplace (gchar *path, guint len, guint *nlen)
 {
        const gchar *p, *end, *slash = NULL, *dot = NULL;
        gchar *o;
index 1c418ebb8c2d8e13d44b6d0c9c9f1e4c16b5b86f..4ce9e0a84763fd48655aedb5e206424708f75dfd 100644 (file)
@@ -570,6 +570,6 @@ glong rspamd_http_date_format (gchar *buf, gsize len, time_t time);
  * @param len
  * @param nlen
  */
-void rspamd_http_normalize_path_inplace (gchar *path, gsize len, gsize *nlen);
+void rspamd_http_normalize_path_inplace (gchar *path, guint len, guint *nlen);
 
 #endif /* HTTP_H_ */
index 8026ea7e5629e60c33cae5221c3b12f987f1d389..ab6be966a9f21dc416d898b276758c9ccbca060d 100644 (file)
 #include "cryptobox.h"
 #include "url.h"
 #include "str_util.h"
+#include "logger.h"
 #include "contrib/t1ha/t1ha.h"
+#include <unicode/uversion.h>
+#include <unicode/ucnv.h>
+#include <unicode/unorm2.h>
 #include <math.h>
 
 const guchar lc_map[256] = {
@@ -1958,3 +1962,82 @@ rspamd_memrchr (const void *m, gint c, gsize len)
 
        return NULL;
 }
+
+gboolean
+rspamd_normalise_unicode_inplace (rspamd_mempool_t *pool, gchar *start,
+               guint *len)
+{
+       UErrorCode uc_err = U_ZERO_ERROR;
+       static UConverter *utf8_conv = NULL;
+       static const UNormalizer2 *norm = NULL;
+       gint32 nsym, end;
+       UChar *src = NULL, *dest = NULL;
+       gboolean ret = FALSE;
+
+       if (utf8_conv == NULL) {
+               utf8_conv = ucnv_open ("UTF-8", &uc_err);
+               g_assert (U_SUCCESS (uc_err));
+               norm = unorm2_getInstance (NULL, "nfkc", UNORM2_COMPOSE, &uc_err);
+               g_assert (U_SUCCESS (uc_err));
+       }
+
+       /* We first need to convert data to UChars :( */
+       src = g_malloc ((*len + 1) * sizeof (*src));
+       nsym = ucnv_toUChars (utf8_conv, src, *len + 1,
+                       start, *len, &uc_err);
+
+       if (!U_SUCCESS (uc_err)) {
+               msg_warn_pool_check ("cannot normalise URL, cannot convert to unicode: %s",
+                               u_errorName (uc_err));
+               goto out;
+       }
+
+       /* We can now check if we need to decompose */
+       end = unorm2_spanQuickCheckYes (norm, src, nsym, &uc_err);
+
+       if (!U_SUCCESS (uc_err)) {
+               msg_warn_pool_check ("cannot normalise URL, cannot check normalisation: %s",
+                               u_errorName (uc_err));
+               goto out;
+       }
+
+       if (end == nsym) {
+               /* No normalisation needed */
+               goto out;
+       }
+
+       /* We copy sub(src, 0, end) to dest and normalise the rest */
+       ret = TRUE;
+       dest = g_malloc (nsym * sizeof (*dest));
+       memcpy (dest, src, end * sizeof (*dest));
+       nsym = unorm2_normalizeSecondAndAppend (norm, dest, end, nsym,
+                       src + end, nsym - end, &uc_err);
+
+       if (!U_SUCCESS (uc_err)) {
+               msg_warn_pool_check ("cannot normalise URL: %s",
+                               u_errorName (uc_err));
+               goto out;
+       }
+
+       /* We now convert it back to utf */
+       nsym = ucnv_fromUChars (utf8_conv, start, *len, dest, nsym, &uc_err);
+
+       if (!U_SUCCESS (uc_err)) {
+               msg_warn_pool_check ("cannot normalise URL, cannot convert to UTF8: %s",
+                               u_errorName (uc_err));
+               goto out;
+       }
+
+       *len = nsym;
+       out:
+
+       if (src) {
+               g_free (src);
+       }
+
+       if (dest) {
+               g_free (dest);
+       }
+
+       return ret;
+}
index ab97555acb9d478c4fa18461bfc2267be90824fe..68ec5f0bda93e1bd48e48182684fe1fadb0e16ac 100644 (file)
@@ -361,4 +361,14 @@ rspamd_str_has_8bit (const guchar *beg, gsize len)
        return FALSE;
 }
 
+/**
+ * Gets a string in UTF8 and normalises it to NFKC_Casefold form
+ * @param pool optional memory pool used for logging purposes
+ * @param start
+ * @param len
+ * @return TRUE if a string has been normalised
+ */
+gboolean rspamd_normalise_unicode_inplace (rspamd_mempool_t *pool,
+               gchar *start, guint *len);
+
 #endif /* SRC_LIBUTIL_STR_UTIL_H_ */