[Feature] Deal with unnormalised Unicode obfuscation

author Vsevolod Stakhov <vsevolod@highsecure.ru>

Mon, 26 Mar 2018 12:04:50 +0000 (13:04 +0100)

committer Vsevolod Stakhov <vsevolod@highsecure.ru>

Mon, 26 Mar 2018 12:04:50 +0000 (13:04 +0100)
author Vsevolod Stakhov <vsevolod@highsecure.ru>
Mon, 26 Mar 2018 12:04:50 +0000 (13:04 +0100)
committer Vsevolod Stakhov <vsevolod@highsecure.ru>
Mon, 26 Mar 2018 12:04:50 +0000 (13:04 +0100)
diff --git a/src/libserver/html.c b/src/libserver/html.c

index c8917503d41147a6fe1b7c5b11e148bf52d79e84..b27e07fadf1c1702b2c3bb0d144a881ec3e78597 100644 (file)
--- a/src/libserver/html.c
+++ b/src/libserver/html.c
@@ -22,6 +22,7 @@
  #include "html_colors.h"
  #include "url.h"
  #include <unicode/uversion.h>
+#include <unicode/ucnv.h>
  #if U_ICU_VERSION_MAJOR_NUM >= 46
  #include <unicode/uidna.h>
  #endif
@@ -1469,6 +1470,8 @@ rspamd_html_parse_tag_content (rspamd_mempool_t *pool,
         *statep = state;
  }
  
+
+
  struct rspamd_url *
  rspamd_html_process_url (rspamd_mempool_t *pool, const gchar *start, guint len,
                 struct html_tag_component *comp)
@@ -1554,9 +1557,15 @@ rspamd_html_process_url (rspamd_mempool_t *pool, const gchar *start, guint len,
         }
  
         *d = '\0';
+       dlen = d - decoded;
  
         url = rspamd_mempool_alloc0 (pool, sizeof (*url));
-       rc = rspamd_url_parse (url, decoded, d - decoded, pool);
+
+       if (rspamd_normalise_unicode_inplace (pool, decoded, &dlen)) {
+               url->flags |= RSPAMD_URL_FLAG_UNNORMALISED;
+       }
+
+       rc = rspamd_url_parse (url, decoded, dlen, pool);
  
         if (rc == URI_ERRNO_OK) {
                 if (has_bad_chars) {
diff --git a/src/libserver/url.c b/src/libserver/url.c

index 1665ff379413abab652218a8688b24b4a2bf0541..ef187f94cd0476ece88d07595b3e45f706314303 100644 (file)
--- a/src/libserver/url.c
+++ b/src/libserver/url.c
@@ -1543,7 +1543,7 @@ rspamd_url_parse (struct rspamd_url *uri, gchar *uristring, gsize len,
         gchar *p, *comp;
         const gchar *end;
         guint i, complen, ret, flags = 0;
-       gsize unquoted_len = 0;
+       guint unquoted_len = 0;
  
         memset (uri, 0, sizeof (*uri));
         memset (&u, 0, sizeof (u));
@@ -1649,10 +1649,16 @@ rspamd_url_parse (struct rspamd_url *uri, gchar *uristring, gsize len,
                         uri->protocollen);
         rspamd_url_shift (uri, unquoted_len, UF_SCHEMA);
         unquoted_len = rspamd_url_decode (uri->host, uri->host, uri->hostlen);
+       if (rspamd_normalise_unicode_inplace (pool, uri->host, &unquoted_len)) {
+               uri->flags |= RSPAMD_URL_FLAG_UNNORMALISED;
+       }
         rspamd_url_shift (uri, unquoted_len, UF_HOST);
  
         if (uri->datalen) {
                 unquoted_len = rspamd_url_decode (uri->data, uri->data, uri->datalen);
+               if (rspamd_normalise_unicode_inplace (pool, uri->data, &unquoted_len)) {
+                       uri->flags |= RSPAMD_URL_FLAG_UNNORMALISED;
+               }
                 rspamd_url_shift (uri, unquoted_len, UF_PATH);
                 /* We now normalize path */
                 rspamd_http_normalize_path_inplace (uri->data, uri->datalen, &unquoted_len);
@@ -1662,12 +1668,18 @@ rspamd_url_parse (struct rspamd_url *uri, gchar *uristring, gsize len,
                 unquoted_len = rspamd_url_decode (uri->query,
                                 uri->query,
                                 uri->querylen);
+               if (rspamd_normalise_unicode_inplace (pool, uri->query, &unquoted_len)) {
+                       uri->flags |= RSPAMD_URL_FLAG_UNNORMALISED;
+               }
                 rspamd_url_shift (uri, unquoted_len, UF_QUERY);
         }
         if (uri->fragmentlen) {
                 unquoted_len = rspamd_url_decode (uri->fragment,
                                 uri->fragment,
                                 uri->fragmentlen);
+               if (rspamd_normalise_unicode_inplace (pool, uri->fragment, &unquoted_len)) {
+                       uri->flags |= RSPAMD_URL_FLAG_UNNORMALISED;
+               }
                 rspamd_url_shift (uri, unquoted_len, UF_FRAGMENT);
         }
  
diff --git a/src/libserver/url.h b/src/libserver/url.h

index e6ccfc0f9938540cc284d1bfc7a4cee389d03e7c..a02d3c9d0867a9e149c1083d9728120ee9591b27 100644 (file)
--- a/src/libserver/url.h
+++ b/src/libserver/url.h
@@ -26,6 +26,7 @@ enum rspamd_url_flags {
         RSPAMD_URL_FLAG_HAS_PORT = 1 << 13,
         RSPAMD_URL_FLAG_HAS_USER = 1 << 14,
         RSPAMD_URL_FLAG_SCHEMALESS = 1 << 15,
+       RSPAMD_URL_FLAG_UNNORMALISED = 1 << 16,
  };
  
  struct rspamd_url_tag {
diff --git a/src/libutil/http.c b/src/libutil/http.c

index 5732f8b8e7ab4eed2a27a352f67124909e627377..c6b77ee1591cf99d6eeddaaaeaa162d18748eed8 100644 (file)
--- a/src/libutil/http.c
+++ b/src/libutil/http.c
@@ -3252,12 +3252,14 @@ rspamd_http_router_finish_handler (struct rspamd_http_connection *conn,
                         http_parser_parse_url (msg->url->str, msg->url->len, TRUE, &u);
  
                         if (u.field_set & (1 << UF_PATH)) {
+                               guint unnorm_len;
                                 lookup.begin = msg->url->str + u.field_data[UF_PATH].off;
                                 lookup.len = u.field_data[UF_PATH].len;
  
                                 rspamd_http_normalize_path_inplace ((gchar *)lookup.begin,
                                                 lookup.len,
-                                               &lookup.len);
+                                               &unnorm_len);
+                               lookup.len = unnorm_len;
                         }
                         else {
                                 lookup.begin = msg->url->str;
@@ -3712,7 +3714,7 @@ rspamd_http_message_unref (struct rspamd_http_message *msg)
  
  
  void
-rspamd_http_normalize_path_inplace (gchar *path, gsize len, gsize *nlen)
+rspamd_http_normalize_path_inplace (gchar *path, guint len, guint *nlen)
  {
         const gchar *p, *end, *slash = NULL, *dot = NULL;
         gchar *o;
diff --git a/src/libutil/http.h b/src/libutil/http.h

index 1c418ebb8c2d8e13d44b6d0c9c9f1e4c16b5b86f..4ce9e0a84763fd48655aedb5e206424708f75dfd 100644 (file)
--- a/src/libutil/http.h
+++ b/src/libutil/http.h
@@ -570,6 +570,6 @@ glong rspamd_http_date_format (gchar *buf, gsize len, time_t time);
   * @param len
   * @param nlen
   */
-void rspamd_http_normalize_path_inplace (gchar *path, gsize len, gsize *nlen);
+void rspamd_http_normalize_path_inplace (gchar *path, guint len, guint *nlen);
  
  #endif /* HTTP_H_ */
diff --git a/src/libutil/str_util.c b/src/libutil/str_util.c

index 8026ea7e5629e60c33cae5221c3b12f987f1d389..ab6be966a9f21dc416d898b276758c9ccbca060d 100644 (file)
--- a/src/libutil/str_util.c
+++ b/src/libutil/str_util.c
@@ -18,7 +18,11 @@
  #include "cryptobox.h"
  #include "url.h"
  #include "str_util.h"
+#include "logger.h"
  #include "contrib/t1ha/t1ha.h"
+#include <unicode/uversion.h>
+#include <unicode/ucnv.h>
+#include <unicode/unorm2.h>
  #include <math.h>
  
  const guchar lc_map[256] = {
@@ -1958,3 +1962,82 @@ rspamd_memrchr (const void *m, gint c, gsize len)
  
         return NULL;
  }
+
+gboolean
+rspamd_normalise_unicode_inplace (rspamd_mempool_t *pool, gchar *start,
+               guint *len)
+{
+       UErrorCode uc_err = U_ZERO_ERROR;
+       static UConverter *utf8_conv = NULL;
+       static const UNormalizer2 *norm = NULL;
+       gint32 nsym, end;
+       UChar *src = NULL, *dest = NULL;
+       gboolean ret = FALSE;
+
+       if (utf8_conv == NULL) {
+               utf8_conv = ucnv_open ("UTF-8", &uc_err);
+               g_assert (U_SUCCESS (uc_err));
+               norm = unorm2_getInstance (NULL, "nfkc", UNORM2_COMPOSE, &uc_err);
+               g_assert (U_SUCCESS (uc_err));
+       }
+
+       /* We first need to convert data to UChars :( */
+       src = g_malloc ((*len + 1) * sizeof (*src));
+       nsym = ucnv_toUChars (utf8_conv, src, *len + 1,
+                       start, *len, &uc_err);
+
+       if (!U_SUCCESS (uc_err)) {
+               msg_warn_pool_check ("cannot normalise URL, cannot convert to unicode: %s",
+                               u_errorName (uc_err));
+               goto out;
+       }
+
+       /* We can now check if we need to decompose */
+       end = unorm2_spanQuickCheckYes (norm, src, nsym, &uc_err);
+
+       if (!U_SUCCESS (uc_err)) {
+               msg_warn_pool_check ("cannot normalise URL, cannot check normalisation: %s",
+                               u_errorName (uc_err));
+               goto out;
+       }
+
+       if (end == nsym) {
+               /* No normalisation needed */
+               goto out;
+       }
+
+       /* We copy sub(src, 0, end) to dest and normalise the rest */
+       ret = TRUE;
+       dest = g_malloc (nsym * sizeof (*dest));
+       memcpy (dest, src, end * sizeof (*dest));
+       nsym = unorm2_normalizeSecondAndAppend (norm, dest, end, nsym,
+                       src + end, nsym - end, &uc_err);
+
+       if (!U_SUCCESS (uc_err)) {
+               msg_warn_pool_check ("cannot normalise URL: %s",
+                               u_errorName (uc_err));
+               goto out;
+       }
+
+       /* We now convert it back to utf */
+       nsym = ucnv_fromUChars (utf8_conv, start, *len, dest, nsym, &uc_err);
+
+       if (!U_SUCCESS (uc_err)) {
+               msg_warn_pool_check ("cannot normalise URL, cannot convert to UTF8: %s",
+                               u_errorName (uc_err));
+               goto out;
+       }
+
+       *len = nsym;
+       out:
+
+       if (src) {
+               g_free (src);
+       }
+
+       if (dest) {
+               g_free (dest);
+       }
+
+       return ret;
+}
diff --git a/src/libutil/str_util.h b/src/libutil/str_util.h

index ab97555acb9d478c4fa18461bfc2267be90824fe..68ec5f0bda93e1bd48e48182684fe1fadb0e16ac 100644 (file)
--- a/src/libutil/str_util.h
+++ b/src/libutil/str_util.h
@@ -361,4 +361,14 @@ rspamd_str_has_8bit (const guchar *beg, gsize len)
         return FALSE;
  }
  
+/**
+ * Gets a string in UTF8 and normalises it to NFKC_Casefold form
+ * @param pool optional memory pool used for logging purposes
+ * @param start
+ * @param len
+ * @return TRUE if a string has been normalised
+ */
+gboolean rspamd_normalise_unicode_inplace (rspamd_mempool_t *pool,
+               gchar *start, guint *len);
+
  #endif /* SRC_LIBUTIL_STR_UTIL_H_ */
author	Vsevolod Stakhov <vsevolod@highsecure.ru>
	Mon, 26 Mar 2018 12:04:50 +0000 (13:04 +0100)
committer	Vsevolod Stakhov <vsevolod@highsecure.ru>
	Mon, 26 Mar 2018 12:04:50 +0000 (13:04 +0100)
src/libserver/html.c		patch \| blob \| blame \| history
src/libserver/url.c		patch \| blob \| blame \| history
src/libserver/url.h		patch \| blob \| blame \| history
src/libutil/http.c		patch \| blob \| blame \| history
src/libutil/http.h		patch \| blob \| blame \| history
src/libutil/str_util.c		patch \| blob \| blame \| history
src/libutil/str_util.h		patch \| blob \| blame \| history