[Rework] Use C++ version for unicode normalisation

author Vsevolod Stakhov <vsevolod@highsecure.ru>

Mon, 17 May 2021 15:34:35 +0000 (16:34 +0100)

committer Vsevolod Stakhov <vsevolod@highsecure.ru>

Mon, 17 May 2021 15:34:35 +0000 (16:34 +0100)
author Vsevolod Stakhov <vsevolod@highsecure.ru>
Mon, 17 May 2021 15:34:35 +0000 (16:34 +0100)
committer Vsevolod Stakhov <vsevolod@highsecure.ru>
Mon, 17 May 2021 15:34:35 +0000 (16:34 +0100)
diff --git a/src/libserver/html.c b/src/libserver/html.c

index 8d7b722a5695bccc553bd9eb7d29c6fafbb42d3e..cfdd0acef8322a025e2c66a604d2767069964cfc 100644 (file)
--- a/src/libserver/html.c
+++ b/src/libserver/html.c
@@ -2667,7 +2667,7 @@ rspamd_html_check_displayed_url (rspamd_mempool_t *pool,
                 }
         }
  
-       rspamd_normalise_unicode_inplace (pool, url->visible_part, &dlen);
+       rspamd_normalise_unicode_inplace (url->visible_part, &dlen);
  }
  
  static gboolean
diff --git a/src/libserver/url.h b/src/libserver/url.h

index 72fce5f9ea9c44ee2783fa922de7d3373532d9a2..4ace18f1aa34075e1340d2b07ecab3911812a92f 100644 (file)
--- a/src/libserver/url.h
+++ b/src/libserver/url.h
@@ -6,6 +6,7 @@
  #include "mem_pool.h"
  #include "khash.h"
  #include "fstring.h"
+#include "libutil/cxx/utf8_util.h"
  
  #ifdef  __cplusplus
  extern "C" {
@@ -356,7 +357,7 @@ int rspamd_url_cmp_qsort(const void *u1, const void *u2);
  #define rspamd_url_normalise_propagate_flags(pool, input, len_out, url_flags_out) \
    do {                                                                            \
       enum rspamd_normalise_result norm_res;                                       \
-     norm_res = rspamd_normalise_unicode_inplace((pool), (input), (len_out));     \
+     norm_res = rspamd_normalise_unicode_inplace((input), (len_out));     \
       if (norm_res & RSPAMD_UNICODE_NORM_UNNORMAL) {                               \
         url_flags_out |= RSPAMD_URL_FLAG_UNNORMALISED;                             \
       }                                                                            \
diff --git a/src/libutil/cxx/utf8_util.cxx b/src/libutil/cxx/utf8_util.cxx

index f44d02671a58b7249994b0360c1dfe2ad7f60937..6bca4b18ed90bd8c7b5ecb660217e28f95247a19 100644 (file)
--- a/src/libutil/cxx/utf8_util.cxx
+++ b/src/libutil/cxx/utf8_util.cxx
@@ -18,6 +18,8 @@
  #include <unicode/utypes.h>
  #include <unicode/utf8.h>
  #include <unicode/uchar.h>
+#include <unicode/normalizer2.h>
+#include <unicode/schriter.h>
  #include <utility>
  #include <string>
  
@@ -98,3 +100,101 @@ TEST_SUITE("utf8 utils") {
  }
  
  
+
+enum rspamd_normalise_result
+rspamd_normalise_unicode_inplace(char *start, size_t *len)
+{
+       UErrorCode uc_err = U_ZERO_ERROR;
+       const auto *nfkc_norm = icu::Normalizer2::getNFKCInstance(uc_err);
+       static icu::UnicodeSet zw_spaces{};
+
+       if (!zw_spaces.isFrozen()) {
+               /* Add zw spaces to the set */
+               zw_spaces.add(0x200B);
+               zw_spaces.add(0x200C);
+               zw_spaces.add(0x200D);
+               zw_spaces.add(0xFEF);
+               zw_spaces.add(0x00AD);
+               zw_spaces.freeze();
+       }
+
+       int ret = RSPAMD_UNICODE_NORM_NORMAL;
+
+       g_assert (U_SUCCESS (uc_err));
+
+       auto uc_string = icu::UnicodeString::fromUTF8(icu::StringPiece(start, *len));
+       auto is_normal = nfkc_norm->quickCheck(uc_string, uc_err);
+
+       if (!U_SUCCESS (uc_err)) {
+               return RSPAMD_UNICODE_NORM_ERROR;
+       }
+
+       /* Filter zero width spaces and push resulting string back */
+       const auto filter_zw_spaces_and_push_back = [&](const icu::UnicodeString &input) -> size_t {
+               icu::StringCharacterIterator it{input};
+               size_t i = 0;
+
+               while(it.hasNext()) {
+                       auto uc = it.next32PostInc();
+
+                       if (zw_spaces.contains(uc)) {
+                               ret |= RSPAMD_UNICODE_NORM_ZERO_SPACES;
+                       }
+                       else {
+                               UBool err = 0;
+                               U8_APPEND(start, i, *len, uc, err);
+
+                               if (err) {
+                                       ret = RSPAMD_UNICODE_NORM_ERROR;
+
+                                       return i;
+                               }
+                       }
+               }
+
+               return i;
+       };
+
+       if (is_normal != UNORM_YES) {
+               /* Need to normalise */
+               ret |= RSPAMD_UNICODE_NORM_UNNORMAL;
+
+               auto normalised = nfkc_norm->normalize(uc_string, uc_err);
+
+               if (!U_SUCCESS (uc_err)) {
+                       return RSPAMD_UNICODE_NORM_ERROR;
+               }
+
+               *len = filter_zw_spaces_and_push_back(normalised);
+       }
+       else {
+               *len = filter_zw_spaces_and_push_back(uc_string);
+       }
+
+       return static_cast<enum rspamd_normalise_result>(ret);
+}
+
+TEST_SUITE("utf8 utils") {
+       TEST_CASE("utf8 normalise") {
+               std::tuple<const char *, const char *, int> cases[] = {
+                               {"abc", "abc", RSPAMD_UNICODE_NORM_NORMAL},
+                               {"тест", "тест", RSPAMD_UNICODE_NORM_NORMAL},
+                               /* Zero width spaces */
+                               {"\xE2\x80\x8B""те""\xE2\x80\x8B""ст", "тест", RSPAMD_UNICODE_NORM_ZERO_SPACES},
+                               /* Special case of diacritic */
+                               {"13_\u0020\u0308\u0301\u038e\u03ab", "13_ ̈́ΎΫ", RSPAMD_UNICODE_NORM_UNNORMAL},
+                               /* Same with zw spaces */
+                               {"13\u200C_\u0020\u0308\u0301\u038e\u03ab\u200D", "13_ ̈́ΎΫ",
+                                                               RSPAMD_UNICODE_NORM_UNNORMAL|RSPAMD_UNICODE_NORM_ZERO_SPACES},
+               };
+
+               for (const auto &c : cases) {
+                       std::string cpy{std::get<0>(c)};
+                       auto ns = cpy.size();
+                       auto res = rspamd_normalise_unicode_inplace(cpy.data(), &ns);
+                       cpy.resize(ns);
+                       CHECK(cpy == std::string(std::get<1>(c)));
+                       CHECK(res == std::get<2>(c));
+               }
+       }
+}
+\ No newline at end of file
diff --git a/src/libutil/cxx/utf8_util.h b/src/libutil/cxx/utf8_util.h

index 40bb53bf0e539efb8a5ccbb3bd9a92666d57d17b..21add9baedfe7085c9d720b42288ffcf29274149 100644 (file)
--- a/src/libutil/cxx/utf8_util.h
+++ b/src/libutil/cxx/utf8_util.h
@@ -34,6 +34,23 @@ extern "C" {
   */
  char* rspamd_string_unicode_trim_inplace (char *str, size_t *len);
  
+enum rspamd_normalise_result {
+       RSPAMD_UNICODE_NORM_NORMAL = 0,
+       RSPAMD_UNICODE_NORM_UNNORMAL = (1 << 0),
+       RSPAMD_UNICODE_NORM_ZERO_SPACES = (1 << 1),
+       RSPAMD_UNICODE_NORM_ERROR = (1 << 2),
+       RSPAMD_UNICODE_NORM_OVERFLOW = (1 << 3)
+};
+
+/**
+ * Gets a string in UTF8 and normalises it to NFKC_Casefold form
+ * @param pool optional memory pool used for logging purposes
+ * @param start
+ * @param len
+ * @return TRUE if a string has been normalised
+ */
+enum rspamd_normalise_result rspamd_normalise_unicode_inplace(gchar *start, gsize *len);
+
  #ifdef  __cplusplus
  }
  #endif
diff --git a/src/libutil/str_util.c b/src/libutil/str_util.c

index 00774d5886d46cf4f4aa2b4f732b52bec2a42f02..1e92c8e54f505517dd8984c5bd61bc608f407ded 100644 (file)
--- a/src/libutil/str_util.c
+++ b/src/libutil/str_util.c
@@ -3020,135 +3020,6 @@ rspamd_get_unicode_normalizer (void)
  #endif
  }
  
-
-enum rspamd_normalise_result
-rspamd_normalise_unicode_inplace (rspamd_mempool_t *pool, gchar *start,
-               gsize *len)
-{
-#if U_ICU_VERSION_MAJOR_NUM >= 44
-       UErrorCode uc_err = U_ZERO_ERROR;
-       UConverter *utf8_conv = rspamd_get_utf8_converter ();
-       const UNormalizer2 *norm = rspamd_get_unicode_normalizer ();
-       gint32 nsym, end;
-       UChar *src = NULL, *dest = NULL;
-       enum rspamd_normalise_result ret = 0;
-       gboolean has_invisible = FALSE;
-
-       /* We first need to convert data to UChars :( */
-       src = g_malloc ((*len + 1) * sizeof (*src));
-       nsym = ucnv_toUChars (utf8_conv, src, *len + 1,
-                       start, *len, &uc_err);
-
-       if (!U_SUCCESS (uc_err)) {
-               msg_warn_pool_check ("cannot normalise URL, cannot convert to unicode: %s",
-                               u_errorName (uc_err));
-               ret |= RSPAMD_UNICODE_NORM_ERROR;
-               goto out;
-       }
-
-       /* We can now check if we need to decompose */
-       end = unorm2_spanQuickCheckYes (norm, src, nsym, &uc_err);
-
-       if (!U_SUCCESS (uc_err)) {
-               msg_warn_pool_check ("cannot normalise URL, cannot check normalisation: %s",
-                               u_errorName (uc_err));
-               ret |= RSPAMD_UNICODE_NORM_ERROR;
-               goto out;
-       }
-
-       for (gint32 i = 0; i < nsym; i ++) {
-               if (IS_ZERO_WIDTH_SPACE (src[i])) {
-                       has_invisible = TRUE;
-                       break;
-               }
-       }
-
-       uc_err = U_ZERO_ERROR;
-
-       if (end != nsym) {
-               /* No normalisation needed, but we may still have invisible spaces */
-               /* We copy sub(src, 0, end) to dest and normalise the rest */
-               ret |= RSPAMD_UNICODE_NORM_UNNORMAL;
-               dest = g_malloc (nsym * sizeof (*dest));
-               memcpy (dest, src, end * sizeof (*dest));
-               nsym = unorm2_normalizeSecondAndAppend (norm, dest, end, nsym,
-                               src + end, nsym - end, &uc_err);
-
-               if (!U_SUCCESS (uc_err)) {
-                       if (uc_err != U_BUFFER_OVERFLOW_ERROR) {
-                               msg_warn_pool_check ("cannot normalise URL: %s",
-                                               u_errorName (uc_err));
-                               ret |= RSPAMD_UNICODE_NORM_ERROR;
-                       }
-
-                       goto out;
-               }
-       }
-       else if (!has_invisible) {
-               goto out;
-       }
-       else {
-               dest = src;
-               src = NULL;
-       }
-
-       if (has_invisible) {
-               /* Also filter zero width spaces */
-               gint32 new_len = 0;
-               UChar *t = dest, *h = dest;
-
-               ret |= RSPAMD_UNICODE_NORM_ZERO_SPACES;
-
-               for (gint32 i = 0; i < nsym; i ++) {
-                       if (!IS_ZERO_WIDTH_SPACE (*h)) {
-                               *t++ = *h++;
-                               new_len ++;
-                       }
-                       else {
-                               h ++;
-                       }
-               }
-
-               nsym = new_len;
-       }
-
-       /* We now convert it back to utf */
-       nsym = ucnv_fromUChars (utf8_conv, start, *len, dest, nsym, &uc_err);
-
-       if (!U_SUCCESS (uc_err)) {
-               msg_warn_pool_check ("cannot normalise URL, cannot convert to UTF8: %s"
-                                          " input length: %d chars, unicode length: %d utf16 symbols",
-                               u_errorName (uc_err), (gint)*len, (gint)nsym);
-
-               if (uc_err == U_BUFFER_OVERFLOW_ERROR) {
-                       ret |= RSPAMD_UNICODE_NORM_OVERFLOW;
-               }
-               else {
-                       ret |= RSPAMD_UNICODE_NORM_ERROR;
-               }
-
-               goto out;
-       }
-
-       *len = nsym;
-
-out:
-
-       if (src) {
-               g_free (src);
-       }
-
-       if (dest) {
-               g_free (dest);
-       }
-
-       return ret;
-#else
-       /* Kill that with fire please */
-       return FALSE;
-#endif
-}
-
  gchar *
  rspamd_str_regexp_escape (const gchar *pattern, gsize slen,
                 gsize *dst_len, enum rspamd_regexp_escape_flags flags)
diff --git a/src/libutil/str_util.h b/src/libutil/str_util.h

index 427d6b94ee11fc206f3d4a50c7f15ed59a86bcf1..cfa37848fa994426c347ea645cf699eee17b4e70 100644 (file)
--- a/src/libutil/str_util.h
+++ b/src/libutil/str_util.h
@@ -475,23 +475,7 @@ struct UNormalizer2;
  
  const struct UNormalizer2 *rspamd_get_unicode_normalizer (void);
  
-enum rspamd_normalise_result {
-       RSPAMD_UNICODE_NORM_NORMAL = 0,
-       RSPAMD_UNICODE_NORM_UNNORMAL = (1 << 0),
-       RSPAMD_UNICODE_NORM_ZERO_SPACES = (1 << 1),
-       RSPAMD_UNICODE_NORM_ERROR = (1 << 2),
-       RSPAMD_UNICODE_NORM_OVERFLOW = (1 << 3)
-};
  
-/**
- * Gets a string in UTF8 and normalises it to NFKC_Casefold form
- * @param pool optional memory pool used for logging purposes
- * @param start
- * @param len
- * @return TRUE if a string has been normalised
- */
-enum rspamd_normalise_result rspamd_normalise_unicode_inplace (rspamd_mempool_t *pool,
-                                                                                                                          gchar *start, gsize *len);
  
  enum rspamd_regexp_escape_flags {
         RSPAMD_REGEXP_ESCAPE_ASCII = 0,
author	Vsevolod Stakhov <vsevolod@highsecure.ru>
	Mon, 17 May 2021 15:34:35 +0000 (16:34 +0100)
committer	Vsevolod Stakhov <vsevolod@highsecure.ru>
	Mon, 17 May 2021 15:34:35 +0000 (16:34 +0100)
src/libserver/html.c		patch \| blob \| blame \| history
src/libserver/url.h		patch \| blob \| blame \| history
src/libutil/cxx/utf8_util.cxx		patch \| blob \| blame \| history
src/libutil/cxx/utf8_util.h		patch \| blob \| blame \| history
src/libutil/str_util.c		patch \| blob \| blame \| history
src/libutil/str_util.h		patch \| blob \| blame \| history