]> git.ipfire.org Git - thirdparty/rspamd.git/commitdiff
[Feature] Add function to transliterate utf8 to ascii with some normalisation
authorVsevolod Stakhov <vsevolod@rspamd.com>
Thu, 20 Jul 2023 20:43:08 +0000 (21:43 +0100)
committerVsevolod Stakhov <vsevolod@rspamd.com>
Thu, 20 Jul 2023 20:43:08 +0000 (21:43 +0100)
src/libutil/cxx/utf8_util.cxx
src/libutil/cxx/utf8_util.h

index 8d9fc31a9e80e763b298c1570a72caa2ebacf14f..4be7e9c585780fd212dd30fa7bb67ec0af3db31b 100644 (file)
@@ -21,6 +21,7 @@
 #include <unicode/normalizer2.h>
 #include <unicode/schriter.h>
 #include <unicode/coll.h>
+#include <unicode/translit.h>
 #include <utility>
 #include <tuple>
 #include <string>
@@ -159,6 +160,50 @@ rspamd_normalise_unicode_inplace(char *start, size_t *len)
        return static_cast<enum rspamd_utf8_normalise_result>(ret);
 }
 
+gchar*
+rspamd_utf8_transliterate(const gchar *start, gsize len, gsize *target_len)
+{
+       UErrorCode uc_err = U_ZERO_ERROR;
+
+       static const icu::Transliterator *transliterator = nullptr;
+
+       if (transliterator == nullptr) {
+               UParseError parse_err;
+               static const auto rules = icu::UnicodeString{":: Any-Latin;"
+                                                                                                        ":: [:Nonspacing Mark:] Remove;"
+                                                                                                        ":: [:Punctuation:] Remove;"
+                                                                                                        ":: [:Symbol:] Remove;"
+                                                                                                        ":: [:Format:] Remove;"
+                                                                                                        ":: Latin-ASCII;"
+                                                                                                        ":: Lower();"
+                                                                                                        ":: NULL;"
+                                                                                                        "[:Space Separator:] > ' '"
+               };
+               transliterator = icu::Transliterator::createFromRules("RspamdTranslit", rules, UTRANS_FORWARD, parse_err, uc_err);
+
+               if (U_FAILURE(uc_err) || transliterator == nullptr) {
+                       auto context = icu::UnicodeString(parse_err.postContext, sizeof(parse_err.preContext) / sizeof(UChar));
+                       g_error ("fatal error: cannot init libicu transliteration engine: %s, line: %d, offset: %d",
+                                       u_errorName(uc_err), parse_err.line, parse_err.offset);
+                       abort();
+               }
+       }
+
+       auto uc_string = icu::UnicodeString::fromUTF8(icu::StringPiece(start, len));
+       transliterator->transliterate(uc_string);
+
+       // We assume that all characters are now ascii
+       auto dest_len = uc_string.length();
+       gchar *dest = (gchar *)g_malloc(dest_len + 1);
+       auto sink = icu::CheckedArrayByteSink(dest, dest_len);
+       uc_string.toUTF8(sink);
+
+       *target_len = sink.NumberOfBytesWritten();
+       dest[*target_len] = '\0';
+
+       return dest;
+}
+
 struct rspamd_icu_collate_storage {
        icu::Collator* collator = nullptr;
        rspamd_icu_collate_storage() {
@@ -310,4 +355,39 @@ TEST_CASE("utf8 strcmp") {
                }
        }
 }
+
+TEST_CASE("transliterate") {
+       using namespace std::literals;
+       std::tuple<std::string_view, const char *> cases[] = {
+               {"abc"sv, "abc"},
+               {""sv,  ""},
+               {"тест"sv,  "test"},
+               // Diacritic to ascii
+               {"Ύ"sv, "y"},
+               // Chinese to pinyin
+               {"你好"sv, "ni hao"},
+               // Japanese to romaji
+               {"こんにちは"sv, "konnichiha"},
+               // Devanagari to latin
+               {"नमस्ते"sv, "namaste"},
+               // Arabic to latin
+               {"مرحبا"sv, "mrhba"},
+               // Remove of punctuation
+               {"a.b.c"sv, "abc"},
+               // Lowercase
+               {"ABC"sv, "abc"},
+               // Remove zero-width spaces
+               {"\xE2\x80\x8B""abc\xE2\x80\x8B""def"sv, "abcdef"},
+       };
+
+       for (const auto &c : cases) {
+               auto [s1, s2] = c;
+               SUBCASE((std::string("test case: ") + std::string(s1) + " => " + s2).c_str()) {
+                       gsize tlen;
+                       auto *ret = rspamd_utf8_transliterate(s1.data(), s1.length(), &tlen);
+                       CHECK(tlen == strlen(s2));
+                       CHECK(strcmp(s2, ret) == 0);
+               }
+       }
+}
 }
\ No newline at end of file
index da4ebdb2482112031dd86a9ea3aad5e0c8006ffb..7f28ea45e67dd87b17d7dffa048f589f5c0bb820 100644 (file)
@@ -51,6 +51,15 @@ enum rspamd_utf8_normalise_result {
  */
 enum rspamd_utf8_normalise_result rspamd_normalise_unicode_inplace(gchar *start, gsize *len);
 
+/**
+ * Transliterate a string to ASCII
+ * @param start
+ * @param len
+ * @param target_len
+ * @return a new string that should be freed with g_free
+ */
+gchar* rspamd_utf8_transliterate(const gchar *start, gsize len, gsize *target_len);
+
 /**
  * Compare two strings using libicu collator
  * @param s1