]> git.ipfire.org Git - thirdparty/rspamd.git/commitdiff
[Minor] Add UText wrapper for stripped content
authorVsevolod Stakhov <vsevolod@highsecure.ru>
Thu, 6 Sep 2018 15:24:28 +0000 (16:24 +0100)
committerVsevolod Stakhov <vsevolod@highsecure.ru>
Thu, 6 Sep 2018 18:50:18 +0000 (19:50 +0100)
src/libmime/message.c
src/libmime/message.h

index 388ab0aa3750afaaed115a180717ae018ad3b9db..e59d34b259b6b9228f96c93112b97fd5febbc6bf 100644 (file)
@@ -495,11 +495,11 @@ static void
 rspamd_normalize_text_part (struct rspamd_task *task,
                struct rspamd_mime_text_part *part)
 {
-
        const gchar *p, *end;
        guint i;
        goffset off;
        struct rspamd_process_exception *ex;
+       UErrorCode uc_err = U_ZERO_ERROR;
 
        part->newlines = g_ptr_array_sized_new (128);
 
@@ -526,6 +526,18 @@ rspamd_normalize_text_part (struct rspamd_task *task,
                }
        }
 
+       if (IS_PART_UTF (part)) {
+               utext_openUTF8 (&part->utf_stripped_text,
+                               part->utf_stripped_content->data,
+                               part->utf_stripped_content->len,
+                               &uc_err);
+
+               if (!U_SUCCESS (uc_err)) {
+                       msg_warn_task ("cannot open text from utf content");
+                       /* Probably, should be an assertion */
+               }
+       }
+
        rspamd_mempool_add_destructor (task->task_pool,
                        (rspamd_mempool_destruct_t) free_byte_array_callback,
                        part->utf_stripped_content);
@@ -833,6 +845,7 @@ rspamd_message_process_text_part_maybe (struct rspamd_task *task,
        text_part->raw.len = mime_part->raw_data.len;
        text_part->parsed.begin = mime_part->parsed_data.begin;
        text_part->parsed.len = mime_part->parsed_data.len;
+       text_part->utf_stripped_text = (UText)UTEXT_INITIALIZER;
 
        if (found_html) {
                if (!rspamd_message_process_html_text_part (task, text_part)) {
index e4b5a3d4b78d47c1e8f20539f78da05858553f9b..f4dbdaa72de1801c6f6b2ddf8e7f6eb83b3caac7 100644 (file)
@@ -14,6 +14,7 @@
 #include "content_type.h"
 
 #include <unicode/uchar.h>
+#include <unicode/utext.h>
 
 struct rspamd_task;
 struct controller_session;
@@ -97,6 +98,7 @@ struct rspamd_mime_text_part {
        GByteArray *utf_stripped_content; /* utf content with no newlines */
        GArray *normalized_hashes;
        GArray *utf_words;
+       UText utf_stripped_text; /* Used by libicu to represent the utf8 content */
 
        /* Unicode content, used by libicu */
        GArray *unicode_raw_content; /* unicode raw content (of UChar) */