]> git.ipfire.org Git - thirdparty/rspamd.git/commitdiff
[Feature] Core: Detect charset in archived files
authorVsevolod Stakhov <vsevolod@highsecure.ru>
Fri, 28 Dec 2018 07:53:42 +0000 (07:53 +0000)
committerVsevolod Stakhov <vsevolod@highsecure.ru>
Fri, 28 Dec 2018 07:53:42 +0000 (07:53 +0000)
src/libmime/archives.c
src/libmime/mime_encoding.c
src/libmime/mime_encoding.h

index 183232e6f6d957e1750467d03e8d384ffc3a6950..8497fdf70791606e737e8affe6d1a061d95fdc45 100644 (file)
 #include "message.h"
 #include "task.h"
 #include "archives.h"
+#include "libmime/mime_encoding.h"
 #include <unicode/uchar.h>
 #include <unicode/utf8.h>
 #include <unicode/utf16.h>
+#include <unicode/ucnv.h>
+
 
 static void
 rspamd_archive_dtor (gpointer p)
@@ -42,6 +45,79 @@ rspamd_archive_dtor (gpointer p)
        g_ptr_array_free (arch->files, TRUE);
 }
 
+static GString *
+rspamd_archive_file_try_utf (const gchar *in, gsize inlen)
+{
+       const gchar *charset = NULL, *p, *end;
+       GString *res;
+
+       charset = rspamd_mime_charset_find_by_content (in, inlen);
+
+       if (charset) {
+               UChar *tmp;
+               UErrorCode uc_err = U_ZERO_ERROR;
+               gint32 r, clen, dlen;
+               struct rspamd_charset_converter *conv;
+               UConverter *utf8_converter;
+
+               conv = rspamd_mime_get_converter_cached (charset, &uc_err);
+               utf8_converter = rspamd_get_utf8_converter ();
+
+               if (conv == NULL) {
+                       msg_err ("cannot open converter for %s: %s",
+                                       charset, u_errorName (uc_err));
+
+                       return NULL;
+               }
+
+               tmp = g_malloc (sizeof (*tmp) * (inlen + 1));
+               r = rspamd_converter_to_uchars (conv, tmp, inlen + 1,
+                               in, inlen, &uc_err);
+               if (!U_SUCCESS (uc_err)) {
+                       msg_err ("cannot convert data to unicode from %s: %s",
+                                       charset, u_errorName (uc_err));
+                       g_free (tmp);
+
+                       return NULL;
+               }
+
+               clen = ucnv_getMaxCharSize (utf8_converter);
+               dlen = UCNV_GET_MAX_BYTES_FOR_STRING (r, clen);
+               res = g_string_sized_new (dlen);
+               r = ucnv_fromUChars (utf8_converter, res->str, dlen, tmp, r, &uc_err);
+
+               if (!U_SUCCESS (uc_err)) {
+                       msg_err ("cannot convert data from unicode from %s: %s",
+                                       charset, u_errorName (uc_err));
+                       g_free (tmp);
+                       g_string_free (res, TRUE);
+
+                       return NULL;
+               }
+
+               res->len = r;
+       }
+       else {
+               /* Convert unsafe characters to '?' */
+               res = g_string_sized_new (inlen);
+               p = in;
+               end = in + inlen;
+
+               while (p < end) {
+                       if (g_ascii_isgraph (*p)) {
+                               g_string_append_c (res, *p);
+                       }
+                       else {
+                               g_string_append_c (res, '?');
+                       }
+
+                       p ++;
+               }
+       }
+
+       return res;
+}
+
 static void
 rspamd_archive_process_zip (struct rspamd_task *task,
                struct rspamd_mime_part *part)
@@ -147,11 +223,17 @@ rspamd_archive_process_zip (struct rspamd_task *task,
                }
 
                f = g_malloc0 (sizeof (*f));
-               f->fname = g_string_new_len (cd + cd_basic_len, fname_len);
+               f->fname = rspamd_archive_file_try_utf (cd + cd_basic_len, fname_len);
                f->compressed_size = comp_size;
                f->uncompressed_size = uncomp_size;
-               g_ptr_array_add (arch->files, f);
-               msg_debug_task ("found file in zip archive: %v", f->fname);
+
+               if (f->fname) {
+                       g_ptr_array_add (arch->files, f);
+                       msg_debug_task ("found file in zip archive: %v", f->fname);
+               }
+               else {
+                       g_free (f);
+               }
 
                cd += fname_len + comment_len + extra_len + cd_basic_len;
        }
@@ -1227,7 +1309,10 @@ rspamd_7zip_ucs2_to_utf8 (struct rspamd_task *task, const guchar *p,
 
        while (src_pos < len) {
                U16_NEXT (up, src_pos, len, wc);
-               U8_APPEND (res->str, dest_pos, res->allocated_len, wc, is_error);
+
+               if (wc > 0) {
+                       U8_APPEND (res->str, dest_pos, res->allocated_len, wc, is_error);
+               }
 
                if (is_error) {
                        g_string_free (res, TRUE);
@@ -1519,19 +1604,14 @@ rspamd_archive_process_gzip (struct rspamd_task *task,
                                        struct rspamd_archive_file *f;
 
                                        f = g_malloc0 (sizeof (*f));
-                                       f->fname = g_string_new (fname_start);
+                                       f->fname = rspamd_archive_file_try_utf (fname_start,
+                                                       p - fname_start);
 
                                        g_ptr_array_add (arch->files, f);
 
                                        goto set;
                                }
                        }
-                       else if (!g_ascii_isgraph (*p)) {
-                               msg_debug_task ("gzip archive is invalid, bad filename at pos %d",
-                                               (int)(p - start));
-
-                               return;
-                       }
 
                        p ++;
                }
index d7ac5d416fba36754c31a2302e7d4a12fc9866db..213817747beb45bbdecb35b1d558f209d182538b 100644 (file)
@@ -98,7 +98,7 @@ rspamd_converter_dtor (gpointer p)
        g_free (c);
 }
 
-static int32_t
+int32_t
 rspamd_converter_to_uchars (struct rspamd_charset_converter *cnv,
                                                        UChar *dest,
                                                        int32_t destCapacity,
@@ -132,7 +132,7 @@ rspamd_converter_to_uchars (struct rspamd_charset_converter *cnv,
 }
 
 
-static struct rspamd_charset_converter *
+struct rspamd_charset_converter *
 rspamd_mime_get_converter_cached (const gchar *enc, UErrorCode *err)
 {
        const gchar *canon_name;
@@ -497,8 +497,8 @@ rspamd_mime_charset_utf_enforce (gchar *in, gsize len)
        }
 }
 
-static const char *
-rspamd_mime_charset_find_by_content (gchar *in, gsize inlen)
+const char *
+rspamd_mime_charset_find_by_content (const gchar *in, gsize inlen)
 {
        static UCharsetDetector *csd;
        const UCharsetMatch **csm, *sel = NULL;
@@ -524,7 +524,7 @@ rspamd_mime_charset_find_by_content (gchar *in, gsize inlen)
 detect:
 
        ucsdet_setText (csd, in, inlen, &uc_err);
-       csm = ucsdet_detectAll(csd, &matches, &uc_err);
+       csm = ucsdet_detectAll (csd, &matches, &uc_err);
 
        for (i = 0; i < matches; i ++) {
                if ((conf = ucsdet_getConfidence (csm[i], &uc_err)) > max_conf) {
index 5f436d99dd7a9513a74fd1736821aeba6c47b8e2..1a61339ca22328567148f2ba7980e7cc661323d3 100644 (file)
 #include "config.h"
 #include "mem_pool.h"
 #include "fstring.h"
+#include <unicode/uchar.h>
 
 struct rspamd_task;
 struct rspamd_mime_part;
 struct rspamd_mime_text_part;
+struct rspamd_charset_converter;
 
 /**
  * Convert charset to a valid iconv charset
@@ -87,5 +89,41 @@ gboolean rspamd_mime_charset_utf_check (rspamd_ftok_t *charset,
  */
 void rspamd_mime_charset_utf_enforce (gchar *in, gsize len);
 
+/**
+ * Gets cached converter
+ * @param enc
+ * @param err
+ * @return
+ */
+struct rspamd_charset_converter *rspamd_mime_get_converter_cached (
+               const gchar *enc,
+               UErrorCode *err);
+
+/**
+ * Performs charset->utf16 conversion
+ * @param cnv
+ * @param dest
+ * @param destCapacity
+ * @param src
+ * @param srcLength
+ * @param pErrorCode
+ * @return
+ */
+gint32
+rspamd_converter_to_uchars (struct rspamd_charset_converter *cnv,
+                                                       UChar *dest,
+                                                       gint32 destCapacity,
+                                                       const char *src,
+                                                       gint32 srcLength,
+                                                       UErrorCode *pErrorCode);
+
+/**
+ * Detect charset in text
+ * @param in
+ * @param inlen
+ * @return detected charset name or NULL
+ */
+const char *rspamd_mime_charset_find_by_content (const gchar *in, gsize inlen);
+
 
 #endif /* SRC_LIBMIME_MIME_ENCODING_H_ */