]> git.ipfire.org Git - thirdparty/rspamd.git/commitdiff
Use new HTML API in message.c
authorVsevolod Stakhov <vsevolod@highsecure.ru>
Thu, 16 Jul 2015 08:36:06 +0000 (09:36 +0100)
committerVsevolod Stakhov <vsevolod@highsecure.ru>
Thu, 16 Jul 2015 08:36:06 +0000 (09:36 +0100)
src/libmime/message.c
src/libserver/html.c
src/libserver/html.h

index 70885a36d7979e8e858dba2e5e3f0a2b32d59fb2..f48151d059fa149761cb2cd11e99e80ea52d47fe 100644 (file)
@@ -53,261 +53,6 @@ rspamd_message_quark (void)
        return g_quark_from_static_string ("mime-error");
 }
 
-GByteArray *
-strip_html_tags (struct rspamd_task *task,
-       rspamd_mempool_t * pool,
-       struct mime_text_part *part,
-       GByteArray * src,
-       gint *stateptr)
-{
-       uint8_t *p, *rp, *tbegin = NULL, *end, c, lc, *estart = NULL;
-       gint br, i = 0, depth = 0, in_q = 0;
-       gint state = 0;
-       guint dlen;
-       GByteArray *buf;
-       GNode *level_ptr = NULL;
-       gboolean erase = FALSE, html_decode = FALSE;
-
-       if (stateptr)
-               state = *stateptr;
-
-       buf = g_byte_array_sized_new (src->len);
-       g_byte_array_append (buf, src->data, src->len);
-
-       c = *src->data;
-       lc = '\0';
-       p = src->data;
-       rp = buf->data;
-       end = src->data + src->len;
-       br = 0;
-
-       while (i < (gint)src->len) {
-               switch (c) {
-               case '\0':
-                       break;
-               case '<':
-                       if (g_ascii_isspace (*(p + 1))) {
-                               goto reg_char;
-                       }
-                       if (state == 0) {
-                               lc = '<';
-                               tbegin = p + 1;
-                               state = 1;
-                       }
-                       else if (state == 1) {
-                               /* Opening bracket without closing one */
-                               p--;
-                               while (g_ascii_isspace (*p) && p > src->data) {
-                                       p--;
-                               }
-                               p++;
-                               goto unbreak_tag;
-                       }
-                       break;
-
-               case '(':
-                       if (state == 2) {
-                               if (lc != '"' && lc != '\'') {
-                                       lc = '(';
-                                       br++;
-                               }
-                       }
-                       else if (state == 0 && !erase) {
-                               *(rp++) = c;
-                       }
-                       break;
-
-               case ')':
-                       if (state == 2) {
-                               if (lc != '"' && lc != '\'') {
-                                       lc = ')';
-                                       br--;
-                               }
-                       }
-                       else if (state == 0 && !erase) {
-                               *(rp++) = c;
-                       }
-                       break;
-
-               case '>':
-                       if (depth) {
-                               depth--;
-                               break;
-                       }
-
-                       if (in_q) {
-                               break;
-                       }
-unbreak_tag:
-                       switch (state) {
-                       case 1:         /* HTML/XML */
-                               lc = '>';
-                               in_q = state = 0;
-                               erase = !add_html_node (task,
-                                               pool,
-                                               part,
-                                               tbegin,
-                                               p - tbegin,
-                                               end - tbegin,
-                                               &level_ptr);
-                               break;
-
-                       case 2:         /* PHP */
-                               if (!br && lc != '\"' && *(p - 1) == '?') {
-                                       in_q = state = 0;
-                               }
-                               break;
-
-                       case 3:
-                               in_q = state = 0;
-                               break;
-
-                       case 4:         /* JavaScript/CSS/etc... */
-                               if (p >= src->data + 2 && *(p - 1) == '-' && *(p - 2) == '-') {
-                                       in_q = state = 0;
-                               }
-                               break;
-
-                       default:
-                               if (!erase) {
-                                       *(rp++) = c;
-                               }
-                               break;
-                       }
-                       break;
-
-               case '"':
-               case '\'':
-                       if (state == 2 && *(p - 1) != '\\') {
-                               if (lc == c) {
-                                       lc = '\0';
-                               }
-                               else if (lc != '\\') {
-                                       lc = c;
-                               }
-                       }
-                       else if (state == 0 && !erase) {
-                               *(rp++) = c;
-                       }
-                       if (state && p != src->data && *(p - 1) != '\\' &&
-                               (!in_q || *p == in_q)) {
-                               if (in_q) {
-                                       in_q = 0;
-                               }
-                               else {
-                                       in_q = *p;
-                               }
-                       }
-                       break;
-
-               case '!':
-                       /* JavaScript & Other HTML scripting languages */
-                       if (state == 1 && *(p - 1) == '<') {
-                               state = 3;
-                               lc = c;
-                       }
-                       else {
-                               if (state == 0 && !erase) {
-                                       *(rp++) = c;
-                               }
-                       }
-                       break;
-
-               case '-':
-                       if (state == 3 && p >= src->data + 2 && *(p - 1) == '-' &&
-                               *(p - 2) == '!') {
-                               state = 4;
-                       }
-                       else {
-                               goto reg_char;
-                       }
-                       break;
-
-               case '&':
-                       /* Decode entitle */
-                       html_decode = TRUE;
-                       estart = rp;
-                       goto reg_char;
-                       break;
-
-               case ';':
-                       if (html_decode) {
-                               html_decode = FALSE;
-                               *rp = ';';
-                               if (rp - estart > 0) {
-                                       dlen = rp - estart + 1;
-                                       rspamd_html_decode_entitles_inplace (estart, &dlen);
-                                       rp = estart + dlen;
-                               }
-                       }
-                       break;
-
-               case '?':
-
-                       if (state == 1 && *(p - 1) == '<') {
-                               br = 0;
-                               state = 2;
-                               break;
-                       }
-               case 'E':
-               case 'e':
-                       /* !DOCTYPE exception */
-                       if (state == 3 && p > src->data + 6
-                               && g_ascii_tolower (*(p - 1)) == 'p'
-                               && g_ascii_tolower (*(p - 2)) == 'y'
-                               && g_ascii_tolower (*(p - 3)) == 't' &&
-                               g_ascii_tolower (*(p - 4)) == 'c' &&
-                               g_ascii_tolower (*(p - 5)) == 'o' &&
-                               g_ascii_tolower (*(p - 6)) == 'd') {
-                               state = 1;
-                               break;
-                       }
-               /* fall-through */
-               case 'l':
-
-                       /* swm: If we encounter '<?xml' then we shouldn't be in
-                        * state == 2 (PHP). Switch back to HTML.
-                        */
-
-                       if (state == 2 && p > src->data + 2 && *(p - 1) == 'm' &&
-                               *(p - 2) == 'x') {
-                               state = 1;
-                               break;
-                       }
-
-               /* fall-through */
-               default:
-reg_char:
-                       if (state == 0 && !erase) {
-                               *(rp++) = c;
-                       }
-                       break;
-               }
-               i++;
-               if (i < (gint)src->len) {
-                       c = *(++p);
-               }
-       }
-       if (rp < buf->data + src->len) {
-               *rp = '\0';
-               g_byte_array_set_size (buf, rp - buf->data);
-       }
-
-       /* Check tag balancing */
-       if (level_ptr && level_ptr->data != NULL) {
-               part->flags &= ~RSPAMD_MIME_PART_FLAG_BALANCED;
-       }
-       else {
-               part->flags |= RSPAMD_MIME_PART_FLAG_BALANCED;
-       }
-
-       if (stateptr) {
-               *stateptr = state;
-       }
-
-       return buf;
-}
-
 static void
 parse_qmail_recv (rspamd_mempool_t * pool,
        gchar *line,
@@ -1386,21 +1131,17 @@ process_text_part (struct rspamd_task *task,
                                text_part->orig,
                                type,
                                text_part);
-               text_part->html_nodes = NULL;
+               text_part->html = rspamd_mempool_alloc (task->task_pool,
+                               sizeof (*text_part->html));
                text_part->parent = parent;
                text_part->mime_part = mime_part;
 
                text_part->flags |= RSPAMD_MIME_PART_FLAG_BALANCED;
-               text_part->content = strip_html_tags (task,
+               text_part->content = rspamd_html_process_part (
                                task->task_pool,
-                               text_part,
-                               part_content,
-                               NULL);
+                               text_part->html,
+                               part_content);
 
-               if (text_part->html_nodes != NULL) {
-                       rspamd_html_decode_entitles_inplace (text_part->content->data,
-                               &text_part->content->len);
-               }
                rspamd_url_text_extract (task->task_pool, task, text_part, TRUE);
 
                rspamd_mempool_add_destructor (task->task_pool,
index 6af61a1996df5a198586fbc33fedc715d5cd4b77..5bf042153acf7fdf7eb1bfeb61a70a810c3f9cf0 100644 (file)
@@ -1365,7 +1365,7 @@ rspamd_html_parse_tag_content (rspamd_mempool_t *pool,
        *statep = state;
 }
 
-gboolean
+GByteArray*
 rspamd_html_process_part (rspamd_mempool_t *pool, struct html_content *hc,
                GByteArray *in)
 {
@@ -1635,5 +1635,5 @@ rspamd_html_process_part (rspamd_mempool_t *pool, struct html_content *hc,
                }
        }
 
-       return TRUE;
+       return dest;
 }
index 0f1a95dc78f128a76235432804293bdde3b6c5b8..3095813c2601fd6f4d176b6acd234c664f51ae78 100644 (file)
@@ -63,7 +63,7 @@ struct html_tag * get_tag_by_name (const gchar *name);
  */
 guint rspamd_html_decode_entitles_inplace (gchar *s, guint len);
 
-gboolean rspamd_html_process_part (rspamd_mempool_t *pool,
+GByteArray* rspamd_html_process_part (rspamd_mempool_t *pool,
                struct html_content *hc,
                GByteArray *in);