]> git.ipfire.org Git - thirdparty/rspamd.git/commitdiff
Exclude HTML urls content from texts
authorVsevolod Stakhov <vsevolod@highsecure.ru>
Thu, 16 Jul 2015 16:19:03 +0000 (17:19 +0100)
committerVsevolod Stakhov <vsevolod@highsecure.ru>
Thu, 16 Jul 2015 16:19:03 +0000 (17:19 +0100)
src/libmime/message.c
src/libserver/html.c
src/libserver/html.h

index 6c1bad06c52b5499a643b37bb3a6d0c15bdedbe9..cdd532f5a32aef97560a80cfa97cf40730ef6b8a 100644 (file)
@@ -1137,10 +1137,12 @@ process_text_part (struct rspamd_task *task,
                text_part->mime_part = mime_part;
 
                text_part->flags |= RSPAMD_MIME_PART_FLAG_BALANCED;
-               text_part->content = rspamd_html_process_part (
+               text_part->content = rspamd_html_process_part_full (
                                task->task_pool,
                                text_part->html,
-                               part_content);
+                               part_content,
+                               &text_part->urls_offset,
+                               task->urls);
 
                rspamd_url_text_extract (task->task_pool, task, text_part, TRUE);
 
index c23b228b6875e12b68104f5ed5e6589883a341af..f8220eabdf9b1d338ca7e1ce7592e32b84c84f8d 100644 (file)
@@ -1382,8 +1382,8 @@ rspamd_html_parse_tag_content (rspamd_mempool_t *pool,
 }
 
 GByteArray*
-rspamd_html_process_part (rspamd_mempool_t *pool, struct html_content *hc,
-               GByteArray *in)
+rspamd_html_process_part_full (rspamd_mempool_t *pool, struct html_content *hc,
+               GByteArray *in, GList **exceptions, GHashTable *urls)
 {
        const guchar *p, *c, *end, *tag_start = NULL, *savep = NULL;
        guchar t;
@@ -1391,8 +1391,9 @@ rspamd_html_process_part (rspamd_mempool_t *pool, struct html_content *hc,
        GByteArray *dest;
        guint obrace = 0, ebrace = 0;
        GNode *cur_level = NULL;
-       gint substate, len;
+       gint substate, len, href_offset = -1;
        struct html_tag *cur_tag = NULL;
+       struct process_exception *ex;
        enum {
                parse_start = 0,
                tag_begin,
@@ -1696,6 +1697,25 @@ rspamd_html_process_part (rspamd_mempool_t *pool, struct html_content *hc,
                                        g_byte_array_append (dest, "\r\n", 2);
                                        save_space = FALSE;
                                }
+
+                               if (cur_tag->id == Tag_A) {
+                                       if (!(cur_tag->flags & (FL_CLOSED|FL_CLOSING))) {
+                                               href_offset = dest->len;
+                                       }
+                                       else if (cur_tag->flags & FL_CLOSING) {
+                                               /* Insert exception */
+                                               if (exceptions && href_offset != -1
+                                                               && (gint)dest->len > href_offset) {
+                                                       ex = rspamd_mempool_alloc (pool, sizeof (*ex));
+                                                       ex->pos = href_offset;
+                                                       ex->len = dest->len - href_offset;
+
+                                                       *exceptions = g_list_prepend (*exceptions, ex);
+                                               }
+
+                                               href_offset = -1;
+                                       }
+                               }
                        }
                        else {
                                state = content_write;
@@ -1711,3 +1731,11 @@ rspamd_html_process_part (rspamd_mempool_t *pool, struct html_content *hc,
 
        return dest;
 }
+
+GByteArray*
+rspamd_html_process_part (rspamd_mempool_t *pool,
+               struct html_content *hc,
+               GByteArray *in)
+{
+       return rspamd_html_process_part_full (pool, hc, in, NULL, NULL);
+}
index 83c58c9f1563b50169b8871ba7f8eef28683a65d..1a98a3e9a886f954fa3e29dc694826ab99901378 100644 (file)
@@ -57,4 +57,8 @@ GByteArray* rspamd_html_process_part (rspamd_mempool_t *pool,
                struct html_content *hc,
                GByteArray *in);
 
+GByteArray* rspamd_html_process_part_full (rspamd_mempool_t *pool,
+               struct html_content *hc,
+               GByteArray *in, GList **exceptions, GHashTable *urls);
+
 #endif