Allow processing images urls for SURBL

author Vsevolod Stakhov <vsevolod@highsecure.ru>

Thu, 14 Jan 2016 13:30:55 +0000 (13:30 +0000)

committer Vsevolod Stakhov <vsevolod@highsecure.ru>

Thu, 14 Jan 2016 13:30:55 +0000 (13:30 +0000)
author Vsevolod Stakhov <vsevolod@highsecure.ru>
Thu, 14 Jan 2016 13:30:55 +0000 (13:30 +0000)
committer Vsevolod Stakhov <vsevolod@highsecure.ru>
Thu, 14 Jan 2016 13:30:55 +0000 (13:30 +0000)
diff --git a/doc/markdown/modules/surbl.md b/doc/markdown/modules/surbl.md

index 30655b7945d138dcacb5e8bdae8b44e506e6e788..84f43b8c0555a9bd4b0dbbd5404db19015d55ae6 100644 (file)
--- a/doc/markdown/modules/surbl.md
+++ b/doc/markdown/modules/surbl.md
@@ -46,6 +46,8 @@ surbl {
      }
      rule {
          suffix = "uribl.rambler.ru";
+        # Also check images
+        images = true;
          symbol = "RAMBLER_URIBL";
      }
      rule {
@@ -77,6 +79,28 @@ In general, the configuration of `surbl` module is definition of DNS lists. Each
  list must have suffix that defines the list itself and optionally for some lists
  it is possible to specify either `bit` or `ips` sections.
  
+Since some URL lists do not accept `IP` addresses, it is also possible to disable sending of URLs with IP address in the host to such lists. That could be done by specifying `noip = true` option:
+
+~~~nginx
+    rule {
+        suffix = "dbl.spamhaus.org";
+        symbol = "DBL";
+        # Do not check numeric URL's
+        noip = true;
+    }
+~~~
+
+It is also possible to check HTML images URLs using URL blacklists. Just specify `images = true` for such list and you are done:
+
+~~~nginx
+    rule {
+        suffix = "uribl.rambler.ru";
+        # Also check images
+        images = true;
+        symbol = "RAMBLER_URIBL";
+    }
+~~~
+
  ## Principles of operation
  
  In this section, we define how `surbl` module performs its checks.
diff --git a/src/libserver/html.c b/src/libserver/html.c

index 29922b1333bb44c6607458fc97c6a2e586d3e9cd..5c55d6f30f7f7c6fdb3f909ae9de4ae5a1619bf7 100644 (file)
--- a/src/libserver/html.c
+++ b/src/libserver/html.c
@@ -1278,60 +1278,85 @@ rspamd_html_parse_tag_content (rspamd_mempool_t *pool,
         *statep = state;
  }
  
-static struct rspamd_url *
-rspamd_html_process_url_tag (rspamd_mempool_t *pool, struct html_tag *tag)
+struct rspamd_url *
+rspamd_html_process_url (rspamd_mempool_t *pool, const gchar *start, guint len,
+               struct html_tag_component *comp)
  {
-       struct html_tag_component *comp;
         struct rspamd_url *url;
-       GList *cur;
-       const guchar *p;
         gchar *decoded;
         gint rc;
         gsize decoded_len;
         gboolean has_spaces = FALSE;
+       const gchar *p;
  
-       cur = tag->params->head;
+       p = start;
  
-       while (cur) {
-               comp = cur->data;
+       /* Strip spaces from the url */
+       /* Head spaces */
+       while (g_ascii_isspace (*p) && p < start + len) {
+               p ++;
+               start ++;
+               len --;
+               has_spaces = TRUE;
+       }
  
-               if (comp->type == RSPAMD_HTML_COMPONENT_HREF && comp->len > 0) {
-                       /* Strip spaces from the url component */
-                       p = comp->start;
+       if (comp) {
+               comp->start = p;
+               comp->len = len;
+       }
  
-                       while (g_ascii_isspace (*p) && p < comp->start + comp->len) {
-                               p ++;
-                               has_spaces = TRUE;
-                       }
+       /* Trailing spaces */
+       p = start + len - 1;
  
-                       comp->start = p;
-                       comp->len -= p - comp->start;
+       while (g_ascii_isspace (*p) && p >= start) {
+               p --;
+               len --;
  
-                       p = comp->start + comp->len - 1;
+               if (comp) {
+                       comp->len --;
+               }
+               has_spaces = TRUE;
+       }
  
-                       while (g_ascii_isspace (*p) && p >= comp->start) {
-                               p --;
-                               comp->len --;
-                               has_spaces = TRUE;
-                       }
+       /* Also we need to perform url decode */
+       decoded = rspamd_mempool_alloc (pool, len + 1);
+       rspamd_strlcpy (decoded, start, len + 1);
+       decoded_len = rspamd_decode_url (decoded, start, len);
  
-                       /* Also we need to perform url decode */
-                       decoded = rspamd_mempool_alloc (pool, comp->len + 1);
-                       rspamd_strlcpy (decoded, comp->start, comp->len + 1);
-                       decoded_len = rspamd_decode_url (decoded, comp->start, comp->len);
+       if (comp) {
+               comp->start = decoded;
+               comp->len = decoded_len;
+       }
  
-                       url = rspamd_mempool_alloc (pool, sizeof (*url));
-                       rc = rspamd_url_parse (url, decoded, decoded_len, pool);
+       url = rspamd_mempool_alloc (pool, sizeof (*url));
+       rc = rspamd_url_parse (url, decoded, decoded_len, pool);
  
-                       if (rc == URI_ERRNO_OK) {
+       if (rc == URI_ERRNO_OK) {
  
-                               /* Spaces in href usually mean an attempt to obfusicate URL */
-                               if (has_spaces) {
-                                       url->flags |= RSPAMD_URL_FLAG_OBSCURED;
-                               }
+               /* Spaces in href usually mean an attempt to obfuscate URL */
+               if (has_spaces) {
+                       url->flags |= RSPAMD_URL_FLAG_OBSCURED;
+               }
  
-                               return url;
-                       }
+               return url;
+       }
+
+       return NULL;
+}
+
+static struct rspamd_url *
+rspamd_html_process_url_tag (rspamd_mempool_t *pool, struct html_tag *tag)
+{
+       struct html_tag_component *comp;
+       GList *cur;
+
+       cur = tag->params->head;
+
+       while (cur) {
+               comp = cur->data;
+
+               if (comp->type == RSPAMD_HTML_COMPONENT_HREF && comp->len > 0) {
+                       return rspamd_html_process_url (pool, comp->start, comp->len, comp);
                 }
  
                 cur = g_list_next (cur);
@@ -1971,7 +1996,7 @@ rspamd_html_process_part_full (rspamd_mempool_t *pool, struct html_content *hc,
                                         save_space = FALSE;
                                 }
  
-                               if (cur_tag->id == Tag_A) {
+                               if (cur_tag->id == Tag_A || cur_tag->id == Tag_IFRAME) {
                                         if (!(cur_tag->flags & (FL_CLOSED|FL_CLOSING))) {
                                                 url = rspamd_html_process_url_tag (pool, cur_tag);
  
@@ -2007,7 +2032,8 @@ rspamd_html_process_part_full (rspamd_mempool_t *pool, struct html_content *hc,
                                                         href_offset = dest->len;
                                                 }
                                         }
-                                       else if (cur_tag->flags & FL_CLOSING) {
+                                       else if (cur_tag->id == Tag_A &&
+                                                       (cur_tag->flags & FL_CLOSING)) {
                                                 /* Insert exception */
                                                 if (url != NULL && (gint)dest->len > href_offset) {
                                                         rspamd_html_url_is_phished (pool, url,
@@ -2028,7 +2054,8 @@ rspamd_html_process_part_full (rspamd_mempool_t *pool, struct html_content *hc,
                                                 url = NULL;
                                         }
                                 }
-                               else if (cur_tag->id == Tag_IMG && !(cur_tag->flags & FL_CLOSING)) {
+
+                               if (cur_tag->id == Tag_IMG && !(cur_tag->flags & FL_CLOSING)) {
                                         rspamd_html_process_img_tag (pool, cur_tag, hc);
                                 }
                                 else if (!(cur_tag->flags & FL_CLOSING) &&
diff --git a/src/libserver/html.h b/src/libserver/html.h

index 3fe16696175c6a3c3587c04573681ba49cd75b6b..c16e7b040a40fa47abde218cbcb8f8dd6fb8c03b 100644 (file)
--- a/src/libserver/html.h
+++ b/src/libserver/html.h
@@ -112,4 +112,16 @@ GByteArray* rspamd_html_process_part_full (rspamd_mempool_t *pool,
   */
  gboolean rspamd_html_tag_seen (struct html_content *hc, const gchar *tagname);
  
+/**
+ * Extract URL from HTML tag component and sets component elements if needed
+ * @param pool
+ * @param start
+ * @param len
+ * @param comp
+ * @return
+ */
+struct rspamd_url * rspamd_html_process_url (rspamd_mempool_t *pool,
+               const gchar *start, guint len,
+               struct html_tag_component *comp);
+
  #endif
diff --git a/src/plugins/surbl.c b/src/plugins/surbl.c

index 7572a0df8a8031b3e6bc1d488f31897e165be269..8942f9ec535491bf71a77ca1881b2a6c1d15e865 100644 (file)
--- a/src/plugins/surbl.c
+++ b/src/plugins/surbl.c
@@ -48,6 +48,7 @@
  #include "rspamd.h"
  #include "surbl.h"
  #include "utlist.h"
+#include "libserver/html.h"
  #include "unix-std.h"
  
  static struct surbl_ctx *surbl_module_ctx = NULL;
@@ -410,6 +411,15 @@ surbl_module_init (struct rspamd_config *cfg, struct module_ctx **ctx)
                         0,
                         NULL,
                         0);
+       rspamd_rcl_add_doc_by_path (cfg,
+                       "surbl.rule",
+                       "Check images URLs with this URL list",
+                       "images",
+                       UCL_BOOLEAN,
+                       NULL,
+                       0,
+                       NULL,
+                       0);
         rspamd_rcl_add_doc_by_path (cfg,
                         "surbl.rule",
                         "Parse IP bits in DNS reply, the content is 'symbol = <bit>'",
@@ -631,6 +641,7 @@ surbl_module_config (struct rspamd_config *cfg)
                                         new_suffix->options |= SURBL_OPTION_NOIP;
                                 }
                         }
+
                         cur = ucl_obj_get_key (cur_rule, "resolve_ip");
                         if (cur != NULL && cur->type == UCL_BOOLEAN) {
                                 if (ucl_object_toboolean (cur)) {
@@ -638,6 +649,13 @@ surbl_module_config (struct rspamd_config *cfg)
                                 }
                         }
  
+                       cur = ucl_obj_get_key (cur_rule, "images");
+                       if (cur != NULL && cur->type == UCL_BOOLEAN) {
+                               if (ucl_object_toboolean (cur)) {
+                                       new_suffix->options |= SURBL_OPTION_CHECKIMAGES;
+                               }
+                       }
+
                         if ((new_suffix->options & (SURBL_OPTION_RESOLVEIP|SURBL_OPTION_NOIP)) ==
                                         (SURBL_OPTION_NOIP|SURBL_OPTION_RESOLVEIP)) {
                                 /* Mutually exclusive options */
@@ -1425,6 +1443,10 @@ surbl_test_url (struct rspamd_task *task, void *user_data)
  {
         struct redirector_param param;
         struct suffix_item *suffix = user_data;
+       guint i, j;
+       struct mime_text_part *part;
+       struct html_image *img;
+       struct rspamd_url *url;
  
         param.task = task;
         param.suffix = suffix;
@@ -1433,4 +1455,29 @@ surbl_test_url (struct rspamd_task *task, void *user_data)
                 (rspamd_mempool_destruct_t)g_hash_table_unref,
                 param.tree);
         g_hash_table_foreach (task->urls, surbl_tree_url_callback, &param);
+
+       /* We also need to check and process img URLs */
+       if (suffix->options & SURBL_OPTION_CHECKIMAGES) {
+               for (i = 0; i < task->text_parts->len; i ++) {
+                       part = g_ptr_array_index (task->text_parts, i);
+
+                       if (part->html && part->html->images) {
+                               for (j = 0; j < part->html->images->len; j ++) {
+                                       img = g_ptr_array_index (part->html->images, j);
+
+                                       if ((img->flags & RSPAMD_HTML_FLAG_IMAGE_EXTERNAL)
+                                                       && img->src) {
+                                               url = rspamd_html_process_url (task->task_pool,
+                                                               img->src, strlen (img->src), NULL);
+
+                                               if (url) {
+                                                       surbl_tree_url_callback (url, url, &param);
+                                                       msg_debug_task ("checked image url %s over %s",
+                                                                       img->src, suffix->suffix);
+                                               }
+                                       }
+                               }
+                       }
+               }
+       }
  }
diff --git a/src/plugins/surbl.h b/src/plugins/surbl.h

index 2477032b91708db99e9fd1a6e3deedccfc620937..68b27c3f08ea119f14301319ae5323b02c5f6d68 100644 (file)
--- a/src/plugins/surbl.h
+++ b/src/plugins/surbl.h
@@ -14,6 +14,7 @@
  #define DEFAULT_SURBL_SUFFIX "multi.surbl.org"
  #define SURBL_OPTION_NOIP (1 << 0)
  #define SURBL_OPTION_RESOLVEIP (1 << 1)
+#define SURBL_OPTION_CHECKIMAGES (1 << 2)
  #define MAX_LEVELS 10
  
  struct surbl_ctx {
author	Vsevolod Stakhov <vsevolod@highsecure.ru>
	Thu, 14 Jan 2016 13:30:55 +0000 (13:30 +0000)
committer	Vsevolod Stakhov <vsevolod@highsecure.ru>
	Thu, 14 Jan 2016 13:30:55 +0000 (13:30 +0000)
doc/markdown/modules/surbl.md		patch \| blob \| blame \| history
src/libserver/html.c		patch \| blob \| blame \| history
src/libserver/html.h		patch \| blob \| blame \| history
src/plugins/surbl.c		patch \| blob \| blame \| history
src/plugins/surbl.h		patch \| blob \| blame \| history