[Rework] Html: Further html urls rework

author Vsevolod Stakhov <vsevolod@highsecure.ru>

Tue, 25 May 2021 11:15:30 +0000 (12:15 +0100)

committer Vsevolod Stakhov <vsevolod@highsecure.ru>

Thu, 27 May 2021 14:05:21 +0000 (15:05 +0100)
author Vsevolod Stakhov <vsevolod@highsecure.ru>
Tue, 25 May 2021 11:15:30 +0000 (12:15 +0100)
committer Vsevolod Stakhov <vsevolod@highsecure.ru>
Thu, 27 May 2021 14:05:21 +0000 (15:05 +0100)
diff --git a/src/libserver/html/html.cxx b/src/libserver/html/html.cxx

index c167b004f4bbd3196f17bf4b03e4407bd1b1fea4..c384a9023f328f6dd0f9d8179141a6a377fa87fa 100644 (file)
--- a/src/libserver/html/html.cxx
+++ b/src/libserver/html/html.cxx
@@ -30,6 +30,7 @@
  #include "html_tag_defs.hxx"
  #include "html_entities.hxx"
  #include "html_tag.hxx"
+#include "html_url.hxx"
  
  #include <vector>
  #include <frozen/unordered_map.h>
@@ -633,273 +634,76 @@ parse_tag_content(rspamd_mempool_t *pool,
         parser_env.cur_state = state;
  }
  
-}
-
-/* Unconverted C part */
-
-static struct rspamd_url *rspamd_html_process_url(rspamd_mempool_t *pool,
-                                                                                                 const gchar *start, guint len,
-                                                                                                 struct html_tag_component *comp);
-
-
-
-
-struct rspamd_url *
-rspamd_html_process_url(rspamd_mempool_t *pool, const gchar *start, guint len,
-                                               struct html_tag_component *comp) {
-       struct rspamd_url *url;
-       guint saved_flags = 0;
-       gchar *decoded;
-       gint rc;
-       gsize decoded_len;
-       const gchar *p, *s, *prefix = "http://";
-       gchar *d;
-       guint i;
-       gsize dlen;
-       gboolean has_bad_chars = FALSE, no_prefix = FALSE;
-       static const gchar hexdigests[] = "0123456789abcdef";
-
-       p = start;
-
-       /* Strip spaces from the url */
-       /* Head spaces */
-       while (p < start + len && g_ascii_isspace (*p)) {
-               p++;
-               start++;
-               len--;
-       }
-
-       if (comp) {
-               comp->start = (guchar *)p;
-               comp->len = len;
-       }
-
-       /* Trailing spaces */
-       p = start + len - 1;
-
-       while (p >= start && g_ascii_isspace (*p)) {
-               p--;
-               len--;
-
-               if (comp) {
-                       comp->len--;
-               }
-       }
+static auto
+html_process_url_tag(rspamd_mempool_t *pool,
+                                        struct html_tag *tag,
+                                        struct html_content *hc) -> std::optional<struct rspamd_url *>
+{
+       auto found_href_it = tag->parameters.find(html_component_type::RSPAMD_HTML_COMPONENT_HREF);
  
-       s = start;
-       dlen = 0;
+       if (found_href_it != tag->parameters.end()) {
+               /* Check base url */
+               auto &href_value = found_href_it->second;
  
-       for (i = 0; i < len; i++) {
-               if (G_UNLIKELY (((guint) s[i]) < 0x80 && !g_ascii_isgraph(s[i]))) {
-                       dlen += 3;
-               }
-               else {
-                       dlen++;
-               }
-       }
+               if (hc && hc->base_url && href_value.size() > 2) {
+                       /*
+                        * Relative url cannot start from the following:
+                        * schema://
+                        * data:
+                        * slash
+                        */
  
-       if (rspamd_substring_search(start, len, "://", 3) == -1) {
-               if (len >= sizeof("mailto:") &&
-                       (memcmp(start, "mailto:", sizeof("mailto:") - 1) == 0 ||
-                        memcmp(start, "tel:", sizeof("tel:") - 1) == 0 ||
-                        memcmp(start, "callto:", sizeof("callto:") - 1) == 0)) {
-                       /* Exclusion, has valid but 'strange' prefix */
-               }
-               else {
-                       for (i = 0; i < len; i++) {
-                               if (!((s[i] & 0x80) || g_ascii_isalnum (s[i]))) {
-                                       if (i == 0 && len > 2 && s[i] == '/' && s[i + 1] == '/') {
-                                               prefix = "http:";
-                                               dlen += sizeof("http:") - 1;
-                                               no_prefix = TRUE;
-                                       }
-                                       else if (s[i] == '@') {
-                                               /* Likely email prefix */
-                                               prefix = "mailto://";
-                                               dlen += sizeof("mailto://") - 1;
-                                               no_prefix = TRUE;
-                                       }
-                                       else if (s[i] == ':' && i != 0) {
-                                               /* Special case */
-                                               no_prefix = FALSE;
-                                       }
-                                       else {
-                                               if (i == 0) {
-                                                       /* No valid data */
-                                                       return NULL;
-                                               }
-                                               else {
-                                                       no_prefix = TRUE;
-                                                       dlen += strlen(prefix);
-                                               }
-                                       }
+                       if (rspamd_substring_search(href_value.data(), href_value.size(), "://", 3) == -1) {
  
-                                       break;
+                               if (href_value.size() >= sizeof("data:") &&
+                                       g_ascii_strncasecmp(href_value.data(), "data:", sizeof("data:") - 1) == 0) {
+                                       /* Image data url, never insert as url */
+                                       return std::nullopt;
                                 }
-                       }
-               }
-       }
-
-       decoded = (char *)rspamd_mempool_alloc (pool, dlen + 1);
-       d = decoded;
-
-       if (no_prefix) {
-               gsize plen = strlen(prefix);
-               memcpy(d, prefix, plen);
-               d += plen;
-       }
-
-       /*
-        * We also need to remove all internal newlines, spaces
-        * and encode unsafe characters
-        */
-       for (i = 0; i < len; i++) {
-               if (G_UNLIKELY (g_ascii_isspace(s[i]))) {
-                       continue;
-               }
-               else if (G_UNLIKELY (((guint) s[i]) < 0x80 && !g_ascii_isgraph(s[i]))) {
-                       /* URL encode */
-                       *d++ = '%';
-                       *d++ = hexdigests[(s[i] >> 4) & 0xf];
-                       *d++ = hexdigests[s[i] & 0xf];
-                       has_bad_chars = TRUE;
-               }
-               else {
-                       *d++ = s[i];
-               }
-       }
-
-       *d = '\0';
-       dlen = d - decoded;
  
-       url = rspamd_mempool_alloc0_type(pool, struct rspamd_url);
+                               /* Assume relative url */
+                               auto need_slash = false;
  
-       rspamd_url_normalise_propagate_flags (pool, decoded, &dlen, saved_flags);
+                               auto orig_len = href_value.size();
+                               auto len = orig_len + hc->base_url->urllen;
  
-       rc = rspamd_url_parse(url, decoded, dlen, pool, RSPAMD_URL_PARSE_HREF);
-
-       /* Filter some completely damaged urls */
-       if (rc == URI_ERRNO_OK && url->hostlen > 0 &&
-               !((url->protocol & PROTOCOL_UNKNOWN))) {
-               url->flags |= saved_flags;
-
-               if (has_bad_chars) {
-                       url->flags |= RSPAMD_URL_FLAG_OBSCURED;
-               }
-
-               if (no_prefix) {
-                       url->flags |= RSPAMD_URL_FLAG_SCHEMALESS;
+                               if (hc->base_url->datalen == 0) {
+                                       need_slash = true;
+                                       len++;
+                               }
  
-                       if (url->tldlen == 0 || (url->flags & RSPAMD_URL_FLAG_NO_TLD)) {
-                               /* Ignore urls with both no schema and no tld */
-                               return NULL;
+                               auto *buf = rspamd_mempool_alloc_buffer(pool, len + 1);
+                               auto nlen = (std::size_t)rspamd_snprintf(buf, len + 1,
+                                               "%*s%s%*s",
+                                               hc->base_url->urllen, hc->base_url->string,
+                                               need_slash ? "/" : "",
+                                               (gint) orig_len, href_value.size());
+                               href_value = {buf, nlen};
+                       }
+                       else if (href_value[0] == '/' && href_value[1] != '/') {
+                               /* Relative to the hostname */
+                               auto orig_len = href_value.size();
+                               auto len = orig_len + hc->base_url->hostlen + hc->base_url->protocollen +
+                                          3 /* for :// */;
+                               auto *buf = rspamd_mempool_alloc_buffer(pool, len + 1);
+                               auto nlen = (std::size_t)rspamd_snprintf(buf, len + 1, "%*s://%*s/%*s",
+                                               hc->base_url->protocollen, hc->base_url->string,
+                                               hc->base_url->hostlen, rspamd_url_host_unsafe (hc->base_url),
+                                               (gint)orig_len, href_value.data());
+                               href_value = {buf, nlen};
                         }
                 }
  
-               decoded = url->string;
-               decoded_len = url->urllen;
+               auto url = html_process_url(pool, href_value);
  
-               if (comp) {
-                       comp->start = (guchar *)decoded;
-                       comp->len = decoded_len;
-               }
-               /* Spaces in href usually mean an attempt to obfuscate URL */
-               /* See https://github.com/vstakhov/rspamd/issues/593 */
-#if 0
-               if (has_spaces) {
-                       url->flags |= RSPAMD_URL_FLAG_OBSCURED;
+               if (url && tag->extra == nullptr) {
+                       tag->extra = url.value();
                 }
-#endif
  
                 return url;
         }
  
-       return NULL;
-}
-
-static struct rspamd_url *
-rspamd_html_process_url_tag(rspamd_mempool_t *pool, struct html_tag *tag,
-                                                       struct html_content *hc) {
-       struct html_tag_component *comp;
-       GList *cur;
-       struct rspamd_url *url;
-       const gchar *start;
-       gsize len;
-
-       cur = tag->params->head;
-
-       while (cur) {
-               comp = (struct html_tag_component *)cur->data;
-
-               if (comp->type == RSPAMD_HTML_COMPONENT_HREF && comp->len > 0) {
-                       start = (char *)comp->start;
-                       len = comp->len;
-
-                       /* Check base url */
-                       if (hc && hc->base_url && comp->len > 2) {
-                               /*
-                                * Relative url cannot start from the following:
-                                * schema://
-                                * data:
-                                * slash
-                                */
-                               gchar *buf;
-                               gsize orig_len;
-
-                               if (rspamd_substring_search(start, len, "://", 3) == -1) {
-
-                                       if (len >= sizeof("data:") &&
-                                               g_ascii_strncasecmp(start, "data:", sizeof("data:") - 1) == 0) {
-                                               /* Image data url, never insert as url */
-                                               return NULL;
-                                       }
-
-                                       /* Assume relative url */
-
-                                       gboolean need_slash = FALSE;
-
-                                       orig_len = len;
-                                       len += hc->base_url->urllen;
-
-                                       if (hc->base_url->datalen == 0) {
-                                               need_slash = TRUE;
-                                               len++;
-                                       }
-
-                                       buf = (char *)rspamd_mempool_alloc (pool, len + 1);
-                                       rspamd_snprintf(buf, len + 1, "%*s%s%*s",
-                                                       hc->base_url->urllen, hc->base_url->string,
-                                                       need_slash ? "/" : "",
-                                                       (gint) orig_len, start);
-                                       start = buf;
-                               }
-                               else if (start[0] == '/' && start[1] != '/') {
-                                       /* Relative to the hostname */
-                                       orig_len = len;
-                                       len += hc->base_url->hostlen + hc->base_url->protocollen +
-                                                  3 /* for :// */;
-                                       buf = (char *)rspamd_mempool_alloc (pool, len + 1);
-                                       rspamd_snprintf(buf, len + 1, "%*s://%*s/%*s",
-                                                       hc->base_url->protocollen, hc->base_url->string,
-                                                       hc->base_url->hostlen, rspamd_url_host_unsafe (hc->base_url),
-                                                       (gint) orig_len, start);
-                                       start = buf;
-                               }
-                       }
-
-                       url = rspamd_html_process_url(pool, start, len, comp);
-
-                       if (url && tag->extra == NULL) {
-                               tag->extra = url;
-                       }
-
-                       return url;
-               }
-
-               cur = g_list_next (cur);
-       }
-
-       return NULL;
+       return std::nullopt;
  }
  
  struct rspamd_html_url_query_cbd {
@@ -910,8 +714,9 @@ struct rspamd_html_url_query_cbd {
  };
  
  static gboolean
-rspamd_html_url_query_callback(struct rspamd_url *url, gsize start_offset,
-                                                          gsize end_offset, gpointer ud) {
+html_url_query_callback(struct rspamd_url *url, gsize start_offset,
+                                                          gsize end_offset, gpointer ud)
+{
         struct rspamd_html_url_query_cbd *cbd =
                         (struct rspamd_html_url_query_cbd *) ud;
         rspamd_mempool_t *pool;
@@ -939,9 +744,10 @@ rspamd_html_url_query_callback(struct rspamd_url *url, gsize start_offset,
  }
  
  static void
-rspamd_process_html_url(rspamd_mempool_t *pool, struct rspamd_url *url,
-                                               khash_t (rspamd_url_hash) *url_set,
-                                               GPtrArray *part_urls) {
+process_html_query_url(rspamd_mempool_t *pool, struct rspamd_url *url,
+                                          khash_t (rspamd_url_hash) *url_set,
+                                          GPtrArray *part_urls)
+{
         if (url->querylen > 0) {
                 struct rspamd_html_url_query_cbd qcbd;
  
@@ -953,7 +759,7 @@ rspamd_process_html_url(rspamd_mempool_t *pool, struct rspamd_url *url,
                 rspamd_url_find_multiple(pool,
                                 rspamd_url_query_unsafe (url), url->querylen,
                                 RSPAMD_URL_FIND_ALL, NULL,
-                               rspamd_html_url_query_callback, &qcbd);
+                               html_url_query_callback, &qcbd);
         }
  
         if (part_urls) {
@@ -1013,10 +819,12 @@ rspamd_html_process_data_image(rspamd_mempool_t *pool,
  }
  
  static void
-rspamd_html_process_img_tag(rspamd_mempool_t *pool, struct html_tag *tag,
-                                                       struct html_content *hc, khash_t (rspamd_url_hash) *url_set,
-                                                       GPtrArray *part_urls,
-                                                       GByteArray *dest) {
+html_process_img_tag(rspamd_mempool_t *pool, struct html_tag *tag,
+                                        struct html_content *hc,
+                                        khash_t (rspamd_url_hash) *url_set,
+                                        GPtrArray *part_urls,
+                                        GByteArray *dest)
+{
         struct html_tag_component *comp;
         struct html_image *img;
         rspamd_ftok_t fstr;
@@ -1205,6 +1013,10 @@ rspamd_html_process_link_tag(rspamd_mempool_t *pool, struct html_tag *tag,
         }
  }
  
+}
+
+/* Unconverted C part */
+
  static void
  rspamd_html_process_color(const gchar *line, guint len, struct html_color *cl)
  {
@@ -1764,80 +1576,7 @@ rspamd_html_process_block_tag(rspamd_mempool_t *pool, struct html_tag *tag,
         tag->extra = bl;
  }
  
-static void
-rspamd_html_check_displayed_url(rspamd_mempool_t *pool,
-                                                               GList **exceptions,
-                                                               khash_t (rspamd_url_hash) *url_set,
-                                                               GByteArray *dest,
-                                                               gint href_offset,
-                                                               struct rspamd_url *url) {
-       struct rspamd_url *displayed_url = NULL;
-       struct rspamd_url *turl;
-       gboolean url_found = FALSE;
-       struct rspamd_process_exception *ex;
-       guint saved_flags = 0;
-       gsize dlen;
-
-       if (href_offset < 0) {
-               /* No dispalyed url, just some text within <a> tag */
-               return;
-       }
-
-       url->visible_part = (gchar *)rspamd_mempool_alloc (pool, dest->len - href_offset + 1);
-       rspamd_strlcpy(url->visible_part,
-                       reinterpret_cast<const gchar *>(dest->data + href_offset),
-                       dest->len - href_offset + 1);
-       dlen = dest->len - href_offset;
-
-       /* Strip unicode spaces from the start and the end */
-       url->visible_part = rspamd_string_unicode_trim_inplace(url->visible_part,
-                       &dlen);
-       rspamd_html_url_is_phished(pool, url,
-                       reinterpret_cast<const guchar *>(url->visible_part),
-                       dlen,
-                       &url_found, &displayed_url);
-
-       if (url_found) {
-               url->flags |= saved_flags | RSPAMD_URL_FLAG_DISPLAY_URL;
-       }
-
-       if (exceptions && url_found) {
-               ex = rspamd_mempool_alloc_type (pool,struct rspamd_process_exception);
-               ex->pos = href_offset;
-               ex->len = dest->len - href_offset;
-               ex->type = RSPAMD_EXCEPTION_URL;
-               ex->ptr = url;
-
-               *exceptions = g_list_prepend(*exceptions,
-                               ex);
-       }
-
-       if (displayed_url && url_set) {
-               turl = rspamd_url_set_add_or_return(url_set,
-                               displayed_url);
  
-               if (turl != NULL) {
-                       /* Here, we assume the following:
-                        * if we have a URL in the text part which
-                        * is the same as displayed URL in the
-                        * HTML part, we assume that it is also
-                        * hint only.
-                        */
-                       if (turl->flags &
-                               RSPAMD_URL_FLAG_FROM_TEXT) {
-                               turl->flags |= RSPAMD_URL_FLAG_HTML_DISPLAYED;
-                               turl->flags &= ~RSPAMD_URL_FLAG_FROM_TEXT;
-                       }
-
-                       turl->count++;
-               }
-               else {
-                       /* Already inserted by `rspamd_url_set_add_or_return` */
-               }
-       }
-
-       rspamd_normalise_unicode_inplace(url->visible_part, &dlen);
-}
  
  static gboolean
  rspamd_html_propagate_lengths(GNode *node, gpointer _unused) {
diff --git a/src/libserver/html/html.h b/src/libserver/html/html.h

index 14217b2c9112ebbcedb40fed90f8ef06bf6ea35a..afa46eb06330711898380578c82d46fdff2efbd0 100644 (file)
--- a/src/libserver/html/html.h
+++ b/src/libserver/html/html.h
@@ -46,7 +46,6 @@ extern "C" {
  
  
  struct rspamd_image;
-struct html_tag;
  
  struct html_image {
         guint height;
@@ -55,7 +54,7 @@ struct html_image {
         gchar *src;
         struct rspamd_url *url;
         struct rspamd_image *embedded_image;
-       struct html_tag *tag;
+       void *tag;
  };
  
  struct html_color {
@@ -79,7 +78,7 @@ struct html_color {
  };
  
  struct html_block {
-       struct html_tag *tag;
+       void *tag;
         struct html_color font_color;
         struct html_color background_color;
         //struct html_tag_component style;
@@ -101,8 +100,6 @@ struct html_block {
  #define FL_HREF         (1 << 29)
  #define FL_IMAGE        (1 << 30)
  
-
-
  /* Forwarded declaration */
  struct rspamd_task;
  
@@ -122,13 +119,13 @@ struct html_content {
  /*
   * Decode HTML entitles in text. Text is modified in place.
   */
-guint rspamd_html_decode_entitles_inplace (gchar *s, gsize len);
+guint rspamd_html_decode_entitles_inplace(gchar *s, gsize len);
  
-GByteArray *rspamd_html_process_part (rspamd_mempool_t *pool,
+GByteArray *rspamd_html_process_part(rspamd_mempool_t *pool,
                                                                           struct html_content *hc,
                                                                           GByteArray *in);
  
-GByteArray *rspamd_html_process_part_full (rspamd_mempool_t *pool,
+GByteArray *rspamd_html_process_part_full(rspamd_mempool_t *pool,
                                                                                    struct html_content *hc,
                                                                                    GByteArray *in, GList **exceptions,
                                                                                    khash_t (rspamd_url_hash) *url_set,
@@ -138,21 +135,21 @@ GByteArray *rspamd_html_process_part_full (rspamd_mempool_t *pool,
  /*
   * Returns true if a specified tag has been seen in a part
   */
-gboolean rspamd_html_tag_seen (struct html_content *hc, const gchar *tagname);
+gboolean rspamd_html_tag_seen(struct html_content *hc, const gchar *tagname);
  
  /**
   * Returns name for the specified tag id
   * @param id
   * @return
   */
-const gchar *rspamd_html_tag_by_id (gint id);
+const gchar *rspamd_html_tag_by_id(gint id);
  
  /**
   * Returns HTML tag id by name
   * @param name
   * @return
   */
-gint rspamd_html_tag_by_name (const gchar *name);
+gint rspamd_html_tag_by_name(const gchar *name);
  
  /**
   * Extract URL from HTML tag component and sets component elements if needed
diff --git a/src/libserver/html/html_url.cxx b/src/libserver/html/html_url.cxx

index 93728119ba36457de1c661d5be4d51e6d51d0000..5c4fb8d56f51e705960423203930126bd95fd28d 100644 (file)
--- a/src/libserver/html/html_url.cxx
+++ b/src/libserver/html/html_url.cxx
@@ -18,6 +18,7 @@
  #include "libutil/str_util.h"
  #include "libserver/url.h"
  #include "libserver/logger.h"
+#include "rspamd.h"
  
  #include <unicode/idna.h>
  
@@ -137,7 +138,7 @@ html_url_is_phished(rspamd_mempool_t *pool,
         if (text_data.size() > 4 &&
                 rspamd_url_find(pool, text_data.data(), text_data.size(), &url_str,
                                 RSPAMD_URL_FIND_ALL,
-                               &url_pos, NULL) && url_str != NULL) {
+                               &url_pos, NULL) && url_str != nullptr) {
  
                 text_url = rspamd_mempool_alloc0_type (pool, struct rspamd_url);
                 auto rc = rspamd_url_parse(text_url, url_str, strlen(url_str), pool,
@@ -197,4 +198,221 @@ html_url_is_phished(rspamd_mempool_t *pool,
         return std::nullopt;
  }
  
+void
+html_check_displayed_url(rspamd_mempool_t *pool,
+                                                GList **exceptions,
+                                                void *url_set,
+                                                std::string_view visible_part,
+                                                goffset href_offset,
+                                                struct rspamd_url *url)
+{
+       struct rspamd_url *displayed_url = nullptr;
+       struct rspamd_url *turl;
+       struct rspamd_process_exception *ex;
+       guint saved_flags = 0;
+       gsize dlen;
+
+       if (visible_part.empty()) {
+               /* No dispalyed url, just some text within <a> tag */
+               return;
+       }
+
+       url->visible_part = rspamd_mempool_alloc_buffer(pool, visible_part.size() + 1);
+       rspamd_strlcpy(url->visible_part,
+                       visible_part.data(),
+                       visible_part.size());
+       dlen = visible_part.size();
+
+       /* Strip unicode spaces from the start and the end */
+       url->visible_part = const_cast<char *>(
+                       rspamd_string_unicode_trim_inplace(url->visible_part,
+                       &dlen));
+       auto maybe_url = html_url_is_phished(pool, url,
+                       {url->visible_part, dlen});
+
+       if (maybe_url) {
+               url->flags |= saved_flags | RSPAMD_URL_FLAG_DISPLAY_URL;
+               displayed_url = maybe_url.value();
+       }
+
+       if (exceptions && displayed_url != nullptr) {
+               ex = rspamd_mempool_alloc_type (pool,struct rspamd_process_exception);
+               ex->pos = href_offset;
+               ex->len = dlen;
+               ex->type = RSPAMD_EXCEPTION_URL;
+               ex->ptr = url;
+
+               *exceptions = g_list_prepend(*exceptions, ex);
+       }
+
+       if (displayed_url && url_set) {
+               turl = rspamd_url_set_add_or_return((khash_t (rspamd_url_hash) *)url_set, displayed_url);
+
+               if (turl != nullptr) {
+                       /* Here, we assume the following:
+                        * if we have a URL in the text part which
+                        * is the same as displayed URL in the
+                        * HTML part, we assume that it is also
+                        * hint only.
+                        */
+                       if (turl->flags &
+                               RSPAMD_URL_FLAG_FROM_TEXT) {
+                               turl->flags |= RSPAMD_URL_FLAG_HTML_DISPLAYED;
+                               turl->flags &= ~RSPAMD_URL_FLAG_FROM_TEXT;
+                       }
+
+                       turl->count++;
+               }
+               else {
+                       /* Already inserted by `rspamd_url_set_add_or_return` */
+               }
+       }
+
+       rspamd_normalise_unicode_inplace(url->visible_part, &dlen);
+}
+
+auto
+html_process_url(rspamd_mempool_t *pool, std::string_view &input)
+       -> std::optional<struct rspamd_url *>
+{
+       struct rspamd_url *url;
+       guint saved_flags = 0;
+       gint rc;
+       const gchar *s, *prefix = "http://";
+       gchar *d;
+       gsize dlen;
+       gboolean has_bad_chars = FALSE, no_prefix = FALSE;
+       static const gchar hexdigests[] = "0123456789abcdef";
+
+       auto sz = input.length();
+       const auto *trimmed = rspamd_string_unicode_trim_inplace(input.data(), &sz);
+       input = {trimmed, sz};
+
+       const auto *start = input.data();
+       s = start;
+       dlen = 0;
+
+       for (auto i = 0; i < sz; i++) {
+               if (G_UNLIKELY (((guint) s[i]) < 0x80 && !g_ascii_isgraph(s[i]))) {
+                       dlen += 3;
+               }
+               else {
+                       dlen++;
+               }
+       }
+
+       if (rspamd_substring_search(start, sz, "://", 3) == -1) {
+               if (sz >= sizeof("mailto:") &&
+                       (memcmp(start, "mailto:", sizeof("mailto:") - 1) == 0 ||
+                        memcmp(start, "tel:", sizeof("tel:") - 1) == 0 ||
+                        memcmp(start, "callto:", sizeof("callto:") - 1) == 0)) {
+                       /* Exclusion, has valid but 'strange' prefix */
+               }
+               else {
+                       for (auto i = 0; i < sz; i++) {
+                               if (!((s[i] & 0x80) || g_ascii_isalnum (s[i]))) {
+                                       if (i == 0 && sz > 2 && s[i] == '/' && s[i + 1] == '/') {
+                                               prefix = "http:";
+                                               dlen += sizeof("http:") - 1;
+                                               no_prefix = TRUE;
+                                       }
+                                       else if (s[i] == '@') {
+                                               /* Likely email prefix */
+                                               prefix = "mailto://";
+                                               dlen += sizeof("mailto://") - 1;
+                                               no_prefix = TRUE;
+                                       }
+                                       else if (s[i] == ':' && i != 0) {
+                                               /* Special case */
+                                               no_prefix = FALSE;
+                                       }
+                                       else {
+                                               if (i == 0) {
+                                                       /* No valid data */
+                                                       return std::nullopt;
+                                               }
+                                               else {
+                                                       no_prefix = TRUE;
+                                                       dlen += strlen(prefix);
+                                               }
+                                       }
+
+                                       break;
+                               }
+                       }
+               }
+       }
+
+       auto *decoded = rspamd_mempool_alloc_buffer(pool, dlen + 1);
+       d = decoded;
+
+       if (no_prefix) {
+               gsize plen = strlen(prefix);
+               memcpy(d, prefix, plen);
+               d += plen;
+       }
+
+       /*
+        * We also need to remove all internal newlines, spaces
+        * and encode unsafe characters
+        */
+       for (auto i = 0; i < sz; i++) {
+               if (G_UNLIKELY (g_ascii_isspace(s[i]))) {
+                       continue;
+               }
+               else if (G_UNLIKELY (((guint) s[i]) < 0x80 && !g_ascii_isgraph(s[i]))) {
+                       /* URL encode */
+                       *d++ = '%';
+                       *d++ = hexdigests[(s[i] >> 4) & 0xf];
+                       *d++ = hexdigests[s[i] & 0xf];
+                       has_bad_chars = TRUE;
+               }
+               else {
+                       *d++ = s[i];
+               }
+       }
+
+       *d = '\0';
+       dlen = d - decoded;
+
+       url = rspamd_mempool_alloc0_type(pool, struct rspamd_url);
+       rspamd_url_normalise_propagate_flags (pool, decoded, &dlen, saved_flags);
+       rc = rspamd_url_parse(url, decoded, dlen, pool, RSPAMD_URL_PARSE_HREF);
+
+       /* Filter some completely damaged urls */
+       if (rc == URI_ERRNO_OK && url->hostlen > 0 &&
+               !((url->protocol & PROTOCOL_UNKNOWN))) {
+               url->flags |= saved_flags;
+
+               if (has_bad_chars) {
+                       url->flags |= RSPAMD_URL_FLAG_OBSCURED;
+               }
+
+               if (no_prefix) {
+                       url->flags |= RSPAMD_URL_FLAG_SCHEMALESS;
+
+                       if (url->tldlen == 0 || (url->flags & RSPAMD_URL_FLAG_NO_TLD)) {
+                               /* Ignore urls with both no schema and no tld */
+                               return std::nullopt;
+                       }
+               }
+
+               decoded = url->string;
+
+               input = {decoded, url->urllen};
+
+               /* Spaces in href usually mean an attempt to obfuscate URL */
+               /* See https://github.com/vstakhov/rspamd/issues/593 */
+#if 0
+               if (has_spaces) {
+                       url->flags |= RSPAMD_URL_FLAG_OBSCURED;
+               }
+#endif
+
+               return url;
+       }
+
+       return std::nullopt;
+}
+
  }
 \ No newline at end of file
diff --git a/src/libserver/html/html_url.hxx b/src/libserver/html/html_url.hxx

index 7bf81b7d7b9ec89d69120a4f567500f61b610148..6c2f5a71d732c5caa86bc1cdffd5735174f7215e 100644 (file)
--- a/src/libserver/html/html_url.hxx
+++ b/src/libserver/html/html_url.hxx
@@ -19,6 +19,7 @@
  #pragma once
  
  #include "libutil/mem_pool.h"
+
  #include <string_view>
  #include <optional>
  
@@ -38,7 +39,30 @@ auto html_url_is_phished(rspamd_mempool_t *pool,
                                         struct rspamd_url *href_url,
                                         std::string_view text_data) -> std::optional<rspamd_url *>;
  
+/**
+ * Check displayed part of the url at specified offset
+ * @param pool
+ * @param exceptions
+ * @param url_set
+ * @param visible_part
+ * @param href_offset
+ * @param url
+ */
+auto html_check_displayed_url(rspamd_mempool_t *pool,
+                                                GList **exceptions,
+                                                void *url_set,
+                                                std::string_view visible_part,
+                                                goffset href_offset,
+                                                struct rspamd_url *url) -> void;
  
+/**
+ * Process HTML url (e.g. for href component)
+ * @param pool
+ * @param input may be modified during the process
+ * @return
+ */
+auto html_process_url(rspamd_mempool_t *pool, std::string_view &input)
+       -> std::optional<struct rspamd_url *>;
  }
  
  #endif //RSPAMD_HTML_URL_HXX
 \ No newline at end of file
author	Vsevolod Stakhov <vsevolod@highsecure.ru>
	Tue, 25 May 2021 11:15:30 +0000 (12:15 +0100)
committer	Vsevolod Stakhov <vsevolod@highsecure.ru>
	Thu, 27 May 2021 14:05:21 +0000 (15:05 +0100)
src/libserver/html/html.cxx		patch \| blob \| blame \| history
src/libserver/html/html.h		patch \| blob \| blame \| history
src/libserver/html/html_url.cxx		patch \| blob \| blame \| history
src/libserver/html/html_url.hxx		patch \| blob \| blame \| history