]> git.ipfire.org Git - thirdparty/rspamd.git/commitdiff
[Rework] Rework URL structure: host field
authorVsevolod Stakhov <vsevolod@highsecure.ru>
Fri, 6 Mar 2020 12:01:37 +0000 (12:01 +0000)
committerVsevolod Stakhov <vsevolod@highsecure.ru>
Mon, 9 Mar 2020 10:46:11 +0000 (10:46 +0000)
src/libserver/html.c
src/libserver/protocol.c
src/libserver/url.c
src/libserver/url.h
src/lua/lua_url.c

index b7e78e57b7dd175d60021c0b9156402fd1b8adcf..7dca724536695a3d8e73e0391716b74dbaf069e2 100644 (file)
@@ -658,14 +658,14 @@ rspamd_html_url_is_phished (rspamd_mempool_t *pool,
 
                if (rc == URI_ERRNO_OK) {
                        disp_tok.len = text_url->hostlen;
-                       disp_tok.begin = text_url->host;
+                       disp_tok.begin = rspamd_url_host_unsafe (text_url);
 #if U_ICU_VERSION_MAJOR_NUM >= 46
-                       if (rspamd_substring_search_caseless (text_url->host,
+                       if (rspamd_substring_search_caseless (rspamd_url_host_unsafe (text_url),
                                        text_url->hostlen, "xn--", 4) != -1) {
                                idn_hbuf = rspamd_mempool_alloc (pool, text_url->hostlen * 2 + 1);
                                /* We need to convert it to the normal value first */
                                disp_tok.len = uidna_nameToUnicodeUTF8 (udn,
-                                               text_url->host, text_url->hostlen,
+                                               rspamd_url_host_unsafe (text_url), text_url->hostlen,
                                                idn_hbuf, text_url->hostlen * 2 + 1, &uinfo, &uc_err);
 
                                if (uc_err != U_ZERO_ERROR) {
@@ -679,14 +679,14 @@ rspamd_html_url_is_phished (rspamd_mempool_t *pool,
                        }
 #endif
                        href_tok.len = href_url->hostlen;
-                       href_tok.begin = href_url->host;
+                       href_tok.begin = rspamd_url_host_unsafe (href_url);
 #if U_ICU_VERSION_MAJOR_NUM >= 46
-                       if (rspamd_substring_search_caseless (href_url->host,
+                       if (rspamd_substring_search_caseless (rspamd_url_host_unsafe (href_url),
                                        href_url->hostlen, "xn--", 4) != -1) {
                                idn_hbuf = rspamd_mempool_alloc (pool, href_url->hostlen * 2 + 1);
                                /* We need to convert it to the normal value first */
                                href_tok.len = uidna_nameToUnicodeUTF8 (udn,
-                                               href_url->host, href_url->hostlen,
+                                               rspamd_url_host_unsafe (href_url), href_url->hostlen,
                                                idn_hbuf, href_url->hostlen * 2 + 1, &uinfo, &uc_err);
 
                                if (uc_err != U_ZERO_ERROR) {
@@ -1594,7 +1594,7 @@ rspamd_html_process_url_tag (rspamd_mempool_t *pool, struct html_tag *tag,
                                        buf = rspamd_mempool_alloc (pool, len + 1);
                                        rspamd_snprintf (buf, len + 1, "%*s://%*s/%*s",
                                                        hc->base_url->protocollen, hc->base_url->string,
-                                                       hc->base_url->hostlen, hc->base_url->host,
+                                                       hc->base_url->hostlen, rspamd_url_host_unsafe (hc->base_url),
                                                        (gint)orig_len, start);
                                        start = buf;
                                }
index 4c1a94d9947c823b8c07f5d7dfedeffcbb3dc902..16dc05491daba17612b93e4310ac20dfc63cc89b 100644 (file)
@@ -882,7 +882,7 @@ rspamd_protocol_extended_url (struct rspamd_task *task,
                ucl_object_insert_key (obj, elt, "tld", 0, false);
        }
        if (url->hostlen > 0) {
-               elt = ucl_object_fromstring_common (url->host, url->hostlen, 0);
+               elt = ucl_object_fromstring_common (rspamd_url_host_unsafe (url), url->hostlen, 0);
                ucl_object_insert_key (obj, elt, "host", 0, false);
        }
 
@@ -925,11 +925,14 @@ urls_protocol_cb (gpointer key, gpointer value, gpointer ud)
 
                        goffset err_offset;
 
-                       if ((err_offset = rspamd_fast_utf8_validate (url->host, url->hostlen)) == 0) {
-                               obj = ucl_object_fromstring_common (url->host, url->hostlen, 0);
+                       if ((err_offset = rspamd_fast_utf8_validate (rspamd_url_host_unsafe (url),
+                                       url->hostlen)) == 0) {
+                               obj = ucl_object_fromstring_common (rspamd_url_host_unsafe (url),
+                                               url->hostlen, 0);
                        }
                        else {
-                               obj = ucl_object_fromstring_common (url->host, err_offset - 1, 0);
+                               obj = ucl_object_fromstring_common (rspamd_url_host_unsafe (url),
+                                               err_offset - 1, 0);
                        }
                }
                else {
index a2a9d852fe98295cbd3dffef166d83e199031247..ac4c119168f4e62a29bdcf4fc101df286f79a745 100644 (file)
@@ -1526,12 +1526,12 @@ rspamd_tld_trie_callback (struct rspamd_multipattern *mp,
 
        pos = text + match_start;
        p = pos - 1;
-       start = url->host;
+       start = rspamd_url_host_unsafe (url);
 
        if (*pos != '.' || match_pos != (gint) url->hostlen) {
                /* Something weird has been found */
                if (match_pos == (gint) url->hostlen - 1) {
-                       pos = url->host + match_pos;
+                       pos = rspamd_url_host_unsafe (url) + match_pos;
                        if (*pos == '.') {
                                /* This is dot at the end of domain */
                                url->hostlen--;
@@ -1560,9 +1560,9 @@ rspamd_tld_trie_callback (struct rspamd_multipattern *mp,
        }
 
        if ((ndots == 0 || p == start - 1) &&
-                       url->tldlen < url->host + url->hostlen - pos) {
+                       url->tldlen < rspamd_url_host_unsafe (url) + url->hostlen - pos) {
                url->tld = (gchar *) pos;
-               url->tldlen = url->host + url->hostlen - pos;
+               url->tldlen = rspamd_url_host_unsafe (url) + url->hostlen - pos;
        }
 
        return 0;
@@ -1586,13 +1586,13 @@ rspamd_url_regen_from_inet_addr (struct rspamd_url *uri, const void *addr, int a
        /* Allocate new string to build it from IP */
        strbuf = rspamd_mempool_alloc (pool, slen + 1);
        r += rspamd_snprintf (strbuf + r, slen - r, "%*s",
-                       (gint)(uri->host - uri->string),
+                       (gint)(uri->hostshift),
                        uri->string);
-       uri->host = strbuf + r;
+       uri->hostshift = r;
        inet_ntop (af, addr, strbuf + r, slen - r + 1);
-       uri->hostlen = strlen (uri->host);
+       uri->hostlen = strlen (rspamd_url_host_unsafe (uri));
        r += uri->hostlen;
-       uri->tld = uri->host;
+       uri->tld = rspamd_url_host_unsafe (uri);
        uri->tldlen = uri->hostlen;
        uri->flags |= RSPAMD_URL_FLAG_NUMERIC;
 
@@ -1638,7 +1638,7 @@ rspamd_url_is_ip (struct rspamd_url *uri, rspamd_mempool_t *pool)
        gboolean ret = FALSE, check_num = TRUE;
        guint32 n, dots, t = 0, i = 0, shift, nshift;
 
-       p = uri->host;
+       p = rspamd_url_host_unsafe (uri);
        end = p + uri->hostlen;
 
        if (*p == '[' && *(end - 1) == ']') {
@@ -1814,9 +1814,10 @@ rspamd_url_shift (struct rspamd_url *uri, gsize nlen,
 
                old_shift = uri->hostlen;
                uri->hostlen -= shift;
-               remain = (uri->urllen - (uri->host - uri->string)) - old_shift;
+               remain = (uri->urllen - (uri->hostshift)) - old_shift;
                g_assert (remain >= 0);
-               memmove (uri->host + uri->hostlen, uri->host + old_shift,
+               memmove (rspamd_url_host_unsafe (uri) + uri->hostlen,
+                               rspamd_url_host_unsafe (uri) + old_shift,
                                remain);
                uri->urllen -= shift;
                uri->flags |= RSPAMD_URL_FLAG_HOSTENCODED;
@@ -1877,7 +1878,7 @@ rspamd_url_shift (struct rspamd_url *uri, gsize nlen,
                        uri->usershift -= shift;
                }
                if (uri->hostlen > 0) {
-                       uri->host -= shift;
+                       uri->hostshift -= shift;
                }
                /* Go forward */
        case UF_HOST:
@@ -1908,9 +1909,9 @@ rspamd_telephone_normalise_inplace (struct rspamd_url *uri)
        gint i = 0, w, orig_len;
        UChar32 uc;
 
-       t = uri->host;
+       t = rspamd_url_host_unsafe (uri);
        h = t;
-       end = uri->host + uri->hostlen;
+       end = t + uri->hostlen;
        orig_len = uri->hostlen;
 
        if (*h == '+') {
@@ -1931,7 +1932,7 @@ rspamd_telephone_normalise_inplace (struct rspamd_url *uri)
                h += i;
        }
 
-       uri->hostlen = t - uri->host;
+       uri->hostlen = t - rspamd_url_host_unsafe (uri);
        uri->urllen -= (orig_len - uri->hostlen);
 }
 
@@ -2022,7 +2023,7 @@ rspamd_url_parse (struct rspamd_url *uri,
                                uri->protocollen = u.field_data[i].len;
                                break;
                        case UF_HOST:
-                               uri->host = comp;
+                               uri->hostshift = u.field_data[i].off;
                                uri->hostlen = complen;
                                break;
                        case UF_PATH:
@@ -2059,16 +2060,20 @@ rspamd_url_parse (struct rspamd_url *uri,
                        uri->string,
                        uri->protocollen);
        rspamd_url_shift (uri, unquoted_len, UF_SCHEMA);
-       unquoted_len = rspamd_url_decode (uri->host, uri->host, uri->hostlen);
+       unquoted_len = rspamd_url_decode (rspamd_url_host_unsafe (uri),
+                       rspamd_url_host_unsafe (uri), uri->hostlen);
 
-       if (rspamd_normalise_unicode_inplace (pool, uri->host, &unquoted_len)) {
+       if (rspamd_normalise_unicode_inplace (pool,
+                       rspamd_url_host_unsafe (uri), &unquoted_len)) {
                uri->flags |= RSPAMD_URL_FLAG_UNNORMALISED;
        }
 
 
        if (uri->protocol & (PROTOCOL_HTTP|PROTOCOL_HTTPS|PROTOCOL_MAILTO|PROTOCOL_FTP|PROTOCOL_FILE)) {
                /* Ensure that hostname starts with something sane (exclude numeric urls) */
-               if (!(is_domain_start (uri->host[0]) || uri->host[0] == ':')) {
+               const gchar* host = rspamd_url_host_unsafe (uri);
+
+               if (!(is_domain_start (host[0]) || host[0] == ':')) {
                        return URI_ERRNO_BAD_FORMAT;
                }
        }
@@ -2093,7 +2098,7 @@ rspamd_url_parse (struct rspamd_url *uri,
        struct UConverter *utf8_conv = rspamd_get_utf8_converter ();
 
        utf16_len = ucnv_toUChars (utf8_conv, utf16_hostname, uri->hostlen,
-                       uri->host, uri->hostlen, &uc_err);
+                       rspamd_url_host_unsafe (uri), uri->hostlen, &uc_err);
 
        if (!U_SUCCESS (uc_err)) {
 
@@ -2110,7 +2115,8 @@ rspamd_url_parse (struct rspamd_url *uri,
        }
 
        /* Convert back to utf8, sigh... */
-       norm_utf8_len = ucnv_fromUChars (utf8_conv, uri->host, uri->hostlen,
+       norm_utf8_len = ucnv_fromUChars (utf8_conv,
+                       rspamd_url_host_unsafe (uri), uri->hostlen,
                        norm_utf16, norm_utf16_len, &uc_err);
 
        if (!U_SUCCESS (uc_err)) {
@@ -2154,7 +2160,7 @@ rspamd_url_parse (struct rspamd_url *uri,
        }
 
        rspamd_str_lc (uri->string, uri->protocollen);
-       unquoted_len = rspamd_str_lc_utf8 (uri->host, uri->hostlen);
+       unquoted_len = rspamd_str_lc_utf8 (rspamd_url_host_unsafe (uri), uri->hostlen);
        rspamd_url_shift (uri, unquoted_len, UF_HOST);
 
        if (uri->protocol == PROTOCOL_UNKNOWN) {
@@ -2172,7 +2178,7 @@ rspamd_url_parse (struct rspamd_url *uri,
        if (uri->protocol & (PROTOCOL_HTTP|PROTOCOL_HTTPS|PROTOCOL_MAILTO|PROTOCOL_FTP|PROTOCOL_FILE)) {
                /* Find TLD part */
                rspamd_multipattern_lookup (url_scanner->search_trie,
-                               uri->host, uri->hostlen,
+                               rspamd_url_host_unsafe (uri), uri->hostlen,
                                rspamd_tld_trie_callback, uri, NULL);
 
                if (uri->tldlen == 0) {
@@ -2184,7 +2190,7 @@ rspamd_url_parse (struct rspamd_url *uri,
                        } else {
                                if (!rspamd_url_is_ip (uri, pool)) {
                                        /* Assume tld equal to host */
-                                       uri->tld = uri->host;
+                                       uri->tld = rspamd_url_host_unsafe (uri);
                                        uri->tldlen = uri->hostlen;
                                }
                        }
@@ -2194,7 +2200,8 @@ rspamd_url_parse (struct rspamd_url *uri,
                if (uri->protocol & (PROTOCOL_HTTP|PROTOCOL_HTTPS|PROTOCOL_FTP) &&
                        uri->protocollen > 0 && uri->urllen > uri->protocollen + 2) {
 
-                       gchar *pos = &uri->string[uri->protocollen], *host_start = uri->host;
+                       gchar *pos = &uri->string[uri->protocollen],
+                                       *host_start = rspamd_url_host_unsafe (uri);
 
                        while (pos < host_start) {
                                if (*pos == '\\') {
@@ -2209,12 +2216,12 @@ rspamd_url_parse (struct rspamd_url *uri,
                /* We need to normalise phone number: remove all spaces and braces */
                rspamd_telephone_normalise_inplace (uri);
 
-               if (uri->host[0] == '+') {
-                       uri->tld = uri->host + 1;
+               if (rspamd_url_host_unsafe (uri)[0] == '+') {
+                       uri->tld = rspamd_url_host_unsafe (uri) + 1;
                        uri->tldlen = uri->hostlen - 1;
                }
                else {
-                       uri->tld = uri->host;
+                       uri->tld = rspamd_url_host_unsafe (uri);
                        uri->tldlen = uri->hostlen;
                }
        }
@@ -3362,7 +3369,8 @@ rspamd_url_host_hash (gconstpointer u)
        const struct rspamd_url *url = u;
 
        if (url->hostlen > 0) {
-               return (guint)rspamd_cryptobox_fast_hash (url->host, url->hostlen,
+               return (guint)rspamd_cryptobox_fast_hash (rspamd_url_host_unsafe (url),
+                               url->hostlen,
                                rspamd_hash_seed ());
        }
 
@@ -3378,7 +3386,7 @@ rspamd_email_hash (gconstpointer u)
        rspamd_cryptobox_fast_hash_init (&st, rspamd_hash_seed ());
 
        if (url->hostlen > 0) {
-               rspamd_cryptobox_fast_hash_update (&st, url->host, url->hostlen);
+               rspamd_cryptobox_fast_hash_update (&st, rspamd_url_host_unsafe (url), url->hostlen);
        }
 
        if (url->userlen > 0) {
@@ -3399,7 +3407,8 @@ rspamd_emails_cmp (gconstpointer a, gconstpointer b)
                return FALSE;
        }
        else {
-               if ((r = rspamd_lc_cmp (u1->host, u2->host, u1->hostlen)) == 0) {
+               if ((r = rspamd_lc_cmp (rspamd_url_host_unsafe (u1),
+                               rspamd_url_host_unsafe (u2), u1->hostlen)) == 0) {
                        if (u1->userlen != u2->userlen || u1->userlen == 0) {
                                return FALSE;
                        }
@@ -3443,7 +3452,8 @@ rspamd_urls_host_cmp (gconstpointer a, gconstpointer b)
                return FALSE;
        }
        else {
-               r = memcmp (u1->host, u2->host, u1->hostlen);
+               r = memcmp (rspamd_url_host_unsafe (u1), rspamd_url_host_unsafe (u2),
+                               u1->hostlen);
        }
 
        return r == 0;
@@ -3637,7 +3647,7 @@ rspamd_url_encode (struct rspamd_url *url, gsize *pdlen,
 
        g_assert (pdlen != NULL && url != NULL && pool != NULL);
 
-       CHECK_URL_COMPONENT ((guchar *)url->host, url->hostlen,
+       CHECK_URL_COMPONENT (rspamd_url_host_unsafe (url), url->hostlen,
                        RSPAMD_URL_FLAGS_HOSTSAFE);
        CHECK_URL_COMPONENT (rspamd_url_user_unsafe(url), url->userlen,
                        RSPAMD_URL_FLAGS_USERSAFE);
@@ -3683,7 +3693,7 @@ rspamd_url_encode (struct rspamd_url *url, gsize *pdlen,
                *d++ = ':';
        }
 
-       ENCODE_URL_COMPONENT ((guchar *)url->host, url->hostlen,
+       ENCODE_URL_COMPONENT (rspamd_url_host_unsafe (url), url->hostlen,
                        RSPAMD_URL_FLAGS_HOSTSAFE);
 
        if (url->datalen > 0) {
index 78330d8143df9cae82d44d14f2016791cc1a6174..080f005c3be54b2593013fecdcea21bc0f74779e 100644 (file)
@@ -50,7 +50,9 @@ struct rspamd_url {
        guint usershift;
        guint userlen;
 
-       gchar *host;
+       guint hostshift;
+       guint hostlen;
+
        gchar *data;
        gchar *query;
        gchar *fragment;
@@ -60,7 +62,6 @@ struct rspamd_url {
        struct rspamd_url *phished_url;
 
        guint protocollen;
-       guint hostlen;
        guint datalen;
        guint querylen;
        guint fragmentlen;
@@ -75,6 +76,9 @@ struct rspamd_url {
 #define rspamd_url_user(u) ((u)->userlen > 0 ? (u)->string + (u)->usershift : NULL)
 #define rspamd_url_user_unsafe(u) ((u)->string + (u)->usershift)
 
+#define rspamd_url_host(u) ((u)->hostlen > 0 ? (u)->string + (u)->hostshift : NULL)
+#define rspamd_url_host_unsafe(u) ((u)->string + (u)->hostshift)
+
 enum uri_errno {
        URI_ERRNO_OK = 0,           /* Parsing went well */
        URI_ERRNO_EMPTY,        /* The URI string was empty */
index 7b0dee89b84ac20d6a1b94e3174e4b36a4d874c1..bd94120e2f8166a24d5e2c5a0e594fee04e264b1 100644 (file)
@@ -158,8 +158,8 @@ lua_url_get_host (lua_State *L)
        LUA_TRACE_POINT;
        struct rspamd_lua_url *url = lua_check_url (L, 1);
 
-       if (url != NULL) {
-               lua_pushlstring (L, url->url->host, url->url->hostlen);
+       if (url != NULL && url->url && url->url->hostlen > 0) {
+               lua_pushlstring (L, rspamd_url_host (url->url), url->url->hostlen);
        }
        else {
                lua_pushnil (L);
@@ -312,7 +312,7 @@ lua_url_tostring (lua_State *L)
                        }
 
                        tmp[url->url->userlen] = '@';
-                       memcpy (tmp + url->url->userlen + 1, url->url->host,
+                       memcpy (tmp + url->url->userlen + 1, rspamd_url_host_unsafe (url->url),
                                        url->url->hostlen);
 
                        lua_pushlstring (L, tmp, url->url->userlen + 1 + url->url->hostlen);
@@ -660,7 +660,7 @@ lua_url_to_table (lua_State *L)
 
                if (u->hostlen > 0) {
                        lua_pushstring (L, "host");
-                       lua_pushlstring (L, u->host, u->hostlen);
+                       lua_pushlstring (L, rspamd_url_host_unsafe (u), u->hostlen);
                        lua_settable (L, -3);
                }