]> git.ipfire.org Git - thirdparty/rspamd.git/commitdiff
[Rework] Rework URL structure: adjust tld part
authorVsevolod Stakhov <vsevolod@highsecure.ru>
Fri, 6 Mar 2020 14:03:20 +0000 (14:03 +0000)
committerVsevolod Stakhov <vsevolod@highsecure.ru>
Mon, 9 Mar 2020 10:46:11 +0000 (10:46 +0000)
src/libserver/html.c
src/libserver/protocol.c
src/libserver/url.c
src/libserver/url.h
src/libstat/tokenizers/tokenizers.c
src/lua/lua_url.c

index e1a211d2ce4e4997f954060028cf7caf1b8b8af4..981141ad8dddf39fe8a658921ec5935e9fc354c6 100644 (file)
@@ -704,14 +704,14 @@ rspamd_html_url_is_phished (rspamd_mempool_t *pool,
 
                                /* Apply the same logic for TLD */
                                disp_tok.len = text_url->tldlen;
-                               disp_tok.begin = text_url->tld;
+                               disp_tok.begin = rspamd_url_tld_unsafe (text_url);
 #if U_ICU_VERSION_MAJOR_NUM >= 46
-                               if (rspamd_substring_search_caseless (text_url->tld,
+                               if (rspamd_substring_search_caseless (rspamd_url_tld_unsafe (text_url),
                                                text_url->tldlen, "xn--", 4) != -1) {
                                        idn_hbuf = rspamd_mempool_alloc (pool, text_url->tldlen * 2 + 1);
                                        /* We need to convert it to the normal value first */
                                        disp_tok.len = uidna_nameToUnicodeUTF8 (udn,
-                                                       text_url->tld, text_url->tldlen,
+                                                       rspamd_url_tld_unsafe (text_url), text_url->tldlen,
                                                        idn_hbuf, text_url->tldlen * 2 + 1, &uinfo, &uc_err);
 
                                        if (uc_err != U_ZERO_ERROR) {
@@ -725,14 +725,14 @@ rspamd_html_url_is_phished (rspamd_mempool_t *pool,
                                }
 #endif
                                href_tok.len = href_url->tldlen;
-                               href_tok.begin = href_url->tld;
+                               href_tok.begin = rspamd_url_tld_unsafe (href_url);
 #if U_ICU_VERSION_MAJOR_NUM >= 46
-                               if (rspamd_substring_search_caseless (href_url->tld,
+                               if (rspamd_substring_search_caseless (rspamd_url_tld_unsafe (href_url),
                                                href_url->tldlen, "xn--", 4) != -1) {
                                        idn_hbuf = rspamd_mempool_alloc (pool, href_url->tldlen * 2 + 1);
                                        /* We need to convert it to the normal value first */
                                        href_tok.len = uidna_nameToUnicodeUTF8 (udn,
-                                                       href_url->tld, href_url->tldlen,
+                                                       rspamd_url_tld_unsafe (href_url), href_url->tldlen,
                                                        idn_hbuf, href_url->tldlen * 2 + 1, &uinfo, &uc_err);
 
                                        if (uc_err != U_ZERO_ERROR) {
index 16dc05491daba17612b93e4310ac20dfc63cc89b..739d3b950448ade345a8a6e1a2c3033d798bc5fa 100644 (file)
@@ -878,11 +878,13 @@ rspamd_protocol_extended_url (struct rspamd_task *task,
        ucl_object_insert_key (obj, elt, "url", 0, false);
 
        if (url->tldlen > 0) {
-               elt = ucl_object_fromstring_common (url->tld, url->tldlen, 0);
+               elt = ucl_object_fromstring_common (rspamd_url_tld_unsafe (url),
+                               url->tldlen, 0);
                ucl_object_insert_key (obj, elt, "tld", 0, false);
        }
        if (url->hostlen > 0) {
-               elt = ucl_object_fromstring_common (rspamd_url_host_unsafe (url), url->hostlen, 0);
+               elt = ucl_object_fromstring_common (rspamd_url_host_unsafe (url),
+                               url->hostlen, 0);
                ucl_object_insert_key (obj, elt, "host", 0, false);
        }
 
index 7e85a460e6cd3dba52b0564e84d964a55f591ebb..043f523f0650213e41f09863843c52e0a3b2411c 100644 (file)
@@ -1561,7 +1561,7 @@ rspamd_tld_trie_callback (struct rspamd_multipattern *mp,
 
        if ((ndots == 0 || p == start - 1) &&
                        url->tldlen < rspamd_url_host_unsafe (url) + url->hostlen - pos) {
-               url->tld = (gchar *) pos;
+               url->tldshift = (pos - url->string);
                url->tldlen = rspamd_url_host_unsafe (url) + url->hostlen - pos;
        }
 
@@ -1590,11 +1590,11 @@ rspamd_url_regen_from_inet_addr (struct rspamd_url *uri, const void *addr, int a
                        (gint)(uri->hostshift),
                        uri->string);
        uri->hostshift = r;
+       uri->tldshift = r;
        start_offset = strbuf + r;
        inet_ntop (af, addr, strbuf + r, slen - r + 1);
        uri->hostlen = strlen (start_offset);
        r += uri->hostlen;
-       uri->tld = (const gchar *)start_offset;
        uri->tldlen = uri->hostlen;
        uri->flags |= RSPAMD_URL_FLAG_NUMERIC;
 
@@ -2214,7 +2214,7 @@ rspamd_url_parse (struct rspamd_url *uri,
                        } else {
                                if (!rspamd_url_is_ip (uri, pool)) {
                                        /* Assume tld equal to host */
-                                       uri->tld = rspamd_url_host_unsafe (uri);
+                                       uri->tldshift = uri->hostshift;
                                        uri->tldlen = uri->hostlen;
                                }
                        }
@@ -2241,11 +2241,11 @@ rspamd_url_parse (struct rspamd_url *uri,
                rspamd_telephone_normalise_inplace (uri);
 
                if (rspamd_url_host_unsafe (uri)[0] == '+') {
-                       uri->tld = rspamd_url_host_unsafe (uri) + 1;
+                       uri->tldshift = uri->hostshift + 1;
                        uri->tldlen = uri->hostlen - 1;
                }
                else {
-                       uri->tld = rspamd_url_host_unsafe (uri);
+                       uri->tldshift = uri->hostshift;
                        uri->tldlen = uri->hostlen;
                }
        }
index 87766c4e638ed7aea0b8d838498531ce076fef7d..00f09ac30362f625cd5c1b6f70c9e838b549de6f 100644 (file)
@@ -53,15 +53,7 @@ struct rspamd_url {
        guint datashift;
        guint queryshift;
        guint fragmentshift;
-
-       gchar *tld;
-       gchar *visible_part;
-
-       struct rspamd_url *phished_url;
-
-       guint urllen;
-       guint rawlen;
-       guint32 flags;
+       guint tldshift;
 
        guint16 protocollen;
        guint16 userlen;
@@ -70,8 +62,14 @@ struct rspamd_url {
        guint16 querylen;
        guint16 fragmentlen;
        guint16 tldlen;
-
        guint16 count;
+
+       guint urllen;
+       guint rawlen;
+       guint32 flags;
+
+       gchar *visible_part;
+       struct rspamd_url *phished_url;
 };
 
 #define rspamd_url_user(u) ((u)->userlen > 0 ? (u)->string + (u)->usershift : NULL)
@@ -79,6 +77,7 @@ struct rspamd_url {
 
 #define rspamd_url_host(u) ((u)->hostlen > 0 ? (u)->string + (u)->hostshift : NULL)
 #define rspamd_url_host_unsafe(u) ((u)->string + (u)->hostshift)
+#define rspamd_url_tld_unsafe(u) ((u)->string + (u)->tldshift)
 
 #define rspamd_url_data_unsafe(u) ((u)->string + (u)->datashift)
 #define rspamd_url_query_unsafe(u) ((u)->string + (u)->queryshift)
index 77a924f418c1b0afc7296a82e44cddd0732cf52a..9f1b14daf738372a026e7c3b7978c8a72afb1909 100644 (file)
@@ -262,7 +262,7 @@ rspamd_tokenize_exception (struct rspamd_process_exception *ex, GArray *res)
                uri = ex->ptr;
 
                if (uri && uri->tldlen > 0) {
-                       token.original.begin = uri->tld;
+                       token.original.begin = rspamd_url_tld_unsafe (uri);
                        token.original.len = uri->tldlen;
 
                }
index cb54a694c090dde57f57a183604c418bd76d7688..efd34dc6c74231f73e40b6ba76ea2d55b1eb7a21 100644 (file)
@@ -560,7 +560,7 @@ lua_url_get_tld (lua_State *L)
        struct rspamd_lua_url *url = lua_check_url (L, 1);
 
        if (url != NULL && url->url->tldlen > 0) {
-               lua_pushlstring (L, url->url->tld, url->url->tldlen);
+               lua_pushlstring (L, rspamd_url_tld_unsafe (url->url), url->url->tldlen);
        }
        else {
                lua_pushnil (L);
@@ -672,7 +672,7 @@ lua_url_to_table (lua_State *L)
 
                if (u->tldlen > 0) {
                        lua_pushstring (L, "tld");
-                       lua_pushlstring (L, u->tld, u->tldlen);
+                       lua_pushlstring (L, rspamd_url_tld_unsafe (u), u->tldlen);
                        lua_settable (L, -3);
                }