]> git.ipfire.org Git - thirdparty/rspamd.git/commitdiff
[Rework] Rework URL structure: more structure optimisations
authorVsevolod Stakhov <vsevolod@highsecure.ru>
Fri, 6 Mar 2020 13:14:41 +0000 (13:14 +0000)
committerVsevolod Stakhov <vsevolod@highsecure.ru>
Mon, 9 Mar 2020 10:46:11 +0000 (10:46 +0000)
src/libserver/html.c
src/libserver/url.c
src/libserver/url.h
src/lua/lua_url.c

index 7dca724536695a3d8e73e0391716b74dbaf069e2..e1a211d2ce4e4997f954060028cf7caf1b8b8af4 100644 (file)
@@ -1631,7 +1631,7 @@ rspamd_process_html_url (rspamd_mempool_t *pool, struct rspamd_url *url,
 
        if (url->querylen > 0) {
 
-               if (rspamd_url_find (pool, url->query, url->querylen, &url_str,
+               if (rspamd_url_find (pool, rspamd_url_query_unsafe (url), url->querylen, &url_str,
                                RSPAMD_URL_FIND_ALL,
                                NULL, &prefix_added)) {
                        query_url = rspamd_mempool_alloc0 (pool,
@@ -1646,7 +1646,7 @@ rspamd_process_html_url (rspamd_mempool_t *pool, struct rspamd_url *url,
                        if (rc == URI_ERRNO_OK &&
                                        query_url->hostlen > 0) {
                                msg_debug_html ("found url %s in query of url"
-                                               " %*s", url_str, url->querylen, url->query);
+                                               " %*s", url_str, url->querylen, rspamd_url_query_unsafe (url));
 
                                if (query_url->protocol == PROTOCOL_MAILTO) {
                                        target_tbl = tbl_emails;
index ac4c119168f4e62a29bdcf4fc101df286f79a745..7e85a460e6cd3dba52b0564e84d964a55f591ebb 100644 (file)
@@ -1573,6 +1573,7 @@ rspamd_url_regen_from_inet_addr (struct rspamd_url *uri, const void *addr, int a
                rspamd_mempool_t *pool)
 {
        gchar *strbuf, *p;
+       const gchar *start_offset;
        gsize slen = uri->urllen - uri->hostlen;
        goffset r = 0;
 
@@ -1589,39 +1590,46 @@ rspamd_url_regen_from_inet_addr (struct rspamd_url *uri, const void *addr, int a
                        (gint)(uri->hostshift),
                        uri->string);
        uri->hostshift = r;
+       start_offset = strbuf + r;
        inet_ntop (af, addr, strbuf + r, slen - r + 1);
-       uri->hostlen = strlen (rspamd_url_host_unsafe (uri));
+       uri->hostlen = strlen (start_offset);
        r += uri->hostlen;
-       uri->tld = rspamd_url_host_unsafe (uri);
+       uri->tld = (const gchar *)start_offset;
        uri->tldlen = uri->hostlen;
        uri->flags |= RSPAMD_URL_FLAG_NUMERIC;
 
        /* Reconstruct URL */
        if (uri->datalen > 0) {
-               p = strbuf + r + 1;
+               p = strbuf + r;
+               start_offset = p + 1;
                r += rspamd_snprintf (strbuf + r, slen - r, "/%*s",
                                (gint)uri->datalen,
-                               uri->data);
-               uri->data = p;
+                               rspamd_url_data_unsafe (uri));
+               uri->datashift = start_offset - strbuf;
        }
        else {
                /* Add trailing slash if needed */
-               r += rspamd_snprintf (strbuf + r, slen - r, "/");
+               if (uri->hostlen + uri->hostshift < uri->urllen &&
+                       *(rspamd_url_host_unsafe (uri) + uri->hostlen) == '/') {
+                       r += rspamd_snprintf (strbuf + r, slen - r, "/");
+               }
        }
 
        if (uri->querylen > 0) {
-               p = strbuf + r + 1;
+               p = strbuf + r;
+               start_offset = p + 1;
                r += rspamd_snprintf (strbuf + r, slen - r, "?%*s",
                                (gint)uri->querylen,
-                               uri->query);
-               uri->query = p;
+                               rspamd_url_query_unsafe (uri));
+               uri->queryshift = start_offset - strbuf;
        }
        if (uri->fragmentlen > 0) {
-               p = strbuf + r + 1;
+               p = strbuf + r;
+               start_offset = p + 1;
                r += rspamd_snprintf (strbuf + r, slen - r, "#%*s",
                                (gint)uri->fragmentlen,
-                               uri->fragment);
-               uri->fragment = p;
+                               rspamd_url_fragment_unsafe (uri));
+               uri->fragmentshift = start_offset - strbuf;
        }
 
        uri->string = strbuf;
@@ -1832,9 +1840,10 @@ rspamd_url_shift (struct rspamd_url *uri, gsize nlen,
 
                old_shift = uri->datalen;
                uri->datalen -= shift;
-               remain = (uri->urllen - (uri->data - uri->string)) - old_shift;
+               remain = (uri->urllen - (uri->datashift)) - old_shift;
                g_assert (remain >= 0);
-               memmove (uri->data + uri->datalen, uri->data + old_shift,
+               memmove (rspamd_url_data_unsafe (uri) + uri->datalen,
+                               rspamd_url_data_unsafe (uri) + old_shift,
                                remain);
                uri->urllen -= shift;
                uri->flags |= RSPAMD_URL_FLAG_PATHENCODED;
@@ -1849,9 +1858,10 @@ rspamd_url_shift (struct rspamd_url *uri, gsize nlen,
 
                old_shift = uri->querylen;
                uri->querylen -= shift;
-               remain = (uri->urllen - (uri->query - uri->string)) - old_shift;
+               remain = (uri->urllen - (uri->queryshift)) - old_shift;
                g_assert (remain >= 0);
-               memmove (uri->query + uri->querylen, uri->query + old_shift,
+               memmove (rspamd_url_query_unsafe (uri) + uri->querylen,
+                               rspamd_url_query_unsafe (uri) + old_shift,
                                remain);
                uri->urllen -= shift;
                uri->flags |= RSPAMD_URL_FLAG_QUERYENCODED;
@@ -1881,21 +1891,25 @@ rspamd_url_shift (struct rspamd_url *uri, gsize nlen,
                        uri->hostshift -= shift;
                }
                /* Go forward */
+               /* FALLTHRU */
        case UF_HOST:
                if (uri->datalen > 0) {
-                       uri->data -= shift;
+                       uri->datashift -= shift;
                }
                /* Go forward */
+               /* FALLTHRU */
        case UF_PATH:
                if (uri->querylen > 0) {
-                       uri->query -= shift;
+                       uri->queryshift -= shift;
                }
                /* Go forward */
+               /* FALLTHRU */
        case UF_QUERY:
                if (uri->fragmentlen > 0) {
-                       uri->fragment -= shift;
+                       uri->fragmentshift -= shift;
                }
                /* Go forward */
+               /* FALLTHRU */
        case UF_FRAGMENT:
        default:
                break;
@@ -1943,7 +1957,7 @@ rspamd_url_parse (struct rspamd_url *uri,
                                  enum rspamd_url_parse_flags parse_flags)
 {
        struct http_parser_url u;
-       gchar *p, *comp;
+       gchar *p;
        const gchar *end;
        guint i, complen, ret, flags = 0;
        guint unquoted_len = 0;
@@ -2015,31 +2029,36 @@ rspamd_url_parse (struct rspamd_url *uri,
 
        for (i = 0; i < UF_MAX; i++) {
                if (u.field_set & (1 << i)) {
-                       comp = uri->string + u.field_data[i].off;
+                       guint shift = u.field_data[i].off;
                        complen = u.field_data[i].len;
 
+                       if (complen >= G_MAXUINT16) {
+                               /* Too large component length */
+                               return URI_ERRNO_BAD_FORMAT;
+                       }
+
                        switch (i) {
                        case UF_SCHEMA:
                                uri->protocollen = u.field_data[i].len;
                                break;
                        case UF_HOST:
-                               uri->hostshift = u.field_data[i].off;
+                               uri->hostshift = shift;
                                uri->hostlen = complen;
                                break;
                        case UF_PATH:
-                               uri->data = comp;
+                               uri->datashift = shift;
                                uri->datalen = complen;
                                break;
                        case UF_QUERY:
-                               uri->query = comp;
+                               uri->queryshift = shift;
                                uri->querylen = complen;
                                break;
                        case UF_FRAGMENT:
-                               uri->fragment = comp;
+                               uri->fragmentshift = shift;
                                uri->fragmentlen = complen;
                                break;
                        case UF_USERINFO:
-                               uri->usershift = u.field_data[i].off;
+                               uri->usershift = shift;
                                uri->userlen = complen;
                                break;
                        default:
@@ -2129,31 +2148,36 @@ rspamd_url_parse (struct rspamd_url *uri,
 
        /* Process data part */
        if (uri->datalen) {
-               unquoted_len = rspamd_url_decode (uri->data, uri->data, uri->datalen);
-               if (rspamd_normalise_unicode_inplace (pool, uri->data, &unquoted_len)) {
+               unquoted_len = rspamd_url_decode (rspamd_url_data_unsafe (uri),
+                               rspamd_url_data_unsafe (uri), uri->datalen);
+               if (rspamd_normalise_unicode_inplace (pool, rspamd_url_data_unsafe (uri),
+                               &unquoted_len)) {
                        uri->flags |= RSPAMD_URL_FLAG_UNNORMALISED;
                }
                rspamd_url_shift (uri, unquoted_len, UF_PATH);
                /* We now normalize path */
-               rspamd_http_normalize_path_inplace (uri->data, uri->datalen, &unquoted_len);
+               rspamd_http_normalize_path_inplace (rspamd_url_data_unsafe (uri),
+                               uri->datalen, &unquoted_len);
                rspamd_url_shift (uri, unquoted_len, UF_PATH);
        }
 
        if (uri->querylen) {
-               unquoted_len = rspamd_url_decode (uri->query,
-                               uri->query,
+               unquoted_len = rspamd_url_decode (rspamd_url_query_unsafe (uri),
+                               rspamd_url_query_unsafe (uri),
                                uri->querylen);
-               if (rspamd_normalise_unicode_inplace (pool, uri->query, &unquoted_len)) {
+               if (rspamd_normalise_unicode_inplace (pool, rspamd_url_query_unsafe (uri),
+                               &unquoted_len)) {
                        uri->flags |= RSPAMD_URL_FLAG_UNNORMALISED;
                }
                rspamd_url_shift (uri, unquoted_len, UF_QUERY);
        }
 
        if (uri->fragmentlen) {
-               unquoted_len = rspamd_url_decode (uri->fragment,
-                               uri->fragment,
+               unquoted_len = rspamd_url_decode (rspamd_url_fragment_unsafe (uri),
+                               rspamd_url_fragment_unsafe (uri),
                                uri->fragmentlen);
-               if (rspamd_normalise_unicode_inplace (pool, uri->fragment, &unquoted_len)) {
+               if (rspamd_normalise_unicode_inplace (pool, rspamd_url_fragment_unsafe (uri),
+                               &unquoted_len)) {
                        uri->flags |= RSPAMD_URL_FLAG_UNNORMALISED;
                }
                rspamd_url_shift (uri, unquoted_len, UF_FRAGMENT);
@@ -3148,7 +3172,7 @@ rspamd_url_text_part_callback (struct rspamd_url *url, gsize start_offset,
 
        /* We also search the query for additional url inside */
        if (url->querylen > 0) {
-               if (rspamd_url_find (task->task_pool, url->query, url->querylen,
+               if (rspamd_url_find (task->task_pool, rspamd_url_query_unsafe (url), url->querylen,
                                &url_str, RSPAMD_URL_FIND_ALL, NULL, &prefix_added)) {
                        query_url = rspamd_mempool_alloc0 (task->task_pool,
                                        sizeof (struct rspamd_url));
@@ -3161,7 +3185,7 @@ rspamd_url_text_part_callback (struct rspamd_url *url, gsize start_offset,
                        if (rc == URI_ERRNO_OK &&
                                        query_url->hostlen > 0) {
                                msg_debug_task ("found url %s in query of url"
-                                               " %*s", url_str, url->querylen, url->query);
+                                               " %*s", url_str, url->querylen, rspamd_url_query_unsafe (url));
 
                                if (prefix_added) {
                                        query_url->flags |= RSPAMD_URL_FLAG_SCHEMALESS;
@@ -3314,7 +3338,7 @@ rspamd_url_task_subject_callback (struct rspamd_url *url, gsize start_offset,
 
        /* We also search the query for additional url inside */
        if (url->querylen > 0) {
-               if (rspamd_url_find (task->task_pool, url->query, url->querylen,
+               if (rspamd_url_find (task->task_pool, rspamd_url_query_unsafe (url), url->querylen,
                                &url_str, RSPAMD_URL_FIND_ALL, NULL, &prefix_added)) {
 
                        query_url = rspamd_mempool_alloc0 (task->task_pool,
@@ -3328,7 +3352,7 @@ rspamd_url_task_subject_callback (struct rspamd_url *url, gsize start_offset,
                        if (rc == URI_ERRNO_OK &&
                                        url->hostlen > 0) {
                                msg_debug_task ("found url %s in query of url"
-                                               " %*s", url_str, url->querylen, url->query);
+                                               " %*s", url_str, url->querylen, rspamd_url_query_unsafe (url));
 
                                if (prefix_added) {
                                        query_url->flags |= RSPAMD_URL_FLAG_SCHEMALESS;
@@ -3651,11 +3675,11 @@ rspamd_url_encode (struct rspamd_url *url, gsize *pdlen,
                        RSPAMD_URL_FLAGS_HOSTSAFE);
        CHECK_URL_COMPONENT (rspamd_url_user_unsafe(url), url->userlen,
                        RSPAMD_URL_FLAGS_USERSAFE);
-       CHECK_URL_COMPONENT ((guchar *)url->data, url->datalen,
+       CHECK_URL_COMPONENT (rspamd_url_data_unsafe (url), url->datalen,
                        RSPAMD_URL_FLAGS_PATHSAFE);
-       CHECK_URL_COMPONENT ((guchar *)url->query, url->querylen,
+       CHECK_URL_COMPONENT (rspamd_url_query_unsafe (url), url->querylen,
                        RSPAMD_URL_FLAGS_QUERYSAFE);
-       CHECK_URL_COMPONENT ((guchar *)url->fragment, url->fragmentlen,
+       CHECK_URL_COMPONENT (rspamd_url_fragment_unsafe (url), url->fragmentlen,
                        RSPAMD_URL_FLAGS_FRAGMENTSAFE);
 
        if (dlen == 0) {
@@ -3698,19 +3722,19 @@ rspamd_url_encode (struct rspamd_url *url, gsize *pdlen,
 
        if (url->datalen > 0) {
                *d++ = '/';
-               ENCODE_URL_COMPONENT ((guchar *)url->data, url->datalen,
+               ENCODE_URL_COMPONENT (rspamd_url_data_unsafe (url), url->datalen,
                                RSPAMD_URL_FLAGS_PATHSAFE);
        }
 
        if (url->querylen > 0) {
                *d++ = '?';
-               ENCODE_URL_COMPONENT ((guchar *)url->query, url->querylen,
+               ENCODE_URL_COMPONENT (rspamd_url_query_unsafe (url), url->querylen,
                                RSPAMD_URL_FLAGS_QUERYSAFE);
        }
 
        if (url->fragmentlen > 0) {
                *d++ = '#';
-               ENCODE_URL_COMPONENT ((guchar *)url->fragment, url->fragmentlen,
+               ENCODE_URL_COMPONENT (rspamd_url_fragment_unsafe (url), url->fragmentlen,
                                RSPAMD_URL_FLAGS_FRAGMENTSAFE);
        }
 
index 080f005c3be54b2593013fecdcea21bc0f74779e..87766c4e638ed7aea0b8d838498531ce076fef7d 100644 (file)
@@ -44,33 +44,34 @@ struct rspamd_url_tag {
 struct rspamd_url {
        gchar *raw;
        gchar *string;
-       guint protocol;
-       guint port;
 
-       guint usershift;
-       guint userlen;
+       guint16 protocol;
+       guint16 port;
 
+       guint usershift;
        guint hostshift;
-       guint hostlen;
+       guint datashift;
+       guint queryshift;
+       guint fragmentshift;
 
-       gchar *data;
-       gchar *query;
-       gchar *fragment;
        gchar *tld;
        gchar *visible_part;
 
        struct rspamd_url *phished_url;
 
-       guint protocollen;
-       guint datalen;
-       guint querylen;
-       guint fragmentlen;
-       guint tldlen;
        guint urllen;
        guint rawlen;
+       guint32 flags;
 
-       enum rspamd_url_flags flags;
-       guint count;
+       guint16 protocollen;
+       guint16 userlen;
+       guint16 hostlen;
+       guint16 datalen;
+       guint16 querylen;
+       guint16 fragmentlen;
+       guint16 tldlen;
+
+       guint16 count;
 };
 
 #define rspamd_url_user(u) ((u)->userlen > 0 ? (u)->string + (u)->usershift : NULL)
@@ -79,6 +80,10 @@ struct rspamd_url {
 #define rspamd_url_host(u) ((u)->hostlen > 0 ? (u)->string + (u)->hostshift : NULL)
 #define rspamd_url_host_unsafe(u) ((u)->string + (u)->hostshift)
 
+#define rspamd_url_data_unsafe(u) ((u)->string + (u)->datashift)
+#define rspamd_url_query_unsafe(u) ((u)->string + (u)->queryshift)
+#define rspamd_url_fragment_unsafe(u) ((u)->string + (u)->fragmentshift)
+
 enum uri_errno {
        URI_ERRNO_OK = 0,           /* Parsing went well */
        URI_ERRNO_EMPTY,        /* The URI string was empty */
@@ -97,7 +102,7 @@ enum rspamd_url_protocol {
        PROTOCOL_HTTPS = 1u << 3u,
        PROTOCOL_MAILTO = 1u << 4u,
        PROTOCOL_TELEPHONE = 1u << 5u,
-       PROTOCOL_UNKNOWN = 1u << 31u,
+       PROTOCOL_UNKNOWN = 1u << 15u,
 };
 
 enum rspamd_url_parse_flags {
index bd94120e2f8166a24d5e2c5a0e594fee04e264b1..cb54a694c090dde57f57a183604c418bd76d7688 100644 (file)
@@ -220,7 +220,7 @@ lua_url_get_path (lua_State *L)
        struct rspamd_lua_url *url = lua_check_url (L, 1);
 
        if (url != NULL && url->url->datalen > 0) {
-               lua_pushlstring (L, url->url->data, url->url->datalen);
+               lua_pushlstring (L, rspamd_url_data_unsafe (url->url), url->url->datalen);
        }
        else {
                lua_pushnil (L);
@@ -241,7 +241,7 @@ lua_url_get_query (lua_State *L)
        struct rspamd_lua_url *url = lua_check_url (L, 1);
 
        if (url != NULL && url->url->querylen > 0) {
-               lua_pushlstring (L, url->url->query, url->url->querylen);
+               lua_pushlstring (L, rspamd_url_query_unsafe (url->url), url->url->querylen);
        }
        else {
                lua_pushnil (L);
@@ -262,7 +262,7 @@ lua_url_get_fragment (lua_State *L)
        struct rspamd_lua_url *url = lua_check_url (L, 1);
 
        if (url != NULL && url->url->fragmentlen > 0) {
-               lua_pushlstring (L, url->url->fragment, url->url->fragmentlen);
+               lua_pushlstring (L, rspamd_url_fragment_unsafe (url->url), url->url->fragmentlen);
        }
        else {
                lua_pushnil (L);
@@ -684,19 +684,19 @@ lua_url_to_table (lua_State *L)
 
                if (u->datalen > 0) {
                        lua_pushstring (L, "path");
-                       lua_pushlstring (L, u->data, u->datalen);
+                       lua_pushlstring (L, rspamd_url_data_unsafe (u), u->datalen);
                        lua_settable (L, -3);
                }
 
                if (u->querylen > 0) {
                        lua_pushstring (L, "query");
-                       lua_pushlstring (L, u->query, u->querylen);
+                       lua_pushlstring (L, rspamd_url_query_unsafe (u), u->querylen);
                        lua_settable (L, -3);
                }
 
                if (u->fragmentlen > 0) {
                        lua_pushstring (L, "fragment");
-                       lua_pushlstring (L, u->fragment, u->fragmentlen);
+                       lua_pushlstring (L, rspamd_url_fragment_unsafe (u), u->fragmentlen);
                        lua_settable (L, -3);
                }