]> git.ipfire.org Git - thirdparty/rspamd.git/commitdiff
[Feature] Add url encoding function
authorVsevolod Stakhov <vsevolod@highsecure.ru>
Mon, 23 Jan 2017 13:27:45 +0000 (13:27 +0000)
committerVsevolod Stakhov <vsevolod@highsecure.ru>
Mon, 23 Jan 2017 13:27:45 +0000 (13:27 +0000)
src/libserver/task.c
src/libserver/url.c
src/libserver/url.h
src/libutil/http.c
src/libutil/str_util.c
src/libutil/str_util.h
src/lua/lua_util.c

index 75c44f21afbed3b80aeeea0fcff3975e6245c6c5..f02665afd33d22b39bad1fee674bae3a06a81fd0 100644 (file)
@@ -326,7 +326,7 @@ rspamd_task_load_message (struct rspamd_task *task,
                r = rspamd_strlcpy (filepath, tok->begin,
                                MIN (sizeof (filepath), tok->len + 1));
 
-               rspamd_decode_url (filepath, filepath, r + 1);
+               rspamd_url_decode (filepath, filepath, r + 1);
                flen = strlen (filepath);
 
                if (filepath[0] == '"' && flen > 2) {
@@ -424,7 +424,7 @@ rspamd_task_load_message (struct rspamd_task *task,
                r = rspamd_strlcpy (filepath, tok->begin,
                                MIN (sizeof (filepath), tok->len + 1));
 
-               rspamd_decode_url (filepath, filepath, r + 1);
+               rspamd_url_decode (filepath, filepath, r + 1);
                flen = strlen (filepath);
 
                if (filepath[0] == '"' && flen > 2) {
index 4252d5ac1b80f4e7959a9dacfcb7e7de12bb89b5..4c7e643e798834b177d304ca909228a80cbdede2 100644 (file)
@@ -1569,28 +1569,28 @@ rspamd_url_parse (struct rspamd_url *uri, gchar *uristring, gsize len,
        uri->string = p;
        uri->urllen = len;
 
-       unquoted_len = rspamd_decode_url (uri->string,
+       unquoted_len = rspamd_url_decode (uri->string,
                        uri->string,
                        uri->protocollen);
        rspamd_url_shift (uri, unquoted_len, UF_SCHEMA);
-       unquoted_len = rspamd_decode_url (uri->host, uri->host, uri->hostlen);
+       unquoted_len = rspamd_url_decode (uri->host, uri->host, uri->hostlen);
        rspamd_url_shift (uri, unquoted_len, UF_HOST);
 
        if (uri->datalen) {
-               unquoted_len = rspamd_decode_url (uri->data, uri->data, uri->datalen);
+               unquoted_len = rspamd_url_decode (uri->data, uri->data, uri->datalen);
                rspamd_url_shift (uri, unquoted_len, UF_PATH);
                /* We now normalize path */
                rspamd_http_normalize_path_inplace (uri->data, uri->datalen, &unquoted_len);
                rspamd_url_shift (uri, unquoted_len, UF_PATH);
        }
        if (uri->querylen) {
-               unquoted_len = rspamd_decode_url (uri->query,
+               unquoted_len = rspamd_url_decode (uri->query,
                                uri->query,
                                uri->querylen);
                rspamd_url_shift (uri, unquoted_len, UF_QUERY);
        }
        if (uri->fragmentlen) {
-               unquoted_len = rspamd_decode_url (uri->fragment,
+               unquoted_len = rspamd_url_decode (uri->fragment,
                                uri->fragment,
                                uri->fragmentlen);
                rspamd_url_shift (uri, unquoted_len, UF_FRAGMENT);
@@ -2569,3 +2569,233 @@ rspamd_url_add_tag (struct rspamd_url *url, const gchar *tag,
 
        DL_APPEND (found, ntag);
 }
+
+guint
+rspamd_url_hash (gconstpointer u)
+{
+       const struct rspamd_url *url = u;
+       rspamd_cryptobox_fast_hash_state_t st;
+
+       rspamd_cryptobox_fast_hash_init (&st, rspamd_hash_seed ());
+
+       if (url->urllen > 0) {
+               rspamd_cryptobox_fast_hash_update (&st, url->string, url->urllen);
+       }
+
+       rspamd_cryptobox_fast_hash_update (&st, &url->flags, sizeof (url->flags));
+
+       return rspamd_cryptobox_fast_hash_final (&st);
+}
+
+/* Compare two emails for building emails tree */
+gboolean
+rspamd_emails_cmp (gconstpointer a, gconstpointer b)
+{
+       const struct rspamd_url *u1 = a, *u2 = b;
+       gint r;
+
+       if (u1->hostlen != u2->hostlen || u1->hostlen == 0) {
+               return FALSE;
+       }
+       else {
+               if ((r = rspamd_lc_cmp (u1->host, u2->host, u1->hostlen)) == 0) {
+                       if (u1->userlen != u2->userlen || u1->userlen == 0) {
+                               return FALSE;
+                       }
+                       else {
+                               return rspamd_lc_cmp (u1->user, u2->user, u1->userlen) ==
+                                               0;
+                       }
+               }
+               else {
+                       return r == 0;
+               }
+       }
+
+       return FALSE;
+}
+
+gboolean
+rspamd_urls_cmp (gconstpointer a, gconstpointer b)
+{
+       const struct rspamd_url *u1 = a, *u2 = b;
+       int r;
+
+       if (u1->urllen != u2->urllen) {
+               return FALSE;
+       }
+       else {
+               r = memcmp (u1->string, u2->string, u1->urllen);
+               if (r == 0 && u1->flags != u2->flags) {
+                       /* Always insert phished urls to the tree */
+                       return FALSE;
+               }
+       }
+
+       return r == 0;
+}
+
+gsize
+rspamd_url_decode (gchar *dst, const gchar *src, gsize size)
+{
+       gchar *d, ch, c, decoded;
+       const gchar *s;
+       enum {
+               sw_usual = 0,
+               sw_quoted,
+               sw_quoted_second
+       } state;
+
+       d = dst;
+       s = src;
+
+       state = 0;
+       decoded = 0;
+
+       while (size--) {
+
+               ch = *s++;
+
+               switch (state) {
+               case sw_usual:
+
+                       if (ch == '%') {
+                               state = sw_quoted;
+                               break;
+                       }
+                       else if (ch == '+') {
+                               *d++ = ' ';
+                       }
+                       else {
+                               *d++ = ch;
+                       }
+                       break;
+
+               case sw_quoted:
+
+                       if (ch >= '0' && ch <= '9') {
+                               decoded = (ch - '0');
+                               state = sw_quoted_second;
+                               break;
+                       }
+
+                       c = (ch | 0x20);
+                       if (c >= 'a' && c <= 'f') {
+                               decoded = (c - 'a' + 10);
+                               state = sw_quoted_second;
+                               break;
+                       }
+
+                       /* the invalid quoted character */
+
+                       state = sw_usual;
+
+                       *d++ = ch;
+
+                       break;
+
+               case sw_quoted_second:
+
+                       state = sw_usual;
+
+                       if (ch >= '0' && ch <= '9') {
+                               ch = ((decoded << 4) + ch - '0');
+                               *d++ = ch;
+
+                               break;
+                       }
+
+                       c = (u_char) (ch | 0x20);
+                       if (c >= 'a' && c <= 'f') {
+                               ch = ((decoded << 4) + c - 'a' + 10);
+
+                               *d++ = ch;
+                               break;
+                       }
+
+                       /* the invalid quoted character */
+                       break;
+               }
+       }
+
+       return (d - dst);
+}
+
+#define CHECK_URL_COMPONENT(beg, len) do { \
+       for (i = 0; i < (len); i ++) { \
+               if ((beg)[i] > 0x80 || !is_urlsafe ((beg)[i])) { \
+                       dlen += 2; \
+               } \
+       } \
+} while (0)
+
+#define ENCODE_URL_COMPONENT(beg, len) do { \
+       for (i = 0; i < (len) && dend > d; i ++) { \
+               if ((beg)[i] > 0x80 || !is_urlsafe ((beg)[i])) { \
+                       *d++ = '%'; \
+                       *d++ = hexdigests[((beg)[i] >> 4) & 0xf]; \
+                       *d++ = hexdigests[(beg)[i] & 0xf]; \
+               } \
+               else { \
+                       *d++ = (beg)[i]; \
+               } \
+       } \
+} while (0)
+
+const gchar *
+rspamd_url_encode (struct rspamd_url *url, gsize *pdlen,
+               rspamd_mempool_t *pool)
+{
+       guchar *dest, *d, *dend;
+       static const gchar hexdigests[16] = "0123456789abcdef";
+       guint i;
+       gsize dlen = 0;
+
+       g_assert (pdlen != NULL && url != NULL && pool != NULL);
+
+       CHECK_URL_COMPONENT ((guchar *)url->host, url->hostlen);
+       CHECK_URL_COMPONENT ((guchar *)url->user, url->userlen);
+       CHECK_URL_COMPONENT ((guchar *)url->data, url->datalen);
+       CHECK_URL_COMPONENT ((guchar *)url->query, url->querylen);
+       CHECK_URL_COMPONENT ((guchar *)url->fragment, url->fragmentlen);
+
+       if (dlen == 0) {
+               *pdlen = url->urllen;
+
+               return url->string;
+       }
+
+       /* Need to encode */
+       dlen += url->urllen;
+       dest = rspamd_mempool_alloc (pool, dlen + 1);
+       d = dest;
+       dend = d + dlen;
+       d += rspamd_snprintf ((gchar *)d, dend - d,
+                       "%*s://", url->protocollen, url->protocol);
+
+       if (url->userlen > 0) {
+               ENCODE_URL_COMPONENT ((guchar *)url->user, url->userlen);
+               *d++ = ':';
+       }
+
+       ENCODE_URL_COMPONENT ((guchar *)url->host, url->hostlen);
+
+       if (url->datalen > 0) {
+               *d++ = '/';
+               ENCODE_URL_COMPONENT ((guchar *)url->data, url->datalen);
+       }
+
+       if (url->querylen > 0) {
+               *d++ = '/';
+               ENCODE_URL_COMPONENT ((guchar *)url->query, url->querylen);
+       }
+
+       if (url->fragmentlen > 0) {
+               *d++ = '/';
+               ENCODE_URL_COMPONENT ((guchar *)url->fragment, url->fragmentlen);
+       }
+
+       *pdlen = (d - dest);
+
+       return (const gchar *)dest;
+}
index dbe3eb00b21a45350ef662ce8f52a9896cc0bf30..f56649558b45b33c7acb3dddb3d0de1fe9bb42a3 100644 (file)
@@ -177,4 +177,31 @@ void rspamd_url_add_tag (struct rspamd_url *url, const gchar *tag,
                const gchar *value,
                rspamd_mempool_t *pool);
 
+guint rspamd_url_hash (gconstpointer u);
+
+/* Compare two emails for building emails hash */
+gboolean rspamd_emails_cmp (gconstpointer a, gconstpointer b);
+
+/* Compare two urls for building emails hash */
+gboolean rspamd_urls_cmp (gconstpointer a, gconstpointer b);
+
+/**
+ * Decode URL encoded string in-place and return new length of a string, src and dst are NULL terminated
+ * @param dst
+ * @param src
+ * @param size
+ * @return
+ */
+gsize rspamd_url_decode (gchar *dst, const gchar *src, gsize size);
+
+/**
+ * Encode url if needed. In this case, memory is allocated from the specific pool.
+ * Returns pointer to begin and encoded length in `dlen`
+ * @param url
+ * @param pool
+ * @return
+ */
+const gchar * rspamd_url_encode (struct rspamd_url *url, gsize *dlen,
+               rspamd_mempool_t *pool);
+
 #endif
index eec53b515b2e76e51baf2ceb47b500bb919e9f96..9a33b1a90b5c6ad938a2196c4db0b5219ac6cc74 100644 (file)
@@ -27,6 +27,7 @@
 #include "unix-std.h"
 #include "libutil/ssl_util.h"
 #include "libutil/regexp.h"
+#include "libserver/url.h"
 
 #define ENCRYPTED_VERSION " HTTP/1.0"
 
@@ -3376,7 +3377,7 @@ rspamd_http_message_parse_query (struct rspamd_http_message *msg)
                                                /* We have a single parameter without a value */
                                                key = rspamd_fstring_new_init (c, p - c);
                                                key_tok = rspamd_ftok_map (key);
-                                               key_tok->len = rspamd_decode_url (key->str, key->str,
+                                               key_tok->len = rspamd_url_decode (key->str, key->str,
                                                                key->len);
 
                                                value = rspamd_fstring_new_init ("", 0);
@@ -3389,7 +3390,7 @@ rspamd_http_message_parse_query (struct rspamd_http_message *msg)
                                                /* We have something like key=value */
                                                key = rspamd_fstring_new_init (c, p - c);
                                                key_tok = rspamd_ftok_map (key);
-                                               key_tok->len = rspamd_decode_url (key->str, key->str,
+                                               key_tok->len = rspamd_url_decode (key->str, key->str,
                                                                key->len);
 
                                                state = parse_eqsign;
@@ -3415,7 +3416,7 @@ rspamd_http_message_parse_query (struct rspamd_http_message *msg)
                                                if (p > c) {
                                                        value = rspamd_fstring_new_init (c, p - c);
                                                        value_tok = rspamd_ftok_map (value);
-                                                       value_tok->len = rspamd_decode_url (value->str,
+                                                       value_tok->len = rspamd_url_decode (value->str,
                                                                        value->str,
                                                                        value->len);
                                                        /* Detect quotes for value */
index 3b3dc06b700b2bc16f9fda18e424c2253777de45..10f5d54e39b0ea119ae8842dcd5f238f274490d2 100644 (file)
@@ -897,91 +897,7 @@ rspamd_encode_base64_fold (const guchar *in, gsize inlen, gint str_len,
        return rspamd_encode_base64_common (in, inlen, str_len, outlen, TRUE, how);
 }
 
-gsize
-rspamd_decode_url (gchar *dst, const gchar *src, gsize size)
-{
-       gchar *d, ch, c, decoded;
-       const gchar *s;
-       enum {
-               sw_usual = 0,
-               sw_quoted,
-               sw_quoted_second
-       } state;
-
-       d = dst;
-       s = src;
-
-       state = 0;
-       decoded = 0;
-
-       while (size--) {
-
-               ch = *s++;
-
-               switch (state) {
-               case sw_usual:
-
-                       if (ch == '%') {
-                               state = sw_quoted;
-                               break;
-                       }
-                       else if (ch == '+') {
-                               *d++ = ' ';
-                       }
-                       else {
-                               *d++ = ch;
-                       }
-                       break;
-
-               case sw_quoted:
-
-                       if (ch >= '0' && ch <= '9') {
-                               decoded = (ch - '0');
-                               state = sw_quoted_second;
-                               break;
-                       }
-
-                       c = (ch | 0x20);
-                       if (c >= 'a' && c <= 'f') {
-                               decoded = (c - 'a' + 10);
-                               state = sw_quoted_second;
-                               break;
-                       }
-
-                       /* the invalid quoted character */
-
-                       state = sw_usual;
-
-                       *d++ = ch;
-
-                       break;
-
-               case sw_quoted_second:
-
-                       state = sw_usual;
-
-                       if (ch >= '0' && ch <= '9') {
-                               ch = ((decoded << 4) + ch - '0');
-                               *d++ = ch;
-
-                               break;
-                       }
-
-                       c = (u_char) (ch | 0x20);
-                       if (c >= 'a' && c <= 'f') {
-                               ch = ((decoded << 4) + c - 'a' + 10);
-
-                               *d++ = ch;
-                               break;
-                       }
-
-                       /* the invalid quoted character */
-                       break;
-               }
-       }
 
-       return (d - dst);
-}
 #define MIN3(a, b, c) ((a) < (b) ? ((a) < (c) ? (a) : (c)) : ((b) < (c) ? (b) : (c)))
 
 gint
@@ -2143,71 +2059,6 @@ rspamd_ucl_emit_fstring_comments (const ucl_object_t *obj,
        ucl_object_emit_full (obj, emit_type, &func, comments);
 }
 
-guint
-rspamd_url_hash (gconstpointer u)
-{
-       const struct rspamd_url *url = u;
-       rspamd_cryptobox_fast_hash_state_t st;
-
-       rspamd_cryptobox_fast_hash_init (&st, rspamd_hash_seed ());
-
-       if (url->urllen > 0) {
-               rspamd_cryptobox_fast_hash_update (&st, url->string, url->urllen);
-       }
-
-       rspamd_cryptobox_fast_hash_update (&st, &url->flags, sizeof (url->flags));
-
-       return rspamd_cryptobox_fast_hash_final (&st);
-}
-
-/* Compare two emails for building emails tree */
-gboolean
-rspamd_emails_cmp (gconstpointer a, gconstpointer b)
-{
-       const struct rspamd_url *u1 = a, *u2 = b;
-       gint r;
-
-       if (u1->hostlen != u2->hostlen || u1->hostlen == 0) {
-               return FALSE;
-       }
-       else {
-               if ((r = rspamd_lc_cmp (u1->host, u2->host, u1->hostlen)) == 0) {
-                       if (u1->userlen != u2->userlen || u1->userlen == 0) {
-                               return FALSE;
-                       }
-                       else {
-                               return rspamd_lc_cmp (u1->user, u2->user, u1->userlen) ==
-                                               0;
-                       }
-               }
-               else {
-                       return r == 0;
-               }
-       }
-
-       return FALSE;
-}
-
-gboolean
-rspamd_urls_cmp (gconstpointer a, gconstpointer b)
-{
-       const struct rspamd_url *u1 = a, *u2 = b;
-       int r;
-
-       if (u1->urllen != u2->urllen) {
-               return FALSE;
-       }
-       else {
-               r = memcmp (u1->string, u2->string, u1->urllen);
-               if (r == 0 && u1->flags != u2->flags) {
-                       /* Always insert phished urls to the tree */
-                       return FALSE;
-               }
-       }
-
-       return r == 0;
-}
-
 const void *
 rspamd_memrchr (const void *m, gint c, gsize len)
 {
index 941d141b4c3f5e946b91db7dc28cfc0cd876e776..ea3d97278f594f542293e3edcab7791836b511f3 100644 (file)
@@ -204,15 +204,6 @@ gchar * rspamd_encode_base64 (const guchar *in, gsize inlen, gint str_len,
 gchar * rspamd_encode_base64_fold (const guchar *in, gsize inlen, gint str_len,
                gsize *outlen, enum rspamd_newlines_type how);
 
-/**
- * Decode URL encoded string in-place and return new length of a string, src and dst are NULL terminated
- * @param dst
- * @param src
- * @param size
- * @return
- */
-gsize rspamd_decode_url (gchar *dst, const gchar *src, gsize size);
-
 /**
  * Decode quoted-printable encoded buffer, input and output must not overlap
  * @param in input
@@ -343,14 +334,6 @@ void rspamd_ucl_emit_fstring_comments (const ucl_object_t *obj,
                rspamd_fstring_t **target,
                const ucl_object_t *comments);
 
-guint rspamd_url_hash (gconstpointer u);
-
-/* Compare two emails for building emails hash */
-gboolean rspamd_emails_cmp (gconstpointer a, gconstpointer b);
-
-/* Compare two urls for building emails hash */
-gboolean rspamd_urls_cmp (gconstpointer a, gconstpointer b);
-
 extern const guchar lc_map[256];
 
 /**
index 95471601bdefded48869ad39ed5581ffcedb7c51..b1bfdce2833535af2e39c25a714e2b0f5b773bcf 100644 (file)
@@ -857,7 +857,7 @@ lua_util_decode_url (lua_State *L)
                rspamd_lua_setclass (L, "rspamd{text}", -1);
                t->start = g_malloc (inlen);
                memcpy ((char *)t->start, s, inlen);
-               t->len = rspamd_decode_url ((char *)t->start, s, inlen);
+               t->len = rspamd_url_decode ((char *)t->start, s, inlen);
                t->flags = RSPAMD_TEXT_FLAG_OWN;
        }
        else {