]> git.ipfire.org Git - thirdparty/rspamd.git/commitdiff
[Fix] Fix normalisation flags propagation
authorVsevolod Stakhov <vsevolod@highsecure.ru>
Tue, 11 May 2021 14:13:15 +0000 (15:13 +0100)
committerVsevolod Stakhov <vsevolod@highsecure.ru>
Tue, 11 May 2021 14:13:15 +0000 (15:13 +0100)
src/libserver/html.c
src/libserver/url.c
src/libserver/url.h

index 4cb46445f85892cfac21bb257fe121e6893739c2..c373bb115c9bbecc3c9d59c813121d5fe54a9852 100644 (file)
@@ -1593,21 +1593,7 @@ rspamd_html_process_url (rspamd_mempool_t *pool, const gchar *start, guint len,
 
        url = rspamd_mempool_alloc0 (pool, sizeof (*url));
 
-       enum rspamd_normalise_result norm_res;
-
-       norm_res = rspamd_normalise_unicode_inplace (pool, decoded, &dlen);
-
-       if (norm_res & RSPAMD_UNICODE_NORM_UNNORMAL) {
-               saved_flags |= RSPAMD_URL_FLAG_UNNORMALISED;
-       }
-
-       if (norm_res & (RSPAMD_UNICODE_NORM_ZERO_SPACES|RSPAMD_UNICODE_NORM_ERROR)) {
-               saved_flags |= RSPAMD_URL_FLAG_OBSCURED;
-
-               if (norm_res & RSPAMD_UNICODE_NORM_ZERO_SPACES) {
-                       saved_flags |= RSPAMD_URL_FLAG_ZW_SPACES;
-               }
-       }
+       rspamd_url_normalise_propagate_flags (pool, decoded, &dlen, saved_flags);
 
        rc = rspamd_url_parse (url, decoded, dlen, pool, RSPAMD_URL_PARSE_HREF);
 
@@ -2644,6 +2630,9 @@ rspamd_html_check_displayed_url (rspamd_mempool_t *pool,
        if (norm_res & RSPAMD_UNICODE_NORM_UNNORMAL) {
                saved_flags |= RSPAMD_URL_FLAG_UNNORMALISED;
        }
+       if (norm_res & RSPAMD_UNICODE_NORM_ZERO_SPACES) {
+               saved_flags |= RSPAMD_URL_FLAG_ZW_SPACES;
+       }
 
        rspamd_html_url_is_phished (pool, url,
                        url->visible_part,
index d36704e73e0be40d373de1ebfa2541e719e73c07..eb663519df5fedf74d8be9dafb587cb29cc90737 100644 (file)
@@ -1339,7 +1339,7 @@ rspamd_web_parse (struct http_parser_url *u, const gchar *str, gsize len,
                                                if (!u_isalnum (uc)) {
                                                        /* Bad symbol */
                                                        if (IS_ZERO_WIDTH_SPACE (uc)) {
-                                                               (*flags) |= RSPAMD_URL_FLAG_OBSCURED;
+                                                               (*flags) |= RSPAMD_URL_FLAG_OBSCURED|RSPAMD_URL_FLAG_ZW_SPACES;
                                                        }
                                                        else {
                                                                if (!u_isgraph (uc)) {
@@ -2308,10 +2308,8 @@ rspamd_url_parse (struct rspamd_url *uri,
        unquoted_len = rspamd_url_decode (rspamd_url_host_unsafe (uri),
                        rspamd_url_host_unsafe (uri), uri->hostlen);
 
-       if (rspamd_normalise_unicode_inplace (pool,
-                       rspamd_url_host_unsafe (uri), &unquoted_len)) {
-               uri->flags |= RSPAMD_URL_FLAG_UNNORMALISED;
-       }
+       rspamd_url_normalise_propagate_flags (pool, rspamd_url_host_unsafe (uri),
+                       &unquoted_len, uri->flags);
 
        rspamd_url_shift (uri, unquoted_len, UF_HOST);
 
@@ -2380,10 +2378,10 @@ rspamd_url_parse (struct rspamd_url *uri,
        if (uri->datalen) {
                unquoted_len = rspamd_url_decode (rspamd_url_data_unsafe (uri),
                                rspamd_url_data_unsafe (uri), uri->datalen);
-               if (rspamd_normalise_unicode_inplace (pool, rspamd_url_data_unsafe (uri),
-                               &unquoted_len)) {
-                       uri->flags |= RSPAMD_URL_FLAG_UNNORMALISED;
-               }
+
+               rspamd_url_normalise_propagate_flags (pool, rspamd_url_data_unsafe (uri),
+                               &unquoted_len, uri->flags);
+
                rspamd_url_shift (uri, unquoted_len, UF_PATH);
                /* We now normalize path */
                rspamd_http_normalize_path_inplace (rspamd_url_data_unsafe (uri),
@@ -2395,10 +2393,9 @@ rspamd_url_parse (struct rspamd_url *uri,
                unquoted_len = rspamd_url_decode (rspamd_url_query_unsafe (uri),
                                rspamd_url_query_unsafe (uri),
                                uri->querylen);
-               if (rspamd_normalise_unicode_inplace (pool, rspamd_url_query_unsafe (uri),
-                               &unquoted_len)) {
-                       uri->flags |= RSPAMD_URL_FLAG_UNNORMALISED;
-               }
+
+               rspamd_url_normalise_propagate_flags (pool, rspamd_url_query_unsafe (uri),
+                               &unquoted_len, uri->flags);
                rspamd_url_shift (uri, unquoted_len, UF_QUERY);
        }
 
@@ -2406,10 +2403,9 @@ rspamd_url_parse (struct rspamd_url *uri,
                unquoted_len = rspamd_url_decode (rspamd_url_fragment_unsafe (uri),
                                rspamd_url_fragment_unsafe (uri),
                                uri->fragmentlen);
-               if (rspamd_normalise_unicode_inplace (pool, rspamd_url_fragment_unsafe (uri),
-                               &unquoted_len)) {
-                       uri->flags |= RSPAMD_URL_FLAG_UNNORMALISED;
-               }
+
+               rspamd_url_normalise_propagate_flags (pool, rspamd_url_fragment_unsafe (uri),
+                               &unquoted_len, uri->flags);
                rspamd_url_shift (uri, unquoted_len, UF_FRAGMENT);
        }
 
index 249c316e4240f7525443055d2e57b841c1c9b2c7..72fce5f9ea9c44ee2783fa922de7d3373532d9a2 100644 (file)
@@ -127,9 +127,9 @@ enum rspamd_url_find_type {
  * Initialize url library
  * @param cfg
  */
-void rspamd_url_init (const gchar *tld_file);
+void rspamd_url_init(const gchar *tld_file);
 
-void rspamd_url_deinit (void);
+void rspamd_url_deinit(void);
 
 /*
  * Parse urls inside text
@@ -138,10 +138,10 @@ void rspamd_url_deinit (void);
  * @param part current text part
  * @param is_html turn on html euristic
  */
-void rspamd_url_text_extract (rspamd_mempool_t *pool,
-                                                         struct rspamd_task *task,
-                                                         struct rspamd_mime_text_part *part,
-                                                         enum rspamd_url_find_type how);
+void rspamd_url_text_extract(rspamd_mempool_t *pool,
+                                                        struct rspamd_task *task,
+                                                        struct rspamd_mime_text_part *part,
+                                                        enum rspamd_url_find_type how);
 
 /*
  * Parse a single url into an uri structure
@@ -149,11 +149,11 @@ void rspamd_url_text_extract (rspamd_mempool_t *pool,
  * @param uristring text form of url
  * @param uri url object, must be pre allocated
  */
-enum uri_errno rspamd_url_parse (struct rspamd_url *uri,
-                                                                gchar *uristring,
-                                                                gsize len,
-                                                                rspamd_mempool_t *pool,
-                                                                enum rspamd_url_parse_flags flags);
+enum uri_errno rspamd_url_parse(struct rspamd_url *uri,
+                                                               gchar *uristring,
+                                                               gsize len,
+                                                               rspamd_mempool_t *pool,
+                                                               enum rspamd_url_parse_flags flags);
 
 /*
  * Try to extract url from a text
@@ -165,17 +165,17 @@ enum uri_errno rspamd_url_parse (struct rspamd_url *uri,
  * @param url_str storage for url string(or NULL)
  * @return TRUE if url is found in specified text
  */
-gboolean rspamd_url_find (rspamd_mempool_t *pool,
-                                                 const gchar *begin, gsize len,
-                                                 gchar **url_str,
-                                                 enum rspamd_url_find_type how,
-                                                 goffset *url_pos,
-                                                 gboolean *prefix_added);
+gboolean rspamd_url_find(rspamd_mempool_t *pool,
+                                                const gchar *begin, gsize len,
+                                                gchar **url_str,
+                                                enum rspamd_url_find_type how,
+                                                goffset *url_pos,
+                                                gboolean *prefix_added);
 
 /*
  * Return text representation of url parsing error
  */
-const gchar *rspamd_url_strerror (int err);
+const gchar *rspamd_url_strerror(int err);
 
 
 /**
@@ -185,10 +185,10 @@ const gchar *rspamd_url_strerror (int err);
  * @param out output rspamd_ftok_t with tld position
  * @return TRUE if tld has been found
  */
-gboolean rspamd_url_find_tld (const gchar *in, gsize inlen, rspamd_ftok_t *out);
+gboolean rspamd_url_find_tld(const gchar *in, gsize inlen, rspamd_ftok_t *out);
 
-typedef gboolean (*url_insert_function) (struct rspamd_url *url,
-                                                                        gsize start_offset, gsize end_offset, void *ud);
+typedef gboolean (*url_insert_function)(struct rspamd_url *url,
+                                                                               gsize start_offset, gsize end_offset, void *ud);
 
 /**
  * Search for multiple urls in text and call `func` for each url found
@@ -199,12 +199,12 @@ typedef gboolean (*url_insert_function) (struct rspamd_url *url,
  * @param func
  * @param ud
  */
-void rspamd_url_find_multiple (rspamd_mempool_t *pool,
-                                                          const gchar *in, gsize inlen,
-                                                          enum rspamd_url_find_type how,
-                                                          GPtrArray *nlines,
-                                                          url_insert_function func,
-                                                          gpointer ud);
+void rspamd_url_find_multiple(rspamd_mempool_t *pool,
+                                                         const gchar *in, gsize inlen,
+                                                         enum rspamd_url_find_type how,
+                                                         GPtrArray *nlines,
+                                                         url_insert_function func,
+                                                         gpointer ud);
 
 /**
  * Search for a single url in text and call `func` for each url found
@@ -215,11 +215,11 @@ void rspamd_url_find_multiple (rspamd_mempool_t *pool,
  * @param func
  * @param ud
  */
-void rspamd_url_find_single (rspamd_mempool_t *pool,
-                                                        const gchar *in, gsize inlen,
-                                                        enum rspamd_url_find_type how,
-                                                        url_insert_function func,
-                                                        gpointer ud);
+void rspamd_url_find_single(rspamd_mempool_t *pool,
+                                                       const gchar *in, gsize inlen,
+                                                       enum rspamd_url_find_type how,
+                                                       url_insert_function func,
+                                                       gpointer ud);
 
 /**
  * Generic callback to insert URLs into rspamd_task
@@ -228,9 +228,9 @@ void rspamd_url_find_single (rspamd_mempool_t *pool,
  * @param end_offset
  * @param ud
  */
-gboolean rspamd_url_task_subject_callback (struct rspamd_url *url,
-                                                                          gsize start_offset,
-                                                                          gsize end_offset, gpointer ud);
+gboolean rspamd_url_task_subject_callback(struct rspamd_url *url,
+                                                                                 gsize start_offset,
+                                                                                 gsize end_offset, gpointer ud);
 
 /**
  * Decode URL encoded string in-place and return new length of a string, src and dst are NULL terminated
@@ -239,7 +239,7 @@ gboolean rspamd_url_task_subject_callback (struct rspamd_url *url,
  * @param size
  * @return
  */
-gsize rspamd_url_decode (gchar *dst, const gchar *src, gsize size);
+gsize rspamd_url_decode(gchar *dst, const gchar *src, gsize size);
 
 /**
  * Encode url if needed. In this case, memory is allocated from the specific pool.
@@ -248,8 +248,8 @@ gsize rspamd_url_decode (gchar *dst, const gchar *src, gsize size);
  * @param pool
  * @return
  */
-const gchar *rspamd_url_encode (struct rspamd_url *url, gsize *dlen,
-                                                               rspamd_mempool_t *pool);
+const gchar *rspamd_url_encode(struct rspamd_url *url, gsize *dlen,
+                                                          rspamd_mempool_t *pool);
 
 
 /**
@@ -257,14 +257,14 @@ const gchar *rspamd_url_encode (struct rspamd_url *url, gsize *dlen,
  * @param c
  * @return
  */
-gboolean rspamd_url_is_domain (int c);
+gboolean rspamd_url_is_domain(int c);
 
 /**
  * Returns symbolic name for protocol
  * @param proto
  * @return
  */
-const gchar *rspamd_url_protocol_name (enum rspamd_url_protocol proto);
+const gchar *rspamd_url_protocol_name(enum rspamd_url_protocol proto);
 
 
 /**
@@ -272,7 +272,7 @@ const gchar *rspamd_url_protocol_name (enum rspamd_url_protocol proto);
  * @param str
  * @return
  */
-enum rspamd_url_protocol rspamd_url_protocol_from_string (const gchar *str);
+enum rspamd_url_protocol rspamd_url_protocol_from_string(const gchar *str);
 
 /**
  * Converts string to a url flag
@@ -280,14 +280,14 @@ enum rspamd_url_protocol rspamd_url_protocol_from_string (const gchar *str);
  * @param flag
  * @return
  */
-bool rspamd_url_flag_from_string (const gchar *str, gint *flag);
+bool rspamd_url_flag_from_string(const gchar *str, gint *flag);
 
 /**
  * Converts url flag to a string
  * @param flag
  * @return
  */
-const gchar * rspamd_url_flag_to_string (int flag);
+const gchar *rspamd_url_flag_to_string(int flag);
 
 /* Defines sets of urls indexed by url as is */
 KHASH_DECLARE (rspamd_url_hash, struct rspamd_url *, char);
@@ -310,24 +310,25 @@ bool rspamd_url_set_add_or_increase(khash_t (rspamd_url_hash) *set,
  * @param u
  * @return
  */
-struct rspamd_url * rspamd_url_set_add_or_return (khash_t (rspamd_url_hash) *set,
-                                                                                                 struct rspamd_url *u);
+struct rspamd_url *rspamd_url_set_add_or_return(khash_t (rspamd_url_hash) *set,
+                                                                                               struct rspamd_url *u);
 /**
  * Helper for url host set
  * @param set
  * @param u
  * @return
  */
-bool rspamd_url_host_set_add (khash_t (rspamd_url_host_hash) *set,
-                                                                        struct rspamd_url *u);
+bool rspamd_url_host_set_add(khash_t (rspamd_url_host_hash) *set,
+                                                        struct rspamd_url *u);
 /**
  * Checks if a url is in set
  * @param set
  * @param u
  * @return
  */
-bool rspamd_url_set_has (khash_t (rspamd_url_hash) *set, struct rspamd_url *u);
-bool rspamd_url_host_set_has (khash_t (rspamd_url_host_hash) *set, struct rspamd_url *u);
+bool rspamd_url_set_has(khash_t (rspamd_url_hash) *set, struct rspamd_url *u);
+
+bool rspamd_url_host_set_has(khash_t (rspamd_url_host_hash) *set, struct rspamd_url *u);
 
 /**
  * Compares two urls (similar to C comparison functions) lexicographically
@@ -335,15 +336,37 @@ bool rspamd_url_host_set_has (khash_t (rspamd_url_host_hash) *set, struct rspamd
  * @param u2
  * @return
  */
-int rspamd_url_cmp (const struct rspamd_url *u1, const struct rspamd_url *u2);
+int rspamd_url_cmp(const struct rspamd_url *u1, const struct rspamd_url *u2);
+
 /**
  * Same but used for qsort to sort `struct rspamd_url *[]` array
  * @param u1
  * @param u2
  * @return
  */
-int rspamd_url_cmp_qsort (const void *u1, const void *u2);
+int rspamd_url_cmp_qsort(const void *u1, const void *u2);
 
+/**
+ * Normalize unicode input and set out url flags as appropriate
+ * @param pool
+ * @param input
+ * @param len_out (must be &var)
+ * @param url_flags_out (must be just a var with no dereference)
+ */
+#define rspamd_url_normalise_propagate_flags(pool, input, len_out, url_flags_out) \
+  do {                                                                            \
+     enum rspamd_normalise_result norm_res;                                       \
+     norm_res = rspamd_normalise_unicode_inplace((pool), (input), (len_out));     \
+     if (norm_res & RSPAMD_UNICODE_NORM_UNNORMAL) {                               \
+       url_flags_out |= RSPAMD_URL_FLAG_UNNORMALISED;                             \
+     }                                                                            \
+     if (norm_res & RSPAMD_UNICODE_NORM_ZERO_SPACES) {                            \
+       url_flags_out |= RSPAMD_URL_FLAG_ZW_SPACES;                                \
+     }                                                                            \
+     if (norm_res & (RSPAMD_UNICODE_NORM_ERROR)) {                                \
+       url_flags_out |= RSPAMD_URL_FLAG_OBSCURED;                                 \
+     }                                                                            \
+  } while(0)
 #ifdef  __cplusplus
 }
 #endif