#include "html_colors.h"
#include "url.h"
#include <unicode/uversion.h>
+#include <unicode/ucnv.h>
#if U_ICU_VERSION_MAJOR_NUM >= 46
#include <unicode/uidna.h>
#endif
*statep = state;
}
+
+
struct rspamd_url *
rspamd_html_process_url (rspamd_mempool_t *pool, const gchar *start, guint len,
struct html_tag_component *comp)
}
*d = '\0';
+ dlen = d - decoded;
url = rspamd_mempool_alloc0 (pool, sizeof (*url));
- rc = rspamd_url_parse (url, decoded, d - decoded, pool);
+
+ if (rspamd_normalise_unicode_inplace (pool, decoded, &dlen)) {
+ url->flags |= RSPAMD_URL_FLAG_UNNORMALISED;
+ }
+
+ rc = rspamd_url_parse (url, decoded, dlen, pool);
if (rc == URI_ERRNO_OK) {
if (has_bad_chars) {
gchar *p, *comp;
const gchar *end;
guint i, complen, ret, flags = 0;
- gsize unquoted_len = 0;
+ guint unquoted_len = 0;
memset (uri, 0, sizeof (*uri));
memset (&u, 0, sizeof (u));
uri->protocollen);
rspamd_url_shift (uri, unquoted_len, UF_SCHEMA);
unquoted_len = rspamd_url_decode (uri->host, uri->host, uri->hostlen);
+ if (rspamd_normalise_unicode_inplace (pool, uri->host, &unquoted_len)) {
+ uri->flags |= RSPAMD_URL_FLAG_UNNORMALISED;
+ }
rspamd_url_shift (uri, unquoted_len, UF_HOST);
if (uri->datalen) {
unquoted_len = rspamd_url_decode (uri->data, uri->data, uri->datalen);
+ if (rspamd_normalise_unicode_inplace (pool, uri->data, &unquoted_len)) {
+ uri->flags |= RSPAMD_URL_FLAG_UNNORMALISED;
+ }
rspamd_url_shift (uri, unquoted_len, UF_PATH);
/* We now normalize path */
rspamd_http_normalize_path_inplace (uri->data, uri->datalen, &unquoted_len);
unquoted_len = rspamd_url_decode (uri->query,
uri->query,
uri->querylen);
+ if (rspamd_normalise_unicode_inplace (pool, uri->query, &unquoted_len)) {
+ uri->flags |= RSPAMD_URL_FLAG_UNNORMALISED;
+ }
rspamd_url_shift (uri, unquoted_len, UF_QUERY);
}
if (uri->fragmentlen) {
unquoted_len = rspamd_url_decode (uri->fragment,
uri->fragment,
uri->fragmentlen);
+ if (rspamd_normalise_unicode_inplace (pool, uri->fragment, &unquoted_len)) {
+ uri->flags |= RSPAMD_URL_FLAG_UNNORMALISED;
+ }
rspamd_url_shift (uri, unquoted_len, UF_FRAGMENT);
}
RSPAMD_URL_FLAG_HAS_PORT = 1 << 13,
RSPAMD_URL_FLAG_HAS_USER = 1 << 14,
RSPAMD_URL_FLAG_SCHEMALESS = 1 << 15,
+ RSPAMD_URL_FLAG_UNNORMALISED = 1 << 16,
};
struct rspamd_url_tag {
http_parser_parse_url (msg->url->str, msg->url->len, TRUE, &u);
if (u.field_set & (1 << UF_PATH)) {
+ guint unnorm_len;
lookup.begin = msg->url->str + u.field_data[UF_PATH].off;
lookup.len = u.field_data[UF_PATH].len;
rspamd_http_normalize_path_inplace ((gchar *)lookup.begin,
lookup.len,
- &lookup.len);
+ &unnorm_len);
+ lookup.len = unnorm_len;
}
else {
lookup.begin = msg->url->str;
void
-rspamd_http_normalize_path_inplace (gchar *path, gsize len, gsize *nlen)
+rspamd_http_normalize_path_inplace (gchar *path, guint len, guint *nlen)
{
const gchar *p, *end, *slash = NULL, *dot = NULL;
gchar *o;
* @param len
* @param nlen
*/
-void rspamd_http_normalize_path_inplace (gchar *path, gsize len, gsize *nlen);
+void rspamd_http_normalize_path_inplace (gchar *path, guint len, guint *nlen);
#endif /* HTTP_H_ */
#include "cryptobox.h"
#include "url.h"
#include "str_util.h"
+#include "logger.h"
#include "contrib/t1ha/t1ha.h"
+#include <unicode/uversion.h>
+#include <unicode/ucnv.h>
+#include <unicode/unorm2.h>
#include <math.h>
const guchar lc_map[256] = {
return NULL;
}
+
+gboolean
+rspamd_normalise_unicode_inplace (rspamd_mempool_t *pool, gchar *start,
+ guint *len)
+{
+ UErrorCode uc_err = U_ZERO_ERROR;
+ static UConverter *utf8_conv = NULL;
+ static const UNormalizer2 *norm = NULL;
+ gint32 nsym, end;
+ UChar *src = NULL, *dest = NULL;
+ gboolean ret = FALSE;
+
+ if (utf8_conv == NULL) {
+ utf8_conv = ucnv_open ("UTF-8", &uc_err);
+ g_assert (U_SUCCESS (uc_err));
+ norm = unorm2_getInstance (NULL, "nfkc", UNORM2_COMPOSE, &uc_err);
+ g_assert (U_SUCCESS (uc_err));
+ }
+
+ /* We first need to convert data to UChars :( */
+ src = g_malloc ((*len + 1) * sizeof (*src));
+ nsym = ucnv_toUChars (utf8_conv, src, *len + 1,
+ start, *len, &uc_err);
+
+ if (!U_SUCCESS (uc_err)) {
+ msg_warn_pool_check ("cannot normalise URL, cannot convert to unicode: %s",
+ u_errorName (uc_err));
+ goto out;
+ }
+
+ /* We can now check if we need to decompose */
+ end = unorm2_spanQuickCheckYes (norm, src, nsym, &uc_err);
+
+ if (!U_SUCCESS (uc_err)) {
+ msg_warn_pool_check ("cannot normalise URL, cannot check normalisation: %s",
+ u_errorName (uc_err));
+ goto out;
+ }
+
+ if (end == nsym) {
+ /* No normalisation needed */
+ goto out;
+ }
+
+ /* We copy sub(src, 0, end) to dest and normalise the rest */
+ ret = TRUE;
+ dest = g_malloc (nsym * sizeof (*dest));
+ memcpy (dest, src, end * sizeof (*dest));
+ nsym = unorm2_normalizeSecondAndAppend (norm, dest, end, nsym,
+ src + end, nsym - end, &uc_err);
+
+ if (!U_SUCCESS (uc_err)) {
+ msg_warn_pool_check ("cannot normalise URL: %s",
+ u_errorName (uc_err));
+ goto out;
+ }
+
+ /* We now convert it back to utf */
+ nsym = ucnv_fromUChars (utf8_conv, start, *len, dest, nsym, &uc_err);
+
+ if (!U_SUCCESS (uc_err)) {
+ msg_warn_pool_check ("cannot normalise URL, cannot convert to UTF8: %s",
+ u_errorName (uc_err));
+ goto out;
+ }
+
+ *len = nsym;
+ out:
+
+ if (src) {
+ g_free (src);
+ }
+
+ if (dest) {
+ g_free (dest);
+ }
+
+ return ret;
+}
return FALSE;
}
+/**
+ * Gets a string in UTF8 and normalises it to NFKC_Casefold form
+ * @param pool optional memory pool used for logging purposes
+ * @param start
+ * @param len
+ * @return TRUE if a string has been normalised
+ */
+gboolean rspamd_normalise_unicode_inplace (rspamd_mempool_t *pool,
+ gchar *start, guint *len);
+
#endif /* SRC_LIBUTIL_STR_UTIL_H_ */