From: Vsevolod Stakhov Date: Tue, 26 Nov 2013 16:10:37 +0000 (+0000) Subject: Reduce false positive rate in urls detection. X-Git-Tag: 0.6.0~7 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=cbfa72954e882cfa8ad233d2b2d791526bc85f73;p=thirdparty%2Frspamd.git Reduce false positive rate in urls detection. --- diff --git a/src/url.c b/src/url.c index e858a20d5e..927b618892 100644 --- a/src/url.c +++ b/src/url.c @@ -1201,7 +1201,7 @@ url_tld_start (const gchar *begin, const gchar *end, const gchar *pos, url_match /* Try to find the start of the url by finding any non-urlsafe character or whitespace/punctuation */ while (p >= begin) { - if ((!is_domain (*p) && *p != '.') || g_ascii_isspace (*p)) { + if ((!is_domain (*p) && *p != '.' && *p != '/') || g_ascii_isspace (*p)) { p ++; if (!g_ascii_isalnum (*p)) { /* Urls cannot start with strange symbols */ @@ -1224,6 +1224,10 @@ url_tld_start (const gchar *begin, const gchar *end, const gchar *pos, url_match return FALSE; } } + else if (*p == '/') { + /* Urls cannot contain '/' in their body */ + return FALSE; + } p --; } @@ -1235,9 +1239,9 @@ url_tld_end (const gchar *begin, const gchar *end, const gchar *pos, url_match_t { const gchar *p; - /* A url must be finished by tld, so it must be followed by punctuation or by space character */ + /* A url must be finished by tld, so it must be followed by space character */ p = pos + strlen (match->pattern); - if (p == end || g_ascii_isspace (*p) || g_ascii_ispunct (*p)) { + if (p == end || g_ascii_isspace (*p) || *p == ',') { match->m_len = p - match->m_begin; return TRUE; } @@ -1356,10 +1360,13 @@ domain: } if (!passwd && (port >= 65536 || *p == '@')) { - if (p < end) { + if (p < end && *p == '@') { /* this must be a password? */ goto passwd; } + else if (p < end) { + return FALSE; + } p--; }