]> git.ipfire.org Git - thirdparty/rspamd.git/commitdiff
[Minor] Fix parsing of some bogus urls
authorVsevolod Stakhov <vsevolod@highsecure.ru>
Wed, 12 May 2021 13:39:09 +0000 (14:39 +0100)
committerVsevolod Stakhov <vsevolod@highsecure.ru>
Wed, 12 May 2021 13:39:09 +0000 (14:39 +0100)
src/libserver/url.c
test/lua/unit/url.lua

index eb663519df5fedf74d8be9dafb587cb29cc90737..8a33b49157799f52c74fe55dc9c2823df52b40a7 100644 (file)
@@ -1113,10 +1113,35 @@ rspamd_web_parse (struct http_parser_url *u, const gchar *str, gsize len,
 
                        if (t != '/' && t != '\\') {
                                c = p;
-                               st = parse_domain_start;
                                slash = p;
+                               st = parse_domain_start;
+
+                               /*
+                                * Unfortunately, due to brain damage of the RFC 3986 authors,
+                                * we have to distinguish two possibilities here:
+                                * authority = [ userinfo "@" ] host [ ":" port ]
+                                * So if we have @ somewhere before hostname then we must process
+                                * with the username state. Otherwise, we have to process via
+                                * the hostname state. Unfortunately, there is no way to distinguish
+                                * them aside of running NFA or two DFA or performing lookahead.
+                                * Lookahead approach looks easier to implement.
+                                */
+
+                               const char *tp = p;
+                               while (tp < last) {
+                                       if (*tp == '@') {
+                                               user_seen = TRUE;
+                                               st = parse_user;
+                                               break;
+                                       }
+                                       else if (*tp == '/' || *tp == '#' || *tp == '?') {
+                                               st = parse_domain_start;
+                                       }
+
+                                       tp ++;
+                               }
 
-                               if (*p == '[') {
+                               if (st == parse_domain_start && *p == '[') {
                                        st = parse_ipv6;
                                        p++;
                                        c = p;
index 97eda91c6ad129691e2b9d62b35bd8588cb223a0..40d684bfc37638d9a41bc291951768e68b4e3aca 100644 (file)
@@ -133,6 +133,9 @@ context("URL check functions", function()
     {"http://hehe。example。com#test", true, {
       host = 'hehe.example.com', fragment = 'test'
     }},
+    {"http:////$%^&****((@example.org//#f@f", true, {
+      user = '$%^&****((', host = 'example.org', fragment = 'f@f'
+    }},
   }
 
   -- Some cases from https://code.google.com/p/google-url/source/browse/trunk/src/url_canon_unittest.cc