[Minor] Fix parsing of some bogus urls

author Vsevolod Stakhov <vsevolod@highsecure.ru>

Wed, 12 May 2021 13:39:09 +0000 (14:39 +0100)

committer Vsevolod Stakhov <vsevolod@highsecure.ru>

Wed, 12 May 2021 13:39:09 +0000 (14:39 +0100)
author Vsevolod Stakhov <vsevolod@highsecure.ru>
Wed, 12 May 2021 13:39:09 +0000 (14:39 +0100)
committer Vsevolod Stakhov <vsevolod@highsecure.ru>
Wed, 12 May 2021 13:39:09 +0000 (14:39 +0100)
diff --git a/src/libserver/url.c b/src/libserver/url.c

index eb663519df5fedf74d8be9dafb587cb29cc90737..8a33b49157799f52c74fe55dc9c2823df52b40a7 100644 (file)
--- a/src/libserver/url.c
+++ b/src/libserver/url.c
@@ -1113,10 +1113,35 @@ rspamd_web_parse (struct http_parser_url *u, const gchar *str, gsize len,
  
                         if (t != '/' && t != '\\') {
                                 c = p;
-                               st = parse_domain_start;
                                 slash = p;
+                               st = parse_domain_start;
+
+                               /*
+                                * Unfortunately, due to brain damage of the RFC 3986 authors,
+                                * we have to distinguish two possibilities here:
+                                * authority = [ userinfo "@" ] host [ ":" port ]
+                                * So if we have @ somewhere before hostname then we must process
+                                * with the username state. Otherwise, we have to process via
+                                * the hostname state. Unfortunately, there is no way to distinguish
+                                * them aside of running NFA or two DFA or performing lookahead.
+                                * Lookahead approach looks easier to implement.
+                                */
+
+                               const char *tp = p;
+                               while (tp < last) {
+                                       if (*tp == '@') {
+                                               user_seen = TRUE;
+                                               st = parse_user;
+                                               break;
+                                       }
+                                       else if (*tp == '/' || *tp == '#' || *tp == '?') {
+                                               st = parse_domain_start;
+                                       }
+
+                                       tp ++;
+                               }
  
-                               if (*p == '[') {
+                               if (st == parse_domain_start && *p == '[') {
                                         st = parse_ipv6;
                                         p++;
                                         c = p;
diff --git a/test/lua/unit/url.lua b/test/lua/unit/url.lua

index 97eda91c6ad129691e2b9d62b35bd8588cb223a0..40d684bfc37638d9a41bc291951768e68b4e3aca 100644 (file)
--- a/test/lua/unit/url.lua
+++ b/test/lua/unit/url.lua
@@ -133,6 +133,9 @@ context("URL check functions", function()
      {"http://hehe｡example。com#test", true, {
        host = 'hehe.example.com', fragment = 'test'
      }},
+    {"http:////$%^&****((@example.org//#f@f", true, {
+      user = '$%^&****((', host = 'example.org', fragment = 'f@f'
+    }},
    }
  
    -- Some cases from https://code.google.com/p/google-url/source/browse/trunk/src/url_canon_unittest.cc
author	Vsevolod Stakhov <vsevolod@highsecure.ru>
	Wed, 12 May 2021 13:39:09 +0000 (14:39 +0100)
committer	Vsevolod Stakhov <vsevolod@highsecure.ru>
	Wed, 12 May 2021 13:39:09 +0000 (14:39 +0100)
src/libserver/url.c		patch \| blob \| blame \| history
test/lua/unit/url.lua		patch \| blob \| blame \| history