src/anyp/Uri.cc

   1 /*
   2  * Copyright (C) 1996-2025 The Squid Software Foundation and contributors
   3  *
   4  * Squid software is distributed under GPLv2+ license and includes
   5  * contributions from numerous individuals and organizations.
   6  * Please see the COPYING and CONTRIBUTORS files for details.
   7  */
   8
   9 /* DEBUG: section 23    URL Parsing */
  10
  11 #include "squid.h"
  12 #include "anyp/Host.h"
  13 #include "anyp/Uri.h"
  14 #include "base/Raw.h"
  15 #include "globals.h"
  16 #include "HttpRequest.h"
  17 #include "parser/Tokenizer.h"
  18 #include "rfc1738.h"
  19 #include "SquidConfig.h"
  20 #include "SquidMath.h"
  21
  22 static const char valid_hostname_chars_u[] =
  23     "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
  24     "abcdefghijklmnopqrstuvwxyz"
  25     "0123456789-._"
  26     "[:]"
  27     ;
  28 static const char valid_hostname_chars[] =
  29     "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
  30     "abcdefghijklmnopqrstuvwxyz"
  31     "0123456789-."
  32     "[:]"
  33     ;
  34
  35 /// Characters which are valid within a URI userinfo section
  36 static const CharacterSet &
  37 UserInfoChars()
  38 {
  39     /*
  40      * RFC 3986 section 3.2.1
  41      *
  42      *  userinfo      = *( unreserved / pct-encoded / sub-delims / ":" )
  43      *  unreserved    = ALPHA / DIGIT / "-" / "." / "_" / "~"
  44      *  pct-encoded   = "%" HEXDIG HEXDIG
  45      *  sub-delims    = "!" / "$" / "&" / "'" / "(" / ")" / "*" / "+" / "," / ";" / "="
  46      */
  47     static const auto userInfoValid = CharacterSet("userinfo", ":-._~%!$&'()*+,;=") +
  48                                       CharacterSet::ALPHA +
  49                                       CharacterSet::DIGIT;
  50     return userInfoValid;
  51 }
  52
  53 /**
  54  * Governed by RFC 3986 section 2.1
  55  */
  56 SBuf
  57 AnyP::Uri::Encode(const SBuf &buf, const CharacterSet &ignore)
  58 {
  59     if (buf.isEmpty())
  60         return buf;
  61
  62     Parser::Tokenizer tk(buf);
  63     SBuf goodSection;
  64     // optimization for the arguably common "no encoding necessary" case
  65     if (tk.prefix(goodSection, ignore) && tk.atEnd())
  66         return buf;
  67
  68     SBuf output;
  69     output.reserveSpace(buf.length() * 3); // worst case: encode all chars
  70     output.append(goodSection); // may be empty
  71
  72     while (!tk.atEnd()) {
  73         // TODO: Add Tokenizer::parseOne(void).
  74         const auto ch = tk.remaining()[0];
  75         output.appendf("%%%02X", static_cast<unsigned int>(static_cast<unsigned char>(ch))); // TODO: Optimize using a table
  76         (void)tk.skip(ch);
  77
  78         if (tk.prefix(goodSection, ignore))
  79             output.append(goodSection);
  80     }
  81
  82     return output;
  83 }
  84
  85 SBuf
  86 AnyP::Uri::Decode(const SBuf &buf)
  87 {
  88     SBuf output;
  89     Parser::Tokenizer tok(buf);
  90     while (!tok.atEnd()) {
  91         SBuf token;
  92         static const auto unencodedChars = CharacterSet("percent", "%").complement("unencoded");
  93         if (tok.prefix(token, unencodedChars))
  94             output.append(token);
  95
  96         // we are either at '%' or at end of input
  97         if (tok.skip('%')) {
  98             int64_t hex1 = 0, hex2 = 0;
  99             if (tok.int64(hex1, 16, false, 1) && tok.int64(hex2, 16, false, 1))
 100                 output.append(static_cast<char>((hex1 << 4) | hex2));
 101             else
 102                 throw TextException("invalid pct-encoded triplet", Here());
 103         }
 104     }
 105     return output;
 106 }
 107
 108 const SBuf &
 109 AnyP::Uri::Asterisk()
 110 {
 111     static SBuf star("*");
 112     return star;
 113 }
 114
 115 const SBuf &
 116 AnyP::Uri::SlashPath()
 117 {
 118     static SBuf slash("/");
 119     return slash;
 120 }
 121
 122 void
 123 AnyP::Uri::host(const char *src)
 124 {
 125     hostAddr_.fromHost(src);
 126     if (hostAddr_.isAnyAddr()) {
 127         xstrncpy(host_, src, sizeof(host_));
 128         hostIsNumeric_ = false;
 129     } else {
 130         hostAddr_.toHostStr(host_, sizeof(host_));
 131         debugs(23, 3, "given IP: " << hostAddr_);
 132         hostIsNumeric_ = 1;
 133     }
 134     touch();
 135 }
 136
 137 // TODO: Replace with ToSBuf(parsedHost()) or similar.
 138 SBuf
 139 AnyP::Uri::hostOrIp() const
 140 {
 141     if (hostIsNumeric()) {
 142         static char ip[MAX_IPSTRLEN];
 143         const auto hostStrLen = hostIP().toHostStr(ip, sizeof(ip));
 144         return SBuf(ip, hostStrLen);
 145     } else
 146         return SBuf(host());
 147 }
 148
 149 std::optional<AnyP::Host>
 150 AnyP::Uri::parsedHost() const
 151 {
 152     if (hostIsNumeric())
 153         return Host::ParseIp(hostIP());
 154
 155     // XXX: Interpret host subcomponent as reg-name representing a DNS name. It
 156     // may actually be, for example, a URN namespace ID (NID; see RFC 8141), but
 157     // current Squid APIs do not support adequate representation of those cases.
 158     const SBuf regName(host());
 159
 160     if (regName.find('%') != SBuf::npos) {
 161         debugs(23, 3, "rejecting percent-encoded reg-name: " << regName);
 162         return std::nullopt; // TODO: Decode() instead
 163     }
 164
 165     return Host::ParseSimpleDomainName(regName);
 166 }
 167
 168 const SBuf &
 169 AnyP::Uri::path() const
 170 {
 171     // RFC 3986 section 3.3 says path can be empty (path-abempty).
 172     // RFC 7230 sections 2.7.3, 5.3.1, 5.7.2 - says path cannot be empty, default to "/"
 173     // at least when sending and using. We must still accept path-abempty as input.
 174     if (path_.isEmpty() && (scheme_ == AnyP::PROTO_HTTP || scheme_ == AnyP::PROTO_HTTPS))
 175         return SlashPath();
 176
 177     return path_;
 178 }
 179
 180 void
 181 urlInitialize(void)
 182 {
 183     debugs(23, 5, "urlInitialize: Initializing...");
 184     /* this ensures that the number of protocol strings is the same as
 185      * the enum slots allocated because the last enum is always 'MAX'.
 186      */
 187     assert(strcmp(AnyP::ProtocolType_str[AnyP::PROTO_MAX], "MAX") == 0);
 188     /*
 189      * These test that our matchDomainName() function works the
 190      * way we expect it to.
 191      */
 192     assert(0 == matchDomainName("foo.com", "foo.com"));
 193     assert(0 == matchDomainName(".foo.com", "foo.com"));
 194     assert(0 == matchDomainName("foo.com", ".foo.com"));
 195     assert(0 == matchDomainName(".foo.com", ".foo.com"));
 196     assert(0 == matchDomainName("x.foo.com", ".foo.com"));
 197     assert(0 == matchDomainName("y.x.foo.com", ".foo.com"));
 198     assert(0 != matchDomainName("x.foo.com", "foo.com"));
 199     assert(0 != matchDomainName("foo.com", "x.foo.com"));
 200     assert(0 != matchDomainName("bar.com", "foo.com"));
 201     assert(0 != matchDomainName(".bar.com", "foo.com"));
 202     assert(0 != matchDomainName(".bar.com", ".foo.com"));
 203     assert(0 != matchDomainName("bar.com", ".foo.com"));
 204     assert(0 < matchDomainName("zzz.com", "foo.com"));
 205     assert(0 > matchDomainName("aaa.com", "foo.com"));
 206     assert(0 == matchDomainName("FOO.com", "foo.COM"));
 207     assert(0 < matchDomainName("bfoo.com", "afoo.com"));
 208     assert(0 > matchDomainName("afoo.com", "bfoo.com"));
 209     assert(0 < matchDomainName("x-foo.com", ".foo.com"));
 210
 211     assert(0 == matchDomainName(".foo.com", ".foo.com", mdnRejectSubsubDomains));
 212     assert(0 == matchDomainName("x.foo.com", ".foo.com", mdnRejectSubsubDomains));
 213     assert(0 != matchDomainName("y.x.foo.com", ".foo.com", mdnRejectSubsubDomains));
 214     assert(0 != matchDomainName(".x.foo.com", ".foo.com", mdnRejectSubsubDomains));
 215
 216     assert(0 == matchDomainName("*.foo.com", "x.foo.com", mdnHonorWildcards));
 217     assert(0 == matchDomainName("*.foo.com", ".x.foo.com", mdnHonorWildcards));
 218     assert(0 == matchDomainName("*.foo.com", ".foo.com", mdnHonorWildcards));
 219     assert(0 != matchDomainName("*.foo.com", "foo.com", mdnHonorWildcards));
 220
 221     assert(0 != matchDomainName("foo.com", ""));
 222     assert(0 != matchDomainName("foo.com", "", mdnHonorWildcards));
 223     assert(0 != matchDomainName("foo.com", "", mdnRejectSubsubDomains));
 224
 225     /* more cases? */
 226 }
 227
 228 /**
 229  * Extract the URI scheme and ':' delimiter from the given input buffer.
 230  *
 231  * Schemes up to 16 characters are accepted.
 232  *
 233  * Governed by RFC 3986 section 3.1
 234  */
 235 static AnyP::UriScheme
 236 uriParseScheme(Parser::Tokenizer &tok)
 237 {
 238     /*
 239      * RFC 3986 section 3.1 paragraph 2:
 240      *
 241      * Scheme names consist of a sequence of characters beginning with a
 242      * letter and followed by any combination of letters, digits, plus
 243      * ("+"), period ("."), or hyphen ("-").
 244      */
 245     static const auto schemeChars = CharacterSet("scheme", "+.-") + CharacterSet::ALPHA + CharacterSet::DIGIT;
 246
 247     SBuf str;
 248     if (tok.prefix(str, schemeChars, 16) && tok.skip(':') && CharacterSet::ALPHA[str.at(0)]) {
 249         const auto protocol = AnyP::UriScheme::FindProtocolType(str);
 250         if (protocol == AnyP::PROTO_UNKNOWN)
 251             return AnyP::UriScheme(protocol, str.c_str());
 252         return AnyP::UriScheme(protocol, nullptr);
 253     }
 254
 255     throw TextException("invalid URI scheme", Here());
 256 }
 257
 258 /**
 259  * Appends configured append_domain to hostname, assuming
 260  * the given buffer is at least SQUIDHOSTNAMELEN bytes long,
 261  * and that the host FQDN is not a 'dotless' TLD.
 262  *
 263  * \returns false if and only if there is not enough space to append
 264  */
 265 bool
 266 urlAppendDomain(char *host)
 267 {
 268     /* For IPv4 addresses check for a dot */
 269     /* For IPv6 addresses also check for a colon */
 270     if (Config.appendDomain && !strchr(host, '.') && !strchr(host, ':')) {
 271         const uint64_t dlen = strlen(host);
 272         const uint64_t want = dlen + Config.appendDomainLen;
 273         if (want > SQUIDHOSTNAMELEN - 1) {
 274             debugs(23, 2, "URL domain too large (" << dlen << " bytes)");
 275             return false;
 276         }
 277         strncat(host, Config.appendDomain, SQUIDHOSTNAMELEN - dlen - 1);
 278     }
 279     return true;
 280 }
 281
 282 /*
 283  * Parse a URI/URL.
 284  *
 285  * It is assumed that the URL is complete -
 286  * ie, the end of the string is the end of the URL. Don't pass a partial
 287  * URL here as this routine doesn't have any way of knowing whether
 288  * it is partial or not (ie, it handles the case of no trailing slash as
 289  * being "end of host with implied path of /".
 290  *
 291  * method is used to switch parsers. If method is Http::METHOD_CONNECT,
 292  * then rather than a URL a hostname:port is looked for.
 293  */
 294 bool
 295 AnyP::Uri::parse(const HttpRequestMethod& method, const SBuf &rawUrl)
 296 {
 297     try {
 298
 299         LOCAL_ARRAY(char, login, MAX_URL);
 300         LOCAL_ARRAY(char, foundHost, MAX_URL);
 301         LOCAL_ARRAY(char, urlpath, MAX_URL);
 302         char *t = nullptr;
 303         char *q = nullptr;
 304         int foundPort;
 305         int l;
 306         int i;
 307         const char *src;
 308         char *dst;
 309         foundHost[0] = urlpath[0] = login[0] = '\0';
 310
 311         if ((l = rawUrl.length()) + Config.appendDomainLen > (MAX_URL - 1)) {
 312             debugs(23, DBG_IMPORTANT, MYNAME << "URL too large (" << l << " bytes)");
 313             return false;
 314         }
 315
 316         if ((method == Http::METHOD_OPTIONS || method == Http::METHOD_TRACE) &&
 317                 Asterisk().cmp(rawUrl) == 0) {
 318             // XXX: these methods might also occur in HTTPS traffic. Handle this better.
 319             setScheme(AnyP::PROTO_HTTP, nullptr);
 320             port(getScheme().defaultPort());
 321             path(Asterisk());
 322             return true;
 323         }
 324
 325         Parser::Tokenizer tok(rawUrl);
 326         AnyP::UriScheme scheme;
 327
 328         if (method == Http::METHOD_CONNECT) {
 329             // For CONNECTs, RFC 9110 Section 9.3.6 requires "only the host and
 330             // port number of the tunnel destination, separated by a colon".
 331
 332             const auto rawHost = parseHost(tok);
 333             Assure(rawHost.length() < sizeof(foundHost));
 334             SBufToCstring(foundHost, rawHost);
 335
 336             if (!tok.skip(':'))
 337                 throw TextException("missing required :port in CONNECT target", Here());
 338             foundPort = parsePort(tok);
 339
 340             if (!tok.remaining().isEmpty())
 341                 throw TextException("garbage after host:port in CONNECT target", Here());
 342         } else {
 343
 344             scheme = uriParseScheme(tok);
 345
 346             if (scheme == AnyP::PROTO_NONE)
 347                 return false; // invalid scheme
 348
 349             if (scheme == AnyP::PROTO_URN) {
 350                 parseUrn(tok); // throws on any error
 351                 return true;
 352             }
 353
 354             // URLs then have "//"
 355             static const SBuf doubleSlash("//");
 356             if (!tok.skip(doubleSlash))
 357                 return false;
 358
 359             auto B = tok.remaining();
 360             const char *url = B.c_str();
 361
 362             /* Parse the URL: */
 363             src = url;
 364             i = 0;
 365
 366             /* Then everything until first /; that's host (and port; which we'll look for here later) */
 367             // bug 1881: If we don't get a "/" then we imply it was there
 368             // bug 3074: We could just be given a "?" or "#". These also imply "/"
 369             // bug 3233: whitespace is also a hostname delimiter.
 370             for (dst = foundHost; i < l && *src != '/' && *src != '?' && *src != '#' && *src != '\0' && !xisspace(*src); ++i, ++src, ++dst) {
 371                 *dst = *src;
 372             }
 373
 374             /*
 375              * We can't check for "i >= l" here because we could be at the end of the line
 376              * and have a perfectly valid URL w/ no trailing '/'. In this case we assume we've
 377              * been -given- a valid URL and the path is just '/'.
 378              */
 379             if (i > l)
 380                 return false;
 381             *dst = '\0';
 382
 383             // We are looking at path-abempty.
 384             if (*src != '/') {
 385                 // path-empty, including the end of the `src` c-string cases
 386                 urlpath[0] = '/';
 387                 dst = &urlpath[1];
 388             } else {
 389                 dst = urlpath;
 390             }
 391             /* Then everything from / (inclusive) until \r\n or \0 - that's urlpath */
 392             for (; i < l && *src != '\r' && *src != '\n' && *src != '\0'; ++i, ++src, ++dst) {
 393                 *dst = *src;
 394             }
 395
 396             /* We -could- be at the end of the buffer here */
 397             if (i > l)
 398                 return false;
 399             *dst = '\0';
 400
 401             // If the parsed scheme has no (known) default port, and there is no
 402             // explicit port, then we will reject the zero port during foundPort
 403             // validation, often resulting in a misleading 400/ERR_INVALID_URL.
 404             // TODO: Remove this hack when switching to Tokenizer-based parsing.
 405             foundPort = scheme.defaultPort().value_or(0); // may be reset later
 406
 407             /* Is there any login information? (we should eventually parse it above) */
 408             t = strrchr(foundHost, '@');
 409             if (t != nullptr) {
 410                 strncpy((char *) login, (char *) foundHost, sizeof(login)-1);
 411                 login[sizeof(login)-1] = '\0';
 412                 t = strrchr(login, '@');
 413                 *t = 0;
 414                 strncpy((char *) foundHost, t + 1, sizeof(foundHost)-1);
 415                 foundHost[sizeof(foundHost)-1] = '\0';
 416                 // Bug 4498: URL-unescape the login info after extraction
 417                 rfc1738_unescape(login);
 418             }
 419
 420             /* Is there any host information? (we should eventually parse it above) */
 421             if (*foundHost == '[') {
 422                 /* strip any IPA brackets. valid under IPv6. */
 423                 dst = foundHost;
 424                 /* only for IPv6 sadly, pre-IPv6/URL code can't handle the clean result properly anyway. */
 425                 src = foundHost;
 426                 ++src;
 427                 l = strlen(foundHost);
 428                 i = 1;
 429                 for (; i < l && *src != ']' && *src != '\0'; ++i, ++src, ++dst) {
 430                     *dst = *src;
 431                 }
 432
 433                 /* we moved in-place, so truncate the actual hostname found */
 434                 *dst = '\0';
 435                 ++dst;
 436
 437                 /* skip ahead to either start of port, or original EOS */
 438                 while (*dst != '\0' && *dst != ':')
 439                     ++dst;
 440                 t = dst;
 441             } else {
 442                 t = strrchr(foundHost, ':');
 443
 444                 if (t != strchr(foundHost,':') ) {
 445                     /* RFC 2732 states IPv6 "SHOULD" be bracketed. allowing for times when its not. */
 446                     /* RFC 3986 'update' simply modifies this to an "is" with no emphasis at all! */
 447                     /* therefore we MUST accept the case where they are not bracketed at all. */
 448                     t = nullptr;
 449                 }
 450             }
 451
 452             // Bug 3183 sanity check: If scheme is present, host must be too.
 453             if (scheme != AnyP::PROTO_NONE && foundHost[0] == '\0') {
 454                 debugs(23, DBG_IMPORTANT, "SECURITY ALERT: Missing hostname in URL '" << url << "'. see access.log for details.");
 455                 return false;
 456             }
 457
 458             if (t && *t == ':') {
 459                 *t = '\0';
 460                 ++t;
 461                 foundPort = atoi(t);
 462             }
 463         }
 464
 465         for (t = foundHost; *t; ++t)
 466             *t = xtolower(*t);
 467
 468         if (stringHasWhitespace(foundHost)) {
 469             if (URI_WHITESPACE_STRIP == Config.uri_whitespace) {
 470                 t = q = foundHost;
 471                 while (*t) {
 472                     if (!xisspace(*t)) {
 473                         *q = *t;
 474                         ++q;
 475                     }
 476                     ++t;
 477                 }
 478                 *q = '\0';
 479             }
 480         }
 481
 482         debugs(23, 3, "Split URL '" << rawUrl << "' into proto='" << scheme.image() << "', host='" << foundHost << "', port='" << foundPort << "', path='" << urlpath << "'");
 483
 484         if (Config.onoff.check_hostnames &&
 485                 strspn(foundHost, Config.onoff.allow_underscore ? valid_hostname_chars_u : valid_hostname_chars) != strlen(foundHost)) {
 486             debugs(23, DBG_IMPORTANT, MYNAME << "Illegal character in hostname '" << foundHost << "'");
 487             return false;
 488         }
 489
 490         if (!urlAppendDomain(foundHost))
 491             return false;
 492
 493         /* remove trailing dots from hostnames */
 494         while ((l = strlen(foundHost)) > 0 && foundHost[--l] == '.')
 495             foundHost[l] = '\0';
 496
 497         /* reject duplicate or leading dots */
 498         if (strstr(foundHost, "..") || *foundHost == '.') {
 499             debugs(23, DBG_IMPORTANT, MYNAME << "Illegal hostname '" << foundHost << "'");
 500             return false;
 501         }
 502
 503         if (foundPort < 1 || foundPort > 65535) {
 504             debugs(23, 3, "Invalid port '" << foundPort << "'");
 505             return false;
 506         }
 507
 508         if (stringHasWhitespace(urlpath)) {
 509             debugs(23, 2, "URI has whitespace: {" << rawUrl << "}");
 510
 511             switch (Config.uri_whitespace) {
 512
 513             case URI_WHITESPACE_DENY:
 514                 return false;
 515
 516             case URI_WHITESPACE_ALLOW:
 517                 break;
 518
 519             case URI_WHITESPACE_ENCODE:
 520                 t = rfc1738_escape_unescaped(urlpath);
 521                 xstrncpy(urlpath, t, MAX_URL);
 522                 break;
 523
 524             case URI_WHITESPACE_CHOP:
 525                 *(urlpath + strcspn(urlpath, w_space)) = '\0';
 526                 break;
 527
 528             case URI_WHITESPACE_STRIP:
 529             default:
 530                 t = q = urlpath;
 531                 while (*t) {
 532                     if (!xisspace(*t)) {
 533                         *q = *t;
 534                         ++q;
 535                     }
 536                     ++t;
 537                 }
 538                 *q = '\0';
 539             }
 540         }
 541
 542         setScheme(scheme);
 543         path(urlpath);
 544         host(foundHost);
 545         userInfo(SBuf(login));
 546         port(foundPort);
 547         return true;
 548
 549     } catch (...) {
 550         debugs(23, 2, "error: " << CurrentException << " " << Raw("rawUrl", rawUrl.rawContent(), rawUrl.length()));
 551         return false;
 552     }
 553 }
 554
 555 /**
 556  * Governed by RFC 8141 section 2:
 557  *
 558  *  assigned-name = "urn" ":" NID ":" NSS
 559  *  NID           = (alphanum) 0*30(ldh) (alphanum)
 560  *  ldh           = alphanum / "-"
 561  *  NSS           = pchar *(pchar / "/")
 562  *
 563  * RFC 3986 Appendix D.2 defines (as deprecated):
 564  *
 565  *   alphanum     = ALPHA / DIGIT
 566  *
 567  * Notice that NID is exactly 2-32 characters in length.
 568  */
 569 void
 570 AnyP::Uri::parseUrn(Parser::Tokenizer &tok)
 571 {
 572     static const auto nidChars = CharacterSet("NID","-") + CharacterSet::ALPHA + CharacterSet::DIGIT;
 573     static const auto alphanum = (CharacterSet::ALPHA + CharacterSet::DIGIT).rename("alphanum");
 574     SBuf nid;
 575     if (!tok.prefix(nid, nidChars, 32))
 576         throw TextException("NID not found", Here());
 577
 578     if (!tok.skip(':'))
 579         throw TextException("NID too long or missing ':' delimiter", Here());
 580
 581     if (nid.length() < 2)
 582         throw TextException("NID too short", Here());
 583
 584     if (!alphanum[*nid.begin()])
 585         throw TextException("NID prefix is not alphanumeric", Here());
 586
 587     if (!alphanum[*nid.rbegin()])
 588         throw TextException("NID suffix is not alphanumeric", Here());
 589
 590     setScheme(AnyP::PROTO_URN, nullptr);
 591     host(nid.c_str());
 592     // TODO validate path characters
 593     path(tok.remaining());
 594     debugs(23, 3, "Split URI into proto=urn, nid=" << nid << ", " << Raw("path",path().rawContent(),path().length()));
 595 }
 596
 597 /// Extracts and returns a (suspected but only partially validated) uri-host
 598 /// IPv6address, IPv4address, or reg-name component. This function uses (and
 599 /// quotes) RFC 3986, Section 3.2.2 syntax rules.
 600 SBuf
 601 AnyP::Uri::parseHost(Parser::Tokenizer &tok) const
 602 {
 603     // host = IP-literal / IPv4address / reg-name
 604
 605     // XXX: CharacterSets below reject uri-host values containing whitespace
 606     // (e.g., "10.0.0. 1"). That is not a bug, but the uri_whitespace directive
 607     // can be interpreted as if it applies to uri-host and this code. TODO: Fix
 608     // uri_whitespace and the code using it to exclude uri-host (and URI scheme,
 609     // port, etc.) from that directive scope.
 610
 611     // IP-literal = "[" ( IPv6address / IPvFuture  ) "]"
 612     if (tok.skip('[')) {
 613         // Add "." because IPv6address in RFC 3986 includes ls32, which includes
 614         // IPv4address: ls32 = ( h16 ":" h16 ) / IPv4address
 615         // This set rejects IPvFuture that needs a "v" character.
 616         static const CharacterSet IPv6chars = (
 617                 CharacterSet::HEXDIG + CharacterSet("colon", ":") + CharacterSet("period", ".")).rename("IPv6");
 618         SBuf ipv6ish;
 619         if (!tok.prefix(ipv6ish, IPv6chars))
 620             throw TextException("malformed or unsupported bracketed IP address in uri-host", Here());
 621
 622         if (!tok.skip(']'))
 623             throw TextException("IPv6 address is missing a closing bracket in uri-host", Here());
 624
 625         // This rejects bracketed IPv4address and domain names because they lack ":".
 626         if (ipv6ish.find(':') == SBuf::npos)
 627             throw TextException("bracketed IPv6 address is missing a colon in uri-host", Here());
 628
 629         // This rejects bracketed non-IP addresses that our caller would have
 630         // otherwise mistaken for a domain name (e.g., '[127.0.0:1]').
 631         Ip::Address ipv6check;
 632         if (!ipv6check.fromHost(ipv6ish.c_str()))
 633             throw TextException("malformed bracketed IPv6 address in uri-host", Here());
 634
 635         return ipv6ish;
 636     }
 637
 638     // no brackets implies we are looking at IPv4address or reg-name
 639
 640     // XXX: This code does not detect/reject some bad host values (e.g. "!#$%&"
 641     // and "1.2.3.4.5"). TODO: Add more checks here, after migrating the
 642     // non-CONNECT uri-host parsing code to use us.
 643
 644     SBuf otherHost; // IPv4address-ish or reg-name-ish;
 645     // ":" is not in TCHAR so we will stop before any port specification
 646     if (tok.prefix(otherHost, CharacterSet::TCHAR))
 647         return otherHost;
 648
 649     throw TextException("malformed IPv4 address or host name in uri-host", Here());
 650 }
 651
 652 /// Extracts and returns an RFC 3986 URI authority port value (with additional
 653 /// restrictions). The RFC defines port as a possibly empty sequence of decimal
 654 /// digits. We reject certain ports (that are syntactically valid from the RFC
 655 /// point of view) because we are worried that Squid and other traffic handlers
 656 /// may dangerously mishandle unusual (and virtually always bogus) port numbers.
 657 /// Rejected ports cannot be successfully used by Squid itself.
 658 int
 659 AnyP::Uri::parsePort(Parser::Tokenizer &tok) const
 660 {
 661     if (tok.skip('0'))
 662         throw TextException("zero or zero-prefixed port", Here());
 663
 664     int64_t rawPort = 0;
 665     if (!tok.int64(rawPort, 10, false)) // port = *DIGIT
 666         throw TextException("malformed or missing port", Here());
 667
 668     Assure(rawPort > 0);
 669     constexpr KnownPort portMax = 65535; // TODO: Make this a class-scope constant and REuse it.
 670     constexpr auto portStorageMax = std::numeric_limits<Port::value_type>::max();
 671     static_assert(!Less(portStorageMax, portMax), "Port type can represent the maximum valid port number");
 672     if (Less(portMax, rawPort))
 673         throw TextException("huge port", Here());
 674
 675     // TODO: Return KnownPort after migrating the non-CONNECT uri-host parsing
 676     // code to use us (so that foundPort "int" disappears or starts using Port).
 677     return NaturalCast<int>(rawPort);
 678 }
 679
 680 void
 681 AnyP::Uri::touch()
 682 {
 683     absolute_.clear();
 684     authorityHttp_.clear();
 685     authorityWithPort_.clear();
 686 }
 687
 688 SBuf &
 689 AnyP::Uri::authority(bool requirePort) const
 690 {
 691     if (authorityHttp_.isEmpty()) {
 692
 693         // both formats contain Host/IP
 694         authorityWithPort_.append(host());
 695         authorityHttp_ = authorityWithPort_;
 696
 697         if (port().has_value()) {
 698             authorityWithPort_.appendf(":%hu", *port());
 699             // authorityHttp_ only has :port for known non-default ports
 700             if (port() != getScheme().defaultPort())
 701                 authorityHttp_ = authorityWithPort_;
 702         }
 703         // else XXX: We made authorityWithPort_ that does not have a port.
 704         // TODO: Audit callers and refuse to give out broken authorityWithPort_.
 705     }
 706
 707     return requirePort ? authorityWithPort_ : authorityHttp_;
 708 }
 709
 710 SBuf &
 711 AnyP::Uri::absolute() const
 712 {
 713     if (absolute_.isEmpty()) {
 714         // TODO: most URL will be much shorter, avoid allocating this much
 715         absolute_.reserveCapacity(MAX_URL);
 716
 717         absolute_.append(getScheme().image());
 718         absolute_.append(":",1);
 719         if (getScheme() != AnyP::PROTO_URN) {
 720             absolute_.append("//", 2);
 721             const bool allowUserInfo = getScheme() == AnyP::PROTO_FTP ||
 722                                        getScheme() == AnyP::PROTO_UNKNOWN;
 723
 724             if (allowUserInfo && !userInfo().isEmpty()) {
 725                 static const CharacterSet uiChars = CharacterSet(UserInfoChars())
 726                                                     .remove('%')
 727                                                     .rename("userinfo-reserved");
 728                 absolute_.append(Encode(userInfo(), uiChars));
 729                 absolute_.append("@", 1);
 730             }
 731             absolute_.append(authority());
 732         } else {
 733             absolute_.append(host());
 734             absolute_.append(":", 1);
 735         }
 736         absolute_.append(path()); // TODO: Encode each URI subcomponent in path_ as needed.
 737     }
 738
 739     return absolute_;
 740 }
 741
 742 /* XXX: Performance: This is an *almost* duplicate of HttpRequest::effectiveRequestUri(). But elides the query-string.
 743  *        After copying it on in the first place! Would be less code to merge the two with a flag parameter.
 744  *        and never copy the query-string part in the first place
 745  */
 746 char *
 747 urlCanonicalCleanWithoutRequest(const SBuf &url, const HttpRequestMethod &method, const AnyP::UriScheme &scheme)
 748 {
 749     LOCAL_ARRAY(char, buf, MAX_URL);
 750
 751     snprintf(buf, sizeof(buf), SQUIDSBUFPH, SQUIDSBUFPRINT(url));
 752     buf[sizeof(buf)-1] = '\0';
 753
 754     // URN, CONNECT method, and non-stripped URIs can go straight out
 755     if (Config.onoff.strip_query_terms && !(method == Http::METHOD_CONNECT || scheme == AnyP::PROTO_URN)) {
 756         // strip anything AFTER a question-mark
 757         // leaving the '?' in place
 758         if (auto t = strchr(buf, '?')) {
 759             *(++t) = '\0';
 760         }
 761     }
 762
 763     if (stringHasCntl(buf))
 764         xstrncpy(buf, rfc1738_escape_unescaped(buf), MAX_URL);
 765
 766     return buf;
 767 }
 768
 769 /**
 770  * Yet another alternative to urlCanonical.
 771  * This one adds the https:// parts to Http::METHOD_CONNECT URL
 772  * for use in error page outputs.
 773  * Luckily we can leverage the others instead of duplicating.
 774  */
 775 const char *
 776 urlCanonicalFakeHttps(const HttpRequest * request)
 777 {
 778     LOCAL_ARRAY(char, buf, MAX_URL);
 779
 780     // method CONNECT and port HTTPS
 781     if (request->method == Http::METHOD_CONNECT && request->url.port() == 443) {
 782         snprintf(buf, MAX_URL, "https://%s/*", request->url.host());
 783         return buf;
 784     }
 785
 786     // else do the normal complete canonical thing.
 787     return request->canonicalCleanUrl();
 788 }
 789
 790 /**
 791  * Test if a URL is a relative reference.
 792  *
 793  * Governed by RFC 3986 section 4.2
 794  *
 795  *  relative-ref  = relative-part [ "?" query ] [ "#" fragment ]
 796  *
 797  *  relative-part = "//" authority path-abempty
 798  *                / path-absolute
 799  *                / path-noscheme
 800  *                / path-empty
 801  */
 802 bool
 803 urlIsRelative(const char *url)
 804 {
 805     if (!url)
 806         return false; // no URL
 807
 808     /*
 809      * RFC 3986 section 5.2.3
 810      *
 811      * path          = path-abempty    ; begins with "/" or is empty
 812      *               / path-absolute   ; begins with "/" but not "//"
 813      *               / path-noscheme   ; begins with a non-colon segment
 814      *               / path-rootless   ; begins with a segment
 815      *               / path-empty      ; zero characters
 816      */
 817
 818     if (*url == '\0')
 819         return true; // path-empty
 820
 821     if (*url == '/') {
 822         // network-path reference (a.k.a. 'scheme-relative URI') or
 823         // path-absolute (a.k.a. 'absolute-path reference')
 824         return true;
 825     }
 826
 827     for (const auto *p = url; *p != '\0' && *p != '/' && *p != '?' && *p != '#'; ++p) {
 828         if (*p == ':')
 829             return false; // colon is forbidden in first segment
 830     }
 831
 832     return true; // path-noscheme, path-abempty, path-rootless
 833 }
 834
 835 void
 836 AnyP::Uri::addRelativePath(const char *relUrl)
 837 {
 838     // URN cannot be merged
 839     if (getScheme() == AnyP::PROTO_URN)
 840         return;
 841
 842     // TODO: Handle . and .. segment normalization
 843
 844     const auto lastSlashPos = path_.rfind('/');
 845     // TODO: To optimize and simplify, add and use SBuf::replace().
 846     const auto relUrlLength = strlen(relUrl);
 847     if (lastSlashPos == SBuf::npos) {
 848         // start replacing the whole path
 849         path_.reserveCapacity(1 + relUrlLength);
 850         path_.assign("/", 1);
 851     } else {
 852         // start replacing just the last segment
 853         path_.reserveCapacity(lastSlashPos + 1 + relUrlLength);
 854         path_.chop(0, lastSlashPos+1);
 855     }
 856     path_.append(relUrl, relUrlLength);
 857 }
 858
 859 int
 860 matchDomainName(const char *h, const char *d, MatchDomainNameFlags flags)
 861 {
 862     int dl;
 863     int hl;
 864
 865     const bool hostIncludesSubdomains = (*h == '.');
 866     while ('.' == *h)
 867         ++h;
 868
 869     hl = strlen(h);
 870
 871     if (hl == 0)
 872         return -1;
 873
 874     dl = strlen(d);
 875     if (dl == 0)
 876         return 1;
 877
 878     /*
 879      * Start at the ends of the two strings and work towards the
 880      * beginning.
 881      */
 882     while (xtolower(h[--hl]) == xtolower(d[--dl])) {
 883         if (hl == 0 && dl == 0) {
 884             /*
 885              * We made it all the way to the beginning of both
 886              * strings without finding any difference.
 887              */
 888             return 0;
 889         }
 890
 891         if (0 == hl) {
 892             /*
 893              * The host string is shorter than the domain string.
 894              * There is only one case when this can be a match.
 895              * If the domain is just one character longer, and if
 896              * that character is a leading '.' then we call it a
 897              * match.
 898              */
 899
 900             if (1 == dl && '.' == d[0])
 901                 return 0;
 902             else
 903                 return -1;
 904         }
 905
 906         if (0 == dl) {
 907             /*
 908              * The domain string is shorter than the host string.
 909              * This is a match only if the first domain character
 910              * is a leading '.'.
 911              */
 912
 913             if ('.' == d[0]) {
 914                 if (flags & mdnRejectSubsubDomains) {
 915                     // Check for sub-sub domain and reject
 916                     while(--hl >= 0 && h[hl] != '.');
 917                     if (hl < 0) {
 918                         // No sub-sub domain found, but reject if there is a
 919                         // leading dot in given host string (which is removed
 920                         // before the check is started).
 921                         return hostIncludesSubdomains ? 1 : 0;
 922                     } else
 923                         return 1; // sub-sub domain, reject
 924                 } else
 925                     return 0;
 926             } else
 927                 return 1;
 928         }
 929     }
 930
 931     /*
 932      * We found different characters in the same position (from the end).
 933      */
 934
 935     // If the h has a form of "*.foo.com" and d has a form of "x.foo.com"
 936     // then the h[hl] points to '*', h[hl+1] to '.' and d[dl] to 'x'
 937     // The following checks are safe, the "h[hl + 1]" in the worst case is '\0'.
 938     if ((flags & mdnHonorWildcards) && h[hl] == '*' && h[hl + 1] == '.')
 939         return 0;
 940
 941     /*
 942      * If one of those character is '.' then its special.  In order
 943      * for splay tree sorting to work properly, "x-foo.com" must
 944      * be greater than ".foo.com" even though '-' is less than '.'.
 945      */
 946     if ('.' == d[dl])
 947         return 1;
 948
 949     if ('.' == h[hl])
 950         return -1;
 951
 952     return (xtolower(h[hl]) - xtolower(d[dl]));
 953 }
 954
 955 /*
 956  * return true if we can serve requests for this method.
 957  */
 958 bool
 959 urlCheckRequest(const HttpRequest * r)
 960 {
 961     /* protocol "independent" methods
 962      *
 963      * actually these methods are specific to HTTP:
 964      * they are methods we receive on our HTTP port,
 965      * and if we had a FTP listener would not be relevant
 966      * there.
 967      *
 968      * So, we should delegate them to HTTP. The problem is that we
 969      * do not have a default protocol from the client side of HTTP.
 970      */
 971
 972     if (r->method == Http::METHOD_CONNECT)
 973         return true;
 974
 975     // we support OPTIONS and TRACE directed at us (with a 501 reply, for now)
 976     // we also support forwarding OPTIONS and TRACE, except for the *-URI ones
 977     if (r->method == Http::METHOD_OPTIONS || r->method == Http::METHOD_TRACE)
 978         return (r->header.getInt64(Http::HdrType::MAX_FORWARDS) == 0 || r->url.path() != AnyP::Uri::Asterisk());
 979
 980     if (r->method == Http::METHOD_PURGE)
 981         return true;
 982
 983     /* does method match the protocol? */
 984     switch (r->url.getScheme()) {
 985
 986     case AnyP::PROTO_URN:
 987     case AnyP::PROTO_HTTP:
 988         return true;
 989
 990     case AnyP::PROTO_FTP:
 991         if (r->method == Http::METHOD_PUT ||
 992                 r->method == Http::METHOD_GET ||
 993                 r->method == Http::METHOD_HEAD )
 994             return true;
 995         return false;
 996
 997     case AnyP::PROTO_WAIS:
 998     case AnyP::PROTO_WHOIS:
 999         if (r->method == Http::METHOD_GET ||
1000                 r->method == Http::METHOD_HEAD)
1001             return true;
1002         return false;
1003
1004     case AnyP::PROTO_HTTPS:
1005 #if USE_OPENSSL || HAVE_LIBGNUTLS
1006         return true;
1007 #else
1008         /*
1009          * Squid can't originate an SSL connection, so it should
1010          * never receive an "https:" URL.  It should always be
1011          * CONNECT instead.
1012          */
1013         return false;
1014 #endif
1015
1016     default:
1017         return false;
1018     }
1019
1020     /* notreached */
1021     return false;
1022 }
1023
1024 AnyP::Uri::Uri(AnyP::UriScheme const &aScheme) :
1025     scheme_(aScheme),
1026     hostIsNumeric_(false)
1027 {
1028     *host_=0;
1029 }
1030
1031 // TODO: fix code duplication with AnyP::Uri::parse()
1032 char *
1033 AnyP::Uri::cleanup(const char *uri)
1034 {
1035     char *cleanedUri = nullptr;
1036     switch (Config.uri_whitespace) {
1037     case URI_WHITESPACE_ALLOW: {
1038         const auto flags = RFC1738_ESCAPE_NOSPACE | RFC1738_ESCAPE_UNESCAPED;
1039         cleanedUri = xstrndup(rfc1738_do_escape(uri, flags), MAX_URL);
1040         break;
1041     }
1042
1043     case URI_WHITESPACE_ENCODE:
1044         cleanedUri = xstrndup(rfc1738_do_escape(uri, RFC1738_ESCAPE_UNESCAPED), MAX_URL);
1045         break;
1046
1047     case URI_WHITESPACE_CHOP: {
1048         const auto pos = strcspn(uri, w_space);
1049         char *choppedUri = nullptr;
1050         if (pos < strlen(uri))
1051             choppedUri = xstrndup(uri, pos + 1);
1052         cleanedUri = xstrndup(rfc1738_do_escape(choppedUri ? choppedUri : uri,
1053                                                 RFC1738_ESCAPE_UNESCAPED), MAX_URL);
1054         cleanedUri[pos] = '\0';
1055         xfree(choppedUri);
1056         break;
1057     }
1058
1059     case URI_WHITESPACE_DENY:
1060     case URI_WHITESPACE_STRIP:
1061     default: {
1062         // TODO: avoid duplication with urlParse()
1063         const char *t;
1064         char *tmp_uri = static_cast<char*>(xmalloc(strlen(uri) + 1));
1065         char *q = tmp_uri;
1066         t = uri;
1067         while (*t) {
1068             if (!xisspace(*t)) {
1069                 *q = *t;
1070                 ++q;
1071             }
1072             ++t;
1073         }
1074         *q = '\0';
1075         cleanedUri = xstrndup(rfc1738_escape_unescaped(tmp_uri), MAX_URL);
1076         xfree(tmp_uri);
1077         break;
1078     }
1079     }
1080
1081     assert(cleanedUri);
1082     return cleanedUri;
1083 }
1084