src/anyp/Uri.cc

   1 /*
   2  * Copyright (C) 1996-2020 The Squid Software Foundation and contributors
   3  *
   4  * Squid software is distributed under GPLv2+ license and includes
   5  * contributions from numerous individuals and organizations.
   6  * Please see the COPYING and CONTRIBUTORS files for details.
   7  */
   8
   9 /* DEBUG: section 23    URL Parsing */
  10
  11 #include "squid.h"
  12 #include "anyp/Uri.h"
  13 #include "globals.h"
  14 #include "HttpRequest.h"
  15 #include "parser/Tokenizer.h"
  16 #include "rfc1738.h"
  17 #include "SquidConfig.h"
  18 #include "SquidString.h"
  19
  20 static const char valid_hostname_chars_u[] =
  21     "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
  22     "abcdefghijklmnopqrstuvwxyz"
  23     "0123456789-._"
  24     "[:]"
  25     ;
  26 static const char valid_hostname_chars[] =
  27     "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
  28     "abcdefghijklmnopqrstuvwxyz"
  29     "0123456789-."
  30     "[:]"
  31     ;
  32
  33 /// Characters which are valid within a URI userinfo section
  34 static const CharacterSet &
  35 UserInfoChars()
  36 {
  37     /*
  38      * RFC 3986 section 3.2.1
  39      *
  40      *  userinfo      = *( unreserved / pct-encoded / sub-delims / ":" )
  41      *  unreserved    = ALPHA / DIGIT / "-" / "." / "_" / "~"
  42      *  pct-encoded   = "%" HEXDIG HEXDIG
  43      *  sub-delims    = "!" / "$" / "&" / "'" / "(" / ")" / "*" / "+" / "," / ";" / "="
  44      */
  45     static const auto userInfoValid = CharacterSet("userinfo", ":-._~%!$&'()*+,;=") +
  46                                       CharacterSet::ALPHA +
  47                                       CharacterSet::DIGIT;
  48     return userInfoValid;
  49 }
  50
  51 /**
  52  * Governed by RFC 3986 section 2.1
  53  */
  54 SBuf
  55 AnyP::Uri::Encode(const SBuf &buf, const CharacterSet &ignore)
  56 {
  57     if (buf.isEmpty())
  58         return buf;
  59
  60     Parser::Tokenizer tk(buf);
  61     SBuf goodSection;
  62     // optimization for the arguably common "no encoding necessary" case
  63     if (tk.prefix(goodSection, ignore) && tk.atEnd())
  64         return buf;
  65
  66     SBuf output;
  67     output.reserveSpace(buf.length() * 3); // worst case: encode all chars
  68     output.append(goodSection); // may be empty
  69
  70     while (!tk.atEnd()) {
  71         // TODO: Add Tokenizer::parseOne(void).
  72         const auto ch = tk.remaining()[0];
  73         output.appendf("%%%02X", static_cast<unsigned int>(ch)); // TODO: Optimize using a table
  74         (void)tk.skip(ch);
  75
  76         if (tk.prefix(goodSection, ignore))
  77             output.append(goodSection);
  78     }
  79
  80     return output;
  81 }
  82
  83 const SBuf &
  84 AnyP::Uri::Asterisk()
  85 {
  86     static SBuf star("*");
  87     return star;
  88 }
  89
  90 const SBuf &
  91 AnyP::Uri::SlashPath()
  92 {
  93     static SBuf slash("/");
  94     return slash;
  95 }
  96
  97 void
  98 AnyP::Uri::host(const char *src)
  99 {
 100     hostAddr_.setEmpty();
 101     hostAddr_ = src;
 102     if (hostAddr_.isAnyAddr()) {
 103         xstrncpy(host_, src, sizeof(host_));
 104         hostIsNumeric_ = false;
 105     } else {
 106         hostAddr_.toHostStr(host_, sizeof(host_));
 107         debugs(23, 3, "given IP: " << hostAddr_);
 108         hostIsNumeric_ = 1;
 109     }
 110     touch();
 111 }
 112
 113 SBuf
 114 AnyP::Uri::hostOrIp() const
 115 {
 116     static char ip[MAX_IPSTRLEN];
 117     if (hostIsNumeric())
 118         return SBuf(hostIP().toStr(ip, sizeof(ip)));
 119     else
 120         return SBuf(host());
 121 }
 122
 123 const SBuf &
 124 AnyP::Uri::path() const
 125 {
 126     // RFC 3986 section 3.3 says path can be empty (path-abempty).
 127     // RFC 7230 sections 2.7.3, 5.3.1, 5.7.2 - says path cannot be empty, default to "/"
 128     // at least when sending and using. We must still accept path-abempty as input.
 129     if (path_.isEmpty() && (scheme_ == AnyP::PROTO_HTTP || scheme_ == AnyP::PROTO_HTTPS))
 130         return SlashPath();
 131
 132     return path_;
 133 }
 134
 135 void
 136 urlInitialize(void)
 137 {
 138     debugs(23, 5, "urlInitialize: Initializing...");
 139     /* this ensures that the number of protocol strings is the same as
 140      * the enum slots allocated because the last enum is always 'MAX'.
 141      */
 142     assert(strcmp(AnyP::ProtocolType_str[AnyP::PROTO_MAX], "MAX") == 0);
 143     /*
 144      * These test that our matchDomainName() function works the
 145      * way we expect it to.
 146      */
 147     assert(0 == matchDomainName("foo.com", "foo.com"));
 148     assert(0 == matchDomainName(".foo.com", "foo.com"));
 149     assert(0 == matchDomainName("foo.com", ".foo.com"));
 150     assert(0 == matchDomainName(".foo.com", ".foo.com"));
 151     assert(0 == matchDomainName("x.foo.com", ".foo.com"));
 152     assert(0 == matchDomainName("y.x.foo.com", ".foo.com"));
 153     assert(0 != matchDomainName("x.foo.com", "foo.com"));
 154     assert(0 != matchDomainName("foo.com", "x.foo.com"));
 155     assert(0 != matchDomainName("bar.com", "foo.com"));
 156     assert(0 != matchDomainName(".bar.com", "foo.com"));
 157     assert(0 != matchDomainName(".bar.com", ".foo.com"));
 158     assert(0 != matchDomainName("bar.com", ".foo.com"));
 159     assert(0 < matchDomainName("zzz.com", "foo.com"));
 160     assert(0 > matchDomainName("aaa.com", "foo.com"));
 161     assert(0 == matchDomainName("FOO.com", "foo.COM"));
 162     assert(0 < matchDomainName("bfoo.com", "afoo.com"));
 163     assert(0 > matchDomainName("afoo.com", "bfoo.com"));
 164     assert(0 < matchDomainName("x-foo.com", ".foo.com"));
 165
 166     assert(0 == matchDomainName(".foo.com", ".foo.com", mdnRejectSubsubDomains));
 167     assert(0 == matchDomainName("x.foo.com", ".foo.com", mdnRejectSubsubDomains));
 168     assert(0 != matchDomainName("y.x.foo.com", ".foo.com", mdnRejectSubsubDomains));
 169     assert(0 != matchDomainName(".x.foo.com", ".foo.com", mdnRejectSubsubDomains));
 170
 171     assert(0 == matchDomainName("*.foo.com", "x.foo.com", mdnHonorWildcards));
 172     assert(0 == matchDomainName("*.foo.com", ".x.foo.com", mdnHonorWildcards));
 173     assert(0 == matchDomainName("*.foo.com", ".foo.com", mdnHonorWildcards));
 174     assert(0 != matchDomainName("*.foo.com", "foo.com", mdnHonorWildcards));
 175
 176     /* more cases? */
 177 }
 178
 179 /**
 180  * Extract the URI scheme and ':' delimiter from the given input buffer.
 181  *
 182  * Schemes up to 16 characters are accepted.
 183  *
 184  * Governed by RFC 3986 section 3.1
 185  */
 186 static AnyP::UriScheme
 187 uriParseScheme(Parser::Tokenizer &tok)
 188 {
 189     /*
 190      * RFC 3986 section 3.1 paragraph 2:
 191      *
 192      * Scheme names consist of a sequence of characters beginning with a
 193      * letter and followed by any combination of letters, digits, plus
 194      * ("+"), period ("."), or hyphen ("-").
 195      *
 196      * The underscore ("_") required to match "cache_object://" squid
 197      * special URI scheme.
 198      */
 199     static const auto schemeChars =
 200 #if USE_HTTP_VIOLATIONS
 201         CharacterSet("special", "_") +
 202 #endif
 203         CharacterSet("scheme", "+.-") + CharacterSet::ALPHA + CharacterSet::DIGIT;
 204
 205     SBuf str;
 206     if (tok.prefix(str, schemeChars, 16) && tok.skip(':') && CharacterSet::ALPHA[str.at(0)]) {
 207         const auto protocol = AnyP::UriScheme::FindProtocolType(str);
 208         if (protocol == AnyP::PROTO_UNKNOWN)
 209             return AnyP::UriScheme(protocol, str.c_str());
 210         return AnyP::UriScheme(protocol, nullptr);
 211     }
 212
 213     throw TextException("invalid URI scheme", Here());
 214 }
 215
 216 /**
 217  * Appends configured append_domain to hostname, assuming
 218  * the given buffer is at least SQUIDHOSTNAMELEN bytes long,
 219  * and that the host FQDN is not a 'dotless' TLD.
 220  *
 221  * \returns false if and only if there is not enough space to append
 222  */
 223 bool
 224 urlAppendDomain(char *host)
 225 {
 226     /* For IPv4 addresses check for a dot */
 227     /* For IPv6 addresses also check for a colon */
 228     if (Config.appendDomain && !strchr(host, '.') && !strchr(host, ':')) {
 229         const uint64_t dlen = strlen(host);
 230         const uint64_t want = dlen + Config.appendDomainLen;
 231         if (want > SQUIDHOSTNAMELEN - 1) {
 232             debugs(23, 2, "URL domain too large (" << dlen << " bytes)");
 233             return false;
 234         }
 235         strncat(host, Config.appendDomain, SQUIDHOSTNAMELEN - dlen - 1);
 236     }
 237     return true;
 238 }
 239
 240 /*
 241  * Parse a URI/URL.
 242  *
 243  * It is assumed that the URL is complete -
 244  * ie, the end of the string is the end of the URL. Don't pass a partial
 245  * URL here as this routine doesn't have any way of knowing whether
 246  * it is partial or not (ie, it handles the case of no trailing slash as
 247  * being "end of host with implied path of /".
 248  *
 249  * method is used to switch parsers. If method is Http::METHOD_CONNECT,
 250  * then rather than a URL a hostname:port is looked for.
 251  */
 252 bool
 253 AnyP::Uri::parse(const HttpRequestMethod& method, const SBuf &rawUrl)
 254 {
 255     try {
 256
 257         LOCAL_ARRAY(char, login, MAX_URL);
 258         LOCAL_ARRAY(char, foundHost, MAX_URL);
 259         LOCAL_ARRAY(char, urlpath, MAX_URL);
 260         char *t = NULL;
 261         char *q = NULL;
 262         int foundPort;
 263         int l;
 264         int i;
 265         const char *src;
 266         char *dst;
 267         foundHost[0] = urlpath[0] = login[0] = '\0';
 268
 269         if ((l = rawUrl.length()) + Config.appendDomainLen > (MAX_URL - 1)) {
 270             debugs(23, DBG_IMPORTANT, MYNAME << "URL too large (" << l << " bytes)");
 271             return false;
 272         }
 273
 274         if ((method == Http::METHOD_OPTIONS || method == Http::METHOD_TRACE) &&
 275                 Asterisk().cmp(rawUrl) == 0) {
 276             // XXX: these methods might also occur in HTTPS traffic. Handle this better.
 277             setScheme(AnyP::PROTO_HTTP, nullptr);
 278             port(getScheme().defaultPort());
 279             path(Asterisk());
 280             return true;
 281         }
 282
 283         Parser::Tokenizer tok(rawUrl);
 284         AnyP::UriScheme scheme;
 285
 286         if (method == Http::METHOD_CONNECT) {
 287             /*
 288              * RFC 7230 section 5.3.3:  authority-form = authority
 289              *  "excluding any userinfo and its "@" delimiter"
 290              *
 291              * RFC 3986 section 3.2:    authority = [ userinfo "@" ] host [ ":" port ]
 292              *
 293              * As an HTTP(S) proxy we assume HTTPS (443) if no port provided.
 294              */
 295             foundPort = 443;
 296
 297             // XXX: use tokenizer
 298             auto B = tok.buf();
 299             const char *url = B.c_str();
 300
 301             if (sscanf(url, "[%[^]]]:%d", foundHost, &foundPort) < 1)
 302                 if (sscanf(url, "%[^:]:%d", foundHost, &foundPort) < 1)
 303                     return false;
 304
 305         } else {
 306
 307             scheme = uriParseScheme(tok);
 308
 309             if (scheme == AnyP::PROTO_NONE)
 310                 return false; // invalid scheme
 311
 312             if (scheme == AnyP::PROTO_URN) {
 313                 parseUrn(tok); // throws on any error
 314                 return true;
 315             }
 316
 317             // URLs then have "//"
 318             static const SBuf doubleSlash("//");
 319             if (!tok.skip(doubleSlash))
 320                 return false;
 321
 322             auto B = tok.remaining();
 323             const char *url = B.c_str();
 324
 325             /* Parse the URL: */
 326             src = url;
 327             i = 0;
 328
 329             /* Then everything until first /; that's host (and port; which we'll look for here later) */
 330             // bug 1881: If we don't get a "/" then we imply it was there
 331             // bug 3074: We could just be given a "?" or "#". These also imply "/"
 332             // bug 3233: whitespace is also a hostname delimiter.
 333             for (dst = foundHost; i < l && *src != '/' && *src != '?' && *src != '#' && *src != '\0' && !xisspace(*src); ++i, ++src, ++dst) {
 334                 *dst = *src;
 335             }
 336
 337             /*
 338              * We can't check for "i >= l" here because we could be at the end of the line
 339              * and have a perfectly valid URL w/ no trailing '/'. In this case we assume we've
 340              * been -given- a valid URL and the path is just '/'.
 341              */
 342             if (i > l)
 343                 return false;
 344             *dst = '\0';
 345
 346             // bug 3074: received 'path' starting with '?', '#', or '\0' implies '/'
 347             if (*src == '?' || *src == '#' || *src == '\0') {
 348                 urlpath[0] = '/';
 349                 dst = &urlpath[1];
 350             } else {
 351                 dst = urlpath;
 352             }
 353             /* Then everything from / (inclusive) until \r\n or \0 - that's urlpath */
 354             for (; i < l && *src != '\r' && *src != '\n' && *src != '\0'; ++i, ++src, ++dst) {
 355                 *dst = *src;
 356             }
 357
 358             /* We -could- be at the end of the buffer here */
 359             if (i > l)
 360                 return false;
 361             /* If the URL path is empty we set it to be "/" */
 362             if (dst == urlpath) {
 363                 *dst = '/';
 364                 ++dst;
 365             }
 366             *dst = '\0';
 367
 368             foundPort = scheme.defaultPort(); // may be reset later
 369
 370             /* Is there any login information? (we should eventually parse it above) */
 371             t = strrchr(foundHost, '@');
 372             if (t != NULL) {
 373                 strncpy((char *) login, (char *) foundHost, sizeof(login)-1);
 374                 login[sizeof(login)-1] = '\0';
 375                 t = strrchr(login, '@');
 376                 *t = 0;
 377                 strncpy((char *) foundHost, t + 1, sizeof(foundHost)-1);
 378                 foundHost[sizeof(foundHost)-1] = '\0';
 379                 // Bug 4498: URL-unescape the login info after extraction
 380                 rfc1738_unescape(login);
 381             }
 382
 383             /* Is there any host information? (we should eventually parse it above) */
 384             if (*foundHost == '[') {
 385                 /* strip any IPA brackets. valid under IPv6. */
 386                 dst = foundHost;
 387                 /* only for IPv6 sadly, pre-IPv6/URL code can't handle the clean result properly anyway. */
 388                 src = foundHost;
 389                 ++src;
 390                 l = strlen(foundHost);
 391                 i = 1;
 392                 for (; i < l && *src != ']' && *src != '\0'; ++i, ++src, ++dst) {
 393                     *dst = *src;
 394                 }
 395
 396                 /* we moved in-place, so truncate the actual hostname found */
 397                 *dst = '\0';
 398                 ++dst;
 399
 400                 /* skip ahead to either start of port, or original EOS */
 401                 while (*dst != '\0' && *dst != ':')
 402                     ++dst;
 403                 t = dst;
 404             } else {
 405                 t = strrchr(foundHost, ':');
 406
 407                 if (t != strchr(foundHost,':') ) {
 408                     /* RFC 2732 states IPv6 "SHOULD" be bracketed. allowing for times when its not. */
 409                     /* RFC 3986 'update' simply modifies this to an "is" with no emphasis at all! */
 410                     /* therefore we MUST accept the case where they are not bracketed at all. */
 411                     t = NULL;
 412                 }
 413             }
 414
 415             // Bug 3183 sanity check: If scheme is present, host must be too.
 416             if (scheme != AnyP::PROTO_NONE && foundHost[0] == '\0') {
 417                 debugs(23, DBG_IMPORTANT, "SECURITY ALERT: Missing hostname in URL '" << url << "'. see access.log for details.");
 418                 return false;
 419             }
 420
 421             if (t && *t == ':') {
 422                 *t = '\0';
 423                 ++t;
 424                 foundPort = atoi(t);
 425             }
 426         }
 427
 428         for (t = foundHost; *t; ++t)
 429             *t = xtolower(*t);
 430
 431         if (stringHasWhitespace(foundHost)) {
 432             if (URI_WHITESPACE_STRIP == Config.uri_whitespace) {
 433                 t = q = foundHost;
 434                 while (*t) {
 435                     if (!xisspace(*t)) {
 436                         *q = *t;
 437                         ++q;
 438                     }
 439                     ++t;
 440                 }
 441                 *q = '\0';
 442             }
 443         }
 444
 445         debugs(23, 3, "Split URL '" << rawUrl << "' into proto='" << scheme.image() << "', host='" << foundHost << "', port='" << foundPort << "', path='" << urlpath << "'");
 446
 447         if (Config.onoff.check_hostnames &&
 448                 strspn(foundHost, Config.onoff.allow_underscore ? valid_hostname_chars_u : valid_hostname_chars) != strlen(foundHost)) {
 449             debugs(23, DBG_IMPORTANT, MYNAME << "Illegal character in hostname '" << foundHost << "'");
 450             return false;
 451         }
 452
 453         if (!urlAppendDomain(foundHost))
 454             return false;
 455
 456         /* remove trailing dots from hostnames */
 457         while ((l = strlen(foundHost)) > 0 && foundHost[--l] == '.')
 458             foundHost[l] = '\0';
 459
 460         /* reject duplicate or leading dots */
 461         if (strstr(foundHost, "..") || *foundHost == '.') {
 462             debugs(23, DBG_IMPORTANT, MYNAME << "Illegal hostname '" << foundHost << "'");
 463             return false;
 464         }
 465
 466         if (foundPort < 1 || foundPort > 65535) {
 467             debugs(23, 3, "Invalid port '" << foundPort << "'");
 468             return false;
 469         }
 470
 471 #if HARDCODE_DENY_PORTS
 472         /* These ports are filtered in the default squid.conf, but
 473          * maybe someone wants them hardcoded... */
 474         if (foundPort == 7 || foundPort == 9 || foundPort == 19) {
 475             debugs(23, DBG_CRITICAL, MYNAME << "Deny access to port " << foundPort);
 476             return false;
 477         }
 478 #endif
 479
 480         if (stringHasWhitespace(urlpath)) {
 481             debugs(23, 2, "URI has whitespace: {" << rawUrl << "}");
 482
 483             switch (Config.uri_whitespace) {
 484
 485             case URI_WHITESPACE_DENY:
 486                 return false;
 487
 488             case URI_WHITESPACE_ALLOW:
 489                 break;
 490
 491             case URI_WHITESPACE_ENCODE:
 492                 t = rfc1738_escape_unescaped(urlpath);
 493                 xstrncpy(urlpath, t, MAX_URL);
 494                 break;
 495
 496             case URI_WHITESPACE_CHOP:
 497                 *(urlpath + strcspn(urlpath, w_space)) = '\0';
 498                 break;
 499
 500             case URI_WHITESPACE_STRIP:
 501             default:
 502                 t = q = urlpath;
 503                 while (*t) {
 504                     if (!xisspace(*t)) {
 505                         *q = *t;
 506                         ++q;
 507                     }
 508                     ++t;
 509                 }
 510                 *q = '\0';
 511             }
 512         }
 513
 514         setScheme(scheme);
 515         path(urlpath);
 516         host(foundHost);
 517         userInfo(SBuf(login));
 518         port(foundPort);
 519         return true;
 520
 521     } catch (...) {
 522         debugs(23, 2, "error: " << CurrentException << " " << Raw("rawUrl", rawUrl.rawContent(), rawUrl.length()));
 523         return false;
 524     }
 525 }
 526
 527 /**
 528  * Governed by RFC 8141 section 2:
 529  *
 530  *  assigned-name = "urn" ":" NID ":" NSS
 531  *  NID           = (alphanum) 0*30(ldh) (alphanum)
 532  *  ldh           = alphanum / "-"
 533  *  NSS           = pchar *(pchar / "/")
 534  *
 535  * RFC 3986 Appendix D.2 defines (as deprecated):
 536  *
 537  *   alphanum     = ALPHA / DIGIT
 538  *
 539  * Notice that NID is exactly 2-32 characters in length.
 540  */
 541 void
 542 AnyP::Uri::parseUrn(Parser::Tokenizer &tok)
 543 {
 544     static const auto nidChars = CharacterSet("NID","-") + CharacterSet::ALPHA + CharacterSet::DIGIT;
 545     static const auto alphanum = (CharacterSet::ALPHA + CharacterSet::DIGIT).rename("alphanum");
 546     SBuf nid;
 547     if (!tok.prefix(nid, nidChars, 32))
 548         throw TextException("NID not found", Here());
 549
 550     if (!tok.skip(':'))
 551         throw TextException("NID too long or missing ':' delimiter", Here());
 552
 553     if (nid.length() < 2)
 554         throw TextException("NID too short", Here());
 555
 556     if (!alphanum[*nid.begin()])
 557         throw TextException("NID prefix is not alphanumeric", Here());
 558
 559     if (!alphanum[*nid.rbegin()])
 560         throw TextException("NID suffix is not alphanumeric", Here());
 561
 562     setScheme(AnyP::PROTO_URN, nullptr);
 563     host(nid.c_str());
 564     // TODO validate path characters
 565     path(tok.remaining());
 566     debugs(23, 3, "Split URI into proto=urn, nid=" << nid << ", " << Raw("path",path().rawContent(),path().length()));
 567 }
 568
 569 void
 570 AnyP::Uri::touch()
 571 {
 572     absolute_.clear();
 573     authorityHttp_.clear();
 574     authorityWithPort_.clear();
 575 }
 576
 577 SBuf &
 578 AnyP::Uri::authority(bool requirePort) const
 579 {
 580     if (authorityHttp_.isEmpty()) {
 581
 582         // both formats contain Host/IP
 583         authorityWithPort_.append(host());
 584         authorityHttp_ = authorityWithPort_;
 585
 586         // authorityForm_ only has :port if it is non-default
 587         authorityWithPort_.appendf(":%u",port());
 588         if (port() != getScheme().defaultPort())
 589             authorityHttp_ = authorityWithPort_;
 590     }
 591
 592     return requirePort ? authorityWithPort_ : authorityHttp_;
 593 }
 594
 595 SBuf &
 596 AnyP::Uri::absolute() const
 597 {
 598     if (absolute_.isEmpty()) {
 599         // TODO: most URL will be much shorter, avoid allocating this much
 600         absolute_.reserveCapacity(MAX_URL);
 601
 602         absolute_.append(getScheme().image());
 603         absolute_.append(":",1);
 604         if (getScheme() != AnyP::PROTO_URN) {
 605             absolute_.append("//", 2);
 606             const bool allowUserInfo = getScheme() == AnyP::PROTO_FTP ||
 607                                        getScheme() == AnyP::PROTO_UNKNOWN;
 608
 609             if (allowUserInfo && !userInfo().isEmpty()) {
 610                 static const CharacterSet uiChars = CharacterSet(UserInfoChars())
 611                                                     .remove('%')
 612                                                     .rename("userinfo-reserved");
 613                 absolute_.append(Encode(userInfo(), uiChars));
 614                 absolute_.append("@", 1);
 615             }
 616             absolute_.append(authority());
 617         } else {
 618             absolute_.append(host());
 619             absolute_.append(":", 1);
 620         }
 621         absolute_.append(path()); // TODO: Encode each URI subcomponent in path_ as needed.
 622     }
 623
 624     return absolute_;
 625 }
 626
 627 /** \todo AYJ: Performance: This is an *almost* duplicate of HttpRequest::effectiveRequestUri(). But elides the query-string.
 628  *        After copying it on in the first place! Would be less code to merge the two with a flag parameter.
 629  *        and never copy the query-string part in the first place
 630  */
 631 char *
 632 urlCanonicalCleanWithoutRequest(const SBuf &url, const HttpRequestMethod &method, const AnyP::UriScheme &scheme)
 633 {
 634     LOCAL_ARRAY(char, buf, MAX_URL);
 635
 636     snprintf(buf, sizeof(buf), SQUIDSBUFPH, SQUIDSBUFPRINT(url));
 637     buf[sizeof(buf)-1] = '\0';
 638
 639     // URN, CONNECT method, and non-stripped URIs can go straight out
 640     if (Config.onoff.strip_query_terms && !(method == Http::METHOD_CONNECT || scheme == AnyP::PROTO_URN)) {
 641         // strip anything AFTER a question-mark
 642         // leaving the '?' in place
 643         if (auto t = strchr(buf, '?')) {
 644             *(++t) = '\0';
 645         }
 646     }
 647
 648     if (stringHasCntl(buf))
 649         xstrncpy(buf, rfc1738_escape_unescaped(buf), MAX_URL);
 650
 651     return buf;
 652 }
 653
 654 /**
 655  * Yet another alternative to urlCanonical.
 656  * This one adds the https:// parts to Http::METHOD_CONNECT URL
 657  * for use in error page outputs.
 658  * Luckily we can leverage the others instead of duplicating.
 659  */
 660 const char *
 661 urlCanonicalFakeHttps(const HttpRequest * request)
 662 {
 663     LOCAL_ARRAY(char, buf, MAX_URL);
 664
 665     // method CONNECT and port HTTPS
 666     if (request->method == Http::METHOD_CONNECT && request->url.port() == 443) {
 667         snprintf(buf, MAX_URL, "https://%s/*", request->url.host());
 668         return buf;
 669     }
 670
 671     // else do the normal complete canonical thing.
 672     return request->canonicalCleanUrl();
 673 }
 674
 675 /**
 676  * Test if a URL is a relative reference.
 677  *
 678  * Governed by RFC 3986 section 4.2
 679  *
 680  *  relative-ref  = relative-part [ "?" query ] [ "#" fragment ]
 681  *
 682  *  relative-part = "//" authority path-abempty
 683  *                / path-absolute
 684  *                / path-noscheme
 685  *                / path-empty
 686  */
 687 bool
 688 urlIsRelative(const char *url)
 689 {
 690     if (!url)
 691         return false; // no URL
 692
 693     /*
 694      * RFC 3986 section 5.2.3
 695      *
 696      * path          = path-abempty    ; begins with "/" or is empty
 697      *               / path-absolute   ; begins with "/" but not "//"
 698      *               / path-noscheme   ; begins with a non-colon segment
 699      *               / path-rootless   ; begins with a segment
 700      *               / path-empty      ; zero characters
 701      */
 702
 703     if (*url == '\0')
 704         return true; // path-empty
 705
 706     if (*url == '/') {
 707         // RFC 3986 section 5.2.3
 708         // path-absolute   ; begins with "/" but not "//"
 709         if (url[1] == '/')
 710             return true; // network-path reference, aka. 'scheme-relative URI'
 711         else
 712             return true; // path-absolute, aka 'absolute-path reference'
 713     }
 714
 715     for (const auto *p = url; *p != '\0' && *p != '/' && *p != '?' && *p != '#'; ++p) {
 716         if (*p == ':')
 717             return false; // colon is forbidden in first segment
 718     }
 719
 720     return true; // path-noscheme, path-abempty, path-rootless
 721 }
 722
 723 void
 724 AnyP::Uri::addRelativePath(const char *relUrl)
 725 {
 726     // URN cannot be merged
 727     if (getScheme() == AnyP::PROTO_URN)
 728         return;
 729
 730     // TODO: Handle . and .. segment normalization
 731
 732     const auto lastSlashPos = path_.rfind('/');
 733     // TODO: To optimize and simplify, add and use SBuf::replace().
 734     const auto relUrlLength = strlen(relUrl);
 735     if (lastSlashPos == SBuf::npos) {
 736         // start replacing the whole path
 737         path_.reserveCapacity(1 + relUrlLength);
 738         path_.assign("/", 1);
 739     } else {
 740         // start replacing just the last segment
 741         path_.reserveCapacity(lastSlashPos + 1 + relUrlLength);
 742         path_.chop(0, lastSlashPos+1);
 743     }
 744     path_.append(relUrl, relUrlLength);
 745 }
 746
 747 int
 748 matchDomainName(const char *h, const char *d, MatchDomainNameFlags flags)
 749 {
 750     int dl;
 751     int hl;
 752
 753     const bool hostIncludesSubdomains = (*h == '.');
 754     while ('.' == *h)
 755         ++h;
 756
 757     hl = strlen(h);
 758
 759     if (hl == 0)
 760         return -1;
 761
 762     dl = strlen(d);
 763
 764     /*
 765      * Start at the ends of the two strings and work towards the
 766      * beginning.
 767      */
 768     while (xtolower(h[--hl]) == xtolower(d[--dl])) {
 769         if (hl == 0 && dl == 0) {
 770             /*
 771              * We made it all the way to the beginning of both
 772              * strings without finding any difference.
 773              */
 774             return 0;
 775         }
 776
 777         if (0 == hl) {
 778             /*
 779              * The host string is shorter than the domain string.
 780              * There is only one case when this can be a match.
 781              * If the domain is just one character longer, and if
 782              * that character is a leading '.' then we call it a
 783              * match.
 784              */
 785
 786             if (1 == dl && '.' == d[0])
 787                 return 0;
 788             else
 789                 return -1;
 790         }
 791
 792         if (0 == dl) {
 793             /*
 794              * The domain string is shorter than the host string.
 795              * This is a match only if the first domain character
 796              * is a leading '.'.
 797              */
 798
 799             if ('.' == d[0]) {
 800                 if (flags & mdnRejectSubsubDomains) {
 801                     // Check for sub-sub domain and reject
 802                     while(--hl >= 0 && h[hl] != '.');
 803                     if (hl < 0) {
 804                         // No sub-sub domain found, but reject if there is a
 805                         // leading dot in given host string (which is removed
 806                         // before the check is started).
 807                         return hostIncludesSubdomains ? 1 : 0;
 808                     } else
 809                         return 1; // sub-sub domain, reject
 810                 } else
 811                     return 0;
 812             } else
 813                 return 1;
 814         }
 815     }
 816
 817     /*
 818      * We found different characters in the same position (from the end).
 819      */
 820
 821     // If the h has a form of "*.foo.com" and d has a form of "x.foo.com"
 822     // then the h[hl] points to '*', h[hl+1] to '.' and d[dl] to 'x'
 823     // The following checks are safe, the "h[hl + 1]" in the worst case is '\0'.
 824     if ((flags & mdnHonorWildcards) && h[hl] == '*' && h[hl + 1] == '.')
 825         return 0;
 826
 827     /*
 828      * If one of those character is '.' then its special.  In order
 829      * for splay tree sorting to work properly, "x-foo.com" must
 830      * be greater than ".foo.com" even though '-' is less than '.'.
 831      */
 832     if ('.' == d[dl])
 833         return 1;
 834
 835     if ('.' == h[hl])
 836         return -1;
 837
 838     return (xtolower(h[hl]) - xtolower(d[dl]));
 839 }
 840
 841 /*
 842  * return true if we can serve requests for this method.
 843  */
 844 int
 845 urlCheckRequest(const HttpRequest * r)
 846 {
 847     int rc = 0;
 848     /* protocol "independent" methods
 849      *
 850      * actually these methods are specific to HTTP:
 851      * they are methods we receive on our HTTP port,
 852      * and if we had a FTP listener would not be relevant
 853      * there.
 854      *
 855      * So, we should delegate them to HTTP. The problem is that we
 856      * do not have a default protocol from the client side of HTTP.
 857      */
 858
 859     if (r->method == Http::METHOD_CONNECT)
 860         return 1;
 861
 862     // we support OPTIONS and TRACE directed at us (with a 501 reply, for now)
 863     // we also support forwarding OPTIONS and TRACE, except for the *-URI ones
 864     if (r->method == Http::METHOD_OPTIONS || r->method == Http::METHOD_TRACE)
 865         return (r->header.getInt64(Http::HdrType::MAX_FORWARDS) == 0 || r->url.path() != AnyP::Uri::Asterisk());
 866
 867     if (r->method == Http::METHOD_PURGE)
 868         return 1;
 869
 870     /* does method match the protocol? */
 871     switch (r->url.getScheme()) {
 872
 873     case AnyP::PROTO_URN:
 874
 875     case AnyP::PROTO_HTTP:
 876
 877     case AnyP::PROTO_CACHE_OBJECT:
 878         rc = 1;
 879         break;
 880
 881     case AnyP::PROTO_FTP:
 882
 883         if (r->method == Http::METHOD_PUT)
 884             rc = 1;
 885
 886     case AnyP::PROTO_GOPHER:
 887
 888     case AnyP::PROTO_WAIS:
 889
 890     case AnyP::PROTO_WHOIS:
 891         if (r->method == Http::METHOD_GET)
 892             rc = 1;
 893         else if (r->method == Http::METHOD_HEAD)
 894             rc = 1;
 895
 896         break;
 897
 898     case AnyP::PROTO_HTTPS:
 899 #if USE_OPENSSL
 900         rc = 1;
 901 #elif USE_GNUTLS
 902         rc = 1;
 903 #else
 904         /*
 905         * Squid can't originate an SSL connection, so it should
 906         * never receive an "https:" URL.  It should always be
 907         * CONNECT instead.
 908         */
 909         rc = 0;
 910 #endif
 911         break;
 912
 913     default:
 914         break;
 915     }
 916
 917     return rc;
 918 }
 919
 920 AnyP::Uri::Uri(AnyP::UriScheme const &aScheme) :
 921     scheme_(aScheme),
 922     hostIsNumeric_(false),
 923     port_(0)
 924 {
 925     *host_=0;
 926 }
 927
 928 // TODO: fix code duplication with AnyP::Uri::parse()
 929 char *
 930 AnyP::Uri::cleanup(const char *uri)
 931 {
 932     int flags = 0;
 933     char *cleanedUri = nullptr;
 934     switch (Config.uri_whitespace) {
 935     case URI_WHITESPACE_ALLOW:
 936         flags |= RFC1738_ESCAPE_NOSPACE;
 937     // fall through to next case
 938     case URI_WHITESPACE_ENCODE:
 939         flags |= RFC1738_ESCAPE_UNESCAPED;
 940         cleanedUri = xstrndup(rfc1738_do_escape(uri, flags), MAX_URL);
 941         break;
 942
 943     case URI_WHITESPACE_CHOP: {
 944         flags |= RFC1738_ESCAPE_UNESCAPED;
 945         const auto pos = strcspn(uri, w_space);
 946         char *choppedUri = nullptr;
 947         if (pos < strlen(uri))
 948             choppedUri = xstrndup(uri, pos + 1);
 949         cleanedUri = xstrndup(rfc1738_do_escape(choppedUri ? choppedUri : uri, flags), MAX_URL);
 950         cleanedUri[pos] = '\0';
 951         xfree(choppedUri);
 952     }
 953     break;
 954
 955     case URI_WHITESPACE_DENY:
 956     case URI_WHITESPACE_STRIP:
 957     default: {
 958         // TODO: avoid duplication with urlParse()
 959         const char *t;
 960         char *tmp_uri = static_cast<char*>(xmalloc(strlen(uri) + 1));
 961         char *q = tmp_uri;
 962         t = uri;
 963         while (*t) {
 964             if (!xisspace(*t)) {
 965                 *q = *t;
 966                 ++q;
 967             }
 968             ++t;
 969         }
 970         *q = '\0';
 971         cleanedUri = xstrndup(rfc1738_escape_unescaped(tmp_uri), MAX_URL);
 972         xfree(tmp_uri);
 973     }
 974     break;
 975     }
 976
 977     assert(cleanedUri);
 978     return cleanedUri;
 979 }
 980