src/anyp/Uri.cc

   1 /*
   2  * Copyright (C) 1996-2023 The Squid Software Foundation and contributors
   3  *
   4  * Squid software is distributed under GPLv2+ license and includes
   5  * contributions from numerous individuals and organizations.
   6  * Please see the COPYING and CONTRIBUTORS files for details.
   7  */
   8
   9 /* DEBUG: section 23    URL Parsing */
  10
  11 #include "squid.h"
  12 #include "anyp/Uri.h"
  13 #include "base/Raw.h"
  14 #include "globals.h"
  15 #include "HttpRequest.h"
  16 #include "parser/Tokenizer.h"
  17 #include "rfc1738.h"
  18 #include "SquidConfig.h"
  19 #include "SquidString.h"
  20
  21 static const char valid_hostname_chars_u[] =
  22     "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
  23     "abcdefghijklmnopqrstuvwxyz"
  24     "0123456789-._"
  25     "[:]"
  26     ;
  27 static const char valid_hostname_chars[] =
  28     "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
  29     "abcdefghijklmnopqrstuvwxyz"
  30     "0123456789-."
  31     "[:]"
  32     ;
  33
  34 /// Characters which are valid within a URI userinfo section
  35 static const CharacterSet &
  36 UserInfoChars()
  37 {
  38     /*
  39      * RFC 3986 section 3.2.1
  40      *
  41      *  userinfo      = *( unreserved / pct-encoded / sub-delims / ":" )
  42      *  unreserved    = ALPHA / DIGIT / "-" / "." / "_" / "~"
  43      *  pct-encoded   = "%" HEXDIG HEXDIG
  44      *  sub-delims    = "!" / "$" / "&" / "'" / "(" / ")" / "*" / "+" / "," / ";" / "="
  45      */
  46     static const auto userInfoValid = CharacterSet("userinfo", ":-._~%!$&'()*+,;=") +
  47                                       CharacterSet::ALPHA +
  48                                       CharacterSet::DIGIT;
  49     return userInfoValid;
  50 }
  51
  52 /**
  53  * Governed by RFC 3986 section 2.1
  54  */
  55 SBuf
  56 AnyP::Uri::Encode(const SBuf &buf, const CharacterSet &ignore)
  57 {
  58     if (buf.isEmpty())
  59         return buf;
  60
  61     Parser::Tokenizer tk(buf);
  62     SBuf goodSection;
  63     // optimization for the arguably common "no encoding necessary" case
  64     if (tk.prefix(goodSection, ignore) && tk.atEnd())
  65         return buf;
  66
  67     SBuf output;
  68     output.reserveSpace(buf.length() * 3); // worst case: encode all chars
  69     output.append(goodSection); // may be empty
  70
  71     while (!tk.atEnd()) {
  72         // TODO: Add Tokenizer::parseOne(void).
  73         const auto ch = tk.remaining()[0];
  74         output.appendf("%%%02X", static_cast<unsigned int>(ch)); // TODO: Optimize using a table
  75         (void)tk.skip(ch);
  76
  77         if (tk.prefix(goodSection, ignore))
  78             output.append(goodSection);
  79     }
  80
  81     return output;
  82 }
  83
  84 const SBuf &
  85 AnyP::Uri::Asterisk()
  86 {
  87     static SBuf star("*");
  88     return star;
  89 }
  90
  91 const SBuf &
  92 AnyP::Uri::SlashPath()
  93 {
  94     static SBuf slash("/");
  95     return slash;
  96 }
  97
  98 void
  99 AnyP::Uri::host(const char *src)
 100 {
 101     hostAddr_.fromHost(src);
 102     if (hostAddr_.isAnyAddr()) {
 103         xstrncpy(host_, src, sizeof(host_));
 104         hostIsNumeric_ = false;
 105     } else {
 106         hostAddr_.toHostStr(host_, sizeof(host_));
 107         debugs(23, 3, "given IP: " << hostAddr_);
 108         hostIsNumeric_ = 1;
 109     }
 110     touch();
 111 }
 112
 113 SBuf
 114 AnyP::Uri::hostOrIp() const
 115 {
 116     if (hostIsNumeric()) {
 117         static char ip[MAX_IPSTRLEN];
 118         const auto hostStrLen = hostIP().toHostStr(ip, sizeof(ip));
 119         return SBuf(ip, hostStrLen);
 120     } else
 121         return SBuf(host());
 122 }
 123
 124 const SBuf &
 125 AnyP::Uri::path() const
 126 {
 127     // RFC 3986 section 3.3 says path can be empty (path-abempty).
 128     // RFC 7230 sections 2.7.3, 5.3.1, 5.7.2 - says path cannot be empty, default to "/"
 129     // at least when sending and using. We must still accept path-abempty as input.
 130     if (path_.isEmpty() && (scheme_ == AnyP::PROTO_HTTP || scheme_ == AnyP::PROTO_HTTPS))
 131         return SlashPath();
 132
 133     return path_;
 134 }
 135
 136 void
 137 urlInitialize(void)
 138 {
 139     debugs(23, 5, "urlInitialize: Initializing...");
 140     /* this ensures that the number of protocol strings is the same as
 141      * the enum slots allocated because the last enum is always 'MAX'.
 142      */
 143     assert(strcmp(AnyP::ProtocolType_str[AnyP::PROTO_MAX], "MAX") == 0);
 144     /*
 145      * These test that our matchDomainName() function works the
 146      * way we expect it to.
 147      */
 148     assert(0 == matchDomainName("foo.com", "foo.com"));
 149     assert(0 == matchDomainName(".foo.com", "foo.com"));
 150     assert(0 == matchDomainName("foo.com", ".foo.com"));
 151     assert(0 == matchDomainName(".foo.com", ".foo.com"));
 152     assert(0 == matchDomainName("x.foo.com", ".foo.com"));
 153     assert(0 == matchDomainName("y.x.foo.com", ".foo.com"));
 154     assert(0 != matchDomainName("x.foo.com", "foo.com"));
 155     assert(0 != matchDomainName("foo.com", "x.foo.com"));
 156     assert(0 != matchDomainName("bar.com", "foo.com"));
 157     assert(0 != matchDomainName(".bar.com", "foo.com"));
 158     assert(0 != matchDomainName(".bar.com", ".foo.com"));
 159     assert(0 != matchDomainName("bar.com", ".foo.com"));
 160     assert(0 < matchDomainName("zzz.com", "foo.com"));
 161     assert(0 > matchDomainName("aaa.com", "foo.com"));
 162     assert(0 == matchDomainName("FOO.com", "foo.COM"));
 163     assert(0 < matchDomainName("bfoo.com", "afoo.com"));
 164     assert(0 > matchDomainName("afoo.com", "bfoo.com"));
 165     assert(0 < matchDomainName("x-foo.com", ".foo.com"));
 166
 167     assert(0 == matchDomainName(".foo.com", ".foo.com", mdnRejectSubsubDomains));
 168     assert(0 == matchDomainName("x.foo.com", ".foo.com", mdnRejectSubsubDomains));
 169     assert(0 != matchDomainName("y.x.foo.com", ".foo.com", mdnRejectSubsubDomains));
 170     assert(0 != matchDomainName(".x.foo.com", ".foo.com", mdnRejectSubsubDomains));
 171
 172     assert(0 == matchDomainName("*.foo.com", "x.foo.com", mdnHonorWildcards));
 173     assert(0 == matchDomainName("*.foo.com", ".x.foo.com", mdnHonorWildcards));
 174     assert(0 == matchDomainName("*.foo.com", ".foo.com", mdnHonorWildcards));
 175     assert(0 != matchDomainName("*.foo.com", "foo.com", mdnHonorWildcards));
 176
 177     /* more cases? */
 178 }
 179
 180 /**
 181  * Extract the URI scheme and ':' delimiter from the given input buffer.
 182  *
 183  * Schemes up to 16 characters are accepted.
 184  *
 185  * Governed by RFC 3986 section 3.1
 186  */
 187 static AnyP::UriScheme
 188 uriParseScheme(Parser::Tokenizer &tok)
 189 {
 190     /*
 191      * RFC 3986 section 3.1 paragraph 2:
 192      *
 193      * Scheme names consist of a sequence of characters beginning with a
 194      * letter and followed by any combination of letters, digits, plus
 195      * ("+"), period ("."), or hyphen ("-").
 196      *
 197      * The underscore ("_") required to match "cache_object://" squid
 198      * special URI scheme.
 199      */
 200     static const auto schemeChars =
 201 #if USE_HTTP_VIOLATIONS
 202         CharacterSet("special", "_") +
 203 #endif
 204         CharacterSet("scheme", "+.-") + CharacterSet::ALPHA + CharacterSet::DIGIT;
 205
 206     SBuf str;
 207     if (tok.prefix(str, schemeChars, 16) && tok.skip(':') && CharacterSet::ALPHA[str.at(0)]) {
 208         const auto protocol = AnyP::UriScheme::FindProtocolType(str);
 209         if (protocol == AnyP::PROTO_UNKNOWN)
 210             return AnyP::UriScheme(protocol, str.c_str());
 211         return AnyP::UriScheme(protocol, nullptr);
 212     }
 213
 214     throw TextException("invalid URI scheme", Here());
 215 }
 216
 217 /**
 218  * Appends configured append_domain to hostname, assuming
 219  * the given buffer is at least SQUIDHOSTNAMELEN bytes long,
 220  * and that the host FQDN is not a 'dotless' TLD.
 221  *
 222  * \returns false if and only if there is not enough space to append
 223  */
 224 bool
 225 urlAppendDomain(char *host)
 226 {
 227     /* For IPv4 addresses check for a dot */
 228     /* For IPv6 addresses also check for a colon */
 229     if (Config.appendDomain && !strchr(host, '.') && !strchr(host, ':')) {
 230         const uint64_t dlen = strlen(host);
 231         const uint64_t want = dlen + Config.appendDomainLen;
 232         if (want > SQUIDHOSTNAMELEN - 1) {
 233             debugs(23, 2, "URL domain too large (" << dlen << " bytes)");
 234             return false;
 235         }
 236         strncat(host, Config.appendDomain, SQUIDHOSTNAMELEN - dlen - 1);
 237     }
 238     return true;
 239 }
 240
 241 /*
 242  * Parse a URI/URL.
 243  *
 244  * It is assumed that the URL is complete -
 245  * ie, the end of the string is the end of the URL. Don't pass a partial
 246  * URL here as this routine doesn't have any way of knowing whether
 247  * it is partial or not (ie, it handles the case of no trailing slash as
 248  * being "end of host with implied path of /".
 249  *
 250  * method is used to switch parsers. If method is Http::METHOD_CONNECT,
 251  * then rather than a URL a hostname:port is looked for.
 252  */
 253 bool
 254 AnyP::Uri::parse(const HttpRequestMethod& method, const SBuf &rawUrl)
 255 {
 256     try {
 257
 258         LOCAL_ARRAY(char, login, MAX_URL);
 259         LOCAL_ARRAY(char, foundHost, MAX_URL);
 260         LOCAL_ARRAY(char, urlpath, MAX_URL);
 261         char *t = nullptr;
 262         char *q = nullptr;
 263         int foundPort;
 264         int l;
 265         int i;
 266         const char *src;
 267         char *dst;
 268         foundHost[0] = urlpath[0] = login[0] = '\0';
 269
 270         if ((l = rawUrl.length()) + Config.appendDomainLen > (MAX_URL - 1)) {
 271             debugs(23, DBG_IMPORTANT, MYNAME << "URL too large (" << l << " bytes)");
 272             return false;
 273         }
 274
 275         if ((method == Http::METHOD_OPTIONS || method == Http::METHOD_TRACE) &&
 276                 Asterisk().cmp(rawUrl) == 0) {
 277             // XXX: these methods might also occur in HTTPS traffic. Handle this better.
 278             setScheme(AnyP::PROTO_HTTP, nullptr);
 279             port(getScheme().defaultPort());
 280             path(Asterisk());
 281             return true;
 282         }
 283
 284         Parser::Tokenizer tok(rawUrl);
 285         AnyP::UriScheme scheme;
 286
 287         if (method == Http::METHOD_CONNECT) {
 288             /*
 289              * RFC 7230 section 5.3.3:  authority-form = authority
 290              *  "excluding any userinfo and its "@" delimiter"
 291              *
 292              * RFC 3986 section 3.2:    authority = [ userinfo "@" ] host [ ":" port ]
 293              *
 294              * As an HTTP(S) proxy we assume HTTPS (443) if no port provided.
 295              */
 296             foundPort = 443;
 297
 298             // XXX: use tokenizer
 299             auto B = tok.buf();
 300             const char *url = B.c_str();
 301
 302             if (sscanf(url, "[%[^]]]:%d", foundHost, &foundPort) < 1)
 303                 if (sscanf(url, "%[^:]:%d", foundHost, &foundPort) < 1)
 304                     return false;
 305
 306         } else {
 307
 308             scheme = uriParseScheme(tok);
 309
 310             if (scheme == AnyP::PROTO_NONE)
 311                 return false; // invalid scheme
 312
 313             if (scheme == AnyP::PROTO_URN) {
 314                 parseUrn(tok); // throws on any error
 315                 return true;
 316             }
 317
 318             // URLs then have "//"
 319             static const SBuf doubleSlash("//");
 320             if (!tok.skip(doubleSlash))
 321                 return false;
 322
 323             auto B = tok.remaining();
 324             const char *url = B.c_str();
 325
 326             /* Parse the URL: */
 327             src = url;
 328             i = 0;
 329
 330             /* Then everything until first /; that's host (and port; which we'll look for here later) */
 331             // bug 1881: If we don't get a "/" then we imply it was there
 332             // bug 3074: We could just be given a "?" or "#". These also imply "/"
 333             // bug 3233: whitespace is also a hostname delimiter.
 334             for (dst = foundHost; i < l && *src != '/' && *src != '?' && *src != '#' && *src != '\0' && !xisspace(*src); ++i, ++src, ++dst) {
 335                 *dst = *src;
 336             }
 337
 338             /*
 339              * We can't check for "i >= l" here because we could be at the end of the line
 340              * and have a perfectly valid URL w/ no trailing '/'. In this case we assume we've
 341              * been -given- a valid URL and the path is just '/'.
 342              */
 343             if (i > l)
 344                 return false;
 345             *dst = '\0';
 346
 347             // We are looking at path-abempty.
 348             if (*src != '/') {
 349                 // path-empty, including the end of the `src` c-string cases
 350                 urlpath[0] = '/';
 351                 dst = &urlpath[1];
 352             } else {
 353                 dst = urlpath;
 354             }
 355             /* Then everything from / (inclusive) until \r\n or \0 - that's urlpath */
 356             for (; i < l && *src != '\r' && *src != '\n' && *src != '\0'; ++i, ++src, ++dst) {
 357                 *dst = *src;
 358             }
 359
 360             /* We -could- be at the end of the buffer here */
 361             if (i > l)
 362                 return false;
 363             *dst = '\0';
 364
 365             foundPort = scheme.defaultPort(); // may be reset later
 366
 367             /* Is there any login information? (we should eventually parse it above) */
 368             t = strrchr(foundHost, '@');
 369             if (t != nullptr) {
 370                 strncpy((char *) login, (char *) foundHost, sizeof(login)-1);
 371                 login[sizeof(login)-1] = '\0';
 372                 t = strrchr(login, '@');
 373                 *t = 0;
 374                 strncpy((char *) foundHost, t + 1, sizeof(foundHost)-1);
 375                 foundHost[sizeof(foundHost)-1] = '\0';
 376                 // Bug 4498: URL-unescape the login info after extraction
 377                 rfc1738_unescape(login);
 378             }
 379
 380             /* Is there any host information? (we should eventually parse it above) */
 381             if (*foundHost == '[') {
 382                 /* strip any IPA brackets. valid under IPv6. */
 383                 dst = foundHost;
 384                 /* only for IPv6 sadly, pre-IPv6/URL code can't handle the clean result properly anyway. */
 385                 src = foundHost;
 386                 ++src;
 387                 l = strlen(foundHost);
 388                 i = 1;
 389                 for (; i < l && *src != ']' && *src != '\0'; ++i, ++src, ++dst) {
 390                     *dst = *src;
 391                 }
 392
 393                 /* we moved in-place, so truncate the actual hostname found */
 394                 *dst = '\0';
 395                 ++dst;
 396
 397                 /* skip ahead to either start of port, or original EOS */
 398                 while (*dst != '\0' && *dst != ':')
 399                     ++dst;
 400                 t = dst;
 401             } else {
 402                 t = strrchr(foundHost, ':');
 403
 404                 if (t != strchr(foundHost,':') ) {
 405                     /* RFC 2732 states IPv6 "SHOULD" be bracketed. allowing for times when its not. */
 406                     /* RFC 3986 'update' simply modifies this to an "is" with no emphasis at all! */
 407                     /* therefore we MUST accept the case where they are not bracketed at all. */
 408                     t = nullptr;
 409                 }
 410             }
 411
 412             // Bug 3183 sanity check: If scheme is present, host must be too.
 413             if (scheme != AnyP::PROTO_NONE && foundHost[0] == '\0') {
 414                 debugs(23, DBG_IMPORTANT, "SECURITY ALERT: Missing hostname in URL '" << url << "'. see access.log for details.");
 415                 return false;
 416             }
 417
 418             if (t && *t == ':') {
 419                 *t = '\0';
 420                 ++t;
 421                 foundPort = atoi(t);
 422             }
 423         }
 424
 425         for (t = foundHost; *t; ++t)
 426             *t = xtolower(*t);
 427
 428         if (stringHasWhitespace(foundHost)) {
 429             if (URI_WHITESPACE_STRIP == Config.uri_whitespace) {
 430                 t = q = foundHost;
 431                 while (*t) {
 432                     if (!xisspace(*t)) {
 433                         *q = *t;
 434                         ++q;
 435                     }
 436                     ++t;
 437                 }
 438                 *q = '\0';
 439             }
 440         }
 441
 442         debugs(23, 3, "Split URL '" << rawUrl << "' into proto='" << scheme.image() << "', host='" << foundHost << "', port='" << foundPort << "', path='" << urlpath << "'");
 443
 444         if (Config.onoff.check_hostnames &&
 445                 strspn(foundHost, Config.onoff.allow_underscore ? valid_hostname_chars_u : valid_hostname_chars) != strlen(foundHost)) {
 446             debugs(23, DBG_IMPORTANT, MYNAME << "Illegal character in hostname '" << foundHost << "'");
 447             return false;
 448         }
 449
 450         if (!urlAppendDomain(foundHost))
 451             return false;
 452
 453         /* remove trailing dots from hostnames */
 454         while ((l = strlen(foundHost)) > 0 && foundHost[--l] == '.')
 455             foundHost[l] = '\0';
 456
 457         /* reject duplicate or leading dots */
 458         if (strstr(foundHost, "..") || *foundHost == '.') {
 459             debugs(23, DBG_IMPORTANT, MYNAME << "Illegal hostname '" << foundHost << "'");
 460             return false;
 461         }
 462
 463         if (foundPort < 1 || foundPort > 65535) {
 464             debugs(23, 3, "Invalid port '" << foundPort << "'");
 465             return false;
 466         }
 467
 468         if (stringHasWhitespace(urlpath)) {
 469             debugs(23, 2, "URI has whitespace: {" << rawUrl << "}");
 470
 471             switch (Config.uri_whitespace) {
 472
 473             case URI_WHITESPACE_DENY:
 474                 return false;
 475
 476             case URI_WHITESPACE_ALLOW:
 477                 break;
 478
 479             case URI_WHITESPACE_ENCODE:
 480                 t = rfc1738_escape_unescaped(urlpath);
 481                 xstrncpy(urlpath, t, MAX_URL);
 482                 break;
 483
 484             case URI_WHITESPACE_CHOP:
 485                 *(urlpath + strcspn(urlpath, w_space)) = '\0';
 486                 break;
 487
 488             case URI_WHITESPACE_STRIP:
 489             default:
 490                 t = q = urlpath;
 491                 while (*t) {
 492                     if (!xisspace(*t)) {
 493                         *q = *t;
 494                         ++q;
 495                     }
 496                     ++t;
 497                 }
 498                 *q = '\0';
 499             }
 500         }
 501
 502         setScheme(scheme);
 503         path(urlpath);
 504         host(foundHost);
 505         userInfo(SBuf(login));
 506         port(foundPort);
 507         return true;
 508
 509     } catch (...) {
 510         debugs(23, 2, "error: " << CurrentException << " " << Raw("rawUrl", rawUrl.rawContent(), rawUrl.length()));
 511         return false;
 512     }
 513 }
 514
 515 /**
 516  * Governed by RFC 8141 section 2:
 517  *
 518  *  assigned-name = "urn" ":" NID ":" NSS
 519  *  NID           = (alphanum) 0*30(ldh) (alphanum)
 520  *  ldh           = alphanum / "-"
 521  *  NSS           = pchar *(pchar / "/")
 522  *
 523  * RFC 3986 Appendix D.2 defines (as deprecated):
 524  *
 525  *   alphanum     = ALPHA / DIGIT
 526  *
 527  * Notice that NID is exactly 2-32 characters in length.
 528  */
 529 void
 530 AnyP::Uri::parseUrn(Parser::Tokenizer &tok)
 531 {
 532     static const auto nidChars = CharacterSet("NID","-") + CharacterSet::ALPHA + CharacterSet::DIGIT;
 533     static const auto alphanum = (CharacterSet::ALPHA + CharacterSet::DIGIT).rename("alphanum");
 534     SBuf nid;
 535     if (!tok.prefix(nid, nidChars, 32))
 536         throw TextException("NID not found", Here());
 537
 538     if (!tok.skip(':'))
 539         throw TextException("NID too long or missing ':' delimiter", Here());
 540
 541     if (nid.length() < 2)
 542         throw TextException("NID too short", Here());
 543
 544     if (!alphanum[*nid.begin()])
 545         throw TextException("NID prefix is not alphanumeric", Here());
 546
 547     if (!alphanum[*nid.rbegin()])
 548         throw TextException("NID suffix is not alphanumeric", Here());
 549
 550     setScheme(AnyP::PROTO_URN, nullptr);
 551     host(nid.c_str());
 552     // TODO validate path characters
 553     path(tok.remaining());
 554     debugs(23, 3, "Split URI into proto=urn, nid=" << nid << ", " << Raw("path",path().rawContent(),path().length()));
 555 }
 556
 557 void
 558 AnyP::Uri::touch()
 559 {
 560     absolute_.clear();
 561     authorityHttp_.clear();
 562     authorityWithPort_.clear();
 563 }
 564
 565 SBuf &
 566 AnyP::Uri::authority(bool requirePort) const
 567 {
 568     if (authorityHttp_.isEmpty()) {
 569
 570         // both formats contain Host/IP
 571         authorityWithPort_.append(host());
 572         authorityHttp_ = authorityWithPort_;
 573
 574         // authorityForm_ only has :port if it is non-default
 575         authorityWithPort_.appendf(":%u",port());
 576         if (port() != getScheme().defaultPort())
 577             authorityHttp_ = authorityWithPort_;
 578     }
 579
 580     return requirePort ? authorityWithPort_ : authorityHttp_;
 581 }
 582
 583 SBuf &
 584 AnyP::Uri::absolute() const
 585 {
 586     if (absolute_.isEmpty()) {
 587         // TODO: most URL will be much shorter, avoid allocating this much
 588         absolute_.reserveCapacity(MAX_URL);
 589
 590         absolute_.append(getScheme().image());
 591         absolute_.append(":",1);
 592         if (getScheme() != AnyP::PROTO_URN) {
 593             absolute_.append("//", 2);
 594             const bool allowUserInfo = getScheme() == AnyP::PROTO_FTP ||
 595                                        getScheme() == AnyP::PROTO_UNKNOWN;
 596
 597             if (allowUserInfo && !userInfo().isEmpty()) {
 598                 static const CharacterSet uiChars = CharacterSet(UserInfoChars())
 599                                                     .remove('%')
 600                                                     .rename("userinfo-reserved");
 601                 absolute_.append(Encode(userInfo(), uiChars));
 602                 absolute_.append("@", 1);
 603             }
 604             absolute_.append(authority());
 605         } else {
 606             absolute_.append(host());
 607             absolute_.append(":", 1);
 608         }
 609         absolute_.append(path()); // TODO: Encode each URI subcomponent in path_ as needed.
 610     }
 611
 612     return absolute_;
 613 }
 614
 615 /* XXX: Performance: This is an *almost* duplicate of HttpRequest::effectiveRequestUri(). But elides the query-string.
 616  *        After copying it on in the first place! Would be less code to merge the two with a flag parameter.
 617  *        and never copy the query-string part in the first place
 618  */
 619 char *
 620 urlCanonicalCleanWithoutRequest(const SBuf &url, const HttpRequestMethod &method, const AnyP::UriScheme &scheme)
 621 {
 622     LOCAL_ARRAY(char, buf, MAX_URL);
 623
 624     snprintf(buf, sizeof(buf), SQUIDSBUFPH, SQUIDSBUFPRINT(url));
 625     buf[sizeof(buf)-1] = '\0';
 626
 627     // URN, CONNECT method, and non-stripped URIs can go straight out
 628     if (Config.onoff.strip_query_terms && !(method == Http::METHOD_CONNECT || scheme == AnyP::PROTO_URN)) {
 629         // strip anything AFTER a question-mark
 630         // leaving the '?' in place
 631         if (auto t = strchr(buf, '?')) {
 632             *(++t) = '\0';
 633         }
 634     }
 635
 636     if (stringHasCntl(buf))
 637         xstrncpy(buf, rfc1738_escape_unescaped(buf), MAX_URL);
 638
 639     return buf;
 640 }
 641
 642 /**
 643  * Yet another alternative to urlCanonical.
 644  * This one adds the https:// parts to Http::METHOD_CONNECT URL
 645  * for use in error page outputs.
 646  * Luckily we can leverage the others instead of duplicating.
 647  */
 648 const char *
 649 urlCanonicalFakeHttps(const HttpRequest * request)
 650 {
 651     LOCAL_ARRAY(char, buf, MAX_URL);
 652
 653     // method CONNECT and port HTTPS
 654     if (request->method == Http::METHOD_CONNECT && request->url.port() == 443) {
 655         snprintf(buf, MAX_URL, "https://%s/*", request->url.host());
 656         return buf;
 657     }
 658
 659     // else do the normal complete canonical thing.
 660     return request->canonicalCleanUrl();
 661 }
 662
 663 /**
 664  * Test if a URL is a relative reference.
 665  *
 666  * Governed by RFC 3986 section 4.2
 667  *
 668  *  relative-ref  = relative-part [ "?" query ] [ "#" fragment ]
 669  *
 670  *  relative-part = "//" authority path-abempty
 671  *                / path-absolute
 672  *                / path-noscheme
 673  *                / path-empty
 674  */
 675 bool
 676 urlIsRelative(const char *url)
 677 {
 678     if (!url)
 679         return false; // no URL
 680
 681     /*
 682      * RFC 3986 section 5.2.3
 683      *
 684      * path          = path-abempty    ; begins with "/" or is empty
 685      *               / path-absolute   ; begins with "/" but not "//"
 686      *               / path-noscheme   ; begins with a non-colon segment
 687      *               / path-rootless   ; begins with a segment
 688      *               / path-empty      ; zero characters
 689      */
 690
 691     if (*url == '\0')
 692         return true; // path-empty
 693
 694     if (*url == '/') {
 695         // RFC 3986 section 5.2.3
 696         // path-absolute   ; begins with "/" but not "//"
 697         if (url[1] == '/')
 698             return true; // network-path reference, aka. 'scheme-relative URI'
 699         else
 700             return true; // path-absolute, aka 'absolute-path reference'
 701     }
 702
 703     for (const auto *p = url; *p != '\0' && *p != '/' && *p != '?' && *p != '#'; ++p) {
 704         if (*p == ':')
 705             return false; // colon is forbidden in first segment
 706     }
 707
 708     return true; // path-noscheme, path-abempty, path-rootless
 709 }
 710
 711 void
 712 AnyP::Uri::addRelativePath(const char *relUrl)
 713 {
 714     // URN cannot be merged
 715     if (getScheme() == AnyP::PROTO_URN)
 716         return;
 717
 718     // TODO: Handle . and .. segment normalization
 719
 720     const auto lastSlashPos = path_.rfind('/');
 721     // TODO: To optimize and simplify, add and use SBuf::replace().
 722     const auto relUrlLength = strlen(relUrl);
 723     if (lastSlashPos == SBuf::npos) {
 724         // start replacing the whole path
 725         path_.reserveCapacity(1 + relUrlLength);
 726         path_.assign("/", 1);
 727     } else {
 728         // start replacing just the last segment
 729         path_.reserveCapacity(lastSlashPos + 1 + relUrlLength);
 730         path_.chop(0, lastSlashPos+1);
 731     }
 732     path_.append(relUrl, relUrlLength);
 733 }
 734
 735 int
 736 matchDomainName(const char *h, const char *d, MatchDomainNameFlags flags)
 737 {
 738     int dl;
 739     int hl;
 740
 741     const bool hostIncludesSubdomains = (*h == '.');
 742     while ('.' == *h)
 743         ++h;
 744
 745     hl = strlen(h);
 746
 747     if (hl == 0)
 748         return -1;
 749
 750     dl = strlen(d);
 751
 752     /*
 753      * Start at the ends of the two strings and work towards the
 754      * beginning.
 755      */
 756     while (xtolower(h[--hl]) == xtolower(d[--dl])) {
 757         if (hl == 0 && dl == 0) {
 758             /*
 759              * We made it all the way to the beginning of both
 760              * strings without finding any difference.
 761              */
 762             return 0;
 763         }
 764
 765         if (0 == hl) {
 766             /*
 767              * The host string is shorter than the domain string.
 768              * There is only one case when this can be a match.
 769              * If the domain is just one character longer, and if
 770              * that character is a leading '.' then we call it a
 771              * match.
 772              */
 773
 774             if (1 == dl && '.' == d[0])
 775                 return 0;
 776             else
 777                 return -1;
 778         }
 779
 780         if (0 == dl) {
 781             /*
 782              * The domain string is shorter than the host string.
 783              * This is a match only if the first domain character
 784              * is a leading '.'.
 785              */
 786
 787             if ('.' == d[0]) {
 788                 if (flags & mdnRejectSubsubDomains) {
 789                     // Check for sub-sub domain and reject
 790                     while(--hl >= 0 && h[hl] != '.');
 791                     if (hl < 0) {
 792                         // No sub-sub domain found, but reject if there is a
 793                         // leading dot in given host string (which is removed
 794                         // before the check is started).
 795                         return hostIncludesSubdomains ? 1 : 0;
 796                     } else
 797                         return 1; // sub-sub domain, reject
 798                 } else
 799                     return 0;
 800             } else
 801                 return 1;
 802         }
 803     }
 804
 805     /*
 806      * We found different characters in the same position (from the end).
 807      */
 808
 809     // If the h has a form of "*.foo.com" and d has a form of "x.foo.com"
 810     // then the h[hl] points to '*', h[hl+1] to '.' and d[dl] to 'x'
 811     // The following checks are safe, the "h[hl + 1]" in the worst case is '\0'.
 812     if ((flags & mdnHonorWildcards) && h[hl] == '*' && h[hl + 1] == '.')
 813         return 0;
 814
 815     /*
 816      * If one of those character is '.' then its special.  In order
 817      * for splay tree sorting to work properly, "x-foo.com" must
 818      * be greater than ".foo.com" even though '-' is less than '.'.
 819      */
 820     if ('.' == d[dl])
 821         return 1;
 822
 823     if ('.' == h[hl])
 824         return -1;
 825
 826     return (xtolower(h[hl]) - xtolower(d[dl]));
 827 }
 828
 829 /*
 830  * return true if we can serve requests for this method.
 831  */
 832 bool
 833 urlCheckRequest(const HttpRequest * r)
 834 {
 835     /* protocol "independent" methods
 836      *
 837      * actually these methods are specific to HTTP:
 838      * they are methods we receive on our HTTP port,
 839      * and if we had a FTP listener would not be relevant
 840      * there.
 841      *
 842      * So, we should delegate them to HTTP. The problem is that we
 843      * do not have a default protocol from the client side of HTTP.
 844      */
 845
 846     if (r->method == Http::METHOD_CONNECT)
 847         return true;
 848
 849     // we support OPTIONS and TRACE directed at us (with a 501 reply, for now)
 850     // we also support forwarding OPTIONS and TRACE, except for the *-URI ones
 851     if (r->method == Http::METHOD_OPTIONS || r->method == Http::METHOD_TRACE)
 852         return (r->header.getInt64(Http::HdrType::MAX_FORWARDS) == 0 || r->url.path() != AnyP::Uri::Asterisk());
 853
 854     if (r->method == Http::METHOD_PURGE)
 855         return true;
 856
 857     /* does method match the protocol? */
 858     switch (r->url.getScheme()) {
 859
 860     case AnyP::PROTO_URN:
 861     case AnyP::PROTO_HTTP:
 862     case AnyP::PROTO_CACHE_OBJECT:
 863         return true;
 864
 865     case AnyP::PROTO_FTP:
 866         if (r->method == Http::METHOD_PUT ||
 867                 r->method == Http::METHOD_GET ||
 868                 r->method == Http::METHOD_HEAD )
 869             return true;
 870         return false;
 871
 872     case AnyP::PROTO_WAIS:
 873     case AnyP::PROTO_WHOIS:
 874         if (r->method == Http::METHOD_GET ||
 875                 r->method == Http::METHOD_HEAD)
 876             return true;
 877         return false;
 878
 879     case AnyP::PROTO_HTTPS:
 880 #if USE_OPENSSL || USE_GNUTLS
 881         return true;
 882 #else
 883         /*
 884          * Squid can't originate an SSL connection, so it should
 885          * never receive an "https:" URL.  It should always be
 886          * CONNECT instead.
 887          */
 888         return false;
 889 #endif
 890
 891     default:
 892         return false;
 893     }
 894
 895     /* notreached */
 896     return false;
 897 }
 898
 899 AnyP::Uri::Uri(AnyP::UriScheme const &aScheme) :
 900     scheme_(aScheme),
 901     hostIsNumeric_(false),
 902     port_(0)
 903 {
 904     *host_=0;
 905 }
 906
 907 // TODO: fix code duplication with AnyP::Uri::parse()
 908 char *
 909 AnyP::Uri::cleanup(const char *uri)
 910 {
 911     char *cleanedUri = nullptr;
 912     switch (Config.uri_whitespace) {
 913     case URI_WHITESPACE_ALLOW: {
 914         const auto flags = RFC1738_ESCAPE_NOSPACE | RFC1738_ESCAPE_UNESCAPED;
 915         cleanedUri = xstrndup(rfc1738_do_escape(uri, flags), MAX_URL);
 916         break;
 917     }
 918
 919     case URI_WHITESPACE_ENCODE:
 920         cleanedUri = xstrndup(rfc1738_do_escape(uri, RFC1738_ESCAPE_UNESCAPED), MAX_URL);
 921         break;
 922
 923     case URI_WHITESPACE_CHOP: {
 924         const auto pos = strcspn(uri, w_space);
 925         char *choppedUri = nullptr;
 926         if (pos < strlen(uri))
 927             choppedUri = xstrndup(uri, pos + 1);
 928         cleanedUri = xstrndup(rfc1738_do_escape(choppedUri ? choppedUri : uri,
 929                                                 RFC1738_ESCAPE_UNESCAPED), MAX_URL);
 930         cleanedUri[pos] = '\0';
 931         xfree(choppedUri);
 932         break;
 933     }
 934
 935     case URI_WHITESPACE_DENY:
 936     case URI_WHITESPACE_STRIP:
 937     default: {
 938         // TODO: avoid duplication with urlParse()
 939         const char *t;
 940         char *tmp_uri = static_cast<char*>(xmalloc(strlen(uri) + 1));
 941         char *q = tmp_uri;
 942         t = uri;
 943         while (*t) {
 944             if (!xisspace(*t)) {
 945                 *q = *t;
 946                 ++q;
 947             }
 948             ++t;
 949         }
 950         *q = '\0';
 951         cleanedUri = xstrndup(rfc1738_escape_unescaped(tmp_uri), MAX_URL);
 952         xfree(tmp_uri);
 953         break;
 954     }
 955     }
 956
 957     assert(cleanedUri);
 958     return cleanedUri;
 959 }
 960