src/anyp/Uri.cc

   1 /*
   2  * Copyright (C) 1996-2020 The Squid Software Foundation and contributors
   3  *
   4  * Squid software is distributed under GPLv2+ license and includes
   5  * contributions from numerous individuals and organizations.
   6  * Please see the COPYING and CONTRIBUTORS files for details.
   7  */
   8
   9 /* DEBUG: section 23    URL Parsing */
  10
  11 #include "squid.h"
  12 #include "anyp/Uri.h"
  13 #include "globals.h"
  14 #include "HttpRequest.h"
  15 #include "parser/Tokenizer.h"
  16 #include "rfc1738.h"
  17 #include "SquidConfig.h"
  18 #include "SquidString.h"
  19
  20 static const char valid_hostname_chars_u[] =
  21     "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
  22     "abcdefghijklmnopqrstuvwxyz"
  23     "0123456789-._"
  24     "[:]"
  25     ;
  26 static const char valid_hostname_chars[] =
  27     "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
  28     "abcdefghijklmnopqrstuvwxyz"
  29     "0123456789-."
  30     "[:]"
  31     ;
  32
  33 const SBuf &
  34 AnyP::Uri::Asterisk()
  35 {
  36     static SBuf star("*");
  37     return star;
  38 }
  39
  40 const SBuf &
  41 AnyP::Uri::SlashPath()
  42 {
  43     static SBuf slash("/");
  44     return slash;
  45 }
  46
  47 void
  48 AnyP::Uri::host(const char *src)
  49 {
  50     hostAddr_.setEmpty();
  51     hostAddr_ = src;
  52     if (hostAddr_.isAnyAddr()) {
  53         xstrncpy(host_, src, sizeof(host_));
  54         hostIsNumeric_ = false;
  55     } else {
  56         hostAddr_.toHostStr(host_, sizeof(host_));
  57         debugs(23, 3, "given IP: " << hostAddr_);
  58         hostIsNumeric_ = 1;
  59     }
  60     touch();
  61 }
  62
  63 SBuf
  64 AnyP::Uri::hostOrIp() const
  65 {
  66     static char ip[MAX_IPSTRLEN];
  67     if (hostIsNumeric())
  68         return SBuf(hostIP().toStr(ip, sizeof(ip)));
  69     else
  70         return SBuf(host());
  71 }
  72
  73 const SBuf &
  74 AnyP::Uri::path() const
  75 {
  76     // RFC 3986 section 3.3 says path can be empty (path-abempty).
  77     // RFC 7230 sections 2.7.3, 5.3.1, 5.7.2 - says path cannot be empty, default to "/"
  78     // at least when sending and using. We must still accept path-abempty as input.
  79     if (path_.isEmpty() && (scheme_ == AnyP::PROTO_HTTP || scheme_ == AnyP::PROTO_HTTPS))
  80         return SlashPath();
  81
  82     return path_;
  83 }
  84
  85 void
  86 urlInitialize(void)
  87 {
  88     debugs(23, 5, "urlInitialize: Initializing...");
  89     /* this ensures that the number of protocol strings is the same as
  90      * the enum slots allocated because the last enum is always 'MAX'.
  91      */
  92     assert(strcmp(AnyP::ProtocolType_str[AnyP::PROTO_MAX], "MAX") == 0);
  93     /*
  94      * These test that our matchDomainName() function works the
  95      * way we expect it to.
  96      */
  97     assert(0 == matchDomainName("foo.com", "foo.com"));
  98     assert(0 == matchDomainName(".foo.com", "foo.com"));
  99     assert(0 == matchDomainName("foo.com", ".foo.com"));
 100     assert(0 == matchDomainName(".foo.com", ".foo.com"));
 101     assert(0 == matchDomainName("x.foo.com", ".foo.com"));
 102     assert(0 == matchDomainName("y.x.foo.com", ".foo.com"));
 103     assert(0 != matchDomainName("x.foo.com", "foo.com"));
 104     assert(0 != matchDomainName("foo.com", "x.foo.com"));
 105     assert(0 != matchDomainName("bar.com", "foo.com"));
 106     assert(0 != matchDomainName(".bar.com", "foo.com"));
 107     assert(0 != matchDomainName(".bar.com", ".foo.com"));
 108     assert(0 != matchDomainName("bar.com", ".foo.com"));
 109     assert(0 < matchDomainName("zzz.com", "foo.com"));
 110     assert(0 > matchDomainName("aaa.com", "foo.com"));
 111     assert(0 == matchDomainName("FOO.com", "foo.COM"));
 112     assert(0 < matchDomainName("bfoo.com", "afoo.com"));
 113     assert(0 > matchDomainName("afoo.com", "bfoo.com"));
 114     assert(0 < matchDomainName("x-foo.com", ".foo.com"));
 115
 116     assert(0 == matchDomainName(".foo.com", ".foo.com", mdnRejectSubsubDomains));
 117     assert(0 == matchDomainName("x.foo.com", ".foo.com", mdnRejectSubsubDomains));
 118     assert(0 != matchDomainName("y.x.foo.com", ".foo.com", mdnRejectSubsubDomains));
 119     assert(0 != matchDomainName(".x.foo.com", ".foo.com", mdnRejectSubsubDomains));
 120
 121     assert(0 == matchDomainName("*.foo.com", "x.foo.com", mdnHonorWildcards));
 122     assert(0 == matchDomainName("*.foo.com", ".x.foo.com", mdnHonorWildcards));
 123     assert(0 == matchDomainName("*.foo.com", ".foo.com", mdnHonorWildcards));
 124     assert(0 != matchDomainName("*.foo.com", "foo.com", mdnHonorWildcards));
 125
 126     /* more cases? */
 127 }
 128
 129 /**
 130  * Extract the URI scheme and ':' delimiter from the given input buffer.
 131  *
 132  * Schemes up to 16 characters are accepted.
 133  *
 134  * Governed by RFC 3986 section 3.1
 135  */
 136 static AnyP::UriScheme
 137 uriParseScheme(Parser::Tokenizer &tok)
 138 {
 139     /*
 140      * RFC 3986 section 3.1 paragraph 2:
 141      *
 142      * Scheme names consist of a sequence of characters beginning with a
 143      * letter and followed by any combination of letters, digits, plus
 144      * ("+"), period ("."), or hyphen ("-").
 145      *
 146      * The underscore ("_") required to match "cache_object://" squid
 147      * special URI scheme.
 148      */
 149     static const auto schemeChars =
 150 #if USE_HTTP_VIOLATIONS
 151         CharacterSet("special", "_") +
 152 #endif
 153         CharacterSet("scheme", "+.-") + CharacterSet::ALPHA + CharacterSet::DIGIT;
 154
 155     SBuf str;
 156     if (tok.prefix(str, schemeChars, 16) && tok.skip(':') && CharacterSet::ALPHA[str.at(0)]) {
 157         const auto protocol = AnyP::UriScheme::FindProtocolType(str);
 158         if (protocol == AnyP::PROTO_UNKNOWN)
 159             return AnyP::UriScheme(protocol, str.c_str());
 160         return AnyP::UriScheme(protocol, nullptr);
 161     }
 162
 163     throw TextException("invalid URI scheme", Here());
 164 }
 165
 166 /**
 167  * Appends configured append_domain to hostname, assuming
 168  * the given buffer is at least SQUIDHOSTNAMELEN bytes long,
 169  * and that the host FQDN is not a 'dotless' TLD.
 170  *
 171  * \returns false if and only if there is not enough space to append
 172  */
 173 bool
 174 urlAppendDomain(char *host)
 175 {
 176     /* For IPv4 addresses check for a dot */
 177     /* For IPv6 addresses also check for a colon */
 178     if (Config.appendDomain && !strchr(host, '.') && !strchr(host, ':')) {
 179         const uint64_t dlen = strlen(host);
 180         const uint64_t want = dlen + Config.appendDomainLen;
 181         if (want > SQUIDHOSTNAMELEN - 1) {
 182             debugs(23, 2, "URL domain too large (" << dlen << " bytes)");
 183             return false;
 184         }
 185         strncat(host, Config.appendDomain, SQUIDHOSTNAMELEN - dlen - 1);
 186     }
 187     return true;
 188 }
 189
 190 /*
 191  * Parse a URI/URL.
 192  *
 193  * It is assumed that the URL is complete -
 194  * ie, the end of the string is the end of the URL. Don't pass a partial
 195  * URL here as this routine doesn't have any way of knowing whether
 196  * it is partial or not (ie, it handles the case of no trailing slash as
 197  * being "end of host with implied path of /".
 198  *
 199  * method is used to switch parsers. If method is Http::METHOD_CONNECT,
 200  * then rather than a URL a hostname:port is looked for.
 201  */
 202 bool
 203 AnyP::Uri::parse(const HttpRequestMethod& method, const SBuf &rawUrl)
 204 {
 205     try {
 206
 207         LOCAL_ARRAY(char, login, MAX_URL);
 208         LOCAL_ARRAY(char, foundHost, MAX_URL);
 209         LOCAL_ARRAY(char, urlpath, MAX_URL);
 210         char *t = NULL;
 211         char *q = NULL;
 212         int foundPort;
 213         int l;
 214         int i;
 215         const char *src;
 216         char *dst;
 217         foundHost[0] = urlpath[0] = login[0] = '\0';
 218
 219         if ((l = rawUrl.length()) + Config.appendDomainLen > (MAX_URL - 1)) {
 220             debugs(23, DBG_IMPORTANT, MYNAME << "URL too large (" << l << " bytes)");
 221             return false;
 222         }
 223
 224         if ((method == Http::METHOD_OPTIONS || method == Http::METHOD_TRACE) &&
 225                 Asterisk().cmp(rawUrl) == 0) {
 226             // XXX: these methods might also occur in HTTPS traffic. Handle this better.
 227             setScheme(AnyP::PROTO_HTTP, nullptr);
 228             port(getScheme().defaultPort());
 229             path(Asterisk());
 230             return true;
 231         }
 232
 233         Parser::Tokenizer tok(rawUrl);
 234         AnyP::UriScheme scheme;
 235
 236         if (method == Http::METHOD_CONNECT) {
 237             /*
 238              * RFC 7230 section 5.3.3:  authority-form = authority
 239              *  "excluding any userinfo and its "@" delimiter"
 240              *
 241              * RFC 3986 section 3.2:    authority = [ userinfo "@" ] host [ ":" port ]
 242              *
 243              * As an HTTP(S) proxy we assume HTTPS (443) if no port provided.
 244              */
 245             foundPort = 443;
 246
 247             // XXX: use tokenizer
 248             auto B = tok.buf();
 249             const char *url = B.c_str();
 250
 251             if (sscanf(url, "[%[^]]]:%d", foundHost, &foundPort) < 1)
 252                 if (sscanf(url, "%[^:]:%d", foundHost, &foundPort) < 1)
 253                     return false;
 254
 255         } else {
 256
 257             scheme = uriParseScheme(tok);
 258
 259             if (scheme == AnyP::PROTO_NONE)
 260                 return false; // invalid scheme
 261
 262             if (scheme == AnyP::PROTO_URN) {
 263                 parseUrn(tok); // throws on any error
 264                 return true;
 265             }
 266
 267             // URLs then have "//"
 268             static const SBuf doubleSlash("//");
 269             if (!tok.skip(doubleSlash))
 270                 return false;
 271
 272             auto B = tok.remaining();
 273             const char *url = B.c_str();
 274
 275             /* Parse the URL: */
 276             src = url;
 277             i = 0;
 278
 279             /* Then everything until first /; thats host (and port; which we'll look for here later) */
 280             // bug 1881: If we don't get a "/" then we imply it was there
 281             // bug 3074: We could just be given a "?" or "#". These also imply "/"
 282             // bug 3233: whitespace is also a hostname delimiter.
 283             for (dst = foundHost; i < l && *src != '/' && *src != '?' && *src != '#' && *src != '\0' && !xisspace(*src); ++i, ++src, ++dst) {
 284                 *dst = *src;
 285             }
 286
 287             /*
 288              * We can't check for "i >= l" here because we could be at the end of the line
 289              * and have a perfectly valid URL w/ no trailing '/'. In this case we assume we've
 290              * been -given- a valid URL and the path is just '/'.
 291              */
 292             if (i > l)
 293                 return false;
 294             *dst = '\0';
 295
 296             // bug 3074: received 'path' starting with '?', '#', or '\0' implies '/'
 297             if (*src == '?' || *src == '#' || *src == '\0') {
 298                 urlpath[0] = '/';
 299                 dst = &urlpath[1];
 300             } else {
 301                 dst = urlpath;
 302             }
 303             /* Then everything from / (inclusive) until \r\n or \0 - thats urlpath */
 304             for (; i < l && *src != '\r' && *src != '\n' && *src != '\0'; ++i, ++src, ++dst) {
 305                 *dst = *src;
 306             }
 307
 308             /* We -could- be at the end of the buffer here */
 309             if (i > l)
 310                 return false;
 311             /* If the URL path is empty we set it to be "/" */
 312             if (dst == urlpath) {
 313                 *dst = '/';
 314                 ++dst;
 315             }
 316             *dst = '\0';
 317
 318             foundPort = scheme.defaultPort(); // may be reset later
 319
 320             /* Is there any login information? (we should eventually parse it above) */
 321             t = strrchr(foundHost, '@');
 322             if (t != NULL) {
 323                 strncpy((char *) login, (char *) foundHost, sizeof(login)-1);
 324                 login[sizeof(login)-1] = '\0';
 325                 t = strrchr(login, '@');
 326                 *t = 0;
 327                 strncpy((char *) foundHost, t + 1, sizeof(foundHost)-1);
 328                 foundHost[sizeof(foundHost)-1] = '\0';
 329                 // Bug 4498: URL-unescape the login info after extraction
 330                 rfc1738_unescape(login);
 331             }
 332
 333             /* Is there any host information? (we should eventually parse it above) */
 334             if (*foundHost == '[') {
 335                 /* strip any IPA brackets. valid under IPv6. */
 336                 dst = foundHost;
 337                 /* only for IPv6 sadly, pre-IPv6/URL code can't handle the clean result properly anyway. */
 338                 src = foundHost;
 339                 ++src;
 340                 l = strlen(foundHost);
 341                 i = 1;
 342                 for (; i < l && *src != ']' && *src != '\0'; ++i, ++src, ++dst) {
 343                     *dst = *src;
 344                 }
 345
 346                 /* we moved in-place, so truncate the actual hostname found */
 347                 *dst = '\0';
 348                 ++dst;
 349
 350                 /* skip ahead to either start of port, or original EOS */
 351                 while (*dst != '\0' && *dst != ':')
 352                     ++dst;
 353                 t = dst;
 354             } else {
 355                 t = strrchr(foundHost, ':');
 356
 357                 if (t != strchr(foundHost,':') ) {
 358                     /* RFC 2732 states IPv6 "SHOULD" be bracketed. allowing for times when its not. */
 359                     /* RFC 3986 'update' simply modifies this to an "is" with no emphasis at all! */
 360                     /* therefore we MUST accept the case where they are not bracketed at all. */
 361                     t = NULL;
 362                 }
 363             }
 364
 365             // Bug 3183 sanity check: If scheme is present, host must be too.
 366             if (scheme != AnyP::PROTO_NONE && foundHost[0] == '\0') {
 367                 debugs(23, DBG_IMPORTANT, "SECURITY ALERT: Missing hostname in URL '" << url << "'. see access.log for details.");
 368                 return false;
 369             }
 370
 371             if (t && *t == ':') {
 372                 *t = '\0';
 373                 ++t;
 374                 foundPort = atoi(t);
 375             }
 376         }
 377
 378         for (t = foundHost; *t; ++t)
 379             *t = xtolower(*t);
 380
 381         if (stringHasWhitespace(foundHost)) {
 382             if (URI_WHITESPACE_STRIP == Config.uri_whitespace) {
 383                 t = q = foundHost;
 384                 while (*t) {
 385                     if (!xisspace(*t)) {
 386                         *q = *t;
 387                         ++q;
 388                     }
 389                     ++t;
 390                 }
 391                 *q = '\0';
 392             }
 393         }
 394
 395         debugs(23, 3, "Split URL '" << rawUrl << "' into proto='" << scheme.image() << "', host='" << foundHost << "', port='" << foundPort << "', path='" << urlpath << "'");
 396
 397         if (Config.onoff.check_hostnames &&
 398                 strspn(foundHost, Config.onoff.allow_underscore ? valid_hostname_chars_u : valid_hostname_chars) != strlen(foundHost)) {
 399             debugs(23, DBG_IMPORTANT, MYNAME << "Illegal character in hostname '" << foundHost << "'");
 400             return false;
 401         }
 402
 403         if (!urlAppendDomain(foundHost))
 404             return false;
 405
 406         /* remove trailing dots from hostnames */
 407         while ((l = strlen(foundHost)) > 0 && foundHost[--l] == '.')
 408             foundHost[l] = '\0';
 409
 410         /* reject duplicate or leading dots */
 411         if (strstr(foundHost, "..") || *foundHost == '.') {
 412             debugs(23, DBG_IMPORTANT, MYNAME << "Illegal hostname '" << foundHost << "'");
 413             return false;
 414         }
 415
 416         if (foundPort < 1 || foundPort > 65535) {
 417             debugs(23, 3, "Invalid port '" << foundPort << "'");
 418             return false;
 419         }
 420
 421 #if HARDCODE_DENY_PORTS
 422         /* These ports are filtered in the default squid.conf, but
 423          * maybe someone wants them hardcoded... */
 424         if (foundPort == 7 || foundPort == 9 || foundPort == 19) {
 425             debugs(23, DBG_CRITICAL, MYNAME << "Deny access to port " << foundPort);
 426             return false;
 427         }
 428 #endif
 429
 430         if (stringHasWhitespace(urlpath)) {
 431             debugs(23, 2, "URI has whitespace: {" << rawUrl << "}");
 432
 433             switch (Config.uri_whitespace) {
 434
 435             case URI_WHITESPACE_DENY:
 436                 return false;
 437
 438             case URI_WHITESPACE_ALLOW:
 439                 break;
 440
 441             case URI_WHITESPACE_ENCODE:
 442                 t = rfc1738_escape_unescaped(urlpath);
 443                 xstrncpy(urlpath, t, MAX_URL);
 444                 break;
 445
 446             case URI_WHITESPACE_CHOP:
 447                 *(urlpath + strcspn(urlpath, w_space)) = '\0';
 448                 break;
 449
 450             case URI_WHITESPACE_STRIP:
 451             default:
 452                 t = q = urlpath;
 453                 while (*t) {
 454                     if (!xisspace(*t)) {
 455                         *q = *t;
 456                         ++q;
 457                     }
 458                     ++t;
 459                 }
 460                 *q = '\0';
 461             }
 462         }
 463
 464         setScheme(scheme);
 465         path(urlpath);
 466         host(foundHost);
 467         userInfo(SBuf(login));
 468         port(foundPort);
 469         return true;
 470
 471     } catch (...) {
 472         debugs(23, 2, "error: " << CurrentException << " " << Raw("rawUrl", rawUrl.rawContent(), rawUrl.length()));
 473         return false;
 474     }
 475 }
 476
 477 /**
 478  * Governed by RFC 8141 section 2:
 479  *
 480  *  assigned-name = "urn" ":" NID ":" NSS
 481  *  NID           = (alphanum) 0*30(ldh) (alphanum)
 482  *  ldh           = alphanum / "-"
 483  *  NSS           = pchar *(pchar / "/")
 484  *
 485  * RFC 3986 Appendix D.2 defines (as deprecated):
 486  *
 487  *   alphanum     = ALPHA / DIGIT
 488  *
 489  * Notice that NID is exactly 2-32 characters in length.
 490  */
 491 void
 492 AnyP::Uri::parseUrn(Parser::Tokenizer &tok)
 493 {
 494     static const auto nidChars = CharacterSet("NID","-") + CharacterSet::ALPHA + CharacterSet::DIGIT;
 495     static const auto alphanum = (CharacterSet::ALPHA + CharacterSet::DIGIT).rename("alphanum");
 496     SBuf nid;
 497     if (!tok.prefix(nid, nidChars, 32))
 498         throw TextException("NID not found", Here());
 499
 500     if (!tok.skip(':'))
 501         throw TextException("NID too long or missing ':' delimiter", Here());
 502
 503     if (nid.length() < 2)
 504         throw TextException("NID too short", Here());
 505
 506     if (!alphanum[*nid.begin()])
 507         throw TextException("NID prefix is not alphanumeric", Here());
 508
 509     if (!alphanum[*nid.rbegin()])
 510         throw TextException("NID suffix is not alphanumeric", Here());
 511
 512     setScheme(AnyP::PROTO_URN, nullptr);
 513     host(nid.c_str());
 514     // TODO validate path characters
 515     path(tok.remaining());
 516     debugs(23, 3, "Split URI into proto=urn, nid=" << nid << ", " << Raw("path",path().rawContent(),path().length()));
 517 }
 518
 519 void
 520 AnyP::Uri::touch()
 521 {
 522     absolute_.clear();
 523     authorityHttp_.clear();
 524     authorityWithPort_.clear();
 525 }
 526
 527 SBuf &
 528 AnyP::Uri::authority(bool requirePort) const
 529 {
 530     if (authorityHttp_.isEmpty()) {
 531
 532         // both formats contain Host/IP
 533         authorityWithPort_.append(host());
 534         authorityHttp_ = authorityWithPort_;
 535
 536         // authorityForm_ only has :port if it is non-default
 537         authorityWithPort_.appendf(":%u",port());
 538         if (port() != getScheme().defaultPort())
 539             authorityHttp_ = authorityWithPort_;
 540     }
 541
 542     return requirePort ? authorityWithPort_ : authorityHttp_;
 543 }
 544
 545 SBuf &
 546 AnyP::Uri::absolute() const
 547 {
 548     if (absolute_.isEmpty()) {
 549         // TODO: most URL will be much shorter, avoid allocating this much
 550         absolute_.reserveCapacity(MAX_URL);
 551
 552         absolute_.append(getScheme().image());
 553         absolute_.append(":",1);
 554         if (getScheme() != AnyP::PROTO_URN) {
 555             absolute_.append("//", 2);
 556             const bool allowUserInfo = getScheme() == AnyP::PROTO_FTP ||
 557                                        getScheme() == AnyP::PROTO_UNKNOWN;
 558
 559             if (allowUserInfo && !userInfo().isEmpty()) {
 560                 absolute_.append(userInfo());
 561                 absolute_.append("@", 1);
 562             }
 563             absolute_.append(authority());
 564         } else {
 565             absolute_.append(host());
 566             absolute_.append(":", 1);
 567         }
 568         absolute_.append(path());
 569     }
 570
 571     return absolute_;
 572 }
 573
 574 /** \todo AYJ: Performance: This is an *almost* duplicate of HttpRequest::effectiveRequestUri(). But elides the query-string.
 575  *        After copying it on in the first place! Would be less code to merge the two with a flag parameter.
 576  *        and never copy the query-string part in the first place
 577  */
 578 char *
 579 urlCanonicalCleanWithoutRequest(const SBuf &url, const HttpRequestMethod &method, const AnyP::UriScheme &scheme)
 580 {
 581     LOCAL_ARRAY(char, buf, MAX_URL);
 582
 583     snprintf(buf, sizeof(buf), SQUIDSBUFPH, SQUIDSBUFPRINT(url));
 584     buf[sizeof(buf)-1] = '\0';
 585
 586     // URN, CONNECT method, and non-stripped URIs can go straight out
 587     if (Config.onoff.strip_query_terms && !(method == Http::METHOD_CONNECT || scheme == AnyP::PROTO_URN)) {
 588         // strip anything AFTER a question-mark
 589         // leaving the '?' in place
 590         if (auto t = strchr(buf, '?')) {
 591             *(++t) = '\0';
 592         }
 593     }
 594
 595     if (stringHasCntl(buf))
 596         xstrncpy(buf, rfc1738_escape_unescaped(buf), MAX_URL);
 597
 598     return buf;
 599 }
 600
 601 /**
 602  * Yet another alternative to urlCanonical.
 603  * This one adds the https:// parts to Http::METHOD_CONNECT URL
 604  * for use in error page outputs.
 605  * Luckily we can leverage the others instead of duplicating.
 606  */
 607 const char *
 608 urlCanonicalFakeHttps(const HttpRequest * request)
 609 {
 610     LOCAL_ARRAY(char, buf, MAX_URL);
 611
 612     // method CONNECT and port HTTPS
 613     if (request->method == Http::METHOD_CONNECT && request->url.port() == 443) {
 614         snprintf(buf, MAX_URL, "https://%s/*", request->url.host());
 615         return buf;
 616     }
 617
 618     // else do the normal complete canonical thing.
 619     return request->canonicalCleanUrl();
 620 }
 621
 622 /*
 623  * Test if a URL is relative.
 624  *
 625  * RFC 2396, Section 5 (Page 17) implies that in a relative URL, a '/' will
 626  * appear before a ':'.
 627  */
 628 bool
 629 urlIsRelative(const char *url)
 630 {
 631     const char *p;
 632
 633     if (url == NULL) {
 634         return (false);
 635     }
 636     if (*url == '\0') {
 637         return (false);
 638     }
 639
 640     for (p = url; *p != '\0' && *p != ':' && *p != '/'; ++p);
 641
 642     if (*p == ':') {
 643         return (false);
 644     }
 645     return (true);
 646 }
 647
 648 /*
 649  * Convert a relative URL to an absolute URL using the context of a given
 650  * request.
 651  *
 652  * It is assumed that you have already ensured that the URL is relative.
 653  *
 654  * If NULL is returned it is an indication that the method in use in the
 655  * request does not distinguish between relative and absolute and you should
 656  * use the url unchanged.
 657  *
 658  * If non-NULL is returned, it is up to the caller to free the resulting
 659  * memory using safe_free().
 660  */
 661 char *
 662 urlMakeAbsolute(const HttpRequest * req, const char *relUrl)
 663 {
 664
 665     if (req->method.id() == Http::METHOD_CONNECT) {
 666         return (NULL);
 667     }
 668
 669     char *urlbuf = (char *)xmalloc(MAX_URL * sizeof(char));
 670
 671     if (req->url.getScheme() == AnyP::PROTO_URN) {
 672         // XXX: this is what the original code did, but it seems to break the
 673         // intended behaviour of this function. It returns the stored URN path,
 674         // not converting the given one into a URN...
 675         snprintf(urlbuf, MAX_URL, SQUIDSBUFPH, SQUIDSBUFPRINT(req->url.absolute()));
 676         return (urlbuf);
 677     }
 678
 679     SBuf authorityForm = req->url.authority(); // host[:port]
 680     const SBuf &scheme = req->url.getScheme().image();
 681     size_t urllen = snprintf(urlbuf, MAX_URL, SQUIDSBUFPH "://" SQUIDSBUFPH "%s" SQUIDSBUFPH,
 682                              SQUIDSBUFPRINT(scheme),
 683                              SQUIDSBUFPRINT(req->url.userInfo()),
 684                              !req->url.userInfo().isEmpty() ? "@" : "",
 685                              SQUIDSBUFPRINT(authorityForm));
 686
 687     // if the first char is '/' assume its a relative path
 688     // XXX: this breaks on scheme-relative URLs,
 689     // but we should not see those outside ESI, and rarely there.
 690     // XXX: also breaks on any URL containing a '/' in the query-string portion
 691     if (relUrl[0] == '/') {
 692         xstrncpy(&urlbuf[urllen], relUrl, MAX_URL - urllen - 1);
 693     } else {
 694         SBuf path = req->url.path();
 695         SBuf::size_type lastSlashPos = path.rfind('/');
 696
 697         if (lastSlashPos == SBuf::npos) {
 698             // replace the whole path with the given bit(s)
 699             urlbuf[urllen] = '/';
 700             ++urllen;
 701             xstrncpy(&urlbuf[urllen], relUrl, MAX_URL - urllen - 1);
 702         } else {
 703             // replace only the last (file?) segment with the given bit(s)
 704             ++lastSlashPos;
 705             if (lastSlashPos > MAX_URL - urllen - 1) {
 706                 // XXX: crops bits in the middle of the combined URL.
 707                 lastSlashPos = MAX_URL - urllen - 1;
 708             }
 709             SBufToCstring(&urlbuf[urllen], path.substr(0,lastSlashPos));
 710             urllen += lastSlashPos;
 711             if (urllen + 1 < MAX_URL) {
 712                 xstrncpy(&urlbuf[urllen], relUrl, MAX_URL - urllen - 1);
 713             }
 714         }
 715     }
 716
 717     return (urlbuf);
 718 }
 719
 720 int
 721 matchDomainName(const char *h, const char *d, MatchDomainNameFlags flags)
 722 {
 723     int dl;
 724     int hl;
 725
 726     const bool hostIncludesSubdomains = (*h == '.');
 727     while ('.' == *h)
 728         ++h;
 729
 730     hl = strlen(h);
 731
 732     if (hl == 0)
 733         return -1;
 734
 735     dl = strlen(d);
 736
 737     /*
 738      * Start at the ends of the two strings and work towards the
 739      * beginning.
 740      */
 741     while (xtolower(h[--hl]) == xtolower(d[--dl])) {
 742         if (hl == 0 && dl == 0) {
 743             /*
 744              * We made it all the way to the beginning of both
 745              * strings without finding any difference.
 746              */
 747             return 0;
 748         }
 749
 750         if (0 == hl) {
 751             /*
 752              * The host string is shorter than the domain string.
 753              * There is only one case when this can be a match.
 754              * If the domain is just one character longer, and if
 755              * that character is a leading '.' then we call it a
 756              * match.
 757              */
 758
 759             if (1 == dl && '.' == d[0])
 760                 return 0;
 761             else
 762                 return -1;
 763         }
 764
 765         if (0 == dl) {
 766             /*
 767              * The domain string is shorter than the host string.
 768              * This is a match only if the first domain character
 769              * is a leading '.'.
 770              */
 771
 772             if ('.' == d[0]) {
 773                 if (flags & mdnRejectSubsubDomains) {
 774                     // Check for sub-sub domain and reject
 775                     while(--hl >= 0 && h[hl] != '.');
 776                     if (hl < 0) {
 777                         // No sub-sub domain found, but reject if there is a
 778                         // leading dot in given host string (which is removed
 779                         // before the check is started).
 780                         return hostIncludesSubdomains ? 1 : 0;
 781                     } else
 782                         return 1; // sub-sub domain, reject
 783                 } else
 784                     return 0;
 785             } else
 786                 return 1;
 787         }
 788     }
 789
 790     /*
 791      * We found different characters in the same position (from the end).
 792      */
 793
 794     // If the h has a form of "*.foo.com" and d has a form of "x.foo.com"
 795     // then the h[hl] points to '*', h[hl+1] to '.' and d[dl] to 'x'
 796     // The following checks are safe, the "h[hl + 1]" in the worst case is '\0'.
 797     if ((flags & mdnHonorWildcards) && h[hl] == '*' && h[hl + 1] == '.')
 798         return 0;
 799
 800     /*
 801      * If one of those character is '.' then its special.  In order
 802      * for splay tree sorting to work properly, "x-foo.com" must
 803      * be greater than ".foo.com" even though '-' is less than '.'.
 804      */
 805     if ('.' == d[dl])
 806         return 1;
 807
 808     if ('.' == h[hl])
 809         return -1;
 810
 811     return (xtolower(h[hl]) - xtolower(d[dl]));
 812 }
 813
 814 /*
 815  * return true if we can serve requests for this method.
 816  */
 817 int
 818 urlCheckRequest(const HttpRequest * r)
 819 {
 820     int rc = 0;
 821     /* protocol "independent" methods
 822      *
 823      * actually these methods are specific to HTTP:
 824      * they are methods we recieve on our HTTP port,
 825      * and if we had a FTP listener would not be relevant
 826      * there.
 827      *
 828      * So, we should delegate them to HTTP. The problem is that we
 829      * do not have a default protocol from the client side of HTTP.
 830      */
 831
 832     if (r->method == Http::METHOD_CONNECT)
 833         return 1;
 834
 835     // we support OPTIONS and TRACE directed at us (with a 501 reply, for now)
 836     // we also support forwarding OPTIONS and TRACE, except for the *-URI ones
 837     if (r->method == Http::METHOD_OPTIONS || r->method == Http::METHOD_TRACE)
 838         return (r->header.getInt64(Http::HdrType::MAX_FORWARDS) == 0 || r->url.path() != AnyP::Uri::Asterisk());
 839
 840     if (r->method == Http::METHOD_PURGE)
 841         return 1;
 842
 843     /* does method match the protocol? */
 844     switch (r->url.getScheme()) {
 845
 846     case AnyP::PROTO_URN:
 847
 848     case AnyP::PROTO_HTTP:
 849
 850     case AnyP::PROTO_CACHE_OBJECT:
 851         rc = 1;
 852         break;
 853
 854     case AnyP::PROTO_FTP:
 855
 856         if (r->method == Http::METHOD_PUT)
 857             rc = 1;
 858
 859     case AnyP::PROTO_GOPHER:
 860
 861     case AnyP::PROTO_WAIS:
 862
 863     case AnyP::PROTO_WHOIS:
 864         if (r->method == Http::METHOD_GET)
 865             rc = 1;
 866         else if (r->method == Http::METHOD_HEAD)
 867             rc = 1;
 868
 869         break;
 870
 871     case AnyP::PROTO_HTTPS:
 872 #if USE_OPENSSL
 873         rc = 1;
 874 #elif USE_GNUTLS
 875         rc = 1;
 876 #else
 877         /*
 878         * Squid can't originate an SSL connection, so it should
 879         * never receive an "https:" URL.  It should always be
 880         * CONNECT instead.
 881         */
 882         rc = 0;
 883 #endif
 884         break;
 885
 886     default:
 887         break;
 888     }
 889
 890     return rc;
 891 }
 892
 893 /*
 894  * Quick-n-dirty host extraction from a URL.  Steps:
 895  *      Look for a colon
 896  *      Skip any '/' after the colon
 897  *      Copy the next SQUID_MAXHOSTNAMELEN bytes to host[]
 898  *      Look for an ending '/' or ':' and terminate
 899  *      Look for login info preceeded by '@'
 900  */
 901
 902 class URLHostName
 903 {
 904
 905 public:
 906     char * extract(char const *url);
 907
 908 private:
 909     static char Host [SQUIDHOSTNAMELEN];
 910     void init(char const *);
 911     void findHostStart();
 912     void trimTrailingChars();
 913     void trimAuth();
 914     char const *hostStart;
 915     char const *url;
 916 };
 917
 918 char *
 919 urlHostname(const char *url)
 920 {
 921     return URLHostName().extract(url);
 922 }
 923
 924 char URLHostName::Host[SQUIDHOSTNAMELEN];
 925
 926 void
 927 URLHostName::init(char const *aUrl)
 928 {
 929     Host[0] = '\0';
 930     url = aUrl;
 931 }
 932
 933 void
 934 URLHostName::findHostStart()
 935 {
 936     if (NULL == (hostStart = strchr(url, ':')))
 937         return;
 938
 939     ++hostStart;
 940
 941     while (*hostStart != '\0' && *hostStart == '/')
 942         ++hostStart;
 943
 944     if (*hostStart == ']')
 945         ++hostStart;
 946 }
 947
 948 void
 949 URLHostName::trimTrailingChars()
 950 {
 951     char *t;
 952
 953     if ((t = strchr(Host, '/')))
 954         *t = '\0';
 955
 956     if ((t = strrchr(Host, ':')))
 957         *t = '\0';
 958
 959     if ((t = strchr(Host, ']')))
 960         *t = '\0';
 961 }
 962
 963 void
 964 URLHostName::trimAuth()
 965 {
 966     char *t;
 967
 968     if ((t = strrchr(Host, '@'))) {
 969         ++t;
 970         memmove(Host, t, strlen(t) + 1);
 971     }
 972 }
 973
 974 char *
 975 URLHostName::extract(char const *aUrl)
 976 {
 977     init(aUrl);
 978     findHostStart();
 979
 980     if (hostStart == NULL)
 981         return NULL;
 982
 983     xstrncpy(Host, hostStart, SQUIDHOSTNAMELEN);
 984
 985     trimTrailingChars();
 986
 987     trimAuth();
 988
 989     return Host;
 990 }
 991
 992 AnyP::Uri::Uri(AnyP::UriScheme const &aScheme) :
 993     scheme_(aScheme),
 994     hostIsNumeric_(false),
 995     port_(0)
 996 {
 997     *host_=0;
 998 }
 999
1000 // TODO: fix code duplication with AnyP::Uri::parse()
1001 char *
1002 AnyP::Uri::cleanup(const char *uri)
1003 {
1004     int flags = 0;
1005     char *cleanedUri = nullptr;
1006     switch (Config.uri_whitespace) {
1007     case URI_WHITESPACE_ALLOW:
1008         flags |= RFC1738_ESCAPE_NOSPACE;
1009     // fall through to next case
1010     case URI_WHITESPACE_ENCODE:
1011         flags |= RFC1738_ESCAPE_UNESCAPED;
1012         cleanedUri = xstrndup(rfc1738_do_escape(uri, flags), MAX_URL);
1013         break;
1014
1015     case URI_WHITESPACE_CHOP: {
1016         flags |= RFC1738_ESCAPE_UNESCAPED;
1017         const auto pos = strcspn(uri, w_space);
1018         char *choppedUri = nullptr;
1019         if (pos < strlen(uri))
1020             choppedUri = xstrndup(uri, pos + 1);
1021         cleanedUri = xstrndup(rfc1738_do_escape(choppedUri ? choppedUri : uri, flags), MAX_URL);
1022         cleanedUri[pos] = '\0';
1023         xfree(choppedUri);
1024     }
1025     break;
1026
1027     case URI_WHITESPACE_DENY:
1028     case URI_WHITESPACE_STRIP:
1029     default: {
1030         // TODO: avoid duplication with urlParse()
1031         const char *t;
1032         char *tmp_uri = static_cast<char*>(xmalloc(strlen(uri) + 1));
1033         char *q = tmp_uri;
1034         t = uri;
1035         while (*t) {
1036             if (!xisspace(*t)) {
1037                 *q = *t;
1038                 ++q;
1039             }
1040             ++t;
1041         }
1042         *q = '\0';
1043         cleanedUri = xstrndup(rfc1738_escape_unescaped(tmp_uri), MAX_URL);
1044         xfree(tmp_uri);
1045     }
1046     break;
1047     }
1048
1049     assert(cleanedUri);
1050     return cleanedUri;
1051 }
1052