src/anyp/Uri.cc

   1 /*
   2  * Copyright (C) 1996-2018 The Squid Software Foundation and contributors
   3  *
   4  * Squid software is distributed under GPLv2+ license and includes
   5  * contributions from numerous individuals and organizations.
   6  * Please see the COPYING and CONTRIBUTORS files for details.
   7  */
   8
   9 /* DEBUG: section 23    URL Parsing */
  10
  11 #include "squid.h"
  12 #include "anyp/Uri.h"
  13 #include "globals.h"
  14 #include "HttpRequest.h"
  15 #include "rfc1738.h"
  16 #include "SquidConfig.h"
  17 #include "SquidString.h"
  18
  19 static const char valid_hostname_chars_u[] =
  20     "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
  21     "abcdefghijklmnopqrstuvwxyz"
  22     "0123456789-._"
  23     "[:]"
  24     ;
  25 static const char valid_hostname_chars[] =
  26     "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
  27     "abcdefghijklmnopqrstuvwxyz"
  28     "0123456789-."
  29     "[:]"
  30     ;
  31
  32 const SBuf &
  33 AnyP::Uri::Asterisk()
  34 {
  35     static SBuf star("*");
  36     return star;
  37 }
  38
  39 const SBuf &
  40 AnyP::Uri::SlashPath()
  41 {
  42     static SBuf slash("/");
  43     return slash;
  44 }
  45
  46 void
  47 AnyP::Uri::host(const char *src)
  48 {
  49     hostAddr_.setEmpty();
  50     hostAddr_ = src;
  51     if (hostAddr_.isAnyAddr()) {
  52         xstrncpy(host_, src, sizeof(host_));
  53         hostIsNumeric_ = false;
  54     } else {
  55         hostAddr_.toHostStr(host_, sizeof(host_));
  56         debugs(23, 3, "given IP: " << hostAddr_);
  57         hostIsNumeric_ = 1;
  58     }
  59     touch();
  60 }
  61
  62 const SBuf &
  63 AnyP::Uri::path() const
  64 {
  65     // RFC 3986 section 3.3 says path can be empty (path-abempty).
  66     // RFC 7230 sections 2.7.3, 5.3.1, 5.7.2 - says path cannot be empty, default to "/"
  67     // at least when sending and using. We must still accept path-abempty as input.
  68     if (path_.isEmpty() && (scheme_ == AnyP::PROTO_HTTP || scheme_ == AnyP::PROTO_HTTPS))
  69         return SlashPath();
  70
  71     return path_;
  72 }
  73
  74 void
  75 urlInitialize(void)
  76 {
  77     debugs(23, 5, "urlInitialize: Initializing...");
  78     /* this ensures that the number of protocol strings is the same as
  79      * the enum slots allocated because the last enum is always 'MAX'.
  80      */
  81     assert(strcmp(AnyP::ProtocolType_str[AnyP::PROTO_MAX], "MAX") == 0);
  82     /*
  83      * These test that our matchDomainName() function works the
  84      * way we expect it to.
  85      */
  86     assert(0 == matchDomainName("foo.com", "foo.com"));
  87     assert(0 == matchDomainName(".foo.com", "foo.com"));
  88     assert(0 == matchDomainName("foo.com", ".foo.com"));
  89     assert(0 == matchDomainName(".foo.com", ".foo.com"));
  90     assert(0 == matchDomainName("x.foo.com", ".foo.com"));
  91     assert(0 == matchDomainName("y.x.foo.com", ".foo.com"));
  92     assert(0 != matchDomainName("x.foo.com", "foo.com"));
  93     assert(0 != matchDomainName("foo.com", "x.foo.com"));
  94     assert(0 != matchDomainName("bar.com", "foo.com"));
  95     assert(0 != matchDomainName(".bar.com", "foo.com"));
  96     assert(0 != matchDomainName(".bar.com", ".foo.com"));
  97     assert(0 != matchDomainName("bar.com", ".foo.com"));
  98     assert(0 < matchDomainName("zzz.com", "foo.com"));
  99     assert(0 > matchDomainName("aaa.com", "foo.com"));
 100     assert(0 == matchDomainName("FOO.com", "foo.COM"));
 101     assert(0 < matchDomainName("bfoo.com", "afoo.com"));
 102     assert(0 > matchDomainName("afoo.com", "bfoo.com"));
 103     assert(0 < matchDomainName("x-foo.com", ".foo.com"));
 104
 105     assert(0 == matchDomainName(".foo.com", ".foo.com", mdnRejectSubsubDomains));
 106     assert(0 == matchDomainName("x.foo.com", ".foo.com", mdnRejectSubsubDomains));
 107     assert(0 != matchDomainName("y.x.foo.com", ".foo.com", mdnRejectSubsubDomains));
 108     assert(0 != matchDomainName(".x.foo.com", ".foo.com", mdnRejectSubsubDomains));
 109
 110     assert(0 == matchDomainName("*.foo.com", "x.foo.com", mdnHonorWildcards));
 111     assert(0 == matchDomainName("*.foo.com", ".x.foo.com", mdnHonorWildcards));
 112     assert(0 == matchDomainName("*.foo.com", ".foo.com", mdnHonorWildcards));
 113     assert(0 != matchDomainName("*.foo.com", "foo.com", mdnHonorWildcards));
 114
 115     /* more cases? */
 116 }
 117
 118 /**
 119  * Parse the scheme name from string b, into protocol type.
 120  * The string must be 0-terminated.
 121  */
 122 AnyP::ProtocolType
 123 urlParseProtocol(const char *b)
 124 {
 125     // make e point to the ':' character
 126     const char *e = b + strcspn(b, ":");
 127     int len = e - b;
 128
 129     /* test common stuff first */
 130
 131     if (strncasecmp(b, "http", len) == 0)
 132         return AnyP::PROTO_HTTP;
 133
 134     if (strncasecmp(b, "ftp", len) == 0)
 135         return AnyP::PROTO_FTP;
 136
 137     if (strncasecmp(b, "https", len) == 0)
 138         return AnyP::PROTO_HTTPS;
 139
 140     if (strncasecmp(b, "file", len) == 0)
 141         return AnyP::PROTO_FTP;
 142
 143     if (strncasecmp(b, "coap", len) == 0)
 144         return AnyP::PROTO_COAP;
 145
 146     if (strncasecmp(b, "coaps", len) == 0)
 147         return AnyP::PROTO_COAPS;
 148
 149     if (strncasecmp(b, "gopher", len) == 0)
 150         return AnyP::PROTO_GOPHER;
 151
 152     if (strncasecmp(b, "wais", len) == 0)
 153         return AnyP::PROTO_WAIS;
 154
 155     if (strncasecmp(b, "cache_object", len) == 0)
 156         return AnyP::PROTO_CACHE_OBJECT;
 157
 158     if (strncasecmp(b, "urn", len) == 0)
 159         return AnyP::PROTO_URN;
 160
 161     if (strncasecmp(b, "whois", len) == 0)
 162         return AnyP::PROTO_WHOIS;
 163
 164     if (len > 0)
 165         return AnyP::PROTO_UNKNOWN;
 166
 167     return AnyP::PROTO_NONE;
 168 }
 169
 170 /*
 171  * Parse a URI/URL.
 172  *
 173  * Stores parsed values in the `request` argument.
 174  *
 175  * This abuses HttpRequest as a way of representing the parsed url
 176  * and its components.
 177  * method is used to switch parsers and to init the HttpRequest.
 178  * If method is Http::METHOD_CONNECT, then rather than a URL a hostname:port is
 179  * looked for.
 180  * The url is non const so that if its too long we can NULL-terminate it in place.
 181  */
 182
 183 /*
 184  * This routine parses a URL. Its assumed that the URL is complete -
 185  * ie, the end of the string is the end of the URL. Don't pass a partial
 186  * URL here as this routine doesn't have any way of knowing whether
 187  * its partial or not (ie, it handles the case of no trailing slash as
 188  * being "end of host with implied path of /".
 189  */
 190 bool
 191 AnyP::Uri::parse(const HttpRequestMethod& method, const char *url)
 192 {
 193     LOCAL_ARRAY(char, proto, MAX_URL);
 194     LOCAL_ARRAY(char, login, MAX_URL);
 195     LOCAL_ARRAY(char, foundHost, MAX_URL);
 196     LOCAL_ARRAY(char, urlpath, MAX_URL);
 197     char *t = NULL;
 198     char *q = NULL;
 199     int foundPort;
 200     AnyP::ProtocolType protocol = AnyP::PROTO_NONE;
 201     int l;
 202     int i;
 203     const char *src;
 204     char *dst;
 205     proto[0] = foundHost[0] = urlpath[0] = login[0] = '\0';
 206
 207     if ((l = strlen(url)) + Config.appendDomainLen > (MAX_URL - 1)) {
 208         debugs(23, DBG_IMPORTANT, MYNAME << "URL too large (" << l << " bytes)");
 209         return false;
 210     }
 211     if (method == Http::METHOD_CONNECT) {
 212         /*
 213          * RFC 7230 section 5.3.3:  authority-form = authority
 214          *  "excluding any userinfo and its "@" delimiter"
 215          *
 216          * RFC 3986 section 3.2:    authority = [ userinfo "@" ] host [ ":" port ]
 217          *
 218          * As an HTTP(S) proxy we assume HTTPS (443) if no port provided.
 219          */
 220         foundPort = 443;
 221
 222         if (sscanf(url, "[%[^]]]:%d", foundHost, &foundPort) < 1)
 223             if (sscanf(url, "%[^:]:%d", foundHost, &foundPort) < 1)
 224                 return false;
 225
 226     } else if ((method == Http::METHOD_OPTIONS || method == Http::METHOD_TRACE) &&
 227                AnyP::Uri::Asterisk().cmp(url) == 0) {
 228         parseFinish(AnyP::PROTO_HTTP, nullptr, url, foundHost, SBuf(), 80 /* HTTP default port */);
 229         return true;
 230     } else if (strncmp(url, "urn:", 4) == 0) {
 231         debugs(23, 3, "Split URI '" << url << "' into proto='urn', path='" << (url+4) << "'");
 232         debugs(50, 5, "urn=" << (url+4));
 233         setScheme(AnyP::PROTO_URN, nullptr);
 234         path(url + 4);
 235         return true;
 236     } else {
 237         /* Parse the URL: */
 238         src = url;
 239         i = 0;
 240         /* Find first : - everything before is protocol */
 241         for (i = 0, dst = proto; i < l && *src != ':'; ++i, ++src, ++dst) {
 242             *dst = *src;
 243         }
 244         if (i >= l)
 245             return false;
 246         *dst = '\0';
 247
 248         /* Then its :// */
 249         if ((i+3) > l || *src != ':' || *(src + 1) != '/' || *(src + 2) != '/')
 250             return false;
 251         i += 3;
 252         src += 3;
 253
 254         /* Then everything until first /; thats host (and port; which we'll look for here later) */
 255         // bug 1881: If we don't get a "/" then we imply it was there
 256         // bug 3074: We could just be given a "?" or "#". These also imply "/"
 257         // bug 3233: whitespace is also a hostname delimiter.
 258         for (dst = foundHost; i < l && *src != '/' && *src != '?' && *src != '#' && *src != '\0' && !xisspace(*src); ++i, ++src, ++dst) {
 259             *dst = *src;
 260         }
 261
 262         /*
 263          * We can't check for "i >= l" here because we could be at the end of the line
 264          * and have a perfectly valid URL w/ no trailing '/'. In this case we assume we've
 265          * been -given- a valid URL and the path is just '/'.
 266          */
 267         if (i > l)
 268             return false;
 269         *dst = '\0';
 270
 271         // bug 3074: received 'path' starting with '?', '#', or '\0' implies '/'
 272         if (*src == '?' || *src == '#' || *src == '\0') {
 273             urlpath[0] = '/';
 274             dst = &urlpath[1];
 275         } else {
 276             dst = urlpath;
 277         }
 278         /* Then everything from / (inclusive) until \r\n or \0 - thats urlpath */
 279         for (; i < l && *src != '\r' && *src != '\n' && *src != '\0'; ++i, ++src, ++dst) {
 280             *dst = *src;
 281         }
 282
 283         /* We -could- be at the end of the buffer here */
 284         if (i > l)
 285             return false;
 286         /* If the URL path is empty we set it to be "/" */
 287         if (dst == urlpath) {
 288             *dst = '/';
 289             ++dst;
 290         }
 291         *dst = '\0';
 292
 293         protocol = urlParseProtocol(proto);
 294         foundPort = AnyP::UriScheme(protocol).defaultPort();
 295
 296         /* Is there any login information? (we should eventually parse it above) */
 297         t = strrchr(foundHost, '@');
 298         if (t != NULL) {
 299             strncpy((char *) login, (char *) foundHost, sizeof(login)-1);
 300             login[sizeof(login)-1] = '\0';
 301             t = strrchr(login, '@');
 302             *t = 0;
 303             strncpy((char *) foundHost, t + 1, sizeof(foundHost)-1);
 304             foundHost[sizeof(foundHost)-1] = '\0';
 305             // Bug 4498: URL-unescape the login info after extraction
 306             rfc1738_unescape(login);
 307         }
 308
 309         /* Is there any host information? (we should eventually parse it above) */
 310         if (*foundHost == '[') {
 311             /* strip any IPA brackets. valid under IPv6. */
 312             dst = foundHost;
 313             /* only for IPv6 sadly, pre-IPv6/URL code can't handle the clean result properly anyway. */
 314             src = foundHost;
 315             ++src;
 316             l = strlen(foundHost);
 317             i = 1;
 318             for (; i < l && *src != ']' && *src != '\0'; ++i, ++src, ++dst) {
 319                 *dst = *src;
 320             }
 321
 322             /* we moved in-place, so truncate the actual hostname found */
 323             *dst = '\0';
 324             ++dst;
 325
 326             /* skip ahead to either start of port, or original EOS */
 327             while (*dst != '\0' && *dst != ':')
 328                 ++dst;
 329             t = dst;
 330         } else {
 331             t = strrchr(foundHost, ':');
 332
 333             if (t != strchr(foundHost,':') ) {
 334                 /* RFC 2732 states IPv6 "SHOULD" be bracketed. allowing for times when its not. */
 335                 /* RFC 3986 'update' simply modifies this to an "is" with no emphasis at all! */
 336                 /* therefore we MUST accept the case where they are not bracketed at all. */
 337                 t = NULL;
 338             }
 339         }
 340
 341         // Bug 3183 sanity check: If scheme is present, host must be too.
 342         if (protocol != AnyP::PROTO_NONE && foundHost[0] == '\0') {
 343             debugs(23, DBG_IMPORTANT, "SECURITY ALERT: Missing hostname in URL '" << url << "'. see access.log for details.");
 344             return false;
 345         }
 346
 347         if (t && *t == ':') {
 348             *t = '\0';
 349             ++t;
 350             foundPort = atoi(t);
 351         }
 352     }
 353
 354     for (t = foundHost; *t; ++t)
 355         *t = xtolower(*t);
 356
 357     if (stringHasWhitespace(foundHost)) {
 358         if (URI_WHITESPACE_STRIP == Config.uri_whitespace) {
 359             t = q = foundHost;
 360             while (*t) {
 361                 if (!xisspace(*t)) {
 362                     *q = *t;
 363                     ++q;
 364                 }
 365                 ++t;
 366             }
 367             *q = '\0';
 368         }
 369     }
 370
 371     debugs(23, 3, "Split URL '" << url << "' into proto='" << proto << "', host='" << foundHost << "', port='" << foundPort << "', path='" << urlpath << "'");
 372
 373     if (Config.onoff.check_hostnames &&
 374             strspn(foundHost, Config.onoff.allow_underscore ? valid_hostname_chars_u : valid_hostname_chars) != strlen(foundHost)) {
 375         debugs(23, DBG_IMPORTANT, MYNAME << "Illegal character in hostname '" << foundHost << "'");
 376         return false;
 377     }
 378
 379     /* For IPV6 addresses also check for a colon */
 380     if (Config.appendDomain && !strchr(foundHost, '.') && !strchr(foundHost, ':'))
 381         strncat(foundHost, Config.appendDomain, SQUIDHOSTNAMELEN - strlen(foundHost) - 1);
 382
 383     /* remove trailing dots from hostnames */
 384     while ((l = strlen(foundHost)) > 0 && foundHost[--l] == '.')
 385         foundHost[l] = '\0';
 386
 387     /* reject duplicate or leading dots */
 388     if (strstr(foundHost, "..") || *foundHost == '.') {
 389         debugs(23, DBG_IMPORTANT, MYNAME << "Illegal hostname '" << foundHost << "'");
 390         return false;
 391     }
 392
 393     if (foundPort < 1 || foundPort > 65535) {
 394         debugs(23, 3, "Invalid port '" << foundPort << "'");
 395         return false;
 396     }
 397
 398 #if HARDCODE_DENY_PORTS
 399     /* These ports are filtered in the default squid.conf, but
 400      * maybe someone wants them hardcoded... */
 401     if (foundPort == 7 || foundPort == 9 || foundPort == 19) {
 402         debugs(23, DBG_CRITICAL, MYNAME << "Deny access to port " << foundPort);
 403         return false;
 404     }
 405 #endif
 406
 407     if (stringHasWhitespace(urlpath)) {
 408         debugs(23, 2, "URI has whitespace: {" << url << "}");
 409
 410         switch (Config.uri_whitespace) {
 411
 412         case URI_WHITESPACE_DENY:
 413             return false;
 414
 415         case URI_WHITESPACE_ALLOW:
 416             break;
 417
 418         case URI_WHITESPACE_ENCODE:
 419             t = rfc1738_escape_unescaped(urlpath);
 420             xstrncpy(urlpath, t, MAX_URL);
 421             break;
 422
 423         case URI_WHITESPACE_CHOP:
 424             *(urlpath + strcspn(urlpath, w_space)) = '\0';
 425             break;
 426
 427         case URI_WHITESPACE_STRIP:
 428         default:
 429             t = q = urlpath;
 430             while (*t) {
 431                 if (!xisspace(*t)) {
 432                     *q = *t;
 433                     ++q;
 434                 }
 435                 ++t;
 436             }
 437             *q = '\0';
 438         }
 439     }
 440
 441     parseFinish(protocol, proto, urlpath, foundHost, SBuf(login), foundPort);
 442     return true;
 443 }
 444
 445 /// Update the URL object with parsed URI data.
 446 void
 447 AnyP::Uri::parseFinish(const AnyP::ProtocolType protocol,
 448                        const char *const protoStr, // for unknown protocols
 449                        const char *const aUrlPath,
 450                        const char *const aHost,
 451                        const SBuf &aLogin,
 452                        const int aPort)
 453 {
 454     setScheme(protocol, protoStr);
 455     path(aUrlPath);
 456     host(aHost);
 457     userInfo(aLogin);
 458     port(aPort);
 459 }
 460
 461 void
 462 AnyP::Uri::touch()
 463 {
 464     absolute_.clear();
 465     authorityHttp_.clear();
 466     authorityWithPort_.clear();
 467 }
 468
 469 SBuf &
 470 AnyP::Uri::authority(bool requirePort) const
 471 {
 472     if (authorityHttp_.isEmpty()) {
 473
 474         // both formats contain Host/IP
 475         authorityWithPort_.append(host());
 476         authorityHttp_ = authorityWithPort_;
 477
 478         // authorityForm_ only has :port if it is non-default
 479         authorityWithPort_.appendf(":%u",port());
 480         if (port() != getScheme().defaultPort())
 481             authorityHttp_ = authorityWithPort_;
 482     }
 483
 484     return requirePort ? authorityWithPort_ : authorityHttp_;
 485 }
 486
 487 SBuf &
 488 AnyP::Uri::absolute() const
 489 {
 490     if (absolute_.isEmpty()) {
 491         // TODO: most URL will be much shorter, avoid allocating this much
 492         absolute_.reserveCapacity(MAX_URL);
 493
 494         absolute_.append(getScheme().image());
 495         absolute_.append(":",1);
 496         if (getScheme() != AnyP::PROTO_URN) {
 497             absolute_.append("//", 2);
 498             const bool omitUserInfo = getScheme() == AnyP::PROTO_HTTP ||
 499                                       getScheme() != AnyP::PROTO_HTTPS ||
 500                                       userInfo().isEmpty();
 501             if (!omitUserInfo) {
 502                 absolute_.append(userInfo());
 503                 absolute_.append("@", 1);
 504             }
 505             absolute_.append(authority());
 506         }
 507         absolute_.append(path());
 508     }
 509
 510     return absolute_;
 511 }
 512
 513 /** \todo AYJ: Performance: This is an *almost* duplicate of HttpRequest::effectiveRequestUri(). But elides the query-string.
 514  *        After copying it on in the first place! Would be less code to merge the two with a flag parameter.
 515  *        and never copy the query-string part in the first place
 516  */
 517 char *
 518 urlCanonicalCleanWithoutRequest(const SBuf &url, const HttpRequestMethod &method, const AnyP::UriScheme &scheme)
 519 {
 520     LOCAL_ARRAY(char, buf, MAX_URL);
 521
 522     snprintf(buf, sizeof(buf), SQUIDSBUFPH, SQUIDSBUFPRINT(url));
 523     buf[sizeof(buf)-1] = '\0';
 524
 525     // URN, CONNECT method, and non-stripped URIs can go straight out
 526     if (Config.onoff.strip_query_terms && !(method == Http::METHOD_CONNECT || scheme == AnyP::PROTO_URN)) {
 527         // strip anything AFTER a question-mark
 528         // leaving the '?' in place
 529         if (auto t = strchr(buf, '?')) {
 530             *(++t) = '\0';
 531         }
 532     }
 533
 534     if (stringHasCntl(buf))
 535         xstrncpy(buf, rfc1738_escape_unescaped(buf), MAX_URL);
 536
 537     return buf;
 538 }
 539
 540 /**
 541  * Yet another alternative to urlCanonical.
 542  * This one adds the https:// parts to Http::METHOD_CONNECT URL
 543  * for use in error page outputs.
 544  * Luckily we can leverage the others instead of duplicating.
 545  */
 546 const char *
 547 urlCanonicalFakeHttps(const HttpRequest * request)
 548 {
 549     LOCAL_ARRAY(char, buf, MAX_URL);
 550
 551     // method CONNECT and port HTTPS
 552     if (request->method == Http::METHOD_CONNECT && request->url.port() == 443) {
 553         snprintf(buf, MAX_URL, "https://%s/*", request->url.host());
 554         return buf;
 555     }
 556
 557     // else do the normal complete canonical thing.
 558     return request->canonicalCleanUrl();
 559 }
 560
 561 /*
 562  * Test if a URL is relative.
 563  *
 564  * RFC 2396, Section 5 (Page 17) implies that in a relative URL, a '/' will
 565  * appear before a ':'.
 566  */
 567 bool
 568 urlIsRelative(const char *url)
 569 {
 570     const char *p;
 571
 572     if (url == NULL) {
 573         return (false);
 574     }
 575     if (*url == '\0') {
 576         return (false);
 577     }
 578
 579     for (p = url; *p != '\0' && *p != ':' && *p != '/'; ++p);
 580
 581     if (*p == ':') {
 582         return (false);
 583     }
 584     return (true);
 585 }
 586
 587 /*
 588  * Convert a relative URL to an absolute URL using the context of a given
 589  * request.
 590  *
 591  * It is assumed that you have already ensured that the URL is relative.
 592  *
 593  * If NULL is returned it is an indication that the method in use in the
 594  * request does not distinguish between relative and absolute and you should
 595  * use the url unchanged.
 596  *
 597  * If non-NULL is returned, it is up to the caller to free the resulting
 598  * memory using safe_free().
 599  */
 600 char *
 601 urlMakeAbsolute(const HttpRequest * req, const char *relUrl)
 602 {
 603
 604     if (req->method.id() == Http::METHOD_CONNECT) {
 605         return (NULL);
 606     }
 607
 608     char *urlbuf = (char *)xmalloc(MAX_URL * sizeof(char));
 609
 610     if (req->url.getScheme() == AnyP::PROTO_URN) {
 611         // XXX: this is what the original code did, but it seems to break the
 612         // intended behaviour of this function. It returns the stored URN path,
 613         // not converting the given one into a URN...
 614         snprintf(urlbuf, MAX_URL, SQUIDSBUFPH, SQUIDSBUFPRINT(req->url.absolute()));
 615         return (urlbuf);
 616     }
 617
 618     SBuf authorityForm = req->url.authority(); // host[:port]
 619     const SBuf &scheme = req->url.getScheme().image();
 620     size_t urllen = snprintf(urlbuf, MAX_URL, SQUIDSBUFPH "://" SQUIDSBUFPH "%s" SQUIDSBUFPH,
 621                              SQUIDSBUFPRINT(scheme),
 622                              SQUIDSBUFPRINT(req->url.userInfo()),
 623                              !req->url.userInfo().isEmpty() ? "@" : "",
 624                              SQUIDSBUFPRINT(authorityForm));
 625
 626     // if the first char is '/' assume its a relative path
 627     // XXX: this breaks on scheme-relative URLs,
 628     // but we should not see those outside ESI, and rarely there.
 629     // XXX: also breaks on any URL containing a '/' in the query-string portion
 630     if (relUrl[0] == '/') {
 631         xstrncpy(&urlbuf[urllen], relUrl, MAX_URL - urllen - 1);
 632     } else {
 633         SBuf path = req->url.path();
 634         SBuf::size_type lastSlashPos = path.rfind('/');
 635
 636         if (lastSlashPos == SBuf::npos) {
 637             // replace the whole path with the given bit(s)
 638             urlbuf[urllen] = '/';
 639             ++urllen;
 640             xstrncpy(&urlbuf[urllen], relUrl, MAX_URL - urllen - 1);
 641         } else {
 642             // replace only the last (file?) segment with the given bit(s)
 643             ++lastSlashPos;
 644             if (lastSlashPos > MAX_URL - urllen - 1) {
 645                 // XXX: crops bits in the middle of the combined URL.
 646                 lastSlashPos = MAX_URL - urllen - 1;
 647             }
 648             SBufToCstring(&urlbuf[urllen], path.substr(0,lastSlashPos));
 649             urllen += lastSlashPos;
 650             if (urllen + 1 < MAX_URL) {
 651                 xstrncpy(&urlbuf[urllen], relUrl, MAX_URL - urllen - 1);
 652             }
 653         }
 654     }
 655
 656     return (urlbuf);
 657 }
 658
 659 int
 660 matchDomainName(const char *h, const char *d, uint flags)
 661 {
 662     int dl;
 663     int hl;
 664
 665     const bool hostIncludesSubdomains = (*h == '.');
 666     while ('.' == *h)
 667         ++h;
 668
 669     hl = strlen(h);
 670
 671     if (hl == 0)
 672         return -1;
 673
 674     dl = strlen(d);
 675
 676     /*
 677      * Start at the ends of the two strings and work towards the
 678      * beginning.
 679      */
 680     while (xtolower(h[--hl]) == xtolower(d[--dl])) {
 681         if (hl == 0 && dl == 0) {
 682             /*
 683              * We made it all the way to the beginning of both
 684              * strings without finding any difference.
 685              */
 686             return 0;
 687         }
 688
 689         if (0 == hl) {
 690             /*
 691              * The host string is shorter than the domain string.
 692              * There is only one case when this can be a match.
 693              * If the domain is just one character longer, and if
 694              * that character is a leading '.' then we call it a
 695              * match.
 696              */
 697
 698             if (1 == dl && '.' == d[0])
 699                 return 0;
 700             else
 701                 return -1;
 702         }
 703
 704         if (0 == dl) {
 705             /*
 706              * The domain string is shorter than the host string.
 707              * This is a match only if the first domain character
 708              * is a leading '.'.
 709              */
 710
 711             if ('.' == d[0]) {
 712                 if (flags & mdnRejectSubsubDomains) {
 713                     // Check for sub-sub domain and reject
 714                     while(--hl >= 0 && h[hl] != '.');
 715                     if (hl < 0) {
 716                         // No sub-sub domain found, but reject if there is a
 717                         // leading dot in given host string (which is removed
 718                         // before the check is started).
 719                         return hostIncludesSubdomains ? 1 : 0;
 720                     } else
 721                         return 1; // sub-sub domain, reject
 722                 } else
 723                     return 0;
 724             } else
 725                 return 1;
 726         }
 727     }
 728
 729     /*
 730      * We found different characters in the same position (from the end).
 731      */
 732
 733     // If the h has a form of "*.foo.com" and d has a form of "x.foo.com"
 734     // then the h[hl] points to '*', h[hl+1] to '.' and d[dl] to 'x'
 735     // The following checks are safe, the "h[hl + 1]" in the worst case is '\0'.
 736     if ((flags & mdnHonorWildcards) && h[hl] == '*' && h[hl + 1] == '.')
 737         return 0;
 738
 739     /*
 740      * If one of those character is '.' then its special.  In order
 741      * for splay tree sorting to work properly, "x-foo.com" must
 742      * be greater than ".foo.com" even though '-' is less than '.'.
 743      */
 744     if ('.' == d[dl])
 745         return 1;
 746
 747     if ('.' == h[hl])
 748         return -1;
 749
 750     return (xtolower(h[hl]) - xtolower(d[dl]));
 751 }
 752
 753 /*
 754  * return true if we can serve requests for this method.
 755  */
 756 int
 757 urlCheckRequest(const HttpRequest * r)
 758 {
 759     int rc = 0;
 760     /* protocol "independent" methods
 761      *
 762      * actually these methods are specific to HTTP:
 763      * they are methods we recieve on our HTTP port,
 764      * and if we had a FTP listener would not be relevant
 765      * there.
 766      *
 767      * So, we should delegate them to HTTP. The problem is that we
 768      * do not have a default protocol from the client side of HTTP.
 769      */
 770
 771     if (r->method == Http::METHOD_CONNECT)
 772         return 1;
 773
 774     // we support OPTIONS and TRACE directed at us (with a 501 reply, for now)
 775     // we also support forwarding OPTIONS and TRACE, except for the *-URI ones
 776     if (r->method == Http::METHOD_OPTIONS || r->method == Http::METHOD_TRACE)
 777         return (r->header.getInt64(Http::HdrType::MAX_FORWARDS) == 0 || r->url.path() != AnyP::Uri::Asterisk());
 778
 779     if (r->method == Http::METHOD_PURGE)
 780         return 1;
 781
 782     /* does method match the protocol? */
 783     switch (r->url.getScheme()) {
 784
 785     case AnyP::PROTO_URN:
 786
 787     case AnyP::PROTO_HTTP:
 788
 789     case AnyP::PROTO_CACHE_OBJECT:
 790         rc = 1;
 791         break;
 792
 793     case AnyP::PROTO_FTP:
 794
 795         if (r->method == Http::METHOD_PUT)
 796             rc = 1;
 797
 798     case AnyP::PROTO_GOPHER:
 799
 800     case AnyP::PROTO_WAIS:
 801
 802     case AnyP::PROTO_WHOIS:
 803         if (r->method == Http::METHOD_GET)
 804             rc = 1;
 805         else if (r->method == Http::METHOD_HEAD)
 806             rc = 1;
 807
 808         break;
 809
 810     case AnyP::PROTO_HTTPS:
 811 #if USE_OPENSSL
 812         rc = 1;
 813 #elif USE_GNUTLS
 814         rc = 1;
 815 #else
 816         /*
 817         * Squid can't originate an SSL connection, so it should
 818         * never receive an "https:" URL.  It should always be
 819         * CONNECT instead.
 820         */
 821         rc = 0;
 822 #endif
 823         break;
 824
 825     default:
 826         break;
 827     }
 828
 829     return rc;
 830 }
 831
 832 /*
 833  * Quick-n-dirty host extraction from a URL.  Steps:
 834  *      Look for a colon
 835  *      Skip any '/' after the colon
 836  *      Copy the next SQUID_MAXHOSTNAMELEN bytes to host[]
 837  *      Look for an ending '/' or ':' and terminate
 838  *      Look for login info preceeded by '@'
 839  */
 840
 841 class URLHostName
 842 {
 843
 844 public:
 845     char * extract(char const *url);
 846
 847 private:
 848     static char Host [SQUIDHOSTNAMELEN];
 849     void init(char const *);
 850     void findHostStart();
 851     void trimTrailingChars();
 852     void trimAuth();
 853     char const *hostStart;
 854     char const *url;
 855 };
 856
 857 char *
 858 urlHostname(const char *url)
 859 {
 860     return URLHostName().extract(url);
 861 }
 862
 863 char URLHostName::Host[SQUIDHOSTNAMELEN];
 864
 865 void
 866 URLHostName::init(char const *aUrl)
 867 {
 868     Host[0] = '\0';
 869     url = aUrl;
 870 }
 871
 872 void
 873 URLHostName::findHostStart()
 874 {
 875     if (NULL == (hostStart = strchr(url, ':')))
 876         return;
 877
 878     ++hostStart;
 879
 880     while (*hostStart != '\0' && *hostStart == '/')
 881         ++hostStart;
 882
 883     if (*hostStart == ']')
 884         ++hostStart;
 885 }
 886
 887 void
 888 URLHostName::trimTrailingChars()
 889 {
 890     char *t;
 891
 892     if ((t = strchr(Host, '/')))
 893         *t = '\0';
 894
 895     if ((t = strrchr(Host, ':')))
 896         *t = '\0';
 897
 898     if ((t = strchr(Host, ']')))
 899         *t = '\0';
 900 }
 901
 902 void
 903 URLHostName::trimAuth()
 904 {
 905     char *t;
 906
 907     if ((t = strrchr(Host, '@'))) {
 908         ++t;
 909         memmove(Host, t, strlen(t) + 1);
 910     }
 911 }
 912
 913 char *
 914 URLHostName::extract(char const *aUrl)
 915 {
 916     init(aUrl);
 917     findHostStart();
 918
 919     if (hostStart == NULL)
 920         return NULL;
 921
 922     xstrncpy(Host, hostStart, SQUIDHOSTNAMELEN);
 923
 924     trimTrailingChars();
 925
 926     trimAuth();
 927
 928     return Host;
 929 }
 930
 931 AnyP::Uri::Uri(AnyP::UriScheme const &aScheme) :
 932     scheme_(aScheme),
 933     hostIsNumeric_(false),
 934     port_(0)
 935 {
 936     *host_=0;
 937 }
 938
 939 // TODO: fix code duplication with AnyP::Uri::parse()
 940 char *
 941 AnyP::Uri::cleanup(const char *uri)
 942 {
 943     int flags = 0;
 944     char *cleanedUri = nullptr;
 945     switch (Config.uri_whitespace) {
 946     case URI_WHITESPACE_ALLOW:
 947         flags |= RFC1738_ESCAPE_NOSPACE;
 948     // fall through to next case
 949     case URI_WHITESPACE_ENCODE:
 950         flags |= RFC1738_ESCAPE_UNESCAPED;
 951         cleanedUri = xstrndup(rfc1738_do_escape(uri, flags), MAX_URL);
 952         break;
 953
 954     case URI_WHITESPACE_CHOP: {
 955         flags |= RFC1738_ESCAPE_UNESCAPED;
 956         const auto pos = strcspn(uri, w_space);
 957         char *choppedUri = nullptr;
 958         if (pos < strlen(uri))
 959             choppedUri = xstrndup(uri, pos + 1);
 960         cleanedUri = xstrndup(rfc1738_do_escape(choppedUri ? choppedUri : uri, flags), MAX_URL);
 961         cleanedUri[pos] = '\0';
 962         xfree(choppedUri);
 963     }
 964     break;
 965
 966     case URI_WHITESPACE_DENY:
 967     case URI_WHITESPACE_STRIP:
 968     default: {
 969         // TODO: avoid duplication with urlParse()
 970         const char *t;
 971         char *tmp_uri = static_cast<char*>(xmalloc(strlen(uri) + 1));
 972         char *q = tmp_uri;
 973         t = uri;
 974         while (*t) {
 975             if (!xisspace(*t)) {
 976                 *q = *t;
 977                 ++q;
 978             }
 979             ++t;
 980         }
 981         *q = '\0';
 982         cleanedUri = xstrndup(rfc1738_escape_unescaped(tmp_uri), MAX_URL);
 983         xfree(tmp_uri);
 984     }
 985     break;
 986     }
 987
 988     assert(cleanedUri);
 989     return cleanedUri;
 990 }
 991