src/url.cc

   1 /*
   2  * Copyright (C) 1996-2016 The Squid Software Foundation and contributors
   3  *
   4  * Squid software is distributed under GPLv2+ license and includes
   5  * contributions from numerous individuals and organizations.
   6  * Please see the COPYING and CONTRIBUTORS files for details.
   7  */
   8
   9 /* DEBUG: section 23    URL Parsing */
  10
  11 #include "squid.h"
  12 #include "globals.h"
  13 #include "HttpRequest.h"
  14 #include "rfc1738.h"
  15 #include "SquidConfig.h"
  16 #include "SquidString.h"
  17 #include "URL.h"
  18
  19 static HttpRequest *urlParseFinish(const HttpRequestMethod& method,
  20                                    const AnyP::ProtocolType protocol,
  21                                    const char *const urlpath,
  22                                    const char *const host,
  23                                    const SBuf &login,
  24                                    const int port,
  25                                    HttpRequest *request);
  26 static HttpRequest *urnParse(const HttpRequestMethod& method, char *urn, HttpRequest *request);
  27 static const char valid_hostname_chars_u[] =
  28     "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
  29     "abcdefghijklmnopqrstuvwxyz"
  30     "0123456789-._"
  31     "[:]"
  32     ;
  33 static const char valid_hostname_chars[] =
  34     "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
  35     "abcdefghijklmnopqrstuvwxyz"
  36     "0123456789-."
  37     "[:]"
  38     ;
  39
  40 const SBuf &
  41 URL::Asterisk()
  42 {
  43     static SBuf star("*");
  44     return star;
  45 }
  46
  47 const SBuf &
  48 URL::SlashPath()
  49 {
  50     static SBuf slash("/");
  51     return slash;
  52 }
  53
  54 void
  55 URL::host(const char *src)
  56 {
  57     hostAddr_.setEmpty();
  58     hostAddr_ = src;
  59     if (hostAddr_.isAnyAddr()) {
  60         xstrncpy(host_, src, sizeof(host_));
  61         hostIsNumeric_ = false;
  62     } else {
  63         hostAddr_.toHostStr(host_, sizeof(host_));
  64         debugs(23, 3, "given IP: " << hostAddr_);
  65         hostIsNumeric_ = 1;
  66     }
  67     touch();
  68 }
  69
  70 const SBuf &
  71 URL::path() const
  72 {
  73     // RFC 3986 section 3.3 says path can be empty (path-abempty).
  74     // RFC 7230 sections 2.7.3, 5.3.1, 5.7.2 - says path cannot be empty, default to "/"
  75     // at least when sending and using. We must still accept path-abempty as input.
  76     if (path_.isEmpty() && (scheme_ == AnyP::PROTO_HTTP || scheme_ == AnyP::PROTO_HTTPS))
  77         return SlashPath();
  78
  79     return path_;
  80 }
  81
  82 void
  83 urlInitialize(void)
  84 {
  85     debugs(23, 5, "urlInitialize: Initializing...");
  86     /* this ensures that the number of protocol strings is the same as
  87      * the enum slots allocated because the last enum is always 'MAX'.
  88      */
  89     assert(strcmp(AnyP::ProtocolType_str[AnyP::PROTO_MAX], "MAX") == 0);
  90     /*
  91      * These test that our matchDomainName() function works the
  92      * way we expect it to.
  93      */
  94     assert(0 == matchDomainName("foo.com", "foo.com"));
  95     assert(0 == matchDomainName(".foo.com", "foo.com"));
  96     assert(0 == matchDomainName("foo.com", ".foo.com"));
  97     assert(0 == matchDomainName(".foo.com", ".foo.com"));
  98     assert(0 == matchDomainName("x.foo.com", ".foo.com"));
  99     assert(0 != matchDomainName("x.foo.com", "foo.com"));
 100     assert(0 != matchDomainName("foo.com", "x.foo.com"));
 101     assert(0 != matchDomainName("bar.com", "foo.com"));
 102     assert(0 != matchDomainName(".bar.com", "foo.com"));
 103     assert(0 != matchDomainName(".bar.com", ".foo.com"));
 104     assert(0 != matchDomainName("bar.com", ".foo.com"));
 105     assert(0 < matchDomainName("zzz.com", "foo.com"));
 106     assert(0 > matchDomainName("aaa.com", "foo.com"));
 107     assert(0 == matchDomainName("FOO.com", "foo.COM"));
 108     assert(0 < matchDomainName("bfoo.com", "afoo.com"));
 109     assert(0 > matchDomainName("afoo.com", "bfoo.com"));
 110     assert(0 < matchDomainName("x-foo.com", ".foo.com"));
 111     /* more cases? */
 112 }
 113
 114 /**
 115  * Parse the scheme name from string b, into protocol type.
 116  * The string must be 0-terminated.
 117  */
 118 AnyP::ProtocolType
 119 urlParseProtocol(const char *b)
 120 {
 121     // make e point to the ':' character
 122     const char *e = b + strcspn(b, ":");
 123     int len = e - b;
 124
 125     /* test common stuff first */
 126
 127     if (strncasecmp(b, "http", len) == 0)
 128         return AnyP::PROTO_HTTP;
 129
 130     if (strncasecmp(b, "ftp", len) == 0)
 131         return AnyP::PROTO_FTP;
 132
 133     if (strncasecmp(b, "https", len) == 0)
 134         return AnyP::PROTO_HTTPS;
 135
 136     if (strncasecmp(b, "file", len) == 0)
 137         return AnyP::PROTO_FTP;
 138
 139     if (strncasecmp(b, "coap", len) == 0)
 140         return AnyP::PROTO_COAP;
 141
 142     if (strncasecmp(b, "coaps", len) == 0)
 143         return AnyP::PROTO_COAPS;
 144
 145     if (strncasecmp(b, "gopher", len) == 0)
 146         return AnyP::PROTO_GOPHER;
 147
 148     if (strncasecmp(b, "wais", len) == 0)
 149         return AnyP::PROTO_WAIS;
 150
 151     if (strncasecmp(b, "cache_object", len) == 0)
 152         return AnyP::PROTO_CACHE_OBJECT;
 153
 154     if (strncasecmp(b, "urn", len) == 0)
 155         return AnyP::PROTO_URN;
 156
 157     if (strncasecmp(b, "whois", len) == 0)
 158         return AnyP::PROTO_WHOIS;
 159
 160     return AnyP::PROTO_NONE;
 161 }
 162
 163 /*
 164  * Parse a URI/URL.
 165  *
 166  * If the 'request' arg is non-NULL, put parsed values there instead
 167  * of allocating a new HttpRequest.
 168  *
 169  * This abuses HttpRequest as a way of representing the parsed url
 170  * and its components.
 171  * method is used to switch parsers and to init the HttpRequest.
 172  * If method is Http::METHOD_CONNECT, then rather than a URL a hostname:port is
 173  * looked for.
 174  * The url is non const so that if its too long we can NULL-terminate it in place.
 175  */
 176
 177 /*
 178  * This routine parses a URL. Its assumed that the URL is complete -
 179  * ie, the end of the string is the end of the URL. Don't pass a partial
 180  * URL here as this routine doesn't have any way of knowing whether
 181  * its partial or not (ie, it handles the case of no trailing slash as
 182  * being "end of host with implied path of /".
 183  */
 184 HttpRequest *
 185 urlParse(const HttpRequestMethod& method, char *url, HttpRequest *request)
 186 {
 187     LOCAL_ARRAY(char, proto, MAX_URL);
 188     LOCAL_ARRAY(char, login, MAX_URL);
 189     LOCAL_ARRAY(char, host, MAX_URL);
 190     LOCAL_ARRAY(char, urlpath, MAX_URL);
 191     char *t = NULL;
 192     char *q = NULL;
 193     int port;
 194     AnyP::ProtocolType protocol = AnyP::PROTO_NONE;
 195     int l;
 196     int i;
 197     const char *src;
 198     char *dst;
 199     proto[0] = host[0] = urlpath[0] = login[0] = '\0';
 200
 201     if ((l = strlen(url)) + Config.appendDomainLen > (MAX_URL - 1)) {
 202         /* terminate so it doesn't overflow other buffers */
 203         *(url + (MAX_URL >> 1)) = '\0';
 204         debugs(23, DBG_IMPORTANT, "urlParse: URL too large (" << l << " bytes)");
 205         return NULL;
 206     }
 207     if (method == Http::METHOD_CONNECT) {
 208         port = CONNECT_PORT;
 209
 210         if (sscanf(url, "[%[^]]]:%d", host, &port) < 1)
 211             if (sscanf(url, "%[^:]:%d", host, &port) < 1)
 212                 return NULL;
 213
 214     } else if ((method == Http::METHOD_OPTIONS || method == Http::METHOD_TRACE) &&
 215                URL::Asterisk().cmp(url) == 0) {
 216         protocol = AnyP::PROTO_HTTP;
 217         port = AnyP::UriScheme(protocol).defaultPort();
 218         return urlParseFinish(method, protocol, url, host, SBuf(), port, request);
 219     } else if (!strncmp(url, "urn:", 4)) {
 220         return urnParse(method, url, request);
 221     } else {
 222         /* Parse the URL: */
 223         src = url;
 224         i = 0;
 225         /* Find first : - everything before is protocol */
 226         for (i = 0, dst = proto; i < l && *src != ':'; ++i, ++src, ++dst) {
 227             *dst = *src;
 228         }
 229         if (i >= l)
 230             return NULL;
 231         *dst = '\0';
 232
 233         /* Then its :// */
 234         if ((i+3) > l || *src != ':' || *(src + 1) != '/' || *(src + 2) != '/')
 235             return NULL;
 236         i += 3;
 237         src += 3;
 238
 239         /* Then everything until first /; thats host (and port; which we'll look for here later) */
 240         // bug 1881: If we don't get a "/" then we imply it was there
 241         // bug 3074: We could just be given a "?" or "#". These also imply "/"
 242         // bug 3233: whitespace is also a hostname delimiter.
 243         for (dst = host; i < l && *src != '/' && *src != '?' && *src != '#' && *src != '\0' && !xisspace(*src); ++i, ++src, ++dst) {
 244             *dst = *src;
 245         }
 246
 247         /*
 248          * We can't check for "i >= l" here because we could be at the end of the line
 249          * and have a perfectly valid URL w/ no trailing '/'. In this case we assume we've
 250          * been -given- a valid URL and the path is just '/'.
 251          */
 252         if (i > l)
 253             return NULL;
 254         *dst = '\0';
 255
 256         // bug 3074: received 'path' starting with '?', '#', or '\0' implies '/'
 257         if (*src == '?' || *src == '#' || *src == '\0') {
 258             urlpath[0] = '/';
 259             dst = &urlpath[1];
 260         } else {
 261             dst = urlpath;
 262         }
 263         /* Then everything from / (inclusive) until \r\n or \0 - thats urlpath */
 264         for (; i < l && *src != '\r' && *src != '\n' && *src != '\0'; ++i, ++src, ++dst) {
 265             *dst = *src;
 266         }
 267
 268         /* We -could- be at the end of the buffer here */
 269         if (i > l)
 270             return NULL;
 271         /* If the URL path is empty we set it to be "/" */
 272         if (dst == urlpath) {
 273             *dst = '/';
 274             ++dst;
 275         }
 276         *dst = '\0';
 277
 278         protocol = urlParseProtocol(proto);
 279         port = AnyP::UriScheme(protocol).defaultPort();
 280
 281         /* Is there any login information? (we should eventually parse it above) */
 282         t = strrchr(host, '@');
 283         if (t != NULL) {
 284             strncpy((char *) login, (char *) host, sizeof(login)-1);
 285             login[sizeof(login)-1] = '\0';
 286             t = strrchr(login, '@');
 287             *t = 0;
 288             strncpy((char *) host, t + 1, sizeof(host)-1);
 289             host[sizeof(host)-1] = '\0';
 290             // Bug 4498: URL-unescape the login info after extraction
 291             rfc1738_unescape(login);
 292         }
 293
 294         /* Is there any host information? (we should eventually parse it above) */
 295         if (*host == '[') {
 296             /* strip any IPA brackets. valid under IPv6. */
 297             dst = host;
 298             /* only for IPv6 sadly, pre-IPv6/URL code can't handle the clean result properly anyway. */
 299             src = host;
 300             ++src;
 301             l = strlen(host);
 302             i = 1;
 303             for (; i < l && *src != ']' && *src != '\0'; ++i, ++src, ++dst) {
 304                 *dst = *src;
 305             }
 306
 307             /* we moved in-place, so truncate the actual hostname found */
 308             *dst = '\0';
 309             ++dst;
 310
 311             /* skip ahead to either start of port, or original EOS */
 312             while (*dst != '\0' && *dst != ':')
 313                 ++dst;
 314             t = dst;
 315         } else {
 316             t = strrchr(host, ':');
 317
 318             if (t != strchr(host,':') ) {
 319                 /* RFC 2732 states IPv6 "SHOULD" be bracketed. allowing for times when its not. */
 320                 /* RFC 3986 'update' simply modifies this to an "is" with no emphasis at all! */
 321                 /* therefore we MUST accept the case where they are not bracketed at all. */
 322                 t = NULL;
 323             }
 324         }
 325
 326         // Bug 3183 sanity check: If scheme is present, host must be too.
 327         if (protocol != AnyP::PROTO_NONE && host[0] == '\0') {
 328             debugs(23, DBG_IMPORTANT, "SECURITY ALERT: Missing hostname in URL '" << url << "'. see access.log for details.");
 329             return NULL;
 330         }
 331
 332         if (t && *t == ':') {
 333             *t = '\0';
 334             ++t;
 335             port = atoi(t);
 336         }
 337     }
 338
 339     for (t = host; *t; ++t)
 340         *t = xtolower(*t);
 341
 342     if (stringHasWhitespace(host)) {
 343         if (URI_WHITESPACE_STRIP == Config.uri_whitespace) {
 344             t = q = host;
 345             while (*t) {
 346                 if (!xisspace(*t)) {
 347                     *q = *t;
 348                     ++q;
 349                 }
 350                 ++t;
 351             }
 352             *q = '\0';
 353         }
 354     }
 355
 356     debugs(23, 3, "urlParse: Split URL '" << url << "' into proto='" << proto << "', host='" << host << "', port='" << port << "', path='" << urlpath << "'");
 357
 358     if (Config.onoff.check_hostnames && strspn(host, Config.onoff.allow_underscore ? valid_hostname_chars_u : valid_hostname_chars) != strlen(host)) {
 359         debugs(23, DBG_IMPORTANT, "urlParse: Illegal character in hostname '" << host << "'");
 360         return NULL;
 361     }
 362
 363     /* For IPV6 addresses also check for a colon */
 364     if (Config.appendDomain && !strchr(host, '.') && !strchr(host, ':'))
 365         strncat(host, Config.appendDomain, SQUIDHOSTNAMELEN - strlen(host) - 1);
 366
 367     /* remove trailing dots from hostnames */
 368     while ((l = strlen(host)) > 0 && host[--l] == '.')
 369         host[l] = '\0';
 370
 371     /* reject duplicate or leading dots */
 372     if (strstr(host, "..") || *host == '.') {
 373         debugs(23, DBG_IMPORTANT, "urlParse: Illegal hostname '" << host << "'");
 374         return NULL;
 375     }
 376
 377     if (port < 1 || port > 65535) {
 378         debugs(23, 3, "urlParse: Invalid port '" << port << "'");
 379         return NULL;
 380     }
 381
 382 #if HARDCODE_DENY_PORTS
 383     /* These ports are filtered in the default squid.conf, but
 384      * maybe someone wants them hardcoded... */
 385     if (port == 7 || port == 9 || port == 19) {
 386         debugs(23, DBG_CRITICAL, "urlParse: Deny access to port " << port);
 387         return NULL;
 388     }
 389 #endif
 390
 391     if (stringHasWhitespace(urlpath)) {
 392         debugs(23, 2, "urlParse: URI has whitespace: {" << url << "}");
 393
 394         switch (Config.uri_whitespace) {
 395
 396         case URI_WHITESPACE_DENY:
 397             return NULL;
 398
 399         case URI_WHITESPACE_ALLOW:
 400             break;
 401
 402         case URI_WHITESPACE_ENCODE:
 403             t = rfc1738_escape_unescaped(urlpath);
 404             xstrncpy(urlpath, t, MAX_URL);
 405             break;
 406
 407         case URI_WHITESPACE_CHOP:
 408             *(urlpath + strcspn(urlpath, w_space)) = '\0';
 409             break;
 410
 411         case URI_WHITESPACE_STRIP:
 412         default:
 413             t = q = urlpath;
 414             while (*t) {
 415                 if (!xisspace(*t)) {
 416                     *q = *t;
 417                     ++q;
 418                 }
 419                 ++t;
 420             }
 421             *q = '\0';
 422         }
 423     }
 424
 425     return urlParseFinish(method, protocol, urlpath, host, SBuf(login), port, request);
 426 }
 427
 428 /**
 429  * Update request with parsed URI data.  If the request arg is
 430  * non-NULL, put parsed values there instead of allocating a new
 431  * HttpRequest.
 432  */
 433 static HttpRequest *
 434 urlParseFinish(const HttpRequestMethod& method,
 435                const AnyP::ProtocolType protocol,
 436                const char *const urlpath,
 437                const char *const host,
 438                const SBuf &login,
 439                const int port,
 440                HttpRequest *request)
 441 {
 442     if (NULL == request)
 443         request = new HttpRequest(method, protocol, urlpath);
 444     else {
 445         request->initHTTP(method, protocol, urlpath);
 446     }
 447
 448     request->url.host(host);
 449     request->url.userInfo(login);
 450     request->url.port(port);
 451     return request;
 452 }
 453
 454 static HttpRequest *
 455 urnParse(const HttpRequestMethod& method, char *urn, HttpRequest *request)
 456 {
 457     debugs(50, 5, "urnParse: " << urn);
 458     if (request) {
 459         request->initHTTP(method, AnyP::PROTO_URN, urn + 4);
 460         return request;
 461     }
 462
 463     return new HttpRequest(method, AnyP::PROTO_URN, urn + 4);
 464 }
 465
 466 void
 467 URL::touch()
 468 {
 469     absolute_.clear();
 470     authorityHttp_.clear();
 471     authorityWithPort_.clear();
 472 }
 473
 474 SBuf &
 475 URL::authority(bool requirePort) const
 476 {
 477     if (authorityHttp_.isEmpty()) {
 478
 479         // both formats contain Host/IP
 480         authorityWithPort_.append(host());
 481         authorityHttp_ = authorityWithPort_;
 482
 483         // authorityForm_ only has :port if it is non-default
 484         authorityWithPort_.appendf(":%u",port());
 485         if (port() != getScheme().defaultPort())
 486             authorityHttp_ = authorityWithPort_;
 487     }
 488
 489     return requirePort ? authorityWithPort_ : authorityHttp_;
 490 }
 491
 492 SBuf &
 493 URL::absolute() const
 494 {
 495     if (absolute_.isEmpty()) {
 496         // TODO: most URL will be much shorter, avoid allocating this much
 497         absolute_.reserveCapacity(MAX_URL);
 498
 499         absolute_.appendf("%s:", getScheme().c_str());
 500         if (getScheme() != AnyP::PROTO_URN) {
 501             absolute_.append("//", 2);
 502             const bool omitUserInfo = getScheme() == AnyP::PROTO_HTTP ||
 503                                       getScheme() != AnyP::PROTO_HTTPS ||
 504                                       userInfo().isEmpty();
 505             if (!omitUserInfo) {
 506                 absolute_.append(userInfo());
 507                 absolute_.append("@", 1);
 508             }
 509             absolute_.append(authority());
 510         }
 511         absolute_.append(path());
 512     }
 513
 514     return absolute_;
 515 }
 516
 517 /** \todo AYJ: Performance: This is an *almost* duplicate of HttpRequest::effectiveRequestUri(). But elides the query-string.
 518  *        After copying it on in the first place! Would be less code to merge the two with a flag parameter.
 519  *        and never copy the query-string part in the first place
 520  */
 521 char *
 522 urlCanonicalClean(const HttpRequest * request)
 523 {
 524     LOCAL_ARRAY(char, buf, MAX_URL);
 525
 526     snprintf(buf, sizeof(buf), SQUIDSBUFPH, SQUIDSBUFPRINT(request->effectiveRequestUri()));
 527     buf[sizeof(buf)-1] = '\0';
 528
 529     // URN, CONNECT method, and non-stripped URIs can go straight out
 530     if (Config.onoff.strip_query_terms && !(request->method == Http::METHOD_CONNECT || request->url.getScheme() == AnyP::PROTO_URN)) {
 531         // strip anything AFTER a question-mark
 532         // leaving the '?' in place
 533         if (auto t = strchr(buf, '?')) {
 534             *(++t) = '\0';
 535         }
 536     }
 537
 538     if (stringHasCntl(buf))
 539         xstrncpy(buf, rfc1738_escape_unescaped(buf), MAX_URL);
 540
 541     return buf;
 542 }
 543
 544 /**
 545  * Yet another alternative to urlCanonical.
 546  * This one adds the https:// parts to Http::METHOD_CONNECT URL
 547  * for use in error page outputs.
 548  * Luckily we can leverage the others instead of duplicating.
 549  */
 550 const char *
 551 urlCanonicalFakeHttps(const HttpRequest * request)
 552 {
 553     LOCAL_ARRAY(char, buf, MAX_URL);
 554
 555     // method CONNECT and port HTTPS
 556     if (request->method == Http::METHOD_CONNECT && request->url.port() == 443) {
 557         snprintf(buf, MAX_URL, "https://%s/*", request->url.host());
 558         return buf;
 559     }
 560
 561     // else do the normal complete canonical thing.
 562     return urlCanonicalClean(request);
 563 }
 564
 565 /*
 566  * Test if a URL is relative.
 567  *
 568  * RFC 2396, Section 5 (Page 17) implies that in a relative URL, a '/' will
 569  * appear before a ':'.
 570  */
 571 bool
 572 urlIsRelative(const char *url)
 573 {
 574     const char *p;
 575
 576     if (url == NULL) {
 577         return (false);
 578     }
 579     if (*url == '\0') {
 580         return (false);
 581     }
 582
 583     for (p = url; *p != '\0' && *p != ':' && *p != '/'; ++p);
 584
 585     if (*p == ':') {
 586         return (false);
 587     }
 588     return (true);
 589 }
 590
 591 /*
 592  * Convert a relative URL to an absolute URL using the context of a given
 593  * request.
 594  *
 595  * It is assumed that you have already ensured that the URL is relative.
 596  *
 597  * If NULL is returned it is an indication that the method in use in the
 598  * request does not distinguish between relative and absolute and you should
 599  * use the url unchanged.
 600  *
 601  * If non-NULL is returned, it is up to the caller to free the resulting
 602  * memory using safe_free().
 603  */
 604 char *
 605 urlMakeAbsolute(const HttpRequest * req, const char *relUrl)
 606 {
 607
 608     if (req->method.id() == Http::METHOD_CONNECT) {
 609         return (NULL);
 610     }
 611
 612     char *urlbuf = (char *)xmalloc(MAX_URL * sizeof(char));
 613
 614     if (req->url.getScheme() == AnyP::PROTO_URN) {
 615         // XXX: this is what the original code did, but it seems to break the
 616         // intended behaviour of this function. It returns the stored URN path,
 617         // not converting the given one into a URN...
 618         snprintf(urlbuf, MAX_URL, SQUIDSBUFPH, SQUIDSBUFPRINT(req->url.absolute()));
 619         return (urlbuf);
 620     }
 621
 622     SBuf authorityForm = req->url.authority(); // host[:port]
 623     size_t urllen = snprintf(urlbuf, MAX_URL, "%s://" SQUIDSBUFPH "%s" SQUIDSBUFPH,
 624                              req->url.getScheme().c_str(),
 625                              SQUIDSBUFPRINT(req->url.userInfo()),
 626                              !req->url.userInfo().isEmpty() ? "@" : "",
 627                              SQUIDSBUFPRINT(authorityForm));
 628
 629     // if the first char is '/' assume its a relative path
 630     // XXX: this breaks on scheme-relative URLs,
 631     // but we should not see those outside ESI, and rarely there.
 632     // XXX: also breaks on any URL containing a '/' in the query-string portion
 633     if (relUrl[0] == '/') {
 634         xstrncpy(&urlbuf[urllen], relUrl, MAX_URL - urllen - 1);
 635     } else {
 636         SBuf path = req->url.path();
 637         SBuf::size_type lastSlashPos = path.rfind('/');
 638
 639         if (lastSlashPos == SBuf::npos) {
 640             // replace the whole path with the given bit(s)
 641             urlbuf[urllen] = '/';
 642             ++urllen;
 643             xstrncpy(&urlbuf[urllen], relUrl, MAX_URL - urllen - 1);
 644         } else {
 645             // replace only the last (file?) segment with the given bit(s)
 646             ++lastSlashPos;
 647             if (lastSlashPos > MAX_URL - urllen - 1) {
 648                 // XXX: crops bits in the middle of the combined URL.
 649                 lastSlashPos = MAX_URL - urllen - 1;
 650             }
 651             SBufToCstring(&urlbuf[urllen], path.substr(0,lastSlashPos));
 652             urllen += lastSlashPos;
 653             if (urllen + 1 < MAX_URL) {
 654                 xstrncpy(&urlbuf[urllen], relUrl, MAX_URL - urllen - 1);
 655             }
 656         }
 657     }
 658
 659     return (urlbuf);
 660 }
 661
 662 int
 663 matchDomainName(const char *h, const char *d, bool honorWildcards)
 664 {
 665     int dl;
 666     int hl;
 667
 668     while ('.' == *h)
 669         ++h;
 670
 671     hl = strlen(h);
 672
 673     dl = strlen(d);
 674
 675     /*
 676      * Start at the ends of the two strings and work towards the
 677      * beginning.
 678      */
 679     while (xtolower(h[--hl]) == xtolower(d[--dl])) {
 680         if (hl == 0 && dl == 0) {
 681             /*
 682              * We made it all the way to the beginning of both
 683              * strings without finding any difference.
 684              */
 685             return 0;
 686         }
 687
 688         if (0 == hl) {
 689             /*
 690              * The host string is shorter than the domain string.
 691              * There is only one case when this can be a match.
 692              * If the domain is just one character longer, and if
 693              * that character is a leading '.' then we call it a
 694              * match.
 695              */
 696
 697             if (1 == dl && '.' == d[0])
 698                 return 0;
 699             else
 700                 return -1;
 701         }
 702
 703         if (0 == dl) {
 704             /*
 705              * The domain string is shorter than the host string.
 706              * This is a match only if the first domain character
 707              * is a leading '.'.
 708              */
 709
 710             if ('.' == d[0])
 711                 return 0;
 712             else
 713                 return 1;
 714         }
 715     }
 716
 717     /*
 718      * We found different characters in the same position (from the end).
 719      */
 720
 721     // If the h has a form of "*.foo.com" and d has a form of "x.foo.com"
 722     // then the h[hl] points to '*', h[hl+1] to '.' and d[dl] to 'x'
 723     // The following checks are safe, the "h[hl + 1]" in the worst case is '\0'.
 724     if (honorWildcards && h[hl] == '*' && h[hl + 1] == '.')
 725         return 0;
 726
 727     /*
 728      * If one of those character is '.' then its special.  In order
 729      * for splay tree sorting to work properly, "x-foo.com" must
 730      * be greater than ".foo.com" even though '-' is less than '.'.
 731      */
 732     if ('.' == d[dl])
 733         return 1;
 734
 735     if ('.' == h[hl])
 736         return -1;
 737
 738     return (xtolower(h[hl]) - xtolower(d[dl]));
 739 }
 740
 741 /*
 742  * return true if we can serve requests for this method.
 743  */
 744 int
 745 urlCheckRequest(const HttpRequest * r)
 746 {
 747     int rc = 0;
 748     /* protocol "independent" methods
 749      *
 750      * actually these methods are specific to HTTP:
 751      * they are methods we recieve on our HTTP port,
 752      * and if we had a FTP listener would not be relevant
 753      * there.
 754      *
 755      * So, we should delegate them to HTTP. The problem is that we
 756      * do not have a default protocol from the client side of HTTP.
 757      */
 758
 759     if (r->method == Http::METHOD_CONNECT)
 760         return 1;
 761
 762     // we support OPTIONS and TRACE directed at us (with a 501 reply, for now)
 763     // we also support forwarding OPTIONS and TRACE, except for the *-URI ones
 764     if (r->method == Http::METHOD_OPTIONS || r->method == Http::METHOD_TRACE)
 765         return (r->header.getInt64(Http::HdrType::MAX_FORWARDS) == 0 || r->url.path() != URL::Asterisk());
 766
 767     if (r->method == Http::METHOD_PURGE)
 768         return 1;
 769
 770     /* does method match the protocol? */
 771     switch (r->url.getScheme()) {
 772
 773     case AnyP::PROTO_URN:
 774
 775     case AnyP::PROTO_HTTP:
 776
 777     case AnyP::PROTO_CACHE_OBJECT:
 778         rc = 1;
 779         break;
 780
 781     case AnyP::PROTO_FTP:
 782
 783         if (r->method == Http::METHOD_PUT)
 784             rc = 1;
 785
 786     case AnyP::PROTO_GOPHER:
 787
 788     case AnyP::PROTO_WAIS:
 789
 790     case AnyP::PROTO_WHOIS:
 791         if (r->method == Http::METHOD_GET)
 792             rc = 1;
 793         else if (r->method == Http::METHOD_HEAD)
 794             rc = 1;
 795
 796         break;
 797
 798     case AnyP::PROTO_HTTPS:
 799 #if USE_OPENSSL
 800
 801         rc = 1;
 802
 803         break;
 804
 805 #else
 806         /*
 807         * Squid can't originate an SSL connection, so it should
 808         * never receive an "https:" URL.  It should always be
 809         * CONNECT instead.
 810         */
 811         rc = 0;
 812
 813 #endif
 814
 815     default:
 816         break;
 817     }
 818
 819     return rc;
 820 }
 821
 822 /*
 823  * Quick-n-dirty host extraction from a URL.  Steps:
 824  *      Look for a colon
 825  *      Skip any '/' after the colon
 826  *      Copy the next SQUID_MAXHOSTNAMELEN bytes to host[]
 827  *      Look for an ending '/' or ':' and terminate
 828  *      Look for login info preceeded by '@'
 829  */
 830
 831 class URLHostName
 832 {
 833
 834 public:
 835     char * extract(char const *url);
 836
 837 private:
 838     static char Host [SQUIDHOSTNAMELEN];
 839     void init(char const *);
 840     void findHostStart();
 841     void trimTrailingChars();
 842     void trimAuth();
 843     char const *hostStart;
 844     char const *url;
 845 };
 846
 847 char *
 848 urlHostname(const char *url)
 849 {
 850     return URLHostName().extract(url);
 851 }
 852
 853 char URLHostName::Host[SQUIDHOSTNAMELEN];
 854
 855 void
 856 URLHostName::init(char const *aUrl)
 857 {
 858     Host[0] = '\0';
 859     url = aUrl;
 860 }
 861
 862 void
 863 URLHostName::findHostStart()
 864 {
 865     if (NULL == (hostStart = strchr(url, ':')))
 866         return;
 867
 868     ++hostStart;
 869
 870     while (*hostStart != '\0' && *hostStart == '/')
 871         ++hostStart;
 872
 873     if (*hostStart == ']')
 874         ++hostStart;
 875 }
 876
 877 void
 878 URLHostName::trimTrailingChars()
 879 {
 880     char *t;
 881
 882     if ((t = strchr(Host, '/')))
 883         *t = '\0';
 884
 885     if ((t = strrchr(Host, ':')))
 886         *t = '\0';
 887
 888     if ((t = strchr(Host, ']')))
 889         *t = '\0';
 890 }
 891
 892 void
 893 URLHostName::trimAuth()
 894 {
 895     char *t;
 896
 897     if ((t = strrchr(Host, '@'))) {
 898         ++t;
 899         memmove(Host, t, strlen(t) + 1);
 900     }
 901 }
 902
 903 char *
 904 URLHostName::extract(char const *aUrl)
 905 {
 906     init(aUrl);
 907     findHostStart();
 908
 909     if (hostStart == NULL)
 910         return NULL;
 911
 912     xstrncpy(Host, hostStart, SQUIDHOSTNAMELEN);
 913
 914     trimTrailingChars();
 915
 916     trimAuth();
 917
 918     return Host;
 919 }
 920
 921 URL::URL(AnyP::UriScheme const &aScheme) :
 922     scheme_(aScheme),
 923     hostIsNumeric_(false),
 924     port_(0)
 925 {
 926     *host_=0;
 927 }
 928