src/url.cc

   1 /*
   2  * Copyright (C) 1996-2016 The Squid Software Foundation and contributors
   3  *
   4  * Squid software is distributed under GPLv2+ license and includes
   5  * contributions from numerous individuals and organizations.
   6  * Please see the COPYING and CONTRIBUTORS files for details.
   7  */
   8
   9 /* DEBUG: section 23    URL Parsing */
  10
  11 #include "squid.h"
  12 #include "globals.h"
  13 #include "HttpRequest.h"
  14 #include "rfc1738.h"
  15 #include "SquidConfig.h"
  16 #include "SquidString.h"
  17 #include "URL.h"
  18
  19 static HttpRequest *urlParseFinish(const HttpRequestMethod& method,
  20                                    const AnyP::ProtocolType protocol,
  21                                    const char *const protoStr,
  22                                    const char *const urlpath,
  23                                    const char *const host,
  24                                    const SBuf &login,
  25                                    const int port,
  26                                    HttpRequest *request);
  27 static HttpRequest *urnParse(const HttpRequestMethod& method, char *urn, HttpRequest *request);
  28 static const char valid_hostname_chars_u[] =
  29     "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
  30     "abcdefghijklmnopqrstuvwxyz"
  31     "0123456789-._"
  32     "[:]"
  33     ;
  34 static const char valid_hostname_chars[] =
  35     "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
  36     "abcdefghijklmnopqrstuvwxyz"
  37     "0123456789-."
  38     "[:]"
  39     ;
  40
  41 const SBuf &
  42 URL::Asterisk()
  43 {
  44     static SBuf star("*");
  45     return star;
  46 }
  47
  48 const SBuf &
  49 URL::SlashPath()
  50 {
  51     static SBuf slash("/");
  52     return slash;
  53 }
  54
  55 void
  56 URL::host(const char *src)
  57 {
  58     hostAddr_.setEmpty();
  59     hostAddr_ = src;
  60     if (hostAddr_.isAnyAddr()) {
  61         xstrncpy(host_, src, sizeof(host_));
  62         hostIsNumeric_ = false;
  63     } else {
  64         hostAddr_.toHostStr(host_, sizeof(host_));
  65         debugs(23, 3, "given IP: " << hostAddr_);
  66         hostIsNumeric_ = 1;
  67     }
  68     touch();
  69 }
  70
  71 const SBuf &
  72 URL::path() const
  73 {
  74     // RFC 3986 section 3.3 says path can be empty (path-abempty).
  75     // RFC 7230 sections 2.7.3, 5.3.1, 5.7.2 - says path cannot be empty, default to "/"
  76     // at least when sending and using. We must still accept path-abempty as input.
  77     if (path_.isEmpty() && (scheme_ == AnyP::PROTO_HTTP || scheme_ == AnyP::PROTO_HTTPS))
  78         return SlashPath();
  79
  80     return path_;
  81 }
  82
  83 void
  84 urlInitialize(void)
  85 {
  86     debugs(23, 5, "urlInitialize: Initializing...");
  87     /* this ensures that the number of protocol strings is the same as
  88      * the enum slots allocated because the last enum is always 'MAX'.
  89      */
  90     assert(strcmp(AnyP::ProtocolType_str[AnyP::PROTO_MAX], "MAX") == 0);
  91     /*
  92      * These test that our matchDomainName() function works the
  93      * way we expect it to.
  94      */
  95     assert(0 == matchDomainName("foo.com", "foo.com"));
  96     assert(0 == matchDomainName(".foo.com", "foo.com"));
  97     assert(0 == matchDomainName("foo.com", ".foo.com"));
  98     assert(0 == matchDomainName(".foo.com", ".foo.com"));
  99     assert(0 == matchDomainName("x.foo.com", ".foo.com"));
 100     assert(0 != matchDomainName("x.foo.com", "foo.com"));
 101     assert(0 != matchDomainName("foo.com", "x.foo.com"));
 102     assert(0 != matchDomainName("bar.com", "foo.com"));
 103     assert(0 != matchDomainName(".bar.com", "foo.com"));
 104     assert(0 != matchDomainName(".bar.com", ".foo.com"));
 105     assert(0 != matchDomainName("bar.com", ".foo.com"));
 106     assert(0 < matchDomainName("zzz.com", "foo.com"));
 107     assert(0 > matchDomainName("aaa.com", "foo.com"));
 108     assert(0 == matchDomainName("FOO.com", "foo.COM"));
 109     assert(0 < matchDomainName("bfoo.com", "afoo.com"));
 110     assert(0 > matchDomainName("afoo.com", "bfoo.com"));
 111     assert(0 < matchDomainName("x-foo.com", ".foo.com"));
 112     /* more cases? */
 113 }
 114
 115 /**
 116  * Parse the scheme name from string b, into protocol type.
 117  * The string must be 0-terminated.
 118  */
 119 AnyP::ProtocolType
 120 urlParseProtocol(const char *b)
 121 {
 122     // make e point to the ':' character
 123     const char *e = b + strcspn(b, ":");
 124     int len = e - b;
 125
 126     /* test common stuff first */
 127
 128     if (strncasecmp(b, "http", len) == 0)
 129         return AnyP::PROTO_HTTP;
 130
 131     if (strncasecmp(b, "ftp", len) == 0)
 132         return AnyP::PROTO_FTP;
 133
 134     if (strncasecmp(b, "https", len) == 0)
 135         return AnyP::PROTO_HTTPS;
 136
 137     if (strncasecmp(b, "file", len) == 0)
 138         return AnyP::PROTO_FTP;
 139
 140     if (strncasecmp(b, "coap", len) == 0)
 141         return AnyP::PROTO_COAP;
 142
 143     if (strncasecmp(b, "coaps", len) == 0)
 144         return AnyP::PROTO_COAPS;
 145
 146     if (strncasecmp(b, "gopher", len) == 0)
 147         return AnyP::PROTO_GOPHER;
 148
 149     if (strncasecmp(b, "wais", len) == 0)
 150         return AnyP::PROTO_WAIS;
 151
 152     if (strncasecmp(b, "cache_object", len) == 0)
 153         return AnyP::PROTO_CACHE_OBJECT;
 154
 155     if (strncasecmp(b, "urn", len) == 0)
 156         return AnyP::PROTO_URN;
 157
 158     if (strncasecmp(b, "whois", len) == 0)
 159         return AnyP::PROTO_WHOIS;
 160
 161     if (len > 0)
 162         return AnyP::PROTO_UNKNOWN;
 163
 164     return AnyP::PROTO_NONE;
 165 }
 166
 167 /*
 168  * Parse a URI/URL.
 169  *
 170  * If the 'request' arg is non-NULL, put parsed values there instead
 171  * of allocating a new HttpRequest.
 172  *
 173  * This abuses HttpRequest as a way of representing the parsed url
 174  * and its components.
 175  * method is used to switch parsers and to init the HttpRequest.
 176  * If method is Http::METHOD_CONNECT, then rather than a URL a hostname:port is
 177  * looked for.
 178  * The url is non const so that if its too long we can NULL-terminate it in place.
 179  */
 180
 181 /*
 182  * This routine parses a URL. Its assumed that the URL is complete -
 183  * ie, the end of the string is the end of the URL. Don't pass a partial
 184  * URL here as this routine doesn't have any way of knowing whether
 185  * its partial or not (ie, it handles the case of no trailing slash as
 186  * being "end of host with implied path of /".
 187  */
 188 HttpRequest *
 189 urlParse(const HttpRequestMethod& method, char *url, HttpRequest *request)
 190 {
 191     LOCAL_ARRAY(char, proto, MAX_URL);
 192     LOCAL_ARRAY(char, login, MAX_URL);
 193     LOCAL_ARRAY(char, host, MAX_URL);
 194     LOCAL_ARRAY(char, urlpath, MAX_URL);
 195     char *t = NULL;
 196     char *q = NULL;
 197     int port;
 198     AnyP::ProtocolType protocol = AnyP::PROTO_NONE;
 199     int l;
 200     int i;
 201     const char *src;
 202     char *dst;
 203     proto[0] = host[0] = urlpath[0] = login[0] = '\0';
 204
 205     if ((l = strlen(url)) + Config.appendDomainLen > (MAX_URL - 1)) {
 206         /* terminate so it doesn't overflow other buffers */
 207         *(url + (MAX_URL >> 1)) = '\0';
 208         debugs(23, DBG_IMPORTANT, "urlParse: URL too large (" << l << " bytes)");
 209         return NULL;
 210     }
 211     if (method == Http::METHOD_CONNECT) {
 212         port = CONNECT_PORT;
 213
 214         if (sscanf(url, "[%[^]]]:%d", host, &port) < 1)
 215             if (sscanf(url, "%[^:]:%d", host, &port) < 1)
 216                 return NULL;
 217
 218     } else if ((method == Http::METHOD_OPTIONS || method == Http::METHOD_TRACE) &&
 219                URL::Asterisk().cmp(url) == 0) {
 220         protocol = AnyP::PROTO_HTTP;
 221         port = 80; // or the slow way ...  AnyP::UriScheme(protocol,"http").defaultPort();
 222         return urlParseFinish(method, protocol, "http", url, host, SBuf(), port, request);
 223     } else if (!strncmp(url, "urn:", 4)) {
 224         return urnParse(method, url, request);
 225     } else {
 226         /* Parse the URL: */
 227         src = url;
 228         i = 0;
 229         /* Find first : - everything before is protocol */
 230         for (i = 0, dst = proto; i < l && *src != ':'; ++i, ++src, ++dst) {
 231             *dst = *src;
 232         }
 233         if (i >= l)
 234             return NULL;
 235         *dst = '\0';
 236
 237         /* Then its :// */
 238         if ((i+3) > l || *src != ':' || *(src + 1) != '/' || *(src + 2) != '/')
 239             return NULL;
 240         i += 3;
 241         src += 3;
 242
 243         /* Then everything until first /; thats host (and port; which we'll look for here later) */
 244         // bug 1881: If we don't get a "/" then we imply it was there
 245         // bug 3074: We could just be given a "?" or "#". These also imply "/"
 246         // bug 3233: whitespace is also a hostname delimiter.
 247         for (dst = host; i < l && *src != '/' && *src != '?' && *src != '#' && *src != '\0' && !xisspace(*src); ++i, ++src, ++dst) {
 248             *dst = *src;
 249         }
 250
 251         /*
 252          * We can't check for "i >= l" here because we could be at the end of the line
 253          * and have a perfectly valid URL w/ no trailing '/'. In this case we assume we've
 254          * been -given- a valid URL and the path is just '/'.
 255          */
 256         if (i > l)
 257             return NULL;
 258         *dst = '\0';
 259
 260         // bug 3074: received 'path' starting with '?', '#', or '\0' implies '/'
 261         if (*src == '?' || *src == '#' || *src == '\0') {
 262             urlpath[0] = '/';
 263             dst = &urlpath[1];
 264         } else {
 265             dst = urlpath;
 266         }
 267         /* Then everything from / (inclusive) until \r\n or \0 - thats urlpath */
 268         for (; i < l && *src != '\r' && *src != '\n' && *src != '\0'; ++i, ++src, ++dst) {
 269             *dst = *src;
 270         }
 271
 272         /* We -could- be at the end of the buffer here */
 273         if (i > l)
 274             return NULL;
 275         /* If the URL path is empty we set it to be "/" */
 276         if (dst == urlpath) {
 277             *dst = '/';
 278             ++dst;
 279         }
 280         *dst = '\0';
 281
 282         protocol = urlParseProtocol(proto);
 283         port = AnyP::UriScheme(protocol).defaultPort();
 284
 285         /* Is there any login information? (we should eventually parse it above) */
 286         t = strrchr(host, '@');
 287         if (t != NULL) {
 288             strncpy((char *) login, (char *) host, sizeof(login)-1);
 289             login[sizeof(login)-1] = '\0';
 290             t = strrchr(login, '@');
 291             *t = 0;
 292             strncpy((char *) host, t + 1, sizeof(host)-1);
 293             host[sizeof(host)-1] = '\0';
 294             // Bug 4498: URL-unescape the login info after extraction
 295             rfc1738_unescape(login);
 296         }
 297
 298         /* Is there any host information? (we should eventually parse it above) */
 299         if (*host == '[') {
 300             /* strip any IPA brackets. valid under IPv6. */
 301             dst = host;
 302             /* only for IPv6 sadly, pre-IPv6/URL code can't handle the clean result properly anyway. */
 303             src = host;
 304             ++src;
 305             l = strlen(host);
 306             i = 1;
 307             for (; i < l && *src != ']' && *src != '\0'; ++i, ++src, ++dst) {
 308                 *dst = *src;
 309             }
 310
 311             /* we moved in-place, so truncate the actual hostname found */
 312             *dst = '\0';
 313             ++dst;
 314
 315             /* skip ahead to either start of port, or original EOS */
 316             while (*dst != '\0' && *dst != ':')
 317                 ++dst;
 318             t = dst;
 319         } else {
 320             t = strrchr(host, ':');
 321
 322             if (t != strchr(host,':') ) {
 323                 /* RFC 2732 states IPv6 "SHOULD" be bracketed. allowing for times when its not. */
 324                 /* RFC 3986 'update' simply modifies this to an "is" with no emphasis at all! */
 325                 /* therefore we MUST accept the case where they are not bracketed at all. */
 326                 t = NULL;
 327             }
 328         }
 329
 330         // Bug 3183 sanity check: If scheme is present, host must be too.
 331         if (protocol != AnyP::PROTO_NONE && host[0] == '\0') {
 332             debugs(23, DBG_IMPORTANT, "SECURITY ALERT: Missing hostname in URL '" << url << "'. see access.log for details.");
 333             return NULL;
 334         }
 335
 336         if (t && *t == ':') {
 337             *t = '\0';
 338             ++t;
 339             port = atoi(t);
 340         }
 341     }
 342
 343     for (t = host; *t; ++t)
 344         *t = xtolower(*t);
 345
 346     if (stringHasWhitespace(host)) {
 347         if (URI_WHITESPACE_STRIP == Config.uri_whitespace) {
 348             t = q = host;
 349             while (*t) {
 350                 if (!xisspace(*t)) {
 351                     *q = *t;
 352                     ++q;
 353                 }
 354                 ++t;
 355             }
 356             *q = '\0';
 357         }
 358     }
 359
 360     debugs(23, 3, "urlParse: Split URL '" << url << "' into proto='" << proto << "', host='" << host << "', port='" << port << "', path='" << urlpath << "'");
 361
 362     if (Config.onoff.check_hostnames && strspn(host, Config.onoff.allow_underscore ? valid_hostname_chars_u : valid_hostname_chars) != strlen(host)) {
 363         debugs(23, DBG_IMPORTANT, "urlParse: Illegal character in hostname '" << host << "'");
 364         return NULL;
 365     }
 366
 367     /* For IPV6 addresses also check for a colon */
 368     if (Config.appendDomain && !strchr(host, '.') && !strchr(host, ':'))
 369         strncat(host, Config.appendDomain, SQUIDHOSTNAMELEN - strlen(host) - 1);
 370
 371     /* remove trailing dots from hostnames */
 372     while ((l = strlen(host)) > 0 && host[--l] == '.')
 373         host[l] = '\0';
 374
 375     /* reject duplicate or leading dots */
 376     if (strstr(host, "..") || *host == '.') {
 377         debugs(23, DBG_IMPORTANT, "urlParse: Illegal hostname '" << host << "'");
 378         return NULL;
 379     }
 380
 381     if (port < 1 || port > 65535) {
 382         debugs(23, 3, "urlParse: Invalid port '" << port << "'");
 383         return NULL;
 384     }
 385
 386 #if HARDCODE_DENY_PORTS
 387     /* These ports are filtered in the default squid.conf, but
 388      * maybe someone wants them hardcoded... */
 389     if (port == 7 || port == 9 || port == 19) {
 390         debugs(23, DBG_CRITICAL, "urlParse: Deny access to port " << port);
 391         return NULL;
 392     }
 393 #endif
 394
 395     if (stringHasWhitespace(urlpath)) {
 396         debugs(23, 2, "urlParse: URI has whitespace: {" << url << "}");
 397
 398         switch (Config.uri_whitespace) {
 399
 400         case URI_WHITESPACE_DENY:
 401             return NULL;
 402
 403         case URI_WHITESPACE_ALLOW:
 404             break;
 405
 406         case URI_WHITESPACE_ENCODE:
 407             t = rfc1738_escape_unescaped(urlpath);
 408             xstrncpy(urlpath, t, MAX_URL);
 409             break;
 410
 411         case URI_WHITESPACE_CHOP:
 412             *(urlpath + strcspn(urlpath, w_space)) = '\0';
 413             break;
 414
 415         case URI_WHITESPACE_STRIP:
 416         default:
 417             t = q = urlpath;
 418             while (*t) {
 419                 if (!xisspace(*t)) {
 420                     *q = *t;
 421                     ++q;
 422                 }
 423                 ++t;
 424             }
 425             *q = '\0';
 426         }
 427     }
 428
 429     return urlParseFinish(method, protocol, proto, urlpath, host, SBuf(login), port, request);
 430 }
 431
 432 /**
 433  * Update request with parsed URI data.  If the request arg is
 434  * non-NULL, put parsed values there instead of allocating a new
 435  * HttpRequest.
 436  */
 437 static HttpRequest *
 438 urlParseFinish(const HttpRequestMethod& method,
 439                const AnyP::ProtocolType protocol,
 440                const char *const protoStr, // for unknown protocols
 441                const char *const urlpath,
 442                const char *const host,
 443                const SBuf &login,
 444                const int port,
 445                HttpRequest *request)
 446 {
 447     if (NULL == request)
 448         request = new HttpRequest(method, protocol, protoStr, urlpath);
 449     else {
 450         request->initHTTP(method, protocol, protoStr, urlpath);
 451     }
 452
 453     request->url.host(host);
 454     request->url.userInfo(login);
 455     request->url.port(port);
 456     return request;
 457 }
 458
 459 static HttpRequest *
 460 urnParse(const HttpRequestMethod& method, char *urn, HttpRequest *request)
 461 {
 462     debugs(50, 5, "urnParse: " << urn);
 463     if (request) {
 464         request->initHTTP(method, AnyP::PROTO_URN, "urn", urn + 4);
 465         return request;
 466     }
 467
 468     return new HttpRequest(method, AnyP::PROTO_URN, "urn", urn + 4);
 469 }
 470
 471 void
 472 URL::touch()
 473 {
 474     absolute_.clear();
 475     authorityHttp_.clear();
 476     authorityWithPort_.clear();
 477 }
 478
 479 SBuf &
 480 URL::authority(bool requirePort) const
 481 {
 482     if (authorityHttp_.isEmpty()) {
 483
 484         // both formats contain Host/IP
 485         authorityWithPort_.append(host());
 486         authorityHttp_ = authorityWithPort_;
 487
 488         // authorityForm_ only has :port if it is non-default
 489         authorityWithPort_.appendf(":%u",port());
 490         if (port() != getScheme().defaultPort())
 491             authorityHttp_ = authorityWithPort_;
 492     }
 493
 494     return requirePort ? authorityWithPort_ : authorityHttp_;
 495 }
 496
 497 SBuf &
 498 URL::absolute() const
 499 {
 500     if (absolute_.isEmpty()) {
 501         // TODO: most URL will be much shorter, avoid allocating this much
 502         absolute_.reserveCapacity(MAX_URL);
 503
 504         absolute_.append(getScheme().image());
 505         absolute_.append(":",1);
 506         if (getScheme() != AnyP::PROTO_URN) {
 507             absolute_.append("//", 2);
 508             const bool omitUserInfo = getScheme() == AnyP::PROTO_HTTP ||
 509                                       getScheme() != AnyP::PROTO_HTTPS ||
 510                                       userInfo().isEmpty();
 511             if (!omitUserInfo) {
 512                 absolute_.append(userInfo());
 513                 absolute_.append("@", 1);
 514             }
 515             absolute_.append(authority());
 516         }
 517         absolute_.append(path());
 518     }
 519
 520     return absolute_;
 521 }
 522
 523 /** \todo AYJ: Performance: This is an *almost* duplicate of HttpRequest::effectiveRequestUri(). But elides the query-string.
 524  *        After copying it on in the first place! Would be less code to merge the two with a flag parameter.
 525  *        and never copy the query-string part in the first place
 526  */
 527 char *
 528 urlCanonicalClean(const HttpRequest * request)
 529 {
 530     LOCAL_ARRAY(char, buf, MAX_URL);
 531
 532     snprintf(buf, sizeof(buf), SQUIDSBUFPH, SQUIDSBUFPRINT(request->effectiveRequestUri()));
 533     buf[sizeof(buf)-1] = '\0';
 534
 535     // URN, CONNECT method, and non-stripped URIs can go straight out
 536     if (Config.onoff.strip_query_terms && !(request->method == Http::METHOD_CONNECT || request->url.getScheme() == AnyP::PROTO_URN)) {
 537         // strip anything AFTER a question-mark
 538         // leaving the '?' in place
 539         if (auto t = strchr(buf, '?')) {
 540             *(++t) = '\0';
 541         }
 542     }
 543
 544     if (stringHasCntl(buf))
 545         xstrncpy(buf, rfc1738_escape_unescaped(buf), MAX_URL);
 546
 547     return buf;
 548 }
 549
 550 /**
 551  * Yet another alternative to urlCanonical.
 552  * This one adds the https:// parts to Http::METHOD_CONNECT URL
 553  * for use in error page outputs.
 554  * Luckily we can leverage the others instead of duplicating.
 555  */
 556 const char *
 557 urlCanonicalFakeHttps(const HttpRequest * request)
 558 {
 559     LOCAL_ARRAY(char, buf, MAX_URL);
 560
 561     // method CONNECT and port HTTPS
 562     if (request->method == Http::METHOD_CONNECT && request->url.port() == 443) {
 563         snprintf(buf, MAX_URL, "https://%s/*", request->url.host());
 564         return buf;
 565     }
 566
 567     // else do the normal complete canonical thing.
 568     return urlCanonicalClean(request);
 569 }
 570
 571 /*
 572  * Test if a URL is relative.
 573  *
 574  * RFC 2396, Section 5 (Page 17) implies that in a relative URL, a '/' will
 575  * appear before a ':'.
 576  */
 577 bool
 578 urlIsRelative(const char *url)
 579 {
 580     const char *p;
 581
 582     if (url == NULL) {
 583         return (false);
 584     }
 585     if (*url == '\0') {
 586         return (false);
 587     }
 588
 589     for (p = url; *p != '\0' && *p != ':' && *p != '/'; ++p);
 590
 591     if (*p == ':') {
 592         return (false);
 593     }
 594     return (true);
 595 }
 596
 597 /*
 598  * Convert a relative URL to an absolute URL using the context of a given
 599  * request.
 600  *
 601  * It is assumed that you have already ensured that the URL is relative.
 602  *
 603  * If NULL is returned it is an indication that the method in use in the
 604  * request does not distinguish between relative and absolute and you should
 605  * use the url unchanged.
 606  *
 607  * If non-NULL is returned, it is up to the caller to free the resulting
 608  * memory using safe_free().
 609  */
 610 char *
 611 urlMakeAbsolute(const HttpRequest * req, const char *relUrl)
 612 {
 613
 614     if (req->method.id() == Http::METHOD_CONNECT) {
 615         return (NULL);
 616     }
 617
 618     char *urlbuf = (char *)xmalloc(MAX_URL * sizeof(char));
 619
 620     if (req->url.getScheme() == AnyP::PROTO_URN) {
 621         // XXX: this is what the original code did, but it seems to break the
 622         // intended behaviour of this function. It returns the stored URN path,
 623         // not converting the given one into a URN...
 624         snprintf(urlbuf, MAX_URL, SQUIDSBUFPH, SQUIDSBUFPRINT(req->url.absolute()));
 625         return (urlbuf);
 626     }
 627
 628     SBuf authorityForm = req->url.authority(); // host[:port]
 629     const SBuf &scheme = req->url.getScheme().image();
 630     size_t urllen = snprintf(urlbuf, MAX_URL, SQUIDSBUFPH "://" SQUIDSBUFPH "%s" SQUIDSBUFPH,
 631                              SQUIDSBUFPRINT(scheme),
 632                              SQUIDSBUFPRINT(req->url.userInfo()),
 633                              !req->url.userInfo().isEmpty() ? "@" : "",
 634                              SQUIDSBUFPRINT(authorityForm));
 635
 636     // if the first char is '/' assume its a relative path
 637     // XXX: this breaks on scheme-relative URLs,
 638     // but we should not see those outside ESI, and rarely there.
 639     // XXX: also breaks on any URL containing a '/' in the query-string portion
 640     if (relUrl[0] == '/') {
 641         xstrncpy(&urlbuf[urllen], relUrl, MAX_URL - urllen - 1);
 642     } else {
 643         SBuf path = req->url.path();
 644         SBuf::size_type lastSlashPos = path.rfind('/');
 645
 646         if (lastSlashPos == SBuf::npos) {
 647             // replace the whole path with the given bit(s)
 648             urlbuf[urllen] = '/';
 649             ++urllen;
 650             xstrncpy(&urlbuf[urllen], relUrl, MAX_URL - urllen - 1);
 651         } else {
 652             // replace only the last (file?) segment with the given bit(s)
 653             ++lastSlashPos;
 654             if (lastSlashPos > MAX_URL - urllen - 1) {
 655                 // XXX: crops bits in the middle of the combined URL.
 656                 lastSlashPos = MAX_URL - urllen - 1;
 657             }
 658             SBufToCstring(&urlbuf[urllen], path.substr(0,lastSlashPos));
 659             urllen += lastSlashPos;
 660             if (urllen + 1 < MAX_URL) {
 661                 xstrncpy(&urlbuf[urllen], relUrl, MAX_URL - urllen - 1);
 662             }
 663         }
 664     }
 665
 666     return (urlbuf);
 667 }
 668
 669 int
 670 matchDomainName(const char *h, const char *d, bool honorWildcards)
 671 {
 672     int dl;
 673     int hl;
 674
 675     while ('.' == *h)
 676         ++h;
 677
 678     hl = strlen(h);
 679
 680     dl = strlen(d);
 681
 682     /*
 683      * Start at the ends of the two strings and work towards the
 684      * beginning.
 685      */
 686     while (xtolower(h[--hl]) == xtolower(d[--dl])) {
 687         if (hl == 0 && dl == 0) {
 688             /*
 689              * We made it all the way to the beginning of both
 690              * strings without finding any difference.
 691              */
 692             return 0;
 693         }
 694
 695         if (0 == hl) {
 696             /*
 697              * The host string is shorter than the domain string.
 698              * There is only one case when this can be a match.
 699              * If the domain is just one character longer, and if
 700              * that character is a leading '.' then we call it a
 701              * match.
 702              */
 703
 704             if (1 == dl && '.' == d[0])
 705                 return 0;
 706             else
 707                 return -1;
 708         }
 709
 710         if (0 == dl) {
 711             /*
 712              * The domain string is shorter than the host string.
 713              * This is a match only if the first domain character
 714              * is a leading '.'.
 715              */
 716
 717             if ('.' == d[0])
 718                 return 0;
 719             else
 720                 return 1;
 721         }
 722     }
 723
 724     /*
 725      * We found different characters in the same position (from the end).
 726      */
 727
 728     // If the h has a form of "*.foo.com" and d has a form of "x.foo.com"
 729     // then the h[hl] points to '*', h[hl+1] to '.' and d[dl] to 'x'
 730     // The following checks are safe, the "h[hl + 1]" in the worst case is '\0'.
 731     if (honorWildcards && h[hl] == '*' && h[hl + 1] == '.')
 732         return 0;
 733
 734     /*
 735      * If one of those character is '.' then its special.  In order
 736      * for splay tree sorting to work properly, "x-foo.com" must
 737      * be greater than ".foo.com" even though '-' is less than '.'.
 738      */
 739     if ('.' == d[dl])
 740         return 1;
 741
 742     if ('.' == h[hl])
 743         return -1;
 744
 745     return (xtolower(h[hl]) - xtolower(d[dl]));
 746 }
 747
 748 /*
 749  * return true if we can serve requests for this method.
 750  */
 751 int
 752 urlCheckRequest(const HttpRequest * r)
 753 {
 754     int rc = 0;
 755     /* protocol "independent" methods
 756      *
 757      * actually these methods are specific to HTTP:
 758      * they are methods we recieve on our HTTP port,
 759      * and if we had a FTP listener would not be relevant
 760      * there.
 761      *
 762      * So, we should delegate them to HTTP. The problem is that we
 763      * do not have a default protocol from the client side of HTTP.
 764      */
 765
 766     if (r->method == Http::METHOD_CONNECT)
 767         return 1;
 768
 769     // we support OPTIONS and TRACE directed at us (with a 501 reply, for now)
 770     // we also support forwarding OPTIONS and TRACE, except for the *-URI ones
 771     if (r->method == Http::METHOD_OPTIONS || r->method == Http::METHOD_TRACE)
 772         return (r->header.getInt64(Http::HdrType::MAX_FORWARDS) == 0 || r->url.path() != URL::Asterisk());
 773
 774     if (r->method == Http::METHOD_PURGE)
 775         return 1;
 776
 777     /* does method match the protocol? */
 778     switch (r->url.getScheme()) {
 779
 780     case AnyP::PROTO_URN:
 781
 782     case AnyP::PROTO_HTTP:
 783
 784     case AnyP::PROTO_CACHE_OBJECT:
 785         rc = 1;
 786         break;
 787
 788     case AnyP::PROTO_FTP:
 789
 790         if (r->method == Http::METHOD_PUT)
 791             rc = 1;
 792
 793     case AnyP::PROTO_GOPHER:
 794
 795     case AnyP::PROTO_WAIS:
 796
 797     case AnyP::PROTO_WHOIS:
 798         if (r->method == Http::METHOD_GET)
 799             rc = 1;
 800         else if (r->method == Http::METHOD_HEAD)
 801             rc = 1;
 802
 803         break;
 804
 805     case AnyP::PROTO_HTTPS:
 806 #if USE_OPENSSL
 807         rc = 1;
 808 #else
 809         /*
 810         * Squid can't originate an SSL connection, so it should
 811         * never receive an "https:" URL.  It should always be
 812         * CONNECT instead.
 813         */
 814         rc = 0;
 815 #endif
 816         break;
 817
 818     default:
 819         break;
 820     }
 821
 822     return rc;
 823 }
 824
 825 /*
 826  * Quick-n-dirty host extraction from a URL.  Steps:
 827  *      Look for a colon
 828  *      Skip any '/' after the colon
 829  *      Copy the next SQUID_MAXHOSTNAMELEN bytes to host[]
 830  *      Look for an ending '/' or ':' and terminate
 831  *      Look for login info preceeded by '@'
 832  */
 833
 834 class URLHostName
 835 {
 836
 837 public:
 838     char * extract(char const *url);
 839
 840 private:
 841     static char Host [SQUIDHOSTNAMELEN];
 842     void init(char const *);
 843     void findHostStart();
 844     void trimTrailingChars();
 845     void trimAuth();
 846     char const *hostStart;
 847     char const *url;
 848 };
 849
 850 char *
 851 urlHostname(const char *url)
 852 {
 853     return URLHostName().extract(url);
 854 }
 855
 856 char URLHostName::Host[SQUIDHOSTNAMELEN];
 857
 858 void
 859 URLHostName::init(char const *aUrl)
 860 {
 861     Host[0] = '\0';
 862     url = aUrl;
 863 }
 864
 865 void
 866 URLHostName::findHostStart()
 867 {
 868     if (NULL == (hostStart = strchr(url, ':')))
 869         return;
 870
 871     ++hostStart;
 872
 873     while (*hostStart != '\0' && *hostStart == '/')
 874         ++hostStart;
 875
 876     if (*hostStart == ']')
 877         ++hostStart;
 878 }
 879
 880 void
 881 URLHostName::trimTrailingChars()
 882 {
 883     char *t;
 884
 885     if ((t = strchr(Host, '/')))
 886         *t = '\0';
 887
 888     if ((t = strrchr(Host, ':')))
 889         *t = '\0';
 890
 891     if ((t = strchr(Host, ']')))
 892         *t = '\0';
 893 }
 894
 895 void
 896 URLHostName::trimAuth()
 897 {
 898     char *t;
 899
 900     if ((t = strrchr(Host, '@'))) {
 901         ++t;
 902         memmove(Host, t, strlen(t) + 1);
 903     }
 904 }
 905
 906 char *
 907 URLHostName::extract(char const *aUrl)
 908 {
 909     init(aUrl);
 910     findHostStart();
 911
 912     if (hostStart == NULL)
 913         return NULL;
 914
 915     xstrncpy(Host, hostStart, SQUIDHOSTNAMELEN);
 916
 917     trimTrailingChars();
 918
 919     trimAuth();
 920
 921     return Host;
 922 }
 923
 924 URL::URL(AnyP::UriScheme const &aScheme) :
 925     scheme_(aScheme),
 926     hostIsNumeric_(false),
 927     port_(0)
 928 {
 929     *host_=0;
 930 }
 931