src/url.cc

   1 /*
   2  * Copyright (C) 1996-2017 The Squid Software Foundation and contributors
   3  *
   4  * Squid software is distributed under GPLv2+ license and includes
   5  * contributions from numerous individuals and organizations.
   6  * Please see the COPYING and CONTRIBUTORS files for details.
   7  */
   8
   9 /* DEBUG: section 23    URL Parsing */
  10
  11 #include "squid.h"
  12 #include "globals.h"
  13 #include "HttpRequest.h"
  14 #include "rfc1738.h"
  15 #include "SquidConfig.h"
  16 #include "SquidString.h"
  17 #include "URL.h"
  18
  19 static HttpRequest *urlParseFinish(const HttpRequestMethod& method,
  20                                    const AnyP::ProtocolType protocol,
  21                                    const char *const protoStr,
  22                                    const char *const urlpath,
  23                                    const char *const host,
  24                                    const SBuf &login,
  25                                    const int port,
  26                                    HttpRequest *request);
  27 static HttpRequest *urnParse(const HttpRequestMethod& method, char *urn, HttpRequest *request);
  28 static const char valid_hostname_chars_u[] =
  29     "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
  30     "abcdefghijklmnopqrstuvwxyz"
  31     "0123456789-._"
  32     "[:]"
  33     ;
  34 static const char valid_hostname_chars[] =
  35     "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
  36     "abcdefghijklmnopqrstuvwxyz"
  37     "0123456789-."
  38     "[:]"
  39     ;
  40
  41 const SBuf &
  42 URL::Asterisk()
  43 {
  44     static SBuf star("*");
  45     return star;
  46 }
  47
  48 const SBuf &
  49 URL::SlashPath()
  50 {
  51     static SBuf slash("/");
  52     return slash;
  53 }
  54
  55 void
  56 URL::host(const char *src)
  57 {
  58     hostAddr_.setEmpty();
  59     hostAddr_ = src;
  60     if (hostAddr_.isAnyAddr()) {
  61         xstrncpy(host_, src, sizeof(host_));
  62         hostIsNumeric_ = false;
  63     } else {
  64         hostAddr_.toHostStr(host_, sizeof(host_));
  65         debugs(23, 3, "given IP: " << hostAddr_);
  66         hostIsNumeric_ = 1;
  67     }
  68     touch();
  69 }
  70
  71 const SBuf &
  72 URL::path() const
  73 {
  74     // RFC 3986 section 3.3 says path can be empty (path-abempty).
  75     // RFC 7230 sections 2.7.3, 5.3.1, 5.7.2 - says path cannot be empty, default to "/"
  76     // at least when sending and using. We must still accept path-abempty as input.
  77     if (path_.isEmpty() && (scheme_ == AnyP::PROTO_HTTP || scheme_ == AnyP::PROTO_HTTPS))
  78         return SlashPath();
  79
  80     return path_;
  81 }
  82
  83 void
  84 urlInitialize(void)
  85 {
  86     debugs(23, 5, "urlInitialize: Initializing...");
  87     /* this ensures that the number of protocol strings is the same as
  88      * the enum slots allocated because the last enum is always 'MAX'.
  89      */
  90     assert(strcmp(AnyP::ProtocolType_str[AnyP::PROTO_MAX], "MAX") == 0);
  91     /*
  92      * These test that our matchDomainName() function works the
  93      * way we expect it to.
  94      */
  95     assert(0 == matchDomainName("foo.com", "foo.com"));
  96     assert(0 == matchDomainName(".foo.com", "foo.com"));
  97     assert(0 == matchDomainName("foo.com", ".foo.com"));
  98     assert(0 == matchDomainName(".foo.com", ".foo.com"));
  99     assert(0 == matchDomainName("x.foo.com", ".foo.com"));
 100     assert(0 == matchDomainName("y.x.foo.com", ".foo.com"));
 101     assert(0 != matchDomainName("x.foo.com", "foo.com"));
 102     assert(0 != matchDomainName("foo.com", "x.foo.com"));
 103     assert(0 != matchDomainName("bar.com", "foo.com"));
 104     assert(0 != matchDomainName(".bar.com", "foo.com"));
 105     assert(0 != matchDomainName(".bar.com", ".foo.com"));
 106     assert(0 != matchDomainName("bar.com", ".foo.com"));
 107     assert(0 < matchDomainName("zzz.com", "foo.com"));
 108     assert(0 > matchDomainName("aaa.com", "foo.com"));
 109     assert(0 == matchDomainName("FOO.com", "foo.COM"));
 110     assert(0 < matchDomainName("bfoo.com", "afoo.com"));
 111     assert(0 > matchDomainName("afoo.com", "bfoo.com"));
 112     assert(0 < matchDomainName("x-foo.com", ".foo.com"));
 113
 114     assert(0 == matchDomainName(".foo.com", ".foo.com", mdnRejectSubsubDomains));
 115     assert(0 == matchDomainName("x.foo.com", ".foo.com", mdnRejectSubsubDomains));
 116     assert(0 != matchDomainName("y.x.foo.com", ".foo.com", mdnRejectSubsubDomains));
 117     assert(0 != matchDomainName(".x.foo.com", ".foo.com", mdnRejectSubsubDomains));
 118
 119     assert(0 == matchDomainName("*.foo.com", "x.foo.com", mdnHonorWildcards));
 120     assert(0 == matchDomainName("*.foo.com", ".x.foo.com", mdnHonorWildcards));
 121     assert(0 == matchDomainName("*.foo.com", ".foo.com", mdnHonorWildcards));
 122     assert(0 != matchDomainName("*.foo.com", "foo.com", mdnHonorWildcards));
 123
 124     /* more cases? */
 125 }
 126
 127 /**
 128  * Parse the scheme name from string b, into protocol type.
 129  * The string must be 0-terminated.
 130  */
 131 AnyP::ProtocolType
 132 urlParseProtocol(const char *b)
 133 {
 134     // make e point to the ':' character
 135     const char *e = b + strcspn(b, ":");
 136     int len = e - b;
 137
 138     /* test common stuff first */
 139
 140     if (strncasecmp(b, "http", len) == 0)
 141         return AnyP::PROTO_HTTP;
 142
 143     if (strncasecmp(b, "ftp", len) == 0)
 144         return AnyP::PROTO_FTP;
 145
 146     if (strncasecmp(b, "https", len) == 0)
 147         return AnyP::PROTO_HTTPS;
 148
 149     if (strncasecmp(b, "file", len) == 0)
 150         return AnyP::PROTO_FTP;
 151
 152     if (strncasecmp(b, "coap", len) == 0)
 153         return AnyP::PROTO_COAP;
 154
 155     if (strncasecmp(b, "coaps", len) == 0)
 156         return AnyP::PROTO_COAPS;
 157
 158     if (strncasecmp(b, "gopher", len) == 0)
 159         return AnyP::PROTO_GOPHER;
 160
 161     if (strncasecmp(b, "wais", len) == 0)
 162         return AnyP::PROTO_WAIS;
 163
 164     if (strncasecmp(b, "cache_object", len) == 0)
 165         return AnyP::PROTO_CACHE_OBJECT;
 166
 167     if (strncasecmp(b, "urn", len) == 0)
 168         return AnyP::PROTO_URN;
 169
 170     if (strncasecmp(b, "whois", len) == 0)
 171         return AnyP::PROTO_WHOIS;
 172
 173     if (len > 0)
 174         return AnyP::PROTO_UNKNOWN;
 175
 176     return AnyP::PROTO_NONE;
 177 }
 178
 179 /*
 180  * Parse a URI/URL.
 181  *
 182  * If the 'request' arg is non-NULL, put parsed values there instead
 183  * of allocating a new HttpRequest.
 184  *
 185  * This abuses HttpRequest as a way of representing the parsed url
 186  * and its components.
 187  * method is used to switch parsers and to init the HttpRequest.
 188  * If method is Http::METHOD_CONNECT, then rather than a URL a hostname:port is
 189  * looked for.
 190  * The url is non const so that if its too long we can NULL-terminate it in place.
 191  */
 192
 193 /*
 194  * This routine parses a URL. Its assumed that the URL is complete -
 195  * ie, the end of the string is the end of the URL. Don't pass a partial
 196  * URL here as this routine doesn't have any way of knowing whether
 197  * its partial or not (ie, it handles the case of no trailing slash as
 198  * being "end of host with implied path of /".
 199  */
 200 HttpRequest *
 201 urlParse(const HttpRequestMethod& method, char *url, HttpRequest *request)
 202 {
 203     LOCAL_ARRAY(char, proto, MAX_URL);
 204     LOCAL_ARRAY(char, login, MAX_URL);
 205     LOCAL_ARRAY(char, host, MAX_URL);
 206     LOCAL_ARRAY(char, urlpath, MAX_URL);
 207     char *t = NULL;
 208     char *q = NULL;
 209     int port;
 210     AnyP::ProtocolType protocol = AnyP::PROTO_NONE;
 211     int l;
 212     int i;
 213     const char *src;
 214     char *dst;
 215     proto[0] = host[0] = urlpath[0] = login[0] = '\0';
 216
 217     if ((l = strlen(url)) + Config.appendDomainLen > (MAX_URL - 1)) {
 218         /* terminate so it doesn't overflow other buffers */
 219         *(url + (MAX_URL >> 1)) = '\0';
 220         debugs(23, DBG_IMPORTANT, "urlParse: URL too large (" << l << " bytes)");
 221         return NULL;
 222     }
 223     if (method == Http::METHOD_CONNECT) {
 224         port = CONNECT_PORT;
 225
 226         if (sscanf(url, "[%[^]]]:%d", host, &port) < 1)
 227             if (sscanf(url, "%[^:]:%d", host, &port) < 1)
 228                 return NULL;
 229
 230     } else if ((method == Http::METHOD_OPTIONS || method == Http::METHOD_TRACE) &&
 231                URL::Asterisk().cmp(url) == 0) {
 232         protocol = AnyP::PROTO_HTTP;
 233         port = 80; // or the slow way ...  AnyP::UriScheme(protocol,"http").defaultPort();
 234         return urlParseFinish(method, protocol, "http", url, host, SBuf(), port, request);
 235     } else if (!strncmp(url, "urn:", 4)) {
 236         return urnParse(method, url, request);
 237     } else {
 238         /* Parse the URL: */
 239         src = url;
 240         i = 0;
 241         /* Find first : - everything before is protocol */
 242         for (i = 0, dst = proto; i < l && *src != ':'; ++i, ++src, ++dst) {
 243             *dst = *src;
 244         }
 245         if (i >= l)
 246             return NULL;
 247         *dst = '\0';
 248
 249         /* Then its :// */
 250         if ((i+3) > l || *src != ':' || *(src + 1) != '/' || *(src + 2) != '/')
 251             return NULL;
 252         i += 3;
 253         src += 3;
 254
 255         /* Then everything until first /; thats host (and port; which we'll look for here later) */
 256         // bug 1881: If we don't get a "/" then we imply it was there
 257         // bug 3074: We could just be given a "?" or "#". These also imply "/"
 258         // bug 3233: whitespace is also a hostname delimiter.
 259         for (dst = host; i < l && *src != '/' && *src != '?' && *src != '#' && *src != '\0' && !xisspace(*src); ++i, ++src, ++dst) {
 260             *dst = *src;
 261         }
 262
 263         /*
 264          * We can't check for "i >= l" here because we could be at the end of the line
 265          * and have a perfectly valid URL w/ no trailing '/'. In this case we assume we've
 266          * been -given- a valid URL and the path is just '/'.
 267          */
 268         if (i > l)
 269             return NULL;
 270         *dst = '\0';
 271
 272         // bug 3074: received 'path' starting with '?', '#', or '\0' implies '/'
 273         if (*src == '?' || *src == '#' || *src == '\0') {
 274             urlpath[0] = '/';
 275             dst = &urlpath[1];
 276         } else {
 277             dst = urlpath;
 278         }
 279         /* Then everything from / (inclusive) until \r\n or \0 - thats urlpath */
 280         for (; i < l && *src != '\r' && *src != '\n' && *src != '\0'; ++i, ++src, ++dst) {
 281             *dst = *src;
 282         }
 283
 284         /* We -could- be at the end of the buffer here */
 285         if (i > l)
 286             return NULL;
 287         /* If the URL path is empty we set it to be "/" */
 288         if (dst == urlpath) {
 289             *dst = '/';
 290             ++dst;
 291         }
 292         *dst = '\0';
 293
 294         protocol = urlParseProtocol(proto);
 295         port = AnyP::UriScheme(protocol).defaultPort();
 296
 297         /* Is there any login information? (we should eventually parse it above) */
 298         t = strrchr(host, '@');
 299         if (t != NULL) {
 300             strncpy((char *) login, (char *) host, sizeof(login)-1);
 301             login[sizeof(login)-1] = '\0';
 302             t = strrchr(login, '@');
 303             *t = 0;
 304             strncpy((char *) host, t + 1, sizeof(host)-1);
 305             host[sizeof(host)-1] = '\0';
 306             // Bug 4498: URL-unescape the login info after extraction
 307             rfc1738_unescape(login);
 308         }
 309
 310         /* Is there any host information? (we should eventually parse it above) */
 311         if (*host == '[') {
 312             /* strip any IPA brackets. valid under IPv6. */
 313             dst = host;
 314             /* only for IPv6 sadly, pre-IPv6/URL code can't handle the clean result properly anyway. */
 315             src = host;
 316             ++src;
 317             l = strlen(host);
 318             i = 1;
 319             for (; i < l && *src != ']' && *src != '\0'; ++i, ++src, ++dst) {
 320                 *dst = *src;
 321             }
 322
 323             /* we moved in-place, so truncate the actual hostname found */
 324             *dst = '\0';
 325             ++dst;
 326
 327             /* skip ahead to either start of port, or original EOS */
 328             while (*dst != '\0' && *dst != ':')
 329                 ++dst;
 330             t = dst;
 331         } else {
 332             t = strrchr(host, ':');
 333
 334             if (t != strchr(host,':') ) {
 335                 /* RFC 2732 states IPv6 "SHOULD" be bracketed. allowing for times when its not. */
 336                 /* RFC 3986 'update' simply modifies this to an "is" with no emphasis at all! */
 337                 /* therefore we MUST accept the case where they are not bracketed at all. */
 338                 t = NULL;
 339             }
 340         }
 341
 342         // Bug 3183 sanity check: If scheme is present, host must be too.
 343         if (protocol != AnyP::PROTO_NONE && host[0] == '\0') {
 344             debugs(23, DBG_IMPORTANT, "SECURITY ALERT: Missing hostname in URL '" << url << "'. see access.log for details.");
 345             return NULL;
 346         }
 347
 348         if (t && *t == ':') {
 349             *t = '\0';
 350             ++t;
 351             port = atoi(t);
 352         }
 353     }
 354
 355     for (t = host; *t; ++t)
 356         *t = xtolower(*t);
 357
 358     if (stringHasWhitespace(host)) {
 359         if (URI_WHITESPACE_STRIP == Config.uri_whitespace) {
 360             t = q = host;
 361             while (*t) {
 362                 if (!xisspace(*t)) {
 363                     *q = *t;
 364                     ++q;
 365                 }
 366                 ++t;
 367             }
 368             *q = '\0';
 369         }
 370     }
 371
 372     debugs(23, 3, "urlParse: Split URL '" << url << "' into proto='" << proto << "', host='" << host << "', port='" << port << "', path='" << urlpath << "'");
 373
 374     if (Config.onoff.check_hostnames && strspn(host, Config.onoff.allow_underscore ? valid_hostname_chars_u : valid_hostname_chars) != strlen(host)) {
 375         debugs(23, DBG_IMPORTANT, "urlParse: Illegal character in hostname '" << host << "'");
 376         return NULL;
 377     }
 378
 379     /* For IPV6 addresses also check for a colon */
 380     if (Config.appendDomain && !strchr(host, '.') && !strchr(host, ':'))
 381         strncat(host, Config.appendDomain, SQUIDHOSTNAMELEN - strlen(host) - 1);
 382
 383     /* remove trailing dots from hostnames */
 384     while ((l = strlen(host)) > 0 && host[--l] == '.')
 385         host[l] = '\0';
 386
 387     /* reject duplicate or leading dots */
 388     if (strstr(host, "..") || *host == '.') {
 389         debugs(23, DBG_IMPORTANT, "urlParse: Illegal hostname '" << host << "'");
 390         return NULL;
 391     }
 392
 393     if (port < 1 || port > 65535) {
 394         debugs(23, 3, "urlParse: Invalid port '" << port << "'");
 395         return NULL;
 396     }
 397
 398 #if HARDCODE_DENY_PORTS
 399     /* These ports are filtered in the default squid.conf, but
 400      * maybe someone wants them hardcoded... */
 401     if (port == 7 || port == 9 || port == 19) {
 402         debugs(23, DBG_CRITICAL, "urlParse: Deny access to port " << port);
 403         return NULL;
 404     }
 405 #endif
 406
 407     if (stringHasWhitespace(urlpath)) {
 408         debugs(23, 2, "urlParse: URI has whitespace: {" << url << "}");
 409
 410         switch (Config.uri_whitespace) {
 411
 412         case URI_WHITESPACE_DENY:
 413             return NULL;
 414
 415         case URI_WHITESPACE_ALLOW:
 416             break;
 417
 418         case URI_WHITESPACE_ENCODE:
 419             t = rfc1738_escape_unescaped(urlpath);
 420             xstrncpy(urlpath, t, MAX_URL);
 421             break;
 422
 423         case URI_WHITESPACE_CHOP:
 424             *(urlpath + strcspn(urlpath, w_space)) = '\0';
 425             break;
 426
 427         case URI_WHITESPACE_STRIP:
 428         default:
 429             t = q = urlpath;
 430             while (*t) {
 431                 if (!xisspace(*t)) {
 432                     *q = *t;
 433                     ++q;
 434                 }
 435                 ++t;
 436             }
 437             *q = '\0';
 438         }
 439     }
 440
 441     return urlParseFinish(method, protocol, proto, urlpath, host, SBuf(login), port, request);
 442 }
 443
 444 /**
 445  * Update request with parsed URI data.  If the request arg is
 446  * non-NULL, put parsed values there instead of allocating a new
 447  * HttpRequest.
 448  */
 449 static HttpRequest *
 450 urlParseFinish(const HttpRequestMethod& method,
 451                const AnyP::ProtocolType protocol,
 452                const char *const protoStr, // for unknown protocols
 453                const char *const urlpath,
 454                const char *const host,
 455                const SBuf &login,
 456                const int port,
 457                HttpRequest *request)
 458 {
 459     if (NULL == request)
 460         request = new HttpRequest(method, protocol, protoStr, urlpath);
 461     else {
 462         request->initHTTP(method, protocol, protoStr, urlpath);
 463     }
 464
 465     request->url.host(host);
 466     request->url.userInfo(login);
 467     request->url.port(port);
 468     return request;
 469 }
 470
 471 static HttpRequest *
 472 urnParse(const HttpRequestMethod& method, char *urn, HttpRequest *request)
 473 {
 474     debugs(50, 5, "urnParse: " << urn);
 475     if (request) {
 476         request->initHTTP(method, AnyP::PROTO_URN, "urn", urn + 4);
 477         return request;
 478     }
 479
 480     return new HttpRequest(method, AnyP::PROTO_URN, "urn", urn + 4);
 481 }
 482
 483 void
 484 URL::touch()
 485 {
 486     absolute_.clear();
 487     authorityHttp_.clear();
 488     authorityWithPort_.clear();
 489 }
 490
 491 SBuf &
 492 URL::authority(bool requirePort) const
 493 {
 494     if (authorityHttp_.isEmpty()) {
 495
 496         // both formats contain Host/IP
 497         authorityWithPort_.append(host());
 498         authorityHttp_ = authorityWithPort_;
 499
 500         // authorityForm_ only has :port if it is non-default
 501         authorityWithPort_.appendf(":%u",port());
 502         if (port() != getScheme().defaultPort())
 503             authorityHttp_ = authorityWithPort_;
 504     }
 505
 506     return requirePort ? authorityWithPort_ : authorityHttp_;
 507 }
 508
 509 SBuf &
 510 URL::absolute() const
 511 {
 512     if (absolute_.isEmpty()) {
 513         // TODO: most URL will be much shorter, avoid allocating this much
 514         absolute_.reserveCapacity(MAX_URL);
 515
 516         absolute_.append(getScheme().image());
 517         absolute_.append(":",1);
 518         if (getScheme() != AnyP::PROTO_URN) {
 519             absolute_.append("//", 2);
 520             const bool omitUserInfo = getScheme() == AnyP::PROTO_HTTP ||
 521                                       getScheme() != AnyP::PROTO_HTTPS ||
 522                                       userInfo().isEmpty();
 523             if (!omitUserInfo) {
 524                 absolute_.append(userInfo());
 525                 absolute_.append("@", 1);
 526             }
 527             absolute_.append(authority());
 528         }
 529         absolute_.append(path());
 530     }
 531
 532     return absolute_;
 533 }
 534
 535 /** \todo AYJ: Performance: This is an *almost* duplicate of HttpRequest::effectiveRequestUri(). But elides the query-string.
 536  *        After copying it on in the first place! Would be less code to merge the two with a flag parameter.
 537  *        and never copy the query-string part in the first place
 538  */
 539 char *
 540 urlCanonicalClean(const HttpRequest * request)
 541 {
 542     LOCAL_ARRAY(char, buf, MAX_URL);
 543
 544     snprintf(buf, sizeof(buf), SQUIDSBUFPH, SQUIDSBUFPRINT(request->effectiveRequestUri()));
 545     buf[sizeof(buf)-1] = '\0';
 546
 547     // URN, CONNECT method, and non-stripped URIs can go straight out
 548     if (Config.onoff.strip_query_terms && !(request->method == Http::METHOD_CONNECT || request->url.getScheme() == AnyP::PROTO_URN)) {
 549         // strip anything AFTER a question-mark
 550         // leaving the '?' in place
 551         if (auto t = strchr(buf, '?')) {
 552             *(++t) = '\0';
 553         }
 554     }
 555
 556     if (stringHasCntl(buf))
 557         xstrncpy(buf, rfc1738_escape_unescaped(buf), MAX_URL);
 558
 559     return buf;
 560 }
 561
 562 /**
 563  * Yet another alternative to urlCanonical.
 564  * This one adds the https:// parts to Http::METHOD_CONNECT URL
 565  * for use in error page outputs.
 566  * Luckily we can leverage the others instead of duplicating.
 567  */
 568 const char *
 569 urlCanonicalFakeHttps(const HttpRequest * request)
 570 {
 571     LOCAL_ARRAY(char, buf, MAX_URL);
 572
 573     // method CONNECT and port HTTPS
 574     if (request->method == Http::METHOD_CONNECT && request->url.port() == 443) {
 575         snprintf(buf, MAX_URL, "https://%s/*", request->url.host());
 576         return buf;
 577     }
 578
 579     // else do the normal complete canonical thing.
 580     return urlCanonicalClean(request);
 581 }
 582
 583 /*
 584  * Test if a URL is relative.
 585  *
 586  * RFC 2396, Section 5 (Page 17) implies that in a relative URL, a '/' will
 587  * appear before a ':'.
 588  */
 589 bool
 590 urlIsRelative(const char *url)
 591 {
 592     const char *p;
 593
 594     if (url == NULL) {
 595         return (false);
 596     }
 597     if (*url == '\0') {
 598         return (false);
 599     }
 600
 601     for (p = url; *p != '\0' && *p != ':' && *p != '/'; ++p);
 602
 603     if (*p == ':') {
 604         return (false);
 605     }
 606     return (true);
 607 }
 608
 609 /*
 610  * Convert a relative URL to an absolute URL using the context of a given
 611  * request.
 612  *
 613  * It is assumed that you have already ensured that the URL is relative.
 614  *
 615  * If NULL is returned it is an indication that the method in use in the
 616  * request does not distinguish between relative and absolute and you should
 617  * use the url unchanged.
 618  *
 619  * If non-NULL is returned, it is up to the caller to free the resulting
 620  * memory using safe_free().
 621  */
 622 char *
 623 urlMakeAbsolute(const HttpRequest * req, const char *relUrl)
 624 {
 625
 626     if (req->method.id() == Http::METHOD_CONNECT) {
 627         return (NULL);
 628     }
 629
 630     char *urlbuf = (char *)xmalloc(MAX_URL * sizeof(char));
 631
 632     if (req->url.getScheme() == AnyP::PROTO_URN) {
 633         // XXX: this is what the original code did, but it seems to break the
 634         // intended behaviour of this function. It returns the stored URN path,
 635         // not converting the given one into a URN...
 636         snprintf(urlbuf, MAX_URL, SQUIDSBUFPH, SQUIDSBUFPRINT(req->url.absolute()));
 637         return (urlbuf);
 638     }
 639
 640     SBuf authorityForm = req->url.authority(); // host[:port]
 641     const SBuf &scheme = req->url.getScheme().image();
 642     size_t urllen = snprintf(urlbuf, MAX_URL, SQUIDSBUFPH "://" SQUIDSBUFPH "%s" SQUIDSBUFPH,
 643                              SQUIDSBUFPRINT(scheme),
 644                              SQUIDSBUFPRINT(req->url.userInfo()),
 645                              !req->url.userInfo().isEmpty() ? "@" : "",
 646                              SQUIDSBUFPRINT(authorityForm));
 647
 648     // if the first char is '/' assume its a relative path
 649     // XXX: this breaks on scheme-relative URLs,
 650     // but we should not see those outside ESI, and rarely there.
 651     // XXX: also breaks on any URL containing a '/' in the query-string portion
 652     if (relUrl[0] == '/') {
 653         xstrncpy(&urlbuf[urllen], relUrl, MAX_URL - urllen - 1);
 654     } else {
 655         SBuf path = req->url.path();
 656         SBuf::size_type lastSlashPos = path.rfind('/');
 657
 658         if (lastSlashPos == SBuf::npos) {
 659             // replace the whole path with the given bit(s)
 660             urlbuf[urllen] = '/';
 661             ++urllen;
 662             xstrncpy(&urlbuf[urllen], relUrl, MAX_URL - urllen - 1);
 663         } else {
 664             // replace only the last (file?) segment with the given bit(s)
 665             ++lastSlashPos;
 666             if (lastSlashPos > MAX_URL - urllen - 1) {
 667                 // XXX: crops bits in the middle of the combined URL.
 668                 lastSlashPos = MAX_URL - urllen - 1;
 669             }
 670             SBufToCstring(&urlbuf[urllen], path.substr(0,lastSlashPos));
 671             urllen += lastSlashPos;
 672             if (urllen + 1 < MAX_URL) {
 673                 xstrncpy(&urlbuf[urllen], relUrl, MAX_URL - urllen - 1);
 674             }
 675         }
 676     }
 677
 678     return (urlbuf);
 679 }
 680
 681 int
 682 matchDomainName(const char *h, const char *d, uint flags)
 683 {
 684     int dl;
 685     int hl;
 686
 687     const bool hostIncludesSubdomains = (*h == '.');
 688     while ('.' == *h)
 689         ++h;
 690
 691     hl = strlen(h);
 692
 693     if (hl == 0)
 694         return -1;
 695
 696     dl = strlen(d);
 697
 698     /*
 699      * Start at the ends of the two strings and work towards the
 700      * beginning.
 701      */
 702     while (xtolower(h[--hl]) == xtolower(d[--dl])) {
 703         if (hl == 0 && dl == 0) {
 704             /*
 705              * We made it all the way to the beginning of both
 706              * strings without finding any difference.
 707              */
 708             return 0;
 709         }
 710
 711         if (0 == hl) {
 712             /*
 713              * The host string is shorter than the domain string.
 714              * There is only one case when this can be a match.
 715              * If the domain is just one character longer, and if
 716              * that character is a leading '.' then we call it a
 717              * match.
 718              */
 719
 720             if (1 == dl && '.' == d[0])
 721                 return 0;
 722             else
 723                 return -1;
 724         }
 725
 726         if (0 == dl) {
 727             /*
 728              * The domain string is shorter than the host string.
 729              * This is a match only if the first domain character
 730              * is a leading '.'.
 731              */
 732
 733             if ('.' == d[0]) {
 734                 if (flags & mdnRejectSubsubDomains) {
 735                     // Check for sub-sub domain and reject
 736                     while(--hl >= 0 && h[hl] != '.');
 737                     if (hl < 0) {
 738                         // No sub-sub domain found, but reject if there is a
 739                         // leading dot in given host string (which is removed
 740                         // before the check is started).
 741                         return hostIncludesSubdomains ? 1 : 0;
 742                     } else
 743                         return 1; // sub-sub domain, reject
 744                 } else
 745                     return 0;
 746             } else
 747                 return 1;
 748         }
 749     }
 750
 751     /*
 752      * We found different characters in the same position (from the end).
 753      */
 754
 755     // If the h has a form of "*.foo.com" and d has a form of "x.foo.com"
 756     // then the h[hl] points to '*', h[hl+1] to '.' and d[dl] to 'x'
 757     // The following checks are safe, the "h[hl + 1]" in the worst case is '\0'.
 758     if ((flags & mdnHonorWildcards) && h[hl] == '*' && h[hl + 1] == '.')
 759         return 0;
 760
 761     /*
 762      * If one of those character is '.' then its special.  In order
 763      * for splay tree sorting to work properly, "x-foo.com" must
 764      * be greater than ".foo.com" even though '-' is less than '.'.
 765      */
 766     if ('.' == d[dl])
 767         return 1;
 768
 769     if ('.' == h[hl])
 770         return -1;
 771
 772     return (xtolower(h[hl]) - xtolower(d[dl]));
 773 }
 774
 775 /*
 776  * return true if we can serve requests for this method.
 777  */
 778 int
 779 urlCheckRequest(const HttpRequest * r)
 780 {
 781     int rc = 0;
 782     /* protocol "independent" methods
 783      *
 784      * actually these methods are specific to HTTP:
 785      * they are methods we recieve on our HTTP port,
 786      * and if we had a FTP listener would not be relevant
 787      * there.
 788      *
 789      * So, we should delegate them to HTTP. The problem is that we
 790      * do not have a default protocol from the client side of HTTP.
 791      */
 792
 793     if (r->method == Http::METHOD_CONNECT)
 794         return 1;
 795
 796     // we support OPTIONS and TRACE directed at us (with a 501 reply, for now)
 797     // we also support forwarding OPTIONS and TRACE, except for the *-URI ones
 798     if (r->method == Http::METHOD_OPTIONS || r->method == Http::METHOD_TRACE)
 799         return (r->header.getInt64(Http::HdrType::MAX_FORWARDS) == 0 || r->url.path() != URL::Asterisk());
 800
 801     if (r->method == Http::METHOD_PURGE)
 802         return 1;
 803
 804     /* does method match the protocol? */
 805     switch (r->url.getScheme()) {
 806
 807     case AnyP::PROTO_URN:
 808
 809     case AnyP::PROTO_HTTP:
 810
 811     case AnyP::PROTO_CACHE_OBJECT:
 812         rc = 1;
 813         break;
 814
 815     case AnyP::PROTO_FTP:
 816
 817         if (r->method == Http::METHOD_PUT)
 818             rc = 1;
 819
 820     case AnyP::PROTO_GOPHER:
 821
 822     case AnyP::PROTO_WAIS:
 823
 824     case AnyP::PROTO_WHOIS:
 825         if (r->method == Http::METHOD_GET)
 826             rc = 1;
 827         else if (r->method == Http::METHOD_HEAD)
 828             rc = 1;
 829
 830         break;
 831
 832     case AnyP::PROTO_HTTPS:
 833 #if USE_OPENSSL
 834         rc = 1;
 835 #else
 836         /*
 837         * Squid can't originate an SSL connection, so it should
 838         * never receive an "https:" URL.  It should always be
 839         * CONNECT instead.
 840         */
 841         rc = 0;
 842 #endif
 843         break;
 844
 845     default:
 846         break;
 847     }
 848
 849     return rc;
 850 }
 851
 852 /*
 853  * Quick-n-dirty host extraction from a URL.  Steps:
 854  *      Look for a colon
 855  *      Skip any '/' after the colon
 856  *      Copy the next SQUID_MAXHOSTNAMELEN bytes to host[]
 857  *      Look for an ending '/' or ':' and terminate
 858  *      Look for login info preceeded by '@'
 859  */
 860
 861 class URLHostName
 862 {
 863
 864 public:
 865     char * extract(char const *url);
 866
 867 private:
 868     static char Host [SQUIDHOSTNAMELEN];
 869     void init(char const *);
 870     void findHostStart();
 871     void trimTrailingChars();
 872     void trimAuth();
 873     char const *hostStart;
 874     char const *url;
 875 };
 876
 877 char *
 878 urlHostname(const char *url)
 879 {
 880     return URLHostName().extract(url);
 881 }
 882
 883 char URLHostName::Host[SQUIDHOSTNAMELEN];
 884
 885 void
 886 URLHostName::init(char const *aUrl)
 887 {
 888     Host[0] = '\0';
 889     url = aUrl;
 890 }
 891
 892 void
 893 URLHostName::findHostStart()
 894 {
 895     if (NULL == (hostStart = strchr(url, ':')))
 896         return;
 897
 898     ++hostStart;
 899
 900     while (*hostStart != '\0' && *hostStart == '/')
 901         ++hostStart;
 902
 903     if (*hostStart == ']')
 904         ++hostStart;
 905 }
 906
 907 void
 908 URLHostName::trimTrailingChars()
 909 {
 910     char *t;
 911
 912     if ((t = strchr(Host, '/')))
 913         *t = '\0';
 914
 915     if ((t = strrchr(Host, ':')))
 916         *t = '\0';
 917
 918     if ((t = strchr(Host, ']')))
 919         *t = '\0';
 920 }
 921
 922 void
 923 URLHostName::trimAuth()
 924 {
 925     char *t;
 926
 927     if ((t = strrchr(Host, '@'))) {
 928         ++t;
 929         memmove(Host, t, strlen(t) + 1);
 930     }
 931 }
 932
 933 char *
 934 URLHostName::extract(char const *aUrl)
 935 {
 936     init(aUrl);
 937     findHostStart();
 938
 939     if (hostStart == NULL)
 940         return NULL;
 941
 942     xstrncpy(Host, hostStart, SQUIDHOSTNAMELEN);
 943
 944     trimTrailingChars();
 945
 946     trimAuth();
 947
 948     return Host;
 949 }
 950
 951 URL::URL(AnyP::UriScheme const &aScheme) :
 952     scheme_(aScheme),
 953     hostIsNumeric_(false),
 954     port_(0)
 955 {
 956     *host_=0;
 957 }
 958