src/url.cc

   1
   2 /*
   3  * DEBUG: section 23    URL Parsing
   4  * AUTHOR: Duane Wessels
   5  *
   6  * SQUID Web Proxy Cache          http://www.squid-cache.org/
   7  * ----------------------------------------------------------
   8  *
   9  *  Squid is the result of efforts by numerous individuals from
  10  *  the Internet community; see the CONTRIBUTORS file for full
  11  *  details.   Many organizations have provided support for Squid's
  12  *  development; see the SPONSORS file for full details.  Squid is
  13  *  Copyrighted (C) 2001 by the Regents of the University of
  14  *  California; see the COPYRIGHT file for full details.  Squid
  15  *  incorporates software developed and/or copyrighted by other
  16  *  sources; see the CREDITS file for full details.
  17  *
  18  *  This program is free software; you can redistribute it and/or modify
  19  *  it under the terms of the GNU General Public License as published by
  20  *  the Free Software Foundation; either version 2 of the License, or
  21  *  (at your option) any later version.
  22  *
  23  *  This program is distributed in the hope that it will be useful,
  24  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
  25  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  26  *  GNU General Public License for more details.
  27  *
  28  *  You should have received a copy of the GNU General Public License
  29  *  along with this program; if not, write to the Free Software
  30  *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111, USA.
  31  *
  32  */
  33
  34 #include "squid.h"
  35 #include "globals.h"
  36 #include "HttpRequest.h"
  37 #include "rfc1738.h"
  38 #include "SquidConfig.h"
  39 #include "SquidString.h"
  40 #include "URL.h"
  41
  42 static HttpRequest *urlParseFinish(const HttpRequestMethod& method,
  43                                    const AnyP::ProtocolType protocol,
  44                                    const char *const urlpath,
  45                                    const char *const host,
  46                                    const char *const login,
  47                                    const int port,
  48                                    HttpRequest *request);
  49 static HttpRequest *urnParse(const HttpRequestMethod& method, char *urn, HttpRequest *request);
  50 static const char valid_hostname_chars_u[] =
  51     "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
  52     "abcdefghijklmnopqrstuvwxyz"
  53     "0123456789-._"
  54     "[:]"
  55     ;
  56 static const char valid_hostname_chars[] =
  57     "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
  58     "abcdefghijklmnopqrstuvwxyz"
  59     "0123456789-."
  60     "[:]"
  61     ;
  62
  63 void
  64 urlInitialize(void)
  65 {
  66     debugs(23, 5, "urlInitialize: Initializing...");
  67     /* this ensures that the number of protocol strings is the same as
  68      * the enum slots allocated because the last enum is always 'MAX'.
  69      */
  70     assert(strcmp(AnyP::ProtocolType_str[AnyP::PROTO_MAX], "MAX") == 0);
  71     /*
  72      * These test that our matchDomainName() function works the
  73      * way we expect it to.
  74      */
  75     assert(0 == matchDomainName("foo.com", "foo.com"));
  76     assert(0 == matchDomainName(".foo.com", "foo.com"));
  77     assert(0 == matchDomainName("foo.com", ".foo.com"));
  78     assert(0 == matchDomainName(".foo.com", ".foo.com"));
  79     assert(0 == matchDomainName("x.foo.com", ".foo.com"));
  80     assert(0 != matchDomainName("x.foo.com", "foo.com"));
  81     assert(0 != matchDomainName("foo.com", "x.foo.com"));
  82     assert(0 != matchDomainName("bar.com", "foo.com"));
  83     assert(0 != matchDomainName(".bar.com", "foo.com"));
  84     assert(0 != matchDomainName(".bar.com", ".foo.com"));
  85     assert(0 != matchDomainName("bar.com", ".foo.com"));
  86     assert(0 < matchDomainName("zzz.com", "foo.com"));
  87     assert(0 > matchDomainName("aaa.com", "foo.com"));
  88     assert(0 == matchDomainName("FOO.com", "foo.COM"));
  89     assert(0 < matchDomainName("bfoo.com", "afoo.com"));
  90     assert(0 > matchDomainName("afoo.com", "bfoo.com"));
  91     assert(0 < matchDomainName("x-foo.com", ".foo.com"));
  92     /* more cases? */
  93 }
  94
  95 /**
  96  * urlParseProtocol() takes begin (b) and end (e) pointers, but for
  97  * backwards compatibility, e defaults to NULL, in which case we
  98  * assume b is NULL-terminated.
  99  */
 100 AnyP::ProtocolType
 101 urlParseProtocol(const char *b, const char *e)
 102 {
 103     /*
 104      * if e is NULL, b must be NULL terminated and we
 105      * make e point to the first whitespace character
 106      * after b.
 107      */
 108
 109     if (NULL == e)
 110         e = b + strcspn(b, ":");
 111
 112     int len = e - b;
 113
 114     /* test common stuff first */
 115
 116     if (strncasecmp(b, "http", len) == 0)
 117         return AnyP::PROTO_HTTP;
 118
 119     if (strncasecmp(b, "ftp", len) == 0)
 120         return AnyP::PROTO_FTP;
 121
 122     if (strncasecmp(b, "https", len) == 0)
 123         return AnyP::PROTO_HTTPS;
 124
 125     if (strncasecmp(b, "file", len) == 0)
 126         return AnyP::PROTO_FTP;
 127
 128     if (strncasecmp(b, "coap", len) == 0)
 129         return AnyP::PROTO_COAP;
 130
 131     if (strncasecmp(b, "coaps", len) == 0)
 132         return AnyP::PROTO_COAPS;
 133
 134     if (strncasecmp(b, "gopher", len) == 0)
 135         return AnyP::PROTO_GOPHER;
 136
 137     if (strncasecmp(b, "wais", len) == 0)
 138         return AnyP::PROTO_WAIS;
 139
 140     if (strncasecmp(b, "cache_object", len) == 0)
 141         return AnyP::PROTO_CACHE_OBJECT;
 142
 143     if (strncasecmp(b, "urn", len) == 0)
 144         return AnyP::PROTO_URN;
 145
 146     if (strncasecmp(b, "whois", len) == 0)
 147         return AnyP::PROTO_WHOIS;
 148
 149     return AnyP::PROTO_NONE;
 150 }
 151
 152 int
 153 urlDefaultPort(AnyP::ProtocolType p)
 154 {
 155     switch (p) {
 156
 157     case AnyP::PROTO_HTTP:
 158         return 80;
 159
 160     case AnyP::PROTO_HTTPS:
 161         return 443;
 162
 163     case AnyP::PROTO_FTP:
 164         return 21;
 165
 166     case AnyP::PROTO_COAP:
 167     case AnyP::PROTO_COAPS:
 168         // coaps:// default is TBA as of draft-ietf-core-coap-08.
 169         // Assuming IANA policy of allocating same port for base and TLS protocol versions will occur.
 170         return 5683;
 171
 172     case AnyP::PROTO_GOPHER:
 173         return 70;
 174
 175     case AnyP::PROTO_WAIS:
 176         return 210;
 177
 178     case AnyP::PROTO_CACHE_OBJECT:
 179         return CACHE_HTTP_PORT;
 180
 181     case AnyP::PROTO_WHOIS:
 182         return 43;
 183
 184     default:
 185         return 0;
 186     }
 187 }
 188
 189 /*
 190  * Parse a URI/URL.
 191  *
 192  * If the 'request' arg is non-NULL, put parsed values there instead
 193  * of allocating a new HttpRequest.
 194  *
 195  * This abuses HttpRequest as a way of representing the parsed url
 196  * and its components.
 197  * method is used to switch parsers and to init the HttpRequest.
 198  * If method is Http::METHOD_CONNECT, then rather than a URL a hostname:port is
 199  * looked for.
 200  * The url is non const so that if its too long we can NULL-terminate it in place.
 201  */
 202
 203 /*
 204  * This routine parses a URL. Its assumed that the URL is complete -
 205  * ie, the end of the string is the end of the URL. Don't pass a partial
 206  * URL here as this routine doesn't have any way of knowing whether
 207  * its partial or not (ie, it handles the case of no trailing slash as
 208  * being "end of host with implied path of /".
 209  */
 210 HttpRequest *
 211 urlParse(const HttpRequestMethod& method, char *url, HttpRequest *request)
 212 {
 213     LOCAL_ARRAY(char, proto, MAX_URL);
 214     LOCAL_ARRAY(char, login, MAX_URL);
 215     LOCAL_ARRAY(char, host, MAX_URL);
 216     LOCAL_ARRAY(char, urlpath, MAX_URL);
 217     char *t = NULL;
 218     char *q = NULL;
 219     int port;
 220     AnyP::ProtocolType protocol = AnyP::PROTO_NONE;
 221     int l;
 222     int i;
 223     const char *src;
 224     char *dst;
 225     proto[0] = host[0] = urlpath[0] = login[0] = '\0';
 226
 227     if ((l = strlen(url)) + Config.appendDomainLen > (MAX_URL - 1)) {
 228         /* terminate so it doesn't overflow other buffers */
 229         *(url + (MAX_URL >> 1)) = '\0';
 230         debugs(23, DBG_IMPORTANT, "urlParse: URL too large (" << l << " bytes)");
 231         return NULL;
 232     }
 233     if (method == Http::METHOD_CONNECT) {
 234         port = CONNECT_PORT;
 235
 236         if (sscanf(url, "[%[^]]]:%d", host, &port) < 1)
 237             if (sscanf(url, "%[^:]:%d", host, &port) < 1)
 238                 return NULL;
 239
 240     } else if ((method == Http::METHOD_OPTIONS || method == Http::METHOD_TRACE) &&
 241                strcmp(url, "*") == 0) {
 242         protocol = AnyP::PROTO_HTTP;
 243         port = urlDefaultPort(protocol);
 244         return urlParseFinish(method, protocol, url, host, login, port, request);
 245     } else if (!strncmp(url, "urn:", 4)) {
 246         return urnParse(method, url, request);
 247     } else {
 248         /* Parse the URL: */
 249         src = url;
 250         i = 0;
 251         /* Find first : - everything before is protocol */
 252         for (i = 0, dst = proto; i < l && *src != ':'; ++i, ++src, ++dst) {
 253             *dst = *src;
 254         }
 255         if (i >= l)
 256             return NULL;
 257         *dst = '\0';
 258
 259         /* Then its :// */
 260         if ((i+3) > l || *src != ':' || *(src + 1) != '/' || *(src + 2) != '/')
 261             return NULL;
 262         i += 3;
 263         src += 3;
 264
 265         /* Then everything until first /; thats host (and port; which we'll look for here later) */
 266         // bug 1881: If we don't get a "/" then we imply it was there
 267         // bug 3074: We could just be given a "?" or "#". These also imply "/"
 268         // bug 3233: whitespace is also a hostname delimiter.
 269         for (dst = host; i < l && *src != '/' && *src != '?' && *src != '#' && *src != '\0' && !xisspace(*src); ++i, ++src, ++dst) {
 270             *dst = *src;
 271         }
 272
 273         /*
 274          * We can't check for "i >= l" here because we could be at the end of the line
 275          * and have a perfectly valid URL w/ no trailing '/'. In this case we assume we've
 276          * been -given- a valid URL and the path is just '/'.
 277          */
 278         if (i > l)
 279             return NULL;
 280         *dst = '\0';
 281
 282         // bug 3074: received 'path' starting with '?', '#', or '\0' implies '/'
 283         if (*src == '?' || *src == '#' || *src == '\0') {
 284             urlpath[0] = '/';
 285             dst = &urlpath[1];
 286         } else {
 287             dst = urlpath;
 288         }
 289         /* Then everything from / (inclusive) until \r\n or \0 - thats urlpath */
 290         for (; i < l && *src != '\r' && *src != '\n' && *src != '\0'; ++i, ++src, ++dst) {
 291             *dst = *src;
 292         }
 293
 294         /* We -could- be at the end of the buffer here */
 295         if (i > l)
 296             return NULL;
 297         /* If the URL path is empty we set it to be "/" */
 298         if (dst == urlpath) {
 299             *dst = '/';
 300             ++dst;
 301         }
 302         *dst = '\0';
 303
 304         protocol = urlParseProtocol(proto);
 305         port = urlDefaultPort(protocol);
 306
 307         /* Is there any login information? (we should eventually parse it above) */
 308         t = strrchr(host, '@');
 309         if (t != NULL) {
 310             strncpy((char *) login, (char *) host, sizeof(login)-1);
 311             login[sizeof(login)-1] = '\0';
 312             t = strrchr(login, '@');
 313             *t = 0;
 314             strncpy((char *) host, t + 1, sizeof(host)-1);
 315             host[sizeof(host)-1] = '\0';
 316         }
 317
 318         /* Is there any host information? (we should eventually parse it above) */
 319         if (*host == '[') {
 320             /* strip any IPA brackets. valid under IPv6. */
 321             dst = host;
 322             /* only for IPv6 sadly, pre-IPv6/URL code can't handle the clean result properly anyway. */
 323             src = host;
 324             ++src;
 325             l = strlen(host);
 326             i = 1;
 327             for (; i < l && *src != ']' && *src != '\0'; ++i, ++src, ++dst) {
 328                 *dst = *src;
 329             }
 330
 331             /* we moved in-place, so truncate the actual hostname found */
 332             *dst = '\0';
 333             ++dst;
 334
 335             /* skip ahead to either start of port, or original EOS */
 336             while (*dst != '\0' && *dst != ':')
 337                 ++dst;
 338             t = dst;
 339         } else {
 340             t = strrchr(host, ':');
 341
 342             if (t != strchr(host,':') ) {
 343                 /* RFC 2732 states IPv6 "SHOULD" be bracketed. allowing for times when its not. */
 344                 /* RFC 3986 'update' simply modifies this to an "is" with no emphasis at all! */
 345                 /* therefore we MUST accept the case where they are not bracketed at all. */
 346                 t = NULL;
 347             }
 348         }
 349
 350         // Bug 3183 sanity check: If scheme is present, host must be too.
 351         if (protocol != AnyP::PROTO_NONE && host[0] == '\0') {
 352             debugs(23, DBG_IMPORTANT, "SECURITY ALERT: Missing hostname in URL '" << url << "'. see access.log for details.");
 353             return NULL;
 354         }
 355
 356         if (t && *t == ':') {
 357             *t = '\0';
 358             ++t;
 359             port = atoi(t);
 360         }
 361     }
 362
 363     for (t = host; *t; ++t)
 364         *t = xtolower(*t);
 365
 366     if (stringHasWhitespace(host)) {
 367         if (URI_WHITESPACE_STRIP == Config.uri_whitespace) {
 368             t = q = host;
 369             while (*t) {
 370                 if (!xisspace(*t)) {
 371                     *q = *t;
 372                     ++q;
 373                 }
 374                 ++t;
 375             }
 376             *q = '\0';
 377         }
 378     }
 379
 380     debugs(23, 3, "urlParse: Split URL '" << url << "' into proto='" << proto << "', host='" << host << "', port='" << port << "', path='" << urlpath << "'");
 381
 382     if (Config.onoff.check_hostnames && strspn(host, Config.onoff.allow_underscore ? valid_hostname_chars_u : valid_hostname_chars) != strlen(host)) {
 383         debugs(23, DBG_IMPORTANT, "urlParse: Illegal character in hostname '" << host << "'");
 384         return NULL;
 385     }
 386
 387     /* For IPV6 addresses also check for a colon */
 388     if (Config.appendDomain && !strchr(host, '.') && !strchr(host, ':'))
 389         strncat(host, Config.appendDomain, SQUIDHOSTNAMELEN - strlen(host) - 1);
 390
 391     /* remove trailing dots from hostnames */
 392     while ((l = strlen(host)) > 0 && host[--l] == '.')
 393         host[l] = '\0';
 394
 395     /* reject duplicate or leading dots */
 396     if (strstr(host, "..") || *host == '.') {
 397         debugs(23, DBG_IMPORTANT, "urlParse: Illegal hostname '" << host << "'");
 398         return NULL;
 399     }
 400
 401     if (port < 1 || port > 65535) {
 402         debugs(23, 3, "urlParse: Invalid port '" << port << "'");
 403         return NULL;
 404     }
 405
 406 #if HARDCODE_DENY_PORTS
 407     /* These ports are filtered in the default squid.conf, but
 408      * maybe someone wants them hardcoded... */
 409     if (port == 7 || port == 9 || port == 19) {
 410         debugs(23, DBG_CRITICAL, "urlParse: Deny access to port " << port);
 411         return NULL;
 412     }
 413 #endif
 414
 415     if (stringHasWhitespace(urlpath)) {
 416         debugs(23, 2, "urlParse: URI has whitespace: {" << url << "}");
 417
 418         switch (Config.uri_whitespace) {
 419
 420         case URI_WHITESPACE_DENY:
 421             return NULL;
 422
 423         case URI_WHITESPACE_ALLOW:
 424             break;
 425
 426         case URI_WHITESPACE_ENCODE:
 427             t = rfc1738_escape_unescaped(urlpath);
 428             xstrncpy(urlpath, t, MAX_URL);
 429             break;
 430
 431         case URI_WHITESPACE_CHOP:
 432             *(urlpath + strcspn(urlpath, w_space)) = '\0';
 433             break;
 434
 435         case URI_WHITESPACE_STRIP:
 436         default:
 437             t = q = urlpath;
 438             while (*t) {
 439                 if (!xisspace(*t)) {
 440                     *q = *t;
 441                     ++q;
 442                 }
 443                 ++t;
 444             }
 445             *q = '\0';
 446         }
 447     }
 448
 449     return urlParseFinish(method, protocol, urlpath, host, login, port, request);
 450 }
 451
 452 /**
 453  * Update request with parsed URI data.  If the request arg is
 454  * non-NULL, put parsed values there instead of allocating a new
 455  * HttpRequest.
 456  */
 457 static HttpRequest *
 458 urlParseFinish(const HttpRequestMethod& method,
 459                const AnyP::ProtocolType protocol,
 460                const char *const urlpath,
 461                const char *const host,
 462                const char *const login,
 463                const int port,
 464                HttpRequest *request)
 465 {
 466     if (NULL == request)
 467         request = new HttpRequest(method, protocol, urlpath);
 468     else {
 469         request->initHTTP(method, protocol, urlpath);
 470         safe_free(request->canonical);
 471     }
 472
 473     request->SetHost(host);
 474     xstrncpy(request->login, login, MAX_LOGIN_SZ);
 475     request->port = (unsigned short) port;
 476     return request;
 477 }
 478
 479 static HttpRequest *
 480 urnParse(const HttpRequestMethod& method, char *urn, HttpRequest *request)
 481 {
 482     debugs(50, 5, "urnParse: " << urn);
 483     if (request) {
 484         request->initHTTP(method, AnyP::PROTO_URN, urn + 4);
 485         safe_free(request->canonical);
 486         return request;
 487     }
 488
 489     return new HttpRequest(method, AnyP::PROTO_URN, urn + 4);
 490 }
 491
 492 const char *
 493 urlCanonical(HttpRequest * request)
 494 {
 495     LOCAL_ARRAY(char, portbuf, 32);
 496     LOCAL_ARRAY(char, urlbuf, MAX_URL);
 497
 498     if (request->canonical)
 499         return request->canonical;
 500
 501     if (request->url.getScheme() == AnyP::PROTO_URN) {
 502         snprintf(urlbuf, MAX_URL, "urn:" SQUIDSTRINGPH,
 503                  SQUIDSTRINGPRINT(request->urlpath));
 504     } else {
 505         switch (request->method.id()) {
 506
 507         case Http::METHOD_CONNECT:
 508             snprintf(urlbuf, MAX_URL, "%s:%d", request->GetHost(), request->port);
 509             break;
 510
 511         default:
 512             {
 513                 portbuf[0] = '\0';
 514
 515                 if (request->port != urlDefaultPort(request->url.getScheme()))
 516                     snprintf(portbuf, 32, ":%d", request->port);
 517
 518                 snprintf(urlbuf, MAX_URL, "%s://%s%s%s%s" SQUIDSTRINGPH,
 519                          request->url.getScheme().c_str(),
 520                          request->login,
 521                          *request->login ? "@" : null_string,
 522                          request->GetHost(),
 523                          portbuf,
 524                          SQUIDSTRINGPRINT(request->urlpath));
 525             }
 526         }
 527     }
 528
 529     return (request->canonical = xstrdup(urlbuf));
 530 }
 531
 532 /** \todo AYJ: Performance: This is an *almost* duplicate of urlCanonical. But elides the query-string.
 533  *        After copying it on in the first place! Would be less code to merge the two with a flag parameter.
 534  *        and never copy the query-string part in the first place
 535  */
 536 char *
 537 urlCanonicalClean(const HttpRequest * request)
 538 {
 539     LOCAL_ARRAY(char, buf, MAX_URL);
 540     LOCAL_ARRAY(char, portbuf, 32);
 541     LOCAL_ARRAY(char, loginbuf, MAX_LOGIN_SZ + 1);
 542     char *t;
 543
 544     if (request->url.getScheme() == AnyP::PROTO_URN) {
 545         snprintf(buf, MAX_URL, "urn:" SQUIDSTRINGPH,
 546                  SQUIDSTRINGPRINT(request->urlpath));
 547     } else {
 548         switch (request->method.id()) {
 549
 550         case Http::METHOD_CONNECT:
 551             snprintf(buf, MAX_URL, "%s:%d", request->GetHost(), request->port);
 552             break;
 553
 554         default:
 555             {
 556                 portbuf[0] = '\0';
 557
 558                 if (request->port != urlDefaultPort(request->url.getScheme()))
 559                     snprintf(portbuf, 32, ":%d", request->port);
 560
 561                 loginbuf[0] = '\0';
 562
 563                 if ((int) strlen(request->login) > 0) {
 564                     strcpy(loginbuf, request->login);
 565
 566                     if ((t = strchr(loginbuf, ':')))
 567                         *t = '\0';
 568
 569                     strcat(loginbuf, "@");
 570                 }
 571
 572                 snprintf(buf, MAX_URL, "%s://%s%s%s" SQUIDSTRINGPH,
 573                          request->url.getScheme().c_str(),
 574                          loginbuf,
 575                          request->GetHost(),
 576                          portbuf,
 577                          SQUIDSTRINGPRINT(request->urlpath));
 578
 579                 // strip arguments AFTER a question-mark
 580                 if (Config.onoff.strip_query_terms)
 581                     if ((t = strchr(buf, '?')))
 582                         *(++t) = '\0';
 583             }
 584         }
 585     }
 586
 587     if (stringHasCntl(buf))
 588         xstrncpy(buf, rfc1738_escape_unescaped(buf), MAX_URL);
 589
 590     return buf;
 591 }
 592
 593 /**
 594  * Yet another alternative to urlCanonical.
 595  * This one adds the https:// parts to Http::METHOD_CONNECT URL
 596  * for use in error page outputs.
 597  * Luckily we can leverage the others instead of duplicating.
 598  */
 599 const char *
 600 urlCanonicalFakeHttps(const HttpRequest * request)
 601 {
 602     LOCAL_ARRAY(char, buf, MAX_URL);
 603
 604     // method CONNECT and port HTTPS
 605     if (request->method == Http::METHOD_CONNECT && request->port == 443) {
 606         snprintf(buf, MAX_URL, "https://%s/*", request->GetHost());
 607         return buf;
 608     }
 609
 610     // else do the normal complete canonical thing.
 611     return urlCanonicalClean(request);
 612 }
 613
 614 /*
 615  * Test if a URL is relative.
 616  *
 617  * RFC 2396, Section 5 (Page 17) implies that in a relative URL, a '/' will
 618  * appear before a ':'.
 619  */
 620 bool
 621 urlIsRelative(const char *url)
 622 {
 623     const char *p;
 624
 625     if (url == NULL) {
 626         return (false);
 627     }
 628     if (*url == '\0') {
 629         return (false);
 630     }
 631
 632     for (p = url; *p != '\0' && *p != ':' && *p != '/'; ++p);
 633
 634     if (*p == ':') {
 635         return (false);
 636     }
 637     return (true);
 638 }
 639
 640 /*
 641  * Convert a relative URL to an absolute URL using the context of a given
 642  * request.
 643  *
 644  * It is assumed that you have already ensured that the URL is relative.
 645  *
 646  * If NULL is returned it is an indication that the method in use in the
 647  * request does not distinguish between relative and absolute and you should
 648  * use the url unchanged.
 649  *
 650  * If non-NULL is returned, it is up to the caller to free the resulting
 651  * memory using safe_free().
 652  */
 653 char *
 654 urlMakeAbsolute(const HttpRequest * req, const char *relUrl)
 655 {
 656
 657     if (req->method.id() == Http::METHOD_CONNECT) {
 658         return (NULL);
 659     }
 660
 661     char *urlbuf = (char *)xmalloc(MAX_URL * sizeof(char));
 662
 663     if (req->url.getScheme() == AnyP::PROTO_URN) {
 664         snprintf(urlbuf, MAX_URL, "urn:" SQUIDSTRINGPH,
 665                  SQUIDSTRINGPRINT(req->urlpath));
 666         return (urlbuf);
 667     }
 668
 669     size_t urllen;
 670
 671     if (req->port != urlDefaultPort(req->url.getScheme())) {
 672         urllen = snprintf(urlbuf, MAX_URL, "%s://%s%s%s:%d",
 673                           req->url.getScheme().c_str(),
 674                           req->login,
 675                           *req->login ? "@" : null_string,
 676                           req->GetHost(),
 677                           req->port
 678                          );
 679     } else {
 680         urllen = snprintf(urlbuf, MAX_URL, "%s://%s%s%s",
 681                           req->url.getScheme().c_str(),
 682                           req->login,
 683                           *req->login ? "@" : null_string,
 684                           req->GetHost()
 685                          );
 686     }
 687
 688     if (relUrl[0] == '/') {
 689         strncpy(&urlbuf[urllen], relUrl, MAX_URL - urllen - 1);
 690     } else {
 691         const char *path = req->urlpath.termedBuf();
 692         const char *last_slash = strrchr(path, '/');
 693
 694         if (last_slash == NULL) {
 695             urlbuf[urllen] = '/';
 696             ++urllen;
 697             strncpy(&urlbuf[urllen], relUrl, MAX_URL - urllen - 1);
 698         } else {
 699             ++last_slash;
 700             size_t pathlen = last_slash - path;
 701             if (pathlen > MAX_URL - urllen - 1) {
 702                 pathlen = MAX_URL - urllen - 1;
 703             }
 704             strncpy(&urlbuf[urllen], path, pathlen);
 705             urllen += pathlen;
 706             if (urllen + 1 < MAX_URL) {
 707                 strncpy(&urlbuf[urllen], relUrl, MAX_URL - urllen - 1);
 708             }
 709         }
 710     }
 711
 712     return (urlbuf);
 713 }
 714
 715 /*
 716  * matchDomainName() compares a hostname with a domainname according
 717  * to the following rules:
 718  *
 719  *    HOST          DOMAIN        MATCH?
 720  * ------------- -------------    ------
 721  *    foo.com       foo.com         YES
 722  *   .foo.com       foo.com         YES
 723  *  x.foo.com       foo.com          NO
 724  *    foo.com      .foo.com         YES
 725  *   .foo.com      .foo.com         YES
 726  *  x.foo.com      .foo.com         YES
 727  *
 728  *  We strip leading dots on hosts (but not domains!) so that
 729  *  ".foo.com" is is always the same as "foo.com".
 730  *
 731  *  Return values:
 732  *     0 means the host matches the domain
 733  *     1 means the host is greater than the domain
 734  *    -1 means the host is less than the domain
 735  */
 736
 737 int
 738 matchDomainName(const char *h, const char *d)
 739 {
 740     int dl;
 741     int hl;
 742
 743     while ('.' == *h)
 744         ++h;
 745
 746     hl = strlen(h);
 747
 748     dl = strlen(d);
 749
 750     /*
 751      * Start at the ends of the two strings and work towards the
 752      * beginning.
 753      */
 754     while (xtolower(h[--hl]) == xtolower(d[--dl])) {
 755         if (hl == 0 && dl == 0) {
 756             /*
 757              * We made it all the way to the beginning of both
 758              * strings without finding any difference.
 759              */
 760             return 0;
 761         }
 762
 763         if (0 == hl) {
 764             /*
 765              * The host string is shorter than the domain string.
 766              * There is only one case when this can be a match.
 767              * If the domain is just one character longer, and if
 768              * that character is a leading '.' then we call it a
 769              * match.
 770              */
 771
 772             if (1 == dl && '.' == d[0])
 773                 return 0;
 774             else
 775                 return -1;
 776         }
 777
 778         if (0 == dl) {
 779             /*
 780              * The domain string is shorter than the host string.
 781              * This is a match only if the first domain character
 782              * is a leading '.'.
 783              */
 784
 785             if ('.' == d[0])
 786                 return 0;
 787             else
 788                 return 1;
 789         }
 790     }
 791
 792     /*
 793      * We found different characters in the same position (from the end).
 794      */
 795     /*
 796      * If one of those character is '.' then its special.  In order
 797      * for splay tree sorting to work properly, "x-foo.com" must
 798      * be greater than ".foo.com" even though '-' is less than '.'.
 799      */
 800     if ('.' == d[dl])
 801         return 1;
 802
 803     if ('.' == h[hl])
 804         return -1;
 805
 806     return (xtolower(h[hl]) - xtolower(d[dl]));
 807 }
 808
 809 /*
 810  * return true if we can serve requests for this method.
 811  */
 812 int
 813 urlCheckRequest(const HttpRequest * r)
 814 {
 815     int rc = 0;
 816     /* protocol "independent" methods
 817      *
 818      * actually these methods are specific to HTTP:
 819      * they are methods we recieve on our HTTP port,
 820      * and if we had a FTP listener would not be relevant
 821      * there.
 822      *
 823      * So, we should delegate them to HTTP. The problem is that we
 824      * do not have a default protocol from the client side of HTTP.
 825      */
 826
 827     if (r->method == Http::METHOD_CONNECT)
 828         return 1;
 829
 830     // we support OPTIONS and TRACE directed at us (with a 501 reply, for now)
 831     // we also support forwarding OPTIONS and TRACE, except for the *-URI ones
 832     if (r->method == Http::METHOD_OPTIONS || r->method == Http::METHOD_TRACE)
 833         return (r->header.getInt64(HDR_MAX_FORWARDS) == 0 || r->urlpath != "*");
 834
 835     if (r->method == Http::METHOD_PURGE)
 836         return 1;
 837
 838     /* does method match the protocol? */
 839     switch (r->url.getScheme()) {
 840
 841     case AnyP::PROTO_URN:
 842
 843     case AnyP::PROTO_HTTP:
 844
 845     case AnyP::PROTO_CACHE_OBJECT:
 846         rc = 1;
 847         break;
 848
 849     case AnyP::PROTO_FTP:
 850
 851         if (r->method == Http::METHOD_PUT)
 852             rc = 1;
 853
 854     case AnyP::PROTO_GOPHER:
 855
 856     case AnyP::PROTO_WAIS:
 857
 858     case AnyP::PROTO_WHOIS:
 859         if (r->method == Http::METHOD_GET)
 860             rc = 1;
 861         else if (r->method == Http::METHOD_HEAD)
 862             rc = 1;
 863
 864         break;
 865
 866     case AnyP::PROTO_HTTPS:
 867 #if USE_OPENSSL
 868
 869         rc = 1;
 870
 871         break;
 872
 873 #else
 874         /*
 875         * Squid can't originate an SSL connection, so it should
 876         * never receive an "https:" URL.  It should always be
 877         * CONNECT instead.
 878         */
 879         rc = 0;
 880
 881 #endif
 882
 883     default:
 884         break;
 885     }
 886
 887     return rc;
 888 }
 889
 890 /*
 891  * Quick-n-dirty host extraction from a URL.  Steps:
 892  *      Look for a colon
 893  *      Skip any '/' after the colon
 894  *      Copy the next SQUID_MAXHOSTNAMELEN bytes to host[]
 895  *      Look for an ending '/' or ':' and terminate
 896  *      Look for login info preceeded by '@'
 897  */
 898
 899 class URLHostName
 900 {
 901
 902 public:
 903     char * extract(char const *url);
 904
 905 private:
 906     static char Host [SQUIDHOSTNAMELEN];
 907     void init(char const *);
 908     void findHostStart();
 909     void trimTrailingChars();
 910     void trimAuth();
 911     char const *hostStart;
 912     char const *url;
 913 };
 914
 915 char *
 916 urlHostname(const char *url)
 917 {
 918     return URLHostName().extract(url);
 919 }
 920
 921 char URLHostName::Host[SQUIDHOSTNAMELEN];
 922
 923 void
 924 URLHostName::init(char const *aUrl)
 925 {
 926     Host[0] = '\0';
 927     url = aUrl;
 928 }
 929
 930 void
 931 URLHostName::findHostStart()
 932 {
 933     if (NULL == (hostStart = strchr(url, ':')))
 934         return;
 935
 936     ++hostStart;
 937
 938     while (*hostStart != '\0' && *hostStart == '/')
 939         ++hostStart;
 940
 941     if (*hostStart == ']')
 942         ++hostStart;
 943 }
 944
 945 void
 946 URLHostName::trimTrailingChars()
 947 {
 948     char *t;
 949
 950     if ((t = strchr(Host, '/')))
 951         *t = '\0';
 952
 953     if ((t = strrchr(Host, ':')))
 954         *t = '\0';
 955
 956     if ((t = strchr(Host, ']')))
 957         *t = '\0';
 958 }
 959
 960 void
 961 URLHostName::trimAuth()
 962 {
 963     char *t;
 964
 965     if ((t = strrchr(Host, '@'))) {
 966         ++t;
 967         memmove(Host, t, strlen(t) + 1);
 968     }
 969 }
 970
 971 char *
 972 URLHostName::extract(char const *aUrl)
 973 {
 974     init(aUrl);
 975     findHostStart();
 976
 977     if (hostStart == NULL)
 978         return NULL;
 979
 980     xstrncpy(Host, hostStart, SQUIDHOSTNAMELEN);
 981
 982     trimTrailingChars();
 983
 984     trimAuth();
 985
 986     return Host;
 987 }