src/url.cc

   1
   2 /*
   3  * DEBUG: section 23    URL Parsing
   4  * AUTHOR: Duane Wessels
   5  *
   6  * SQUID Web Proxy Cache          http://www.squid-cache.org/
   7  * ----------------------------------------------------------
   8  *
   9  *  Squid is the result of efforts by numerous individuals from
  10  *  the Internet community; see the CONTRIBUTORS file for full
  11  *  details.   Many organizations have provided support for Squid's
  12  *  development; see the SPONSORS file for full details.  Squid is
  13  *  Copyrighted (C) 2001 by the Regents of the University of
  14  *  California; see the COPYRIGHT file for full details.  Squid
  15  *  incorporates software developed and/or copyrighted by other
  16  *  sources; see the CREDITS file for full details.
  17  *
  18  *  This program is free software; you can redistribute it and/or modify
  19  *  it under the terms of the GNU General Public License as published by
  20  *  the Free Software Foundation; either version 2 of the License, or
  21  *  (at your option) any later version.
  22  *
  23  *  This program is distributed in the hope that it will be useful,
  24  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
  25  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  26  *  GNU General Public License for more details.
  27  *
  28  *  You should have received a copy of the GNU General Public License
  29  *  along with this program; if not, write to the Free Software
  30  *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111, USA.
  31  *
  32  */
  33
  34 #include "squid.h"
  35 #include "globals.h"
  36 #include "HttpRequest.h"
  37 #include "rfc1738.h"
  38 #include "SquidString.h"
  39 #include "URL.h"
  40 #include "URLScheme.h"
  41
  42 static HttpRequest *urlParseFinish(const HttpRequestMethod& method,
  43                                    const AnyP::ProtocolType protocol,
  44                                    const char *const urlpath,
  45                                    const char *const host,
  46                                    const char *const login,
  47                                    const int port,
  48                                    HttpRequest *request);
  49 static HttpRequest *urnParse(const HttpRequestMethod& method, char *urn, HttpRequest *request);
  50 static const char valid_hostname_chars_u[] =
  51     "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
  52     "abcdefghijklmnopqrstuvwxyz"
  53     "0123456789-._"
  54     "[:]"
  55     ;
  56 static const char valid_hostname_chars[] =
  57     "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
  58     "abcdefghijklmnopqrstuvwxyz"
  59     "0123456789-."
  60     "[:]"
  61     ;
  62
  63 void
  64 urlInitialize(void)
  65 {
  66     debugs(23, 5, "urlInitialize: Initializing...");
  67     /* this ensures that the number of protocol strings is the same as
  68      * the enum slots allocated because the last enum is always 'MAX'.
  69      */
  70     assert(strcmp(AnyP::ProtocolType_str[AnyP::PROTO_MAX], "MAX") == 0);
  71     /*
  72      * These test that our matchDomainName() function works the
  73      * way we expect it to.
  74      */
  75     assert(0 == matchDomainName("foo.com", "foo.com"));
  76     assert(0 == matchDomainName(".foo.com", "foo.com"));
  77     assert(0 == matchDomainName("foo.com", ".foo.com"));
  78     assert(0 == matchDomainName(".foo.com", ".foo.com"));
  79     assert(0 == matchDomainName("x.foo.com", ".foo.com"));
  80     assert(0 != matchDomainName("x.foo.com", "foo.com"));
  81     assert(0 != matchDomainName("foo.com", "x.foo.com"));
  82     assert(0 != matchDomainName("bar.com", "foo.com"));
  83     assert(0 != matchDomainName(".bar.com", "foo.com"));
  84     assert(0 != matchDomainName(".bar.com", ".foo.com"));
  85     assert(0 != matchDomainName("bar.com", ".foo.com"));
  86     assert(0 < matchDomainName("zzz.com", "foo.com"));
  87     assert(0 > matchDomainName("aaa.com", "foo.com"));
  88     assert(0 == matchDomainName("FOO.com", "foo.COM"));
  89     assert(0 < matchDomainName("bfoo.com", "afoo.com"));
  90     assert(0 > matchDomainName("afoo.com", "bfoo.com"));
  91     assert(0 < matchDomainName("x-foo.com", ".foo.com"));
  92     /* more cases? */
  93 }
  94
  95 /**
  96  * urlParseProtocol() takes begin (b) and end (e) pointers, but for
  97  * backwards compatibility, e defaults to NULL, in which case we
  98  * assume b is NULL-terminated.
  99  */
 100 AnyP::ProtocolType
 101 urlParseProtocol(const char *b, const char *e)
 102 {
 103     /*
 104      * if e is NULL, b must be NULL terminated and we
 105      * make e point to the first whitespace character
 106      * after b.
 107      */
 108
 109     if (NULL == e)
 110         e = b + strcspn(b, ":");
 111
 112     int len = e - b;
 113
 114     /* test common stuff first */
 115
 116     if (strncasecmp(b, "http", len) == 0)
 117         return AnyP::PROTO_HTTP;
 118
 119     if (strncasecmp(b, "ftp", len) == 0)
 120         return AnyP::PROTO_FTP;
 121
 122     if (strncasecmp(b, "https", len) == 0)
 123         return AnyP::PROTO_HTTPS;
 124
 125     if (strncasecmp(b, "file", len) == 0)
 126         return AnyP::PROTO_FTP;
 127
 128     if (strncasecmp(b, "coap", len) == 0)
 129         return AnyP::PROTO_COAP;
 130
 131     if (strncasecmp(b, "coaps", len) == 0)
 132         return AnyP::PROTO_COAPS;
 133
 134     if (strncasecmp(b, "gopher", len) == 0)
 135         return AnyP::PROTO_GOPHER;
 136
 137     if (strncasecmp(b, "wais", len) == 0)
 138         return AnyP::PROTO_WAIS;
 139
 140     if (strncasecmp(b, "cache_object", len) == 0)
 141         return AnyP::PROTO_CACHE_OBJECT;
 142
 143     if (strncasecmp(b, "urn", len) == 0)
 144         return AnyP::PROTO_URN;
 145
 146     if (strncasecmp(b, "whois", len) == 0)
 147         return AnyP::PROTO_WHOIS;
 148
 149     if (strncasecmp(b, "internal", len) == 0)
 150         return AnyP::PROTO_INTERNAL;
 151
 152     return AnyP::PROTO_NONE;
 153 }
 154
 155 int
 156 urlDefaultPort(AnyP::ProtocolType p)
 157 {
 158     switch (p) {
 159
 160     case AnyP::PROTO_HTTP:
 161         return 80;
 162
 163     case AnyP::PROTO_HTTPS:
 164         return 443;
 165
 166     case AnyP::PROTO_FTP:
 167         return 21;
 168
 169     case AnyP::PROTO_COAP:
 170     case AnyP::PROTO_COAPS:
 171         // coaps:// default is TBA as of draft-ietf-core-coap-08.
 172         // Assuming IANA policy of allocating same port for base and TLS protocol versions will occur.
 173         return 5683;
 174
 175     case AnyP::PROTO_GOPHER:
 176         return 70;
 177
 178     case AnyP::PROTO_WAIS:
 179         return 210;
 180
 181     case AnyP::PROTO_CACHE_OBJECT:
 182
 183     case AnyP::PROTO_INTERNAL:
 184         return CACHE_HTTP_PORT;
 185
 186     case AnyP::PROTO_WHOIS:
 187         return 43;
 188
 189     default:
 190         return 0;
 191     }
 192 }
 193
 194 /*
 195  * Parse a URI/URL.
 196  *
 197  * If the 'request' arg is non-NULL, put parsed values there instead
 198  * of allocating a new HttpRequest.
 199  *
 200  * This abuses HttpRequest as a way of representing the parsed url
 201  * and its components.
 202  * method is used to switch parsers and to init the HttpRequest.
 203  * If method is METHOD_CONNECT, then rather than a URL a hostname:port is
 204  * looked for.
 205  * The url is non const so that if its too long we can NULL-terminate it in place.
 206  */
 207
 208 /*
 209  * This routine parses a URL. Its assumed that the URL is complete -
 210  * ie, the end of the string is the end of the URL. Don't pass a partial
 211  * URL here as this routine doesn't have any way of knowing whether
 212  * its partial or not (ie, it handles the case of no trailing slash as
 213  * being "end of host with implied path of /".
 214  */
 215 HttpRequest *
 216 urlParse(const HttpRequestMethod& method, char *url, HttpRequest *request)
 217 {
 218     LOCAL_ARRAY(char, proto, MAX_URL);
 219     LOCAL_ARRAY(char, login, MAX_URL);
 220     LOCAL_ARRAY(char, host, MAX_URL);
 221     LOCAL_ARRAY(char, urlpath, MAX_URL);
 222     char *t = NULL;
 223     char *q = NULL;
 224     int port;
 225     AnyP::ProtocolType protocol = AnyP::PROTO_NONE;
 226     int l;
 227     int i;
 228     const char *src;
 229     char *dst;
 230     proto[0] = host[0] = urlpath[0] = login[0] = '\0';
 231
 232     if ((l = strlen(url)) + Config.appendDomainLen > (MAX_URL - 1)) {
 233         /* terminate so it doesn't overflow other buffers */
 234         *(url + (MAX_URL >> 1)) = '\0';
 235         debugs(23, DBG_IMPORTANT, "urlParse: URL too large (" << l << " bytes)");
 236         return NULL;
 237     }
 238     if (method == METHOD_CONNECT) {
 239         port = CONNECT_PORT;
 240
 241         if (sscanf(url, "[%[^]]]:%d", host, &port) < 1)
 242             if (sscanf(url, "%[^:]:%d", host, &port) < 1)
 243                 return NULL;
 244
 245     } else if ((method == METHOD_OPTIONS || method == METHOD_TRACE) &&
 246                strcmp(url, "*") == 0) {
 247         protocol = AnyP::PROTO_HTTP;
 248         port = urlDefaultPort(protocol);
 249         return urlParseFinish(method, protocol, url, host, login, port, request);
 250     } else if (!strncmp(url, "urn:", 4)) {
 251         return urnParse(method, url, request);
 252     } else {
 253         /* Parse the URL: */
 254         src = url;
 255         i = 0;
 256         /* Find first : - everything before is protocol */
 257         for (i = 0, dst = proto; i < l && *src != ':'; ++i, ++src, ++dst) {
 258             *dst = *src;
 259         }
 260         if (i >= l)
 261             return NULL;
 262         *dst = '\0';
 263
 264         /* Then its :// */
 265         if ((i+3) > l || *src != ':' || *(src + 1) != '/' || *(src + 2) != '/')
 266             return NULL;
 267         i += 3;
 268         src += 3;
 269
 270         /* Then everything until first /; thats host (and port; which we'll look for here later) */
 271         // bug 1881: If we don't get a "/" then we imply it was there
 272         // bug 3074: We could just be given a "?" or "#". These also imply "/"
 273         // bug 3233: whitespace is also a hostname delimiter.
 274         for (dst = host; i < l && *src != '/' && *src != '?' && *src != '#' && *src != '\0' && !xisspace(*src); ++i, ++src, ++dst) {
 275             *dst = *src;
 276         }
 277
 278         /*
 279          * We can't check for "i >= l" here because we could be at the end of the line
 280          * and have a perfectly valid URL w/ no trailing '/'. In this case we assume we've
 281          * been -given- a valid URL and the path is just '/'.
 282          */
 283         if (i > l)
 284             return NULL;
 285         *dst = '\0';
 286
 287         // bug 3074: received 'path' starting with '?', '#', or '\0' implies '/'
 288         if (*src == '?' || *src == '#' || *src == '\0') {
 289             urlpath[0] = '/';
 290             dst = &urlpath[1];
 291         } else {
 292             dst = urlpath;
 293         }
 294         /* Then everything from / (inclusive) until \r\n or \0 - thats urlpath */
 295         for (; i < l && *src != '\r' && *src != '\n' && *src != '\0'; ++i, ++src, ++dst) {
 296             *dst = *src;
 297         }
 298
 299         /* We -could- be at the end of the buffer here */
 300         if (i > l)
 301             return NULL;
 302         /* If the URL path is empty we set it to be "/" */
 303         if (dst == urlpath) {
 304             *dst = '/';
 305             ++dst;
 306         }
 307         *dst = '\0';
 308
 309         protocol = urlParseProtocol(proto);
 310         port = urlDefaultPort(protocol);
 311
 312         /* Is there any login information? (we should eventually parse it above) */
 313         if ((t = strrchr(host, '@'))) {
 314             strcpy((char *) login, (char *) host);
 315             t = strrchr(login, '@');
 316             *t = 0;
 317             strcpy((char *) host, t + 1);
 318         }
 319
 320         /* Is there any host information? (we should eventually parse it above) */
 321         if (*host == '[') {
 322             /* strip any IPA brackets. valid under IPv6. */
 323             dst = host;
 324             /* only for IPv6 sadly, pre-IPv6/URL code can't handle the clean result properly anyway. */
 325             src = host;
 326             ++src;
 327             l = strlen(host);
 328             i = 1;
 329             for (; i < l && *src != ']' && *src != '\0'; ++i, ++src, ++dst) {
 330                 *dst = *src;
 331             }
 332
 333             /* we moved in-place, so truncate the actual hostname found */
 334             *dst = '\0';
 335             ++dst;
 336
 337             /* skip ahead to either start of port, or original EOS */
 338             while (*dst != '\0' && *dst != ':')
 339                 ++dst;
 340             t = dst;
 341         } else {
 342             t = strrchr(host, ':');
 343
 344             if (t != strchr(host,':') ) {
 345                 /* RFC 2732 states IPv6 "SHOULD" be bracketed. allowing for times when its not. */
 346                 /* RFC 3986 'update' simply modifies this to an "is" with no emphasis at all! */
 347                 /* therefore we MUST accept the case where they are not bracketed at all. */
 348                 t = NULL;
 349             }
 350         }
 351
 352         // Bug 3183 sanity check: If scheme is present, host must be too.
 353         if (protocol != AnyP::PROTO_NONE && (host == NULL || *host == '\0')) {
 354             debugs(23, DBG_IMPORTANT, "SECURITY ALERT: Missing hostname in URL '" << url << "'. see access.log for details.");
 355             return NULL;
 356         }
 357
 358         if (t && *t == ':') {
 359             *t = '\0';
 360             ++t;
 361             port = atoi(t);
 362         }
 363     }
 364
 365     for (t = host; *t; ++t)
 366         *t = xtolower(*t);
 367
 368     if (stringHasWhitespace(host)) {
 369         if (URI_WHITESPACE_STRIP == Config.uri_whitespace) {
 370             t = q = host;
 371             while (*t) {
 372                 if (!xisspace(*t)) {
 373                     *q = *t;
 374                     ++q;
 375                 }
 376                 ++t;
 377             }
 378             *q = '\0';
 379         }
 380     }
 381
 382     debugs(23, 3, "urlParse: Split URL '" << url << "' into proto='" << proto << "', host='" << host << "', port='" << port << "', path='" << urlpath << "'");
 383
 384     if (Config.onoff.check_hostnames && strspn(host, Config.onoff.allow_underscore ? valid_hostname_chars_u : valid_hostname_chars) != strlen(host)) {
 385         debugs(23, DBG_IMPORTANT, "urlParse: Illegal character in hostname '" << host << "'");
 386         return NULL;
 387     }
 388
 389     /* For IPV6 addresses also check for a colon */
 390     if (Config.appendDomain && !strchr(host, '.') && !strchr(host, ':'))
 391         strncat(host, Config.appendDomain, SQUIDHOSTNAMELEN - strlen(host) - 1);
 392
 393     /* remove trailing dots from hostnames */
 394     while ((l = strlen(host)) > 0 && host[--l] == '.')
 395         host[l] = '\0';
 396
 397     /* reject duplicate or leading dots */
 398     if (strstr(host, "..") || *host == '.') {
 399         debugs(23, DBG_IMPORTANT, "urlParse: Illegal hostname '" << host << "'");
 400         return NULL;
 401     }
 402
 403     if (port < 1 || port > 65535) {
 404         debugs(23, 3, "urlParse: Invalid port '" << port << "'");
 405         return NULL;
 406     }
 407
 408 #if HARDCODE_DENY_PORTS
 409     /* These ports are filtered in the default squid.conf, but
 410      * maybe someone wants them hardcoded... */
 411     if (port == 7 || port == 9 || port == 19) {
 412         debugs(23, DBG_CRITICAL, "urlParse: Deny access to port " << port);
 413         return NULL;
 414     }
 415 #endif
 416
 417     if (stringHasWhitespace(urlpath)) {
 418         debugs(23, 2, "urlParse: URI has whitespace: {" << url << "}");
 419
 420         switch (Config.uri_whitespace) {
 421
 422         case URI_WHITESPACE_DENY:
 423             return NULL;
 424
 425         case URI_WHITESPACE_ALLOW:
 426             break;
 427
 428         case URI_WHITESPACE_ENCODE:
 429             t = rfc1738_escape_unescaped(urlpath);
 430             xstrncpy(urlpath, t, MAX_URL);
 431             break;
 432
 433         case URI_WHITESPACE_CHOP:
 434             *(urlpath + strcspn(urlpath, w_space)) = '\0';
 435             break;
 436
 437         case URI_WHITESPACE_STRIP:
 438         default:
 439             t = q = urlpath;
 440             while (*t) {
 441                 if (!xisspace(*t)) {
 442                     *q = *t;
 443                     ++q;
 444                 }
 445                 ++t;
 446             }
 447             *q = '\0';
 448         }
 449     }
 450
 451     return urlParseFinish(method, protocol, urlpath, host, login, port, request);
 452 }
 453
 454 /**
 455  * Update request with parsed URI data.  If the request arg is
 456  * non-NULL, put parsed values there instead of allocating a new
 457  * HttpRequest.
 458  */
 459 static HttpRequest *
 460 urlParseFinish(const HttpRequestMethod& method,
 461                const AnyP::ProtocolType protocol,
 462                const char *const urlpath,
 463                const char *const host,
 464                const char *const login,
 465                const int port,
 466                HttpRequest *request)
 467 {
 468     if (NULL == request)
 469         request = new HttpRequest(method, protocol, urlpath);
 470     else {
 471         request->initHTTP(method, protocol, urlpath);
 472         safe_free(request->canonical);
 473     }
 474
 475     request->SetHost(host);
 476     xstrncpy(request->login, login, MAX_LOGIN_SZ);
 477     request->port = (unsigned short) port;
 478     return request;
 479 }
 480
 481 static HttpRequest *
 482 urnParse(const HttpRequestMethod& method, char *urn, HttpRequest *request)
 483 {
 484     debugs(50, 5, "urnParse: " << urn);
 485     if (request) {
 486         request->initHTTP(method, AnyP::PROTO_URN, urn + 4);
 487         safe_free(request->canonical);
 488         return request;
 489     }
 490
 491     return new HttpRequest(method, AnyP::PROTO_URN, urn + 4);
 492 }
 493
 494 const char *
 495 urlCanonical(HttpRequest * request)
 496 {
 497     LOCAL_ARRAY(char, portbuf, 32);
 498 /// \todo AYJ: Performance: making this a ptr and allocating when needed will be better than a write and future xstrdup().
 499     LOCAL_ARRAY(char, urlbuf, MAX_URL);
 500
 501     if (request->canonical)
 502         return request->canonical;
 503
 504     if (request->protocol == AnyP::PROTO_URN) {
 505         snprintf(urlbuf, MAX_URL, "urn:" SQUIDSTRINGPH,
 506                  SQUIDSTRINGPRINT(request->urlpath));
 507     } else {
 508 /// \todo AYJ: this could use "if..else and method == METHOD_CONNECT" easier.
 509         switch (request->method.id()) {
 510
 511         case METHOD_CONNECT:
 512             snprintf(urlbuf, MAX_URL, "%s:%d", request->GetHost(), request->port);
 513             break;
 514
 515         default:
 516             portbuf[0] = '\0';
 517
 518             if (request->port != urlDefaultPort(request->protocol))
 519                 snprintf(portbuf, 32, ":%d", request->port);
 520
 521             const URLScheme sch = request->protocol; // temporary, until bug 1961 URL handling is fixed.
 522             snprintf(urlbuf, MAX_URL, "%s://%s%s%s%s" SQUIDSTRINGPH,
 523                      sch.const_str(),
 524                      request->login,
 525                      *request->login ? "@" : null_string,
 526                      request->GetHost(),
 527                      portbuf,
 528                      SQUIDSTRINGPRINT(request->urlpath));
 529
 530             break;
 531         }
 532     }
 533
 534     return (request->canonical = xstrdup(urlbuf));
 535 }
 536
 537 /** \todo AYJ: Performance: This is an *almost* duplicate of urlCanonical. But elides the query-string.
 538  *        After copying it on in the first place! Would be less code to merge the two with a flag parameter.
 539  *        and never copy the query-string part in the first place
 540  */
 541 char *
 542 urlCanonicalClean(const HttpRequest * request)
 543 {
 544     LOCAL_ARRAY(char, buf, MAX_URL);
 545     LOCAL_ARRAY(char, portbuf, 32);
 546     LOCAL_ARRAY(char, loginbuf, MAX_LOGIN_SZ + 1);
 547     char *t;
 548
 549     if (request->protocol == AnyP::PROTO_URN) {
 550         snprintf(buf, MAX_URL, "urn:" SQUIDSTRINGPH,
 551                  SQUIDSTRINGPRINT(request->urlpath));
 552     } else {
 553 /// \todo AYJ: this could use "if..else and method == METHOD_CONNECT" easier.
 554         switch (request->method.id()) {
 555
 556         case METHOD_CONNECT:
 557             snprintf(buf, MAX_URL, "%s:%d",
 558                      request->GetHost(),
 559                      request->port);
 560             break;
 561
 562         default:
 563             portbuf[0] = '\0';
 564
 565             if (request->port != urlDefaultPort(request->protocol))
 566                 snprintf(portbuf, 32, ":%d", request->port);
 567
 568             loginbuf[0] = '\0';
 569
 570             if ((int) strlen(request->login) > 0) {
 571                 strcpy(loginbuf, request->login);
 572
 573                 if ((t = strchr(loginbuf, ':')))
 574                     *t = '\0';
 575
 576                 strcat(loginbuf, "@");
 577             }
 578
 579             const URLScheme sch = request->protocol; // temporary, until bug 1961 URL handling is fixed.
 580             snprintf(buf, MAX_URL, "%s://%s%s%s" SQUIDSTRINGPH,
 581                      sch.const_str(),
 582                      loginbuf,
 583                      request->GetHost(),
 584                      portbuf,
 585                      SQUIDSTRINGPRINT(request->urlpath));
 586             /*
 587              * strip arguments AFTER a question-mark
 588              */
 589
 590             if (Config.onoff.strip_query_terms)
 591                 if ((t = strchr(buf, '?'))) {
 592                     ++t;
 593                     *t = '\0';
 594                 }
 595
 596             break;
 597         }
 598     }
 599
 600     if (stringHasCntl(buf))
 601         xstrncpy(buf, rfc1738_escape_unescaped(buf), MAX_URL);
 602
 603     return buf;
 604 }
 605
 606 /**
 607  * Yet another alternative to urlCanonical.
 608  * This one addes the https:// parts to METHOD_CONNECT URL
 609  * for use in error page outputs.
 610  * Luckily we can leverage the others instead of duplicating.
 611  */
 612 const char *
 613 urlCanonicalFakeHttps(const HttpRequest * request)
 614 {
 615     LOCAL_ARRAY(char, buf, MAX_URL);
 616
 617     // method CONNECT and port HTTPS
 618     if (request->method == METHOD_CONNECT && request->port == 443) {
 619         snprintf(buf, MAX_URL, "https://%s/*", request->GetHost());
 620         return buf;
 621     }
 622
 623     // else do the normal complete canonical thing.
 624     return urlCanonicalClean(request);
 625 }
 626
 627 /*
 628  * Test if a URL is relative.
 629  *
 630  * RFC 2396, Section 5 (Page 17) implies that in a relative URL, a '/' will
 631  * appear before a ':'.
 632  */
 633 bool
 634 urlIsRelative(const char *url)
 635 {
 636     const char *p;
 637
 638     if (url == NULL) {
 639         return (false);
 640     }
 641     if (*url == '\0') {
 642         return (false);
 643     }
 644
 645     for (p = url; *p != '\0' && *p != ':' && *p != '/'; ++p);
 646
 647     if (*p == ':') {
 648         return (false);
 649     }
 650     return (true);
 651 }
 652
 653 /*
 654  * Convert a relative URL to an absolute URL using the context of a given
 655  * request.
 656  *
 657  * It is assumed that you have already ensured that the URL is relative.
 658  *
 659  * If NULL is returned it is an indication that the method in use in the
 660  * request does not distinguish between relative and absolute and you should
 661  * use the url unchanged.
 662  *
 663  * If non-NULL is returned, it is up to the caller to free the resulting
 664  * memory using safe_free().
 665  */
 666 char *
 667 urlMakeAbsolute(const HttpRequest * req, const char *relUrl)
 668 {
 669
 670     if (req->method.id() == METHOD_CONNECT) {
 671         return (NULL);
 672     }
 673
 674     char *urlbuf = (char *)xmalloc(MAX_URL * sizeof(char));
 675
 676     if (req->protocol == AnyP::PROTO_URN) {
 677         snprintf(urlbuf, MAX_URL, "urn:" SQUIDSTRINGPH,
 678                  SQUIDSTRINGPRINT(req->urlpath));
 679         return (urlbuf);
 680     }
 681
 682     size_t urllen;
 683
 684     const URLScheme sch = req->protocol; // temporary, until bug 1961 URL handling is fixed.
 685     if (req->port != urlDefaultPort(req->protocol)) {
 686         urllen = snprintf(urlbuf, MAX_URL, "%s://%s%s%s:%d",
 687                           sch.const_str(),
 688                           req->login,
 689                           *req->login ? "@" : null_string,
 690                           req->GetHost(),
 691                           req->port
 692                          );
 693     } else {
 694         urllen = snprintf(urlbuf, MAX_URL, "%s://%s%s%s",
 695                           sch.const_str(),
 696                           req->login,
 697                           *req->login ? "@" : null_string,
 698                           req->GetHost()
 699                          );
 700     }
 701
 702     if (relUrl[0] == '/') {
 703         strncpy(&urlbuf[urllen], relUrl, MAX_URL - urllen - 1);
 704     } else {
 705         const char *path = req->urlpath.termedBuf();
 706         const char *last_slash = strrchr(path, '/');
 707
 708         if (last_slash == NULL) {
 709             urlbuf[urllen] = '/';
 710             ++urllen;
 711             strncpy(&urlbuf[urllen], relUrl, MAX_URL - urllen - 1);
 712         } else {
 713             ++last_slash;
 714             size_t pathlen = last_slash - path;
 715             if (pathlen > MAX_URL - urllen - 1) {
 716                 pathlen = MAX_URL - urllen - 1;
 717             }
 718             strncpy(&urlbuf[urllen], path, pathlen);
 719             urllen += pathlen;
 720             if (urllen + 1 < MAX_URL) {
 721                 strncpy(&urlbuf[urllen], relUrl, MAX_URL - urllen - 1);
 722             }
 723         }
 724     }
 725
 726     return (urlbuf);
 727 }
 728
 729 /*
 730  * matchDomainName() compares a hostname with a domainname according
 731  * to the following rules:
 732  *
 733  *    HOST          DOMAIN        MATCH?
 734  * ------------- -------------    ------
 735  *    foo.com       foo.com         YES
 736  *   .foo.com       foo.com         YES
 737  *  x.foo.com       foo.com          NO
 738  *    foo.com      .foo.com         YES
 739  *   .foo.com      .foo.com         YES
 740  *  x.foo.com      .foo.com         YES
 741  *
 742  *  We strip leading dots on hosts (but not domains!) so that
 743  *  ".foo.com" is is always the same as "foo.com".
 744  *
 745  *  Return values:
 746  *     0 means the host matches the domain
 747  *     1 means the host is greater than the domain
 748  *    -1 means the host is less than the domain
 749  */
 750
 751 int
 752 matchDomainName(const char *h, const char *d)
 753 {
 754     int dl;
 755     int hl;
 756
 757     while ('.' == *h)
 758         ++h;
 759
 760     hl = strlen(h);
 761
 762     dl = strlen(d);
 763
 764     /*
 765      * Start at the ends of the two strings and work towards the
 766      * beginning.
 767      */
 768     while (xtolower(h[--hl]) == xtolower(d[--dl])) {
 769         if (hl == 0 && dl == 0) {
 770             /*
 771              * We made it all the way to the beginning of both
 772              * strings without finding any difference.
 773              */
 774             return 0;
 775         }
 776
 777         if (0 == hl) {
 778             /*
 779              * The host string is shorter than the domain string.
 780              * There is only one case when this can be a match.
 781              * If the domain is just one character longer, and if
 782              * that character is a leading '.' then we call it a
 783              * match.
 784              */
 785
 786             if (1 == dl && '.' == d[0])
 787                 return 0;
 788             else
 789                 return -1;
 790         }
 791
 792         if (0 == dl) {
 793             /*
 794              * The domain string is shorter than the host string.
 795              * This is a match only if the first domain character
 796              * is a leading '.'.
 797              */
 798
 799             if ('.' == d[0])
 800                 return 0;
 801             else
 802                 return 1;
 803         }
 804     }
 805
 806     /*
 807      * We found different characters in the same position (from the end).
 808      */
 809     /*
 810      * If one of those character is '.' then its special.  In order
 811      * for splay tree sorting to work properly, "x-foo.com" must
 812      * be greater than ".foo.com" even though '-' is less than '.'.
 813      */
 814     if ('.' == d[dl])
 815         return 1;
 816
 817     if ('.' == h[hl])
 818         return -1;
 819
 820     return (xtolower(h[hl]) - xtolower(d[dl]));
 821 }
 822
 823 /*
 824  * return true if we can serve requests for this method.
 825  */
 826 int
 827 urlCheckRequest(const HttpRequest * r)
 828 {
 829     int rc = 0;
 830     /* protocol "independent" methods
 831      *
 832      * actually these methods are specific to HTTP:
 833      * they are methods we recieve on our HTTP port,
 834      * and if we had a FTP listener would not be relevant
 835      * there.
 836      *
 837      * So, we should delegate them to HTTP. The problem is that we
 838      * do not have a default protocol from the client side of HTTP.
 839      */
 840
 841     if (r->method == METHOD_CONNECT)
 842         return 1;
 843
 844     // we support OPTIONS and TRACE directed at us (with a 501 reply, for now)
 845     // we also support forwarding OPTIONS and TRACE, except for the *-URI ones
 846     if (r->method == METHOD_OPTIONS || r->method == METHOD_TRACE)
 847         return (r->header.getInt64(HDR_MAX_FORWARDS) == 0 || r->urlpath != "*");
 848
 849     if (r->method == METHOD_PURGE)
 850         return 1;
 851
 852     /* does method match the protocol? */
 853     switch (r->protocol) {
 854
 855     case AnyP::PROTO_URN:
 856
 857     case AnyP::PROTO_HTTP:
 858
 859     case AnyP::PROTO_CACHE_OBJECT:
 860         rc = 1;
 861         break;
 862
 863     case AnyP::PROTO_FTP:
 864
 865         if (r->method == METHOD_PUT)
 866             rc = 1;
 867
 868     case AnyP::PROTO_GOPHER:
 869
 870     case AnyP::PROTO_WAIS:
 871
 872     case AnyP::PROTO_WHOIS:
 873         if (r->method == METHOD_GET)
 874             rc = 1;
 875         else if (r->method == METHOD_HEAD)
 876             rc = 1;
 877
 878         break;
 879
 880     case AnyP::PROTO_HTTPS:
 881 #if USE_SSL
 882
 883         rc = 1;
 884
 885         break;
 886
 887 #else
 888         /*
 889         * Squid can't originate an SSL connection, so it should
 890         * never receive an "https:" URL.  It should always be
 891         * CONNECT instead.
 892         */
 893         rc = 0;
 894
 895 #endif
 896
 897     default:
 898         break;
 899     }
 900
 901     return rc;
 902 }
 903
 904 /*
 905  * Quick-n-dirty host extraction from a URL.  Steps:
 906  *      Look for a colon
 907  *      Skip any '/' after the colon
 908  *      Copy the next SQUID_MAXHOSTNAMELEN bytes to host[]
 909  *      Look for an ending '/' or ':' and terminate
 910  *      Look for login info preceeded by '@'
 911  */
 912
 913 class URLHostName
 914 {
 915
 916 public:
 917     char * extract(char const *url);
 918
 919 private:
 920     static char Host [SQUIDHOSTNAMELEN];
 921     void init(char const *);
 922     void findHostStart();
 923     void trimTrailingChars();
 924     void trimAuth();
 925     char const *hostStart;
 926     char const *url;
 927 };
 928
 929 char *
 930 urlHostname(const char *url)
 931 {
 932     return URLHostName().extract(url);
 933 }
 934
 935 char URLHostName::Host[SQUIDHOSTNAMELEN];
 936
 937 void
 938 URLHostName::init(char const *aUrl)
 939 {
 940     Host[0] = '\0';
 941     url = aUrl;
 942 }
 943
 944 void
 945 URLHostName::findHostStart()
 946 {
 947     if (NULL == (hostStart = strchr(url, ':')))
 948         return;
 949
 950     ++hostStart;
 951
 952     while (*hostStart != '\0' && *hostStart == '/')
 953         ++hostStart;
 954
 955     if (*hostStart == ']')
 956         ++hostStart;
 957 }
 958
 959 void
 960 URLHostName::trimTrailingChars()
 961 {
 962     char *t;
 963
 964     if ((t = strchr(Host, '/')))
 965         *t = '\0';
 966
 967     if ((t = strrchr(Host, ':')))
 968         *t = '\0';
 969
 970     if ((t = strchr(Host, ']')))
 971         *t = '\0';
 972 }
 973
 974 void
 975 URLHostName::trimAuth()
 976 {
 977     char *t;
 978
 979     if ((t = strrchr(Host, '@'))) {
 980         ++t;
 981         memmove(Host, t, strlen(t) + 1);
 982     }
 983 }
 984
 985 char *
 986 URLHostName::extract(char const *aUrl)
 987 {
 988     init(aUrl);
 989     findHostStart();
 990
 991     if (hostStart == NULL)
 992         return NULL;
 993
 994     xstrncpy(Host, hostStart, SQUIDHOSTNAMELEN);
 995
 996     trimTrailingChars();
 997
 998     trimAuth();
 999
1000     return Host;
1001 }
1002
1003 URL::URL() : scheme()
1004 {}
1005
1006 URL::URL(URLScheme const &aScheme): scheme(aScheme)
1007 {}