src/url.cc

   1
   2 /*
   3  * $Id: url.cc,v 1.154 2006/05/08 23:38:33 robertc Exp $
   4  *
   5  * DEBUG: section 23    URL Parsing
   6  * AUTHOR: Duane Wessels
   7  *
   8  * SQUID Web Proxy Cache          http://www.squid-cache.org/
   9  * ----------------------------------------------------------
  10  *
  11  *  Squid is the result of efforts by numerous individuals from
  12  *  the Internet community; see the CONTRIBUTORS file for full
  13  *  details.   Many organizations have provided support for Squid's
  14  *  development; see the SPONSORS file for full details.  Squid is
  15  *  Copyrighted (C) 2001 by the Regents of the University of
  16  *  California; see the COPYRIGHT file for full details.  Squid
  17  *  incorporates software developed and/or copyrighted by other
  18  *  sources; see the CREDITS file for full details.
  19  *
  20  *  This program is free software; you can redistribute it and/or modify
  21  *  it under the terms of the GNU General Public License as published by
  22  *  the Free Software Foundation; either version 2 of the License, or
  23  *  (at your option) any later version.
  24  *
  25  *  This program is distributed in the hope that it will be useful,
  26  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
  27  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  28  *  GNU General Public License for more details.
  29  *
  30  *  You should have received a copy of the GNU General Public License
  31  *  along with this program; if not, write to the Free Software
  32  *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111, USA.
  33  *
  34  */
  35
  36 #include "URL.h"
  37 #include "HttpRequest.h"
  38 #include "URLScheme.h"
  39
  40 static HttpRequest *urnParse(method_t method, char *urn);
  41 #if CHECK_HOSTNAMES
  42 static const char *const valid_hostname_chars =
  43 #if ALLOW_HOSTNAME_UNDERSCORES
  44     "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
  45     "abcdefghijklmnopqrstuvwxyz"
  46     "0123456789-._";
  47 #else
  48     "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
  49     "abcdefghijklmnopqrstuvwxyz"
  50     "0123456789-."
  51     ;
  52 #endif
  53 #endif /* CHECK_HOSTNAMES */
  54
  55 /* convert %xx in url string to a character
  56  * Allocate a new string and return a pointer to converted string */
  57
  58 char *
  59 url_convert_hex(char *org_url, int allocate)
  60 {
  61     static char code[] = "00";
  62     char *url = NULL;
  63     char *s = NULL;
  64     char *t = NULL;
  65     url = allocate ? (char *) xstrdup(org_url) : org_url;
  66
  67     if ((int) strlen(url) < 3 || !strchr(url, '%'))
  68         return url;
  69
  70     for (s = t = url; *s; s++) {
  71         if (*s == '%' && *(s + 1) && *(s + 2)) {
  72             code[0] = *(++s);
  73             code[1] = *(++s);
  74             *t++ = (char) strtol(code, NULL, 16);
  75         } else {
  76             *t++ = *s;
  77         }
  78     }
  79
  80     do {
  81         *t++ = *s;
  82     } while (*s++);
  83
  84     return url;
  85 }
  86
  87 void
  88 urlInitialize(void)
  89 {
  90     debug(23, 5) ("urlInitialize: Initializing...\n");
  91     /* this ensures that the number of protocol strings is the same as
  92      * the enum slots allocated because the last enum is always 'TOTAL'.
  93      */
  94     assert(strcmp(ProtocolStr[PROTO_MAX], "TOTAL") == 0);
  95     /*
  96      * These test that our matchDomainName() function works the
  97      * way we expect it to.
  98      */
  99     assert(0 == matchDomainName("foo.com", "foo.com"));
 100     assert(0 == matchDomainName(".foo.com", "foo.com"));
 101     assert(0 == matchDomainName("foo.com", ".foo.com"));
 102     assert(0 == matchDomainName(".foo.com", ".foo.com"));
 103     assert(0 == matchDomainName("x.foo.com", ".foo.com"));
 104     assert(0 != matchDomainName("x.foo.com", "foo.com"));
 105     assert(0 != matchDomainName("foo.com", "x.foo.com"));
 106     assert(0 != matchDomainName("bar.com", "foo.com"));
 107     assert(0 != matchDomainName(".bar.com", "foo.com"));
 108     assert(0 != matchDomainName(".bar.com", ".foo.com"));
 109     assert(0 != matchDomainName("bar.com", ".foo.com"));
 110     assert(0 < matchDomainName("zzz.com", "foo.com"));
 111     assert(0 > matchDomainName("aaa.com", "foo.com"));
 112     assert(0 == matchDomainName("FOO.com", "foo.COM"));
 113     assert(0 < matchDomainName("bfoo.com", "afoo.com"));
 114     assert(0 > matchDomainName("afoo.com", "bfoo.com"));
 115     assert(0 < matchDomainName("x-foo.com", ".foo.com"));
 116     /* more cases? */
 117 }
 118
 119 /*
 120  * urlParseProtocol() takes begin (b) and end (e) pointers, but for
 121  * backwards compatibility, e defaults to NULL, in which case we
 122  * assume b is NULL-terminated.
 123  */
 124 protocol_t
 125 urlParseProtocol(const char *b, const char *e)
 126 {
 127     /*
 128      * if e is NULL, b must be NULL terminated and we
 129      * make e point to the first whitespace character
 130      * after b.
 131      */
 132
 133     if (NULL == e)
 134         e = b + strcspn(b, ":");
 135
 136     int len = e - b;
 137
 138     /* test common stuff first */
 139
 140     if (strncasecmp(b, "http", len) == 0)
 141         return PROTO_HTTP;
 142
 143     if (strncasecmp(b, "ftp", len) == 0)
 144         return PROTO_FTP;
 145
 146     if (strncasecmp(b, "https", len) == 0)
 147         return PROTO_HTTPS;
 148
 149     if (strncasecmp(b, "file", len) == 0)
 150         return PROTO_FTP;
 151
 152     if (strncasecmp(b, "gopher", len) == 0)
 153         return PROTO_GOPHER;
 154
 155     if (strncasecmp(b, "wais", len) == 0)
 156         return PROTO_WAIS;
 157
 158     if (strncasecmp(b, "cache_object", len) == 0)
 159         return PROTO_CACHEOBJ;
 160
 161     if (strncasecmp(b, "urn", len) == 0)
 162         return PROTO_URN;
 163
 164     if (strncasecmp(b, "whois", len) == 0)
 165         return PROTO_WHOIS;
 166
 167     if (strncasecmp(b, "internal", len) == 0)
 168         return PROTO_INTERNAL;
 169
 170     return PROTO_NONE;
 171 }
 172
 173 int
 174 urlDefaultPort(protocol_t p)
 175 {
 176     switch (p) {
 177
 178     case PROTO_HTTP:
 179         return 80;
 180
 181     case PROTO_HTTPS:
 182         return 443;
 183
 184     case PROTO_FTP:
 185         return 21;
 186
 187     case PROTO_GOPHER:
 188         return 70;
 189
 190     case PROTO_WAIS:
 191         return 210;
 192
 193     case PROTO_CACHEOBJ:
 194
 195     case PROTO_INTERNAL:
 196         return CACHE_HTTP_PORT;
 197
 198     case PROTO_WHOIS:
 199         return 43;
 200
 201     default:
 202         return 0;
 203     }
 204 }
 205
 206 /*
 207  * Parse a URI/URL.
 208  *
 209  * If the 'request' arg is non-NULL, put parsed values there instead
 210  * of allocating a new HttpRequest.
 211  *
 212  * This abuses HttpRequest as a way of representing the parsed url
 213  * and its components.
 214  * method is used to switch parsers and to init the HttpRequest.
 215  * If method is METHOD_CONNECT, then rather than a URL a hostname:port is
 216  * looked for.
 217  * The url is non const so that if its too long we can NULL-terminate it in place.
 218  */
 219 HttpRequest *
 220 urlParse(method_t method, char *url, HttpRequest *request)
 221 {
 222     LOCAL_ARRAY(char, proto, MAX_URL);
 223     LOCAL_ARRAY(char, login, MAX_URL);
 224     LOCAL_ARRAY(char, host, MAX_URL);
 225     LOCAL_ARRAY(char, urlpath, MAX_URL);
 226     char *t = NULL;
 227     char *q = NULL;
 228     int port;
 229     protocol_t protocol = PROTO_NONE;
 230     int l;
 231     proto[0] = host[0] = urlpath[0] = login[0] = '\0';
 232
 233     if ((l = strlen(url)) + Config.appendDomainLen > (MAX_URL - 1)) {
 234         /* terminate so it doesn't overflow other buffers */
 235         *(url + (MAX_URL >> 1)) = '\0';
 236         debug(23, 1) ("urlParse: URL too large (%d bytes)\n", l);
 237         return NULL;
 238     }
 239
 240     if (method == METHOD_CONNECT) {
 241         port = CONNECT_PORT;
 242
 243         if (sscanf(url, "%[^:]:%d", host, &port) < 1)
 244             return NULL;
 245     } else if (!strncmp(url, "urn:", 4)) {
 246         return urnParse(method, url);
 247     } else {
 248         if (sscanf(url, "%[^:]://%[^/]%[^\r\n]", proto, host, urlpath) < 2)
 249             return NULL;
 250
 251         protocol = urlParseProtocol(proto);
 252
 253         port = urlDefaultPort(protocol);
 254
 255         /* Is there any login informaiton? */
 256         if ((t = strrchr(host, '@'))) {
 257             strcpy((char *) login, (char *) host);
 258             t = strrchr(login, '@');
 259             *t = 0;
 260             strcpy((char *) host, t + 1);
 261         }
 262
 263         if ((t = strrchr(host, ':'))) {
 264             *t++ = '\0';
 265
 266             if (*t != '\0')
 267                 port = atoi(t);
 268         }
 269     }
 270
 271     for (t = host; *t; t++)
 272         *t = xtolower(*t);
 273
 274     if (stringHasWhitespace(host)) {
 275         if (URI_WHITESPACE_STRIP == Config.uri_whitespace) {
 276             t = q = host;
 277
 278             while (*t) {
 279                 if (!xisspace(*t))
 280                     *q++ = *t;
 281
 282                 t++;
 283             }
 284
 285             *q = '\0';
 286         }
 287     }
 288
 289 #if CHECK_HOSTNAMES
 290     if (Config.onoff.check_hostnames && strspn(host, valid_hostname_chars) != strlen(host)) {
 291         debug(23, 1) ("urlParse: Illegal character in hostname '%s'\n", host);
 292         return NULL;
 293     }
 294
 295 #endif
 296 #if DONT_DO_THIS_IT_BREAKS_SEMANTIC_TRANSPARENCY
 297     /* remove trailing dots from hostnames */
 298     while ((l = strlen(host)) > 0 && host[--l] == '.')
 299         host[l] = '\0';
 300
 301     /* remove duplicate dots */
 302     while ((t = strstr(host, "..")))
 303         xmemmove(t, t + 1, strlen(t));
 304
 305 #endif
 306
 307     if (Config.appendDomain && !strchr(host, '.'))
 308         strncat(host, Config.appendDomain, SQUIDHOSTNAMELEN - strlen(host) - 1);
 309
 310     if (port < 1 || port > 65535) {
 311         debug(23, 3) ("urlParse: Invalid port '%d'\n", port);
 312         return NULL;
 313     }
 314
 315 #ifdef HARDCODE_DENY_PORTS
 316     /* These ports are filtered in the default squid.conf, but
 317      * maybe someone wants them hardcoded... */
 318     if (port == 7 || port == 9 || port == 19) {
 319         debug(23, 0) ("urlParse: Deny access to port %d\n", port);
 320         return NULL;
 321     }
 322
 323 #endif
 324     if (stringHasWhitespace(urlpath)) {
 325         debug(23, 2) ("urlParse: URI has whitespace: {%s}\n", url);
 326
 327         switch (Config.uri_whitespace) {
 328
 329         case URI_WHITESPACE_DENY:
 330             return NULL;
 331
 332         case URI_WHITESPACE_ALLOW:
 333             break;
 334
 335         case URI_WHITESPACE_ENCODE:
 336             t = rfc1738_escape_unescaped(urlpath);
 337             xstrncpy(urlpath, t, MAX_URL);
 338             break;
 339
 340         case URI_WHITESPACE_CHOP:
 341             *(urlpath + strcspn(urlpath, w_space)) = '\0';
 342             break;
 343
 344         case URI_WHITESPACE_STRIP:
 345
 346         default:
 347             t = q = urlpath;
 348
 349             while (*t) {
 350                 if (!xisspace(*t))
 351                     *q++ = *t;
 352
 353                 t++;
 354             }
 355
 356             *q = '\0';
 357         }
 358     }
 359
 360     if (NULL == request)
 361         request = new HttpRequest(method, protocol, urlpath);
 362     else {
 363         request->initHTTP(method, protocol, urlpath);
 364     }
 365
 366     xstrncpy(request->host, host, SQUIDHOSTNAMELEN);
 367     xstrncpy(request->login, login, MAX_LOGIN_SZ);
 368     request->port = (u_short) port;
 369     return request;
 370 }
 371
 372 static HttpRequest *
 373 urnParse(method_t method, char *urn)
 374 {
 375     debug(50, 5) ("urnParse: %s\n", urn);
 376     return new HttpRequest(method, PROTO_URN, urn + 4);
 377 }
 378
 379 const char *
 380 urlCanonical(HttpRequest * request)
 381 {
 382     LOCAL_ARRAY(char, portbuf, 32);
 383     LOCAL_ARRAY(char, urlbuf, MAX_URL);
 384
 385     if (request->canonical)
 386         return request->canonical;
 387
 388     if (request->protocol == PROTO_URN) {
 389         snprintf(urlbuf, MAX_URL, "urn:%s", request->urlpath.buf());
 390     } else {
 391         switch (request->method) {
 392
 393         case METHOD_CONNECT:
 394             snprintf(urlbuf, MAX_URL, "%s:%d", request->host, request->port);
 395             break;
 396
 397         default:
 398             portbuf[0] = '\0';
 399
 400             if (request->port != urlDefaultPort(request->protocol))
 401                 snprintf(portbuf, 32, ":%d", request->port);
 402
 403             snprintf(urlbuf, MAX_URL, "%s://%s%s%s%s%s",
 404                      ProtocolStr[request->protocol],
 405                      request->login,
 406                      *request->login ? "@" : null_string,
 407                      request->host,
 408                      portbuf,
 409                      request->urlpath.buf());
 410
 411             break;
 412         }
 413     }
 414
 415     return (request->canonical = xstrdup(urlbuf));
 416 }
 417
 418 char *
 419 urlCanonicalClean(const HttpRequest * request)
 420 {
 421     LOCAL_ARRAY(char, buf, MAX_URL);
 422     LOCAL_ARRAY(char, portbuf, 32);
 423     LOCAL_ARRAY(char, loginbuf, MAX_LOGIN_SZ + 1);
 424     char *t;
 425
 426     if (request->protocol == PROTO_URN) {
 427         snprintf(buf, MAX_URL, "urn:%s", request->urlpath.buf());
 428     } else {
 429         switch (request->method) {
 430
 431         case METHOD_CONNECT:
 432             snprintf(buf, MAX_URL, "%s:%d", request->host, request->port);
 433             break;
 434
 435         default:
 436             portbuf[0] = '\0';
 437
 438             if (request->port != urlDefaultPort(request->protocol))
 439                 snprintf(portbuf, 32, ":%d", request->port);
 440
 441             loginbuf[0] = '\0';
 442
 443             if ((int) strlen(request->login) > 0) {
 444                 strcpy(loginbuf, request->login);
 445
 446                 if ((t = strchr(loginbuf, ':')))
 447                     *t = '\0';
 448
 449                 strcat(loginbuf, "@");
 450             }
 451
 452             snprintf(buf, MAX_URL, "%s://%s%s%s%s",
 453                      ProtocolStr[request->protocol],
 454                      loginbuf,
 455                      request->host,
 456                      portbuf,
 457                      request->urlpath.buf());
 458             /*
 459              * strip arguments AFTER a question-mark
 460              */
 461
 462             if (Config.onoff.strip_query_terms)
 463                 if ((t = strchr(buf, '?')))
 464                     *(++t) = '\0';
 465
 466             break;
 467         }
 468     }
 469
 470     if (stringHasCntl(buf))
 471         xstrncpy(buf, rfc1738_escape_unescaped(buf), MAX_URL);
 472
 473     return buf;
 474 }
 475
 476 /*
 477  * matchDomainName() compares a hostname with a domainname according
 478  * to the following rules:
 479  *
 480  *    HOST          DOMAIN        MATCH?
 481  * ------------- -------------    ------
 482  *    foo.com       foo.com         YES
 483  *   .foo.com       foo.com         YES
 484  *  x.foo.com       foo.com          NO
 485  *    foo.com      .foo.com         YES
 486  *   .foo.com      .foo.com         YES
 487  *  x.foo.com      .foo.com         YES
 488  *
 489  *  We strip leading dots on hosts (but not domains!) so that
 490  *  ".foo.com" is is always the same as "foo.com".
 491  *
 492  *  Return values:
 493  *     0 means the host matches the domain
 494  *     1 means the host is greater than the domain
 495  *    -1 means the host is less than the domain
 496  */
 497
 498 int
 499 matchDomainName(const char *h, const char *d)
 500 {
 501     int dl;
 502     int hl;
 503
 504     while ('.' == *h)
 505         h++;
 506
 507     hl = strlen(h);
 508
 509     dl = strlen(d);
 510
 511     /*
 512      * Start at the ends of the two strings and work towards the
 513      * beginning.
 514      */
 515     while (xtolower(h[--hl]) == xtolower(d[--dl])) {
 516         if (hl == 0 && dl == 0) {
 517             /*
 518              * We made it all the way to the beginning of both
 519              * strings without finding any difference.
 520              */
 521             return 0;
 522         }
 523
 524         if (0 == hl) {
 525             /*
 526              * The host string is shorter than the domain string.
 527              * There is only one case when this can be a match.
 528              * If the domain is just one character longer, and if
 529              * that character is a leading '.' then we call it a
 530              * match.
 531              */
 532
 533             if (1 == dl && '.' == d[0])
 534                 return 0;
 535             else
 536                 return -1;
 537         }
 538
 539         if (0 == dl) {
 540             /*
 541              * The domain string is shorter than the host string.
 542              * This is a match only if the first domain character
 543              * is a leading '.'.
 544              */
 545
 546             if ('.' == d[0])
 547                 return 0;
 548             else
 549                 return 1;
 550         }
 551     }
 552
 553     /*
 554      * We found different characters in the same position (from the end).
 555      */
 556     /*
 557      * If one of those character is '.' then its special.  In order
 558      * for splay tree sorting to work properly, "x-foo.com" must
 559      * be greater than ".foo.com" even though '-' is less than '.'.
 560      */
 561     if ('.' == d[dl])
 562         return 1;
 563
 564     if ('.' == h[hl])
 565         return -1;
 566
 567     return (xtolower(h[hl]) - xtolower(d[dl]));
 568 }
 569
 570
 571 /*
 572  * what does the return code of this mean ?
 573  */
 574 int
 575 urlCheckRequest(const HttpRequest * r)
 576 {
 577     int rc = 0;
 578     /* protocol "independent" methods */
 579
 580     if (r->method == METHOD_CONNECT)
 581         return 1;
 582
 583     if (r->method == METHOD_TRACE)
 584         return 1;
 585
 586     if (r->method == METHOD_PURGE)
 587         return 1;
 588
 589     /* does method match the protocol? */
 590     switch (r->protocol) {
 591
 592     case PROTO_URN:
 593
 594     case PROTO_HTTP:
 595
 596     case PROTO_CACHEOBJ:
 597         rc = 1;
 598         break;
 599
 600     case PROTO_FTP:
 601
 602         if (r->method == METHOD_PUT)
 603             rc = 1;
 604
 605     case PROTO_GOPHER:
 606
 607     case PROTO_WAIS:
 608
 609     case PROTO_WHOIS:
 610         if (r->method == METHOD_GET)
 611             rc = 1;
 612         else if (r->method == METHOD_HEAD)
 613             rc = 1;
 614
 615         break;
 616
 617     case PROTO_HTTPS:
 618 #ifdef USE_SSL
 619
 620         rc = 1;
 621
 622         break;
 623
 624 #else
 625         /*
 626         * Squid can't originate an SSL connection, so it should
 627         * never receive an "https:" URL.  It should always be
 628         * CONNECT instead.
 629         */
 630         rc = 0;
 631
 632 #endif
 633
 634     default:
 635         break;
 636     }
 637
 638     return rc;
 639 }
 640
 641 /*
 642  * Quick-n-dirty host extraction from a URL.  Steps:
 643  *      Look for a colon
 644  *      Skip any '/' after the colon
 645  *      Copy the next SQUID_MAXHOSTNAMELEN bytes to host[]
 646  *      Look for an ending '/' or ':' and terminate
 647  *      Look for login info preceeded by '@'
 648  */
 649
 650 class URLHostName
 651 {
 652
 653 public:
 654     char * extract(char const *url);
 655
 656 private:
 657     static char Host [SQUIDHOSTNAMELEN];
 658     void init(char const *);
 659     void findHostStart();
 660     void trimTrailingChars();
 661     void trimAuth();
 662     char const *hostStart;
 663     char const *url;
 664 };
 665
 666 char *
 667 urlHostname(const char *url)
 668 {
 669     return URLHostName().extract(url);
 670 }
 671
 672 char URLHostName::Host[SQUIDHOSTNAMELEN];
 673
 674 void
 675 URLHostName::init(char const *aUrl)
 676 {
 677     Host[0] = '\0';
 678     url = url;
 679 }
 680
 681 void
 682 URLHostName::findHostStart()
 683 {
 684     if (NULL == (hostStart = strchr(url, ':')))
 685         return;
 686
 687     ++hostStart;
 688
 689     while (*hostStart != '\0' && *hostStart == '/')
 690         ++hostStart;
 691 }
 692
 693 void
 694 URLHostName::trimTrailingChars()
 695 {
 696     char *t;
 697
 698     if ((t = strchr(Host, '/')))
 699         *t = '\0';
 700
 701     if ((t = strchr(Host, ':')))
 702         *t = '\0';
 703 }
 704
 705 void
 706 URLHostName::trimAuth()
 707 {
 708     char *t;
 709
 710     if ((t = strrchr(Host, '@'))) {
 711         t++;
 712         xmemmove(Host, t, strlen(t) + 1);
 713     }
 714 }
 715
 716 char *
 717 URLHostName::extract(char const *aUrl)
 718 {
 719     init(aUrl);
 720     findHostStart();
 721
 722     if (hostStart == NULL)
 723         return NULL;
 724
 725     xstrncpy(Host, hostStart, SQUIDHOSTNAMELEN);
 726
 727     trimTrailingChars();
 728
 729     trimAuth();
 730
 731     return Host;
 732 }
 733
 734 URL::URL() : scheme()
 735 {}
 736
 737 URL::URL(URLScheme const &aScheme): scheme(aScheme)
 738 {}