src/url.cc

   1
   2 /*
   3  * $Id: url.cc,v 1.161 2007/05/23 21:10:07 hno Exp $
   4  *
   5  * DEBUG: section 23    URL Parsing
   6  * AUTHOR: Duane Wessels
   7  *
   8  * SQUID Web Proxy Cache          http://www.squid-cache.org/
   9  * ----------------------------------------------------------
  10  *
  11  *  Squid is the result of efforts by numerous individuals from
  12  *  the Internet community; see the CONTRIBUTORS file for full
  13  *  details.   Many organizations have provided support for Squid's
  14  *  development; see the SPONSORS file for full details.  Squid is
  15  *  Copyrighted (C) 2001 by the Regents of the University of
  16  *  California; see the COPYRIGHT file for full details.  Squid
  17  *  incorporates software developed and/or copyrighted by other
  18  *  sources; see the CREDITS file for full details.
  19  *
  20  *  This program is free software; you can redistribute it and/or modify
  21  *  it under the terms of the GNU General Public License as published by
  22  *  the Free Software Foundation; either version 2 of the License, or
  23  *  (at your option) any later version.
  24  *
  25  *  This program is distributed in the hope that it will be useful,
  26  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
  27  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  28  *  GNU General Public License for more details.
  29  *
  30  *  You should have received a copy of the GNU General Public License
  31  *  along with this program; if not, write to the Free Software
  32  *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111, USA.
  33  *
  34  */
  35
  36 #include "URL.h"
  37 #include "HttpRequest.h"
  38 #include "URLScheme.h"
  39
  40 static HttpRequest *urnParse(method_t method, char *urn);
  41 static const char valid_hostname_chars_u[] =
  42     "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
  43     "abcdefghijklmnopqrstuvwxyz"
  44     "0123456789-._"
  45     ;
  46 static const char valid_hostname_chars[] =
  47     "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
  48     "abcdefghijklmnopqrstuvwxyz"
  49     "0123456789-."
  50     ;
  51
  52 void
  53 urlInitialize(void)
  54 {
  55     debugs(23, 5, "urlInitialize: Initializing...");
  56     /* this ensures that the number of protocol strings is the same as
  57      * the enum slots allocated because the last enum is always 'TOTAL'.
  58      */
  59     assert(strcmp(ProtocolStr[PROTO_MAX], "TOTAL") == 0);
  60     /*
  61      * These test that our matchDomainName() function works the
  62      * way we expect it to.
  63      */
  64     assert(0 == matchDomainName("foo.com", "foo.com"));
  65     assert(0 == matchDomainName(".foo.com", "foo.com"));
  66     assert(0 == matchDomainName("foo.com", ".foo.com"));
  67     assert(0 == matchDomainName(".foo.com", ".foo.com"));
  68     assert(0 == matchDomainName("x.foo.com", ".foo.com"));
  69     assert(0 != matchDomainName("x.foo.com", "foo.com"));
  70     assert(0 != matchDomainName("foo.com", "x.foo.com"));
  71     assert(0 != matchDomainName("bar.com", "foo.com"));
  72     assert(0 != matchDomainName(".bar.com", "foo.com"));
  73     assert(0 != matchDomainName(".bar.com", ".foo.com"));
  74     assert(0 != matchDomainName("bar.com", ".foo.com"));
  75     assert(0 < matchDomainName("zzz.com", "foo.com"));
  76     assert(0 > matchDomainName("aaa.com", "foo.com"));
  77     assert(0 == matchDomainName("FOO.com", "foo.COM"));
  78     assert(0 < matchDomainName("bfoo.com", "afoo.com"));
  79     assert(0 > matchDomainName("afoo.com", "bfoo.com"));
  80     assert(0 < matchDomainName("x-foo.com", ".foo.com"));
  81     /* more cases? */
  82 }
  83
  84 /*
  85  * urlParseProtocol() takes begin (b) and end (e) pointers, but for
  86  * backwards compatibility, e defaults to NULL, in which case we
  87  * assume b is NULL-terminated.
  88  */
  89 protocol_t
  90 urlParseProtocol(const char *b, const char *e)
  91 {
  92     /*
  93      * if e is NULL, b must be NULL terminated and we
  94      * make e point to the first whitespace character
  95      * after b.
  96      */
  97
  98     if (NULL == e)
  99         e = b + strcspn(b, ":");
 100
 101     int len = e - b;
 102
 103     /* test common stuff first */
 104
 105     if (strncasecmp(b, "http", len) == 0)
 106         return PROTO_HTTP;
 107
 108     if (strncasecmp(b, "ftp", len) == 0)
 109         return PROTO_FTP;
 110
 111     if (strncasecmp(b, "https", len) == 0)
 112         return PROTO_HTTPS;
 113
 114     if (strncasecmp(b, "file", len) == 0)
 115         return PROTO_FTP;
 116
 117     if (strncasecmp(b, "gopher", len) == 0)
 118         return PROTO_GOPHER;
 119
 120     if (strncasecmp(b, "wais", len) == 0)
 121         return PROTO_WAIS;
 122
 123     if (strncasecmp(b, "cache_object", len) == 0)
 124         return PROTO_CACHEOBJ;
 125
 126     if (strncasecmp(b, "urn", len) == 0)
 127         return PROTO_URN;
 128
 129     if (strncasecmp(b, "whois", len) == 0)
 130         return PROTO_WHOIS;
 131
 132     if (strncasecmp(b, "internal", len) == 0)
 133         return PROTO_INTERNAL;
 134
 135     return PROTO_NONE;
 136 }
 137
 138 int
 139 urlDefaultPort(protocol_t p)
 140 {
 141     switch (p) {
 142
 143     case PROTO_HTTP:
 144         return 80;
 145
 146     case PROTO_HTTPS:
 147         return 443;
 148
 149     case PROTO_FTP:
 150         return 21;
 151
 152     case PROTO_GOPHER:
 153         return 70;
 154
 155     case PROTO_WAIS:
 156         return 210;
 157
 158     case PROTO_CACHEOBJ:
 159
 160     case PROTO_INTERNAL:
 161         return CACHE_HTTP_PORT;
 162
 163     case PROTO_WHOIS:
 164         return 43;
 165
 166     default:
 167         return 0;
 168     }
 169 }
 170
 171 /*
 172  * Parse a URI/URL.
 173  *
 174  * If the 'request' arg is non-NULL, put parsed values there instead
 175  * of allocating a new HttpRequest.
 176  *
 177  * This abuses HttpRequest as a way of representing the parsed url
 178  * and its components.
 179  * method is used to switch parsers and to init the HttpRequest.
 180  * If method is METHOD_CONNECT, then rather than a URL a hostname:port is
 181  * looked for.
 182  * The url is non const so that if its too long we can NULL-terminate it in place.
 183  */
 184 HttpRequest *
 185 urlParse(method_t method, char *url, HttpRequest *request)
 186 {
 187     LOCAL_ARRAY(char, proto, MAX_URL);
 188     LOCAL_ARRAY(char, login, MAX_URL);
 189     LOCAL_ARRAY(char, host, MAX_URL);
 190     LOCAL_ARRAY(char, urlpath, MAX_URL);
 191     char *t = NULL;
 192     char *q = NULL;
 193     int port;
 194     protocol_t protocol = PROTO_NONE;
 195     int l;
 196     proto[0] = host[0] = urlpath[0] = login[0] = '\0';
 197
 198     if ((l = strlen(url)) + Config.appendDomainLen > (MAX_URL - 1)) {
 199         /* terminate so it doesn't overflow other buffers */
 200         *(url + (MAX_URL >> 1)) = '\0';
 201         debugs(23, 1, "urlParse: URL too large (" << l << " bytes)");
 202         return NULL;
 203     }
 204
 205     if (method == METHOD_CONNECT) {
 206         port = CONNECT_PORT;
 207
 208         if (sscanf(url, "%[^:]:%d", host, &port) < 1)
 209             return NULL;
 210     } else if (!strncmp(url, "urn:", 4)) {
 211         return urnParse(method, url);
 212     } else {
 213         if (sscanf(url, "%[^:]://%[^/]%[^\r\n]", proto, host, urlpath) < 2)
 214             return NULL;
 215
 216         protocol = urlParseProtocol(proto);
 217
 218         port = urlDefaultPort(protocol);
 219
 220         /* Is there any login informaiton? */
 221         if ((t = strrchr(host, '@'))) {
 222             strcpy((char *) login, (char *) host);
 223             t = strrchr(login, '@');
 224             *t = 0;
 225             strcpy((char *) host, t + 1);
 226         }
 227
 228         if ((t = strrchr(host, ':'))) {
 229             *t++ = '\0';
 230
 231             if (*t != '\0')
 232                 port = atoi(t);
 233         }
 234     }
 235
 236     for (t = host; *t; t++)
 237         *t = xtolower(*t);
 238
 239     if (strpbrk(host, w_space) != NULL) {
 240         if (URI_WHITESPACE_STRIP == Config.uri_whitespace) {
 241             t = q = host;
 242
 243             while (*t) {
 244                 if (!xisspace(*t))
 245                     *q++ = *t;
 246
 247                 t++;
 248             }
 249
 250             *q = '\0';
 251         }
 252     }
 253
 254     if (Config.onoff.check_hostnames && strspn(host, Config.onoff.allow_underscore ? valid_hostname_chars_u : valid_hostname_chars) != strlen(host)) {
 255         debugs(23, 1, "urlParse: Illegal character in hostname '" << host << "'");
 256         return NULL;
 257     }
 258
 259 #if DONT_DO_THIS_IT_BREAKS_SEMANTIC_TRANSPARENCY
 260     /* remove trailing dots from hostnames */
 261     while ((l = strlen(host)) > 0 && host[--l] == '.')
 262         host[l] = '\0';
 263
 264     /* remove duplicate dots */
 265     while ((t = strstr(host, "..")))
 266         xmemmove(t, t + 1, strlen(t));
 267
 268 #endif
 269
 270     if (Config.appendDomain && !strchr(host, '.'))
 271         strncat(host, Config.appendDomain, SQUIDHOSTNAMELEN - strlen(host) - 1);
 272
 273     if (port < 1 || port > 65535) {
 274         debugs(23, 3, "urlParse: Invalid port '" << port << "'");
 275         return NULL;
 276     }
 277
 278 #ifdef HARDCODE_DENY_PORTS
 279     /* These ports are filtered in the default squid.conf, but
 280      * maybe someone wants them hardcoded... */
 281     if (port == 7 || port == 9 || port == 19) {
 282         debugs(23, 0, "urlParse: Deny access to port " << port);
 283         return NULL;
 284     }
 285
 286 #endif
 287     if (strpbrk(urlpath, w_space) != NULL) {
 288         debugs(23, 2, "urlParse: URI has whitespace: {" << url << "}");
 289
 290         switch (Config.uri_whitespace) {
 291
 292         case URI_WHITESPACE_DENY:
 293             return NULL;
 294
 295         case URI_WHITESPACE_ALLOW:
 296             break;
 297
 298         case URI_WHITESPACE_ENCODE:
 299             t = rfc1738_escape_unescaped(urlpath);
 300             xstrncpy(urlpath, t, MAX_URL);
 301             break;
 302
 303         case URI_WHITESPACE_CHOP:
 304             *(urlpath + strcspn(urlpath, w_space)) = '\0';
 305             break;
 306
 307         case URI_WHITESPACE_STRIP:
 308
 309         default:
 310             t = q = urlpath;
 311
 312             while (*t) {
 313                 if (!xisspace(*t))
 314                     *q++ = *t;
 315
 316                 t++;
 317             }
 318
 319             *q = '\0';
 320         }
 321     }
 322
 323     if (NULL == request)
 324         request = new HttpRequest(method, protocol, urlpath);
 325     else {
 326         request->initHTTP(method, protocol, urlpath);
 327     }
 328
 329     xstrncpy(request->host, host, SQUIDHOSTNAMELEN);
 330     xstrncpy(request->login, login, MAX_LOGIN_SZ);
 331     request->port = (u_short) port;
 332     return request;
 333 }
 334
 335 static HttpRequest *
 336 urnParse(method_t method, char *urn)
 337 {
 338     debugs(50, 5, "urnParse: " << urn);
 339     return new HttpRequest(method, PROTO_URN, urn + 4);
 340 }
 341
 342 const char *
 343 urlCanonical(HttpRequest * request)
 344 {
 345     LOCAL_ARRAY(char, portbuf, 32);
 346     LOCAL_ARRAY(char, urlbuf, MAX_URL);
 347
 348     if (request->canonical)
 349         return request->canonical;
 350
 351     if (request->protocol == PROTO_URN) {
 352         snprintf(urlbuf, MAX_URL, "urn:%s", request->urlpath.c_str());
 353     } else {
 354         switch (request->method) {
 355
 356         case METHOD_CONNECT:
 357             snprintf(urlbuf, MAX_URL, "%s:%d", request->host, request->port);
 358             break;
 359
 360         default:
 361             portbuf[0] = '\0';
 362
 363             if (request->port != urlDefaultPort(request->protocol))
 364                 snprintf(portbuf, 32, ":%d", request->port);
 365
 366             snprintf(urlbuf, MAX_URL, "%s://%s%s%s%s%s",
 367                      ProtocolStr[request->protocol],
 368                      request->login,
 369                      *request->login ? "@" : null_string,
 370                      request->host,
 371                      portbuf,
 372                      request->urlpath.c_str());
 373
 374             break;
 375         }
 376     }
 377
 378     return (request->canonical = xstrdup(urlbuf));
 379 }
 380
 381 int
 382 stringHasCntl(const char *s)
 383 {
 384     unsigned char c;
 385
 386     while ((c = (unsigned char) *s++) != '\0') {
 387         if (c <= 0x1f)
 388             return 1;
 389
 390         if (c >= 0x7f && c <= 0x9f)
 391             return 1;
 392     }
 393
 394     return 0;
 395 }
 396
 397 char *
 398 urlCanonicalClean(const HttpRequest * request)
 399 {
 400     LOCAL_ARRAY(char, buf, MAX_URL);
 401     LOCAL_ARRAY(char, portbuf, 32);
 402     LOCAL_ARRAY(char, loginbuf, MAX_LOGIN_SZ + 1);
 403     char *t;
 404
 405     if (request->protocol == PROTO_URN) {
 406         snprintf(buf, MAX_URL, "urn:%s", request->urlpath.c_str());
 407     } else {
 408         switch (request->method) {
 409
 410         case METHOD_CONNECT:
 411             snprintf(buf, MAX_URL, "%s:%d", request->host, request->port);
 412             break;
 413
 414         default:
 415             portbuf[0] = '\0';
 416
 417             if (request->port != urlDefaultPort(request->protocol))
 418                 snprintf(portbuf, 32, ":%d", request->port);
 419
 420             loginbuf[0] = '\0';
 421
 422             if ((int) strlen(request->login) > 0) {
 423                 strcpy(loginbuf, request->login);
 424
 425                 if ((t = strchr(loginbuf, ':')))
 426                     *t = '\0';
 427
 428                 strcat(loginbuf, "@");
 429             }
 430
 431             snprintf(buf, MAX_URL, "%s://%s%s%s%s",
 432                      ProtocolStr[request->protocol],
 433                      loginbuf,
 434                      request->host,
 435                      portbuf,
 436                      request->urlpath.c_str());
 437             /*
 438              * strip arguments AFTER a question-mark
 439              */
 440
 441             if (Config.onoff.strip_query_terms)
 442                 if ((t = strchr(buf, '?')))
 443                     *(++t) = '\0';
 444
 445             break;
 446         }
 447     }
 448
 449     if (stringHasCntl(buf))
 450         xstrncpy(buf, rfc1738_escape_unescaped(buf), MAX_URL);
 451
 452     return buf;
 453 }
 454
 455 /*
 456  * matchDomainName() compares a hostname with a domainname according
 457  * to the following rules:
 458  *
 459  *    HOST          DOMAIN        MATCH?
 460  * ------------- -------------    ------
 461  *    foo.com       foo.com         YES
 462  *   .foo.com       foo.com         YES
 463  *  x.foo.com       foo.com          NO
 464  *    foo.com      .foo.com         YES
 465  *   .foo.com      .foo.com         YES
 466  *  x.foo.com      .foo.com         YES
 467  *
 468  *  We strip leading dots on hosts (but not domains!) so that
 469  *  ".foo.com" is is always the same as "foo.com".
 470  *
 471  *  Return values:
 472  *     0 means the host matches the domain
 473  *     1 means the host is greater than the domain
 474  *    -1 means the host is less than the domain
 475  */
 476
 477 int
 478 matchDomainName(const char *h, const char *d)
 479 {
 480     int dl;
 481     int hl;
 482
 483     while ('.' == *h)
 484         h++;
 485
 486     hl = strlen(h);
 487
 488     dl = strlen(d);
 489
 490     /*
 491      * Start at the ends of the two strings and work towards the
 492      * beginning.
 493      */
 494     while (xtolower(h[--hl]) == xtolower(d[--dl])) {
 495         if (hl == 0 && dl == 0) {
 496             /*
 497              * We made it all the way to the beginning of both
 498              * strings without finding any difference.
 499              */
 500             return 0;
 501         }
 502
 503         if (0 == hl) {
 504             /*
 505              * The host string is shorter than the domain string.
 506              * There is only one case when this can be a match.
 507              * If the domain is just one character longer, and if
 508              * that character is a leading '.' then we call it a
 509              * match.
 510              */
 511
 512             if (1 == dl && '.' == d[0])
 513                 return 0;
 514             else
 515                 return -1;
 516         }
 517
 518         if (0 == dl) {
 519             /*
 520              * The domain string is shorter than the host string.
 521              * This is a match only if the first domain character
 522              * is a leading '.'.
 523              */
 524
 525             if ('.' == d[0])
 526                 return 0;
 527             else
 528                 return 1;
 529         }
 530     }
 531
 532     /*
 533      * We found different characters in the same position (from the end).
 534      */
 535     /*
 536      * If one of those character is '.' then its special.  In order
 537      * for splay tree sorting to work properly, "x-foo.com" must
 538      * be greater than ".foo.com" even though '-' is less than '.'.
 539      */
 540     if ('.' == d[dl])
 541         return 1;
 542
 543     if ('.' == h[hl])
 544         return -1;
 545
 546     return (xtolower(h[hl]) - xtolower(d[dl]));
 547 }
 548
 549
 550 /*
 551  * return true if we can serve requests for this method.
 552  */
 553 int
 554 urlCheckRequest(const HttpRequest * r)
 555 {
 556     int rc = 0;
 557     /* protocol "independent" methods
 558      *
 559      * actually these methods are specific to HTTP:
 560      * they are methods we recieve on our HTTP port,
 561      * and if we had a FTP listener would not be relevant
 562      * there.
 563      *
 564      * So, we should delegate them to HTTP. The problem is that we
 565      * do not have a default protocol from the client side of HTTP.
 566      */
 567
 568     if (r->method == METHOD_CONNECT)
 569         return 1;
 570
 571     if (r->method == METHOD_TRACE)
 572         return 1;
 573
 574     if (r->method == METHOD_PURGE)
 575         return 1;
 576
 577     /* does method match the protocol? */
 578     switch (r->protocol) {
 579
 580     case PROTO_URN:
 581
 582     case PROTO_HTTP:
 583
 584     case PROTO_CACHEOBJ:
 585         rc = 1;
 586         break;
 587
 588     case PROTO_FTP:
 589
 590         if (r->method == METHOD_PUT)
 591             rc = 1;
 592
 593     case PROTO_GOPHER:
 594
 595     case PROTO_WAIS:
 596
 597     case PROTO_WHOIS:
 598         if (r->method == METHOD_GET)
 599             rc = 1;
 600         else if (r->method == METHOD_HEAD)
 601             rc = 1;
 602
 603         break;
 604
 605     case PROTO_HTTPS:
 606 #ifdef USE_SSL
 607
 608         rc = 1;
 609
 610         break;
 611
 612 #else
 613         /*
 614         * Squid can't originate an SSL connection, so it should
 615         * never receive an "https:" URL.  It should always be
 616         * CONNECT instead.
 617         */
 618         rc = 0;
 619
 620 #endif
 621
 622     default:
 623         break;
 624     }
 625
 626     return rc;
 627 }
 628
 629 /*
 630  * Quick-n-dirty host extraction from a URL.  Steps:
 631  *      Look for a colon
 632  *      Skip any '/' after the colon
 633  *      Copy the next SQUID_MAXHOSTNAMELEN bytes to host[]
 634  *      Look for an ending '/' or ':' and terminate
 635  *      Look for login info preceeded by '@'
 636  */
 637
 638 class URLHostName
 639 {
 640
 641 public:
 642     char * extract(char const *url);
 643
 644 private:
 645     static char Host [SQUIDHOSTNAMELEN];
 646     void init(char const *);
 647     void findHostStart();
 648     void trimTrailingChars();
 649     void trimAuth();
 650     char const *hostStart;
 651     char const *url;
 652 };
 653
 654 char *
 655 urlHostname(const char *url)
 656 {
 657     return URLHostName().extract(url);
 658 }
 659
 660 char URLHostName::Host[SQUIDHOSTNAMELEN];
 661
 662 void
 663 URLHostName::init(char const *aUrl)
 664 {
 665     Host[0] = '\0';
 666     url = url;
 667 }
 668
 669 void
 670 URLHostName::findHostStart()
 671 {
 672     if (NULL == (hostStart = strchr(url, ':')))
 673         return;
 674
 675     ++hostStart;
 676
 677     while (*hostStart != '\0' && *hostStart == '/')
 678         ++hostStart;
 679 }
 680
 681 void
 682 URLHostName::trimTrailingChars()
 683 {
 684     char *t;
 685
 686     if ((t = strchr(Host, '/')))
 687         *t = '\0';
 688
 689     if ((t = strchr(Host, ':')))
 690         *t = '\0';
 691 }
 692
 693 void
 694 URLHostName::trimAuth()
 695 {
 696     char *t;
 697
 698     if ((t = strrchr(Host, '@'))) {
 699         t++;
 700         xmemmove(Host, t, strlen(t) + 1);
 701     }
 702 }
 703
 704 char *
 705 URLHostName::extract(char const *aUrl)
 706 {
 707     init(aUrl);
 708     findHostStart();
 709
 710     if (hostStart == NULL)
 711         return NULL;
 712
 713     xstrncpy(Host, hostStart, SQUIDHOSTNAMELEN);
 714
 715     trimTrailingChars();
 716
 717     trimAuth();
 718
 719     return Host;
 720 }
 721
 722 URL::URL() : scheme()
 723 {}
 724
 725 URL::URL(URLScheme const &aScheme): scheme(aScheme)
 726 {}