src/url.cc

   1
   2 /*
   3  * $Id: url.cc,v 1.157 2007/04/28 22:26:38 hno Exp $
   4  *
   5  * DEBUG: section 23    URL Parsing
   6  * AUTHOR: Duane Wessels
   7  *
   8  * SQUID Web Proxy Cache          http://www.squid-cache.org/
   9  * ----------------------------------------------------------
  10  *
  11  *  Squid is the result of efforts by numerous individuals from
  12  *  the Internet community; see the CONTRIBUTORS file for full
  13  *  details.   Many organizations have provided support for Squid's
  14  *  development; see the SPONSORS file for full details.  Squid is
  15  *  Copyrighted (C) 2001 by the Regents of the University of
  16  *  California; see the COPYRIGHT file for full details.  Squid
  17  *  incorporates software developed and/or copyrighted by other
  18  *  sources; see the CREDITS file for full details.
  19  *
  20  *  This program is free software; you can redistribute it and/or modify
  21  *  it under the terms of the GNU General Public License as published by
  22  *  the Free Software Foundation; either version 2 of the License, or
  23  *  (at your option) any later version.
  24  *
  25  *  This program is distributed in the hope that it will be useful,
  26  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
  27  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  28  *  GNU General Public License for more details.
  29  *
  30  *  You should have received a copy of the GNU General Public License
  31  *  along with this program; if not, write to the Free Software
  32  *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111, USA.
  33  *
  34  */
  35
  36 #include "URL.h"
  37 #include "HttpRequest.h"
  38 #include "URLScheme.h"
  39
  40 static HttpRequest *urnParse(method_t method, char *urn);
  41 static const char valid_hostname_chars_u[] =
  42     "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
  43     "abcdefghijklmnopqrstuvwxyz"
  44     "0123456789-._"
  45     ;
  46 static const char valid_hostname_chars[] =
  47     "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
  48     "abcdefghijklmnopqrstuvwxyz"
  49     "0123456789-."
  50     ;
  51
  52 /* convert %xx in url string to a character
  53  * Allocate a new string and return a pointer to converted string */
  54
  55 char *
  56 url_convert_hex(char *org_url, int allocate)
  57 {
  58     static char code[] = "00";
  59     char *url = NULL;
  60     char *s = NULL;
  61     char *t = NULL;
  62     url = allocate ? (char *) xstrdup(org_url) : org_url;
  63
  64     if ((int) strlen(url) < 3 || !strchr(url, '%'))
  65         return url;
  66
  67     for (s = t = url; *s; s++) {
  68         if (*s == '%' && *(s + 1) && *(s + 2)) {
  69             code[0] = *(++s);
  70             code[1] = *(++s);
  71             *t++ = (char) strtol(code, NULL, 16);
  72         } else {
  73             *t++ = *s;
  74         }
  75     }
  76
  77     do {
  78         *t++ = *s;
  79     } while (*s++);
  80
  81     return url;
  82 }
  83
  84 void
  85 urlInitialize(void)
  86 {
  87     debugs(23, 5, "urlInitialize: Initializing...");
  88     /* this ensures that the number of protocol strings is the same as
  89      * the enum slots allocated because the last enum is always 'TOTAL'.
  90      */
  91     assert(strcmp(ProtocolStr[PROTO_MAX], "TOTAL") == 0);
  92     /*
  93      * These test that our matchDomainName() function works the
  94      * way we expect it to.
  95      */
  96     assert(0 == matchDomainName("foo.com", "foo.com"));
  97     assert(0 == matchDomainName(".foo.com", "foo.com"));
  98     assert(0 == matchDomainName("foo.com", ".foo.com"));
  99     assert(0 == matchDomainName(".foo.com", ".foo.com"));
 100     assert(0 == matchDomainName("x.foo.com", ".foo.com"));
 101     assert(0 != matchDomainName("x.foo.com", "foo.com"));
 102     assert(0 != matchDomainName("foo.com", "x.foo.com"));
 103     assert(0 != matchDomainName("bar.com", "foo.com"));
 104     assert(0 != matchDomainName(".bar.com", "foo.com"));
 105     assert(0 != matchDomainName(".bar.com", ".foo.com"));
 106     assert(0 != matchDomainName("bar.com", ".foo.com"));
 107     assert(0 < matchDomainName("zzz.com", "foo.com"));
 108     assert(0 > matchDomainName("aaa.com", "foo.com"));
 109     assert(0 == matchDomainName("FOO.com", "foo.COM"));
 110     assert(0 < matchDomainName("bfoo.com", "afoo.com"));
 111     assert(0 > matchDomainName("afoo.com", "bfoo.com"));
 112     assert(0 < matchDomainName("x-foo.com", ".foo.com"));
 113     /* more cases? */
 114 }
 115
 116 /*
 117  * urlParseProtocol() takes begin (b) and end (e) pointers, but for
 118  * backwards compatibility, e defaults to NULL, in which case we
 119  * assume b is NULL-terminated.
 120  */
 121 protocol_t
 122 urlParseProtocol(const char *b, const char *e)
 123 {
 124     /*
 125      * if e is NULL, b must be NULL terminated and we
 126      * make e point to the first whitespace character
 127      * after b.
 128      */
 129
 130     if (NULL == e)
 131         e = b + strcspn(b, ":");
 132
 133     int len = e - b;
 134
 135     /* test common stuff first */
 136
 137     if (strncasecmp(b, "http", len) == 0)
 138         return PROTO_HTTP;
 139
 140     if (strncasecmp(b, "ftp", len) == 0)
 141         return PROTO_FTP;
 142
 143     if (strncasecmp(b, "https", len) == 0)
 144         return PROTO_HTTPS;
 145
 146     if (strncasecmp(b, "file", len) == 0)
 147         return PROTO_FTP;
 148
 149     if (strncasecmp(b, "gopher", len) == 0)
 150         return PROTO_GOPHER;
 151
 152     if (strncasecmp(b, "wais", len) == 0)
 153         return PROTO_WAIS;
 154
 155     if (strncasecmp(b, "cache_object", len) == 0)
 156         return PROTO_CACHEOBJ;
 157
 158     if (strncasecmp(b, "urn", len) == 0)
 159         return PROTO_URN;
 160
 161     if (strncasecmp(b, "whois", len) == 0)
 162         return PROTO_WHOIS;
 163
 164     if (strncasecmp(b, "internal", len) == 0)
 165         return PROTO_INTERNAL;
 166
 167     return PROTO_NONE;
 168 }
 169
 170 int
 171 urlDefaultPort(protocol_t p)
 172 {
 173     switch (p) {
 174
 175     case PROTO_HTTP:
 176         return 80;
 177
 178     case PROTO_HTTPS:
 179         return 443;
 180
 181     case PROTO_FTP:
 182         return 21;
 183
 184     case PROTO_GOPHER:
 185         return 70;
 186
 187     case PROTO_WAIS:
 188         return 210;
 189
 190     case PROTO_CACHEOBJ:
 191
 192     case PROTO_INTERNAL:
 193         return CACHE_HTTP_PORT;
 194
 195     case PROTO_WHOIS:
 196         return 43;
 197
 198     default:
 199         return 0;
 200     }
 201 }
 202
 203 /*
 204  * Parse a URI/URL.
 205  *
 206  * If the 'request' arg is non-NULL, put parsed values there instead
 207  * of allocating a new HttpRequest.
 208  *
 209  * This abuses HttpRequest as a way of representing the parsed url
 210  * and its components.
 211  * method is used to switch parsers and to init the HttpRequest.
 212  * If method is METHOD_CONNECT, then rather than a URL a hostname:port is
 213  * looked for.
 214  * The url is non const so that if its too long we can NULL-terminate it in place.
 215  */
 216 HttpRequest *
 217 urlParse(method_t method, char *url, HttpRequest *request)
 218 {
 219     LOCAL_ARRAY(char, proto, MAX_URL);
 220     LOCAL_ARRAY(char, login, MAX_URL);
 221     LOCAL_ARRAY(char, host, MAX_URL);
 222     LOCAL_ARRAY(char, urlpath, MAX_URL);
 223     char *t = NULL;
 224     char *q = NULL;
 225     int port;
 226     protocol_t protocol = PROTO_NONE;
 227     int l;
 228     proto[0] = host[0] = urlpath[0] = login[0] = '\0';
 229
 230     if ((l = strlen(url)) + Config.appendDomainLen > (MAX_URL - 1)) {
 231         /* terminate so it doesn't overflow other buffers */
 232         *(url + (MAX_URL >> 1)) = '\0';
 233         debugs(23, 1, "urlParse: URL too large (" << l << " bytes)");
 234         return NULL;
 235     }
 236
 237     if (method == METHOD_CONNECT) {
 238         port = CONNECT_PORT;
 239
 240         if (sscanf(url, "%[^:]:%d", host, &port) < 1)
 241             return NULL;
 242     } else if (!strncmp(url, "urn:", 4)) {
 243         return urnParse(method, url);
 244     } else {
 245         if (sscanf(url, "%[^:]://%[^/]%[^\r\n]", proto, host, urlpath) < 2)
 246             return NULL;
 247
 248         protocol = urlParseProtocol(proto);
 249
 250         port = urlDefaultPort(protocol);
 251
 252         /* Is there any login informaiton? */
 253         if ((t = strrchr(host, '@'))) {
 254             strcpy((char *) login, (char *) host);
 255             t = strrchr(login, '@');
 256             *t = 0;
 257             strcpy((char *) host, t + 1);
 258         }
 259
 260         if ((t = strrchr(host, ':'))) {
 261             *t++ = '\0';
 262
 263             if (*t != '\0')
 264                 port = atoi(t);
 265         }
 266     }
 267
 268     for (t = host; *t; t++)
 269         *t = xtolower(*t);
 270
 271     if (stringHasWhitespace(host)) {
 272         if (URI_WHITESPACE_STRIP == Config.uri_whitespace) {
 273             t = q = host;
 274
 275             while (*t) {
 276                 if (!xisspace(*t))
 277                     *q++ = *t;
 278
 279                 t++;
 280             }
 281
 282             *q = '\0';
 283         }
 284     }
 285
 286     if (Config.onoff.check_hostnames && strspn(host, Config.onoff.allow_underscore ? valid_hostname_chars_u : valid_hostname_chars) != strlen(host)) {
 287         debugs(23, 1, "urlParse: Illegal character in hostname '" << host << "'");
 288         return NULL;
 289     }
 290
 291 #if DONT_DO_THIS_IT_BREAKS_SEMANTIC_TRANSPARENCY
 292     /* remove trailing dots from hostnames */
 293     while ((l = strlen(host)) > 0 && host[--l] == '.')
 294         host[l] = '\0';
 295
 296     /* remove duplicate dots */
 297     while ((t = strstr(host, "..")))
 298         xmemmove(t, t + 1, strlen(t));
 299
 300 #endif
 301
 302     if (Config.appendDomain && !strchr(host, '.'))
 303         strncat(host, Config.appendDomain, SQUIDHOSTNAMELEN - strlen(host) - 1);
 304
 305     if (port < 1 || port > 65535) {
 306         debugs(23, 3, "urlParse: Invalid port '" << port << "'");
 307         return NULL;
 308     }
 309
 310 #ifdef HARDCODE_DENY_PORTS
 311     /* These ports are filtered in the default squid.conf, but
 312      * maybe someone wants them hardcoded... */
 313     if (port == 7 || port == 9 || port == 19) {
 314         debugs(23, 0, "urlParse: Deny access to port " << port);
 315         return NULL;
 316     }
 317
 318 #endif
 319     if (stringHasWhitespace(urlpath)) {
 320         debugs(23, 2, "urlParse: URI has whitespace: {" << url << "}");
 321
 322         switch (Config.uri_whitespace) {
 323
 324         case URI_WHITESPACE_DENY:
 325             return NULL;
 326
 327         case URI_WHITESPACE_ALLOW:
 328             break;
 329
 330         case URI_WHITESPACE_ENCODE:
 331             t = rfc1738_escape_unescaped(urlpath);
 332             xstrncpy(urlpath, t, MAX_URL);
 333             break;
 334
 335         case URI_WHITESPACE_CHOP:
 336             *(urlpath + strcspn(urlpath, w_space)) = '\0';
 337             break;
 338
 339         case URI_WHITESPACE_STRIP:
 340
 341         default:
 342             t = q = urlpath;
 343
 344             while (*t) {
 345                 if (!xisspace(*t))
 346                     *q++ = *t;
 347
 348                 t++;
 349             }
 350
 351             *q = '\0';
 352         }
 353     }
 354
 355     if (NULL == request)
 356         request = new HttpRequest(method, protocol, urlpath);
 357     else {
 358         request->initHTTP(method, protocol, urlpath);
 359     }
 360
 361     xstrncpy(request->host, host, SQUIDHOSTNAMELEN);
 362     xstrncpy(request->login, login, MAX_LOGIN_SZ);
 363     request->port = (u_short) port;
 364     return request;
 365 }
 366
 367 static HttpRequest *
 368 urnParse(method_t method, char *urn)
 369 {
 370     debugs(50, 5, "urnParse: " << urn);
 371     return new HttpRequest(method, PROTO_URN, urn + 4);
 372 }
 373
 374 const char *
 375 urlCanonical(HttpRequest * request)
 376 {
 377     LOCAL_ARRAY(char, portbuf, 32);
 378     LOCAL_ARRAY(char, urlbuf, MAX_URL);
 379
 380     if (request->canonical)
 381         return request->canonical;
 382
 383     if (request->protocol == PROTO_URN) {
 384         snprintf(urlbuf, MAX_URL, "urn:%s", request->urlpath.buf());
 385     } else {
 386         switch (request->method) {
 387
 388         case METHOD_CONNECT:
 389             snprintf(urlbuf, MAX_URL, "%s:%d", request->host, request->port);
 390             break;
 391
 392         default:
 393             portbuf[0] = '\0';
 394
 395             if (request->port != urlDefaultPort(request->protocol))
 396                 snprintf(portbuf, 32, ":%d", request->port);
 397
 398             snprintf(urlbuf, MAX_URL, "%s://%s%s%s%s%s",
 399                      ProtocolStr[request->protocol],
 400                      request->login,
 401                      *request->login ? "@" : null_string,
 402                      request->host,
 403                      portbuf,
 404                      request->urlpath.buf());
 405
 406             break;
 407         }
 408     }
 409
 410     return (request->canonical = xstrdup(urlbuf));
 411 }
 412
 413 char *
 414 urlCanonicalClean(const HttpRequest * request)
 415 {
 416     LOCAL_ARRAY(char, buf, MAX_URL);
 417     LOCAL_ARRAY(char, portbuf, 32);
 418     LOCAL_ARRAY(char, loginbuf, MAX_LOGIN_SZ + 1);
 419     char *t;
 420
 421     if (request->protocol == PROTO_URN) {
 422         snprintf(buf, MAX_URL, "urn:%s", request->urlpath.buf());
 423     } else {
 424         switch (request->method) {
 425
 426         case METHOD_CONNECT:
 427             snprintf(buf, MAX_URL, "%s:%d", request->host, request->port);
 428             break;
 429
 430         default:
 431             portbuf[0] = '\0';
 432
 433             if (request->port != urlDefaultPort(request->protocol))
 434                 snprintf(portbuf, 32, ":%d", request->port);
 435
 436             loginbuf[0] = '\0';
 437
 438             if ((int) strlen(request->login) > 0) {
 439                 strcpy(loginbuf, request->login);
 440
 441                 if ((t = strchr(loginbuf, ':')))
 442                     *t = '\0';
 443
 444                 strcat(loginbuf, "@");
 445             }
 446
 447             snprintf(buf, MAX_URL, "%s://%s%s%s%s",
 448                      ProtocolStr[request->protocol],
 449                      loginbuf,
 450                      request->host,
 451                      portbuf,
 452                      request->urlpath.buf());
 453             /*
 454              * strip arguments AFTER a question-mark
 455              */
 456
 457             if (Config.onoff.strip_query_terms)
 458                 if ((t = strchr(buf, '?')))
 459                     *(++t) = '\0';
 460
 461             break;
 462         }
 463     }
 464
 465     if (stringHasCntl(buf))
 466         xstrncpy(buf, rfc1738_escape_unescaped(buf), MAX_URL);
 467
 468     return buf;
 469 }
 470
 471 /*
 472  * matchDomainName() compares a hostname with a domainname according
 473  * to the following rules:
 474  *
 475  *    HOST          DOMAIN        MATCH?
 476  * ------------- -------------    ------
 477  *    foo.com       foo.com         YES
 478  *   .foo.com       foo.com         YES
 479  *  x.foo.com       foo.com          NO
 480  *    foo.com      .foo.com         YES
 481  *   .foo.com      .foo.com         YES
 482  *  x.foo.com      .foo.com         YES
 483  *
 484  *  We strip leading dots on hosts (but not domains!) so that
 485  *  ".foo.com" is is always the same as "foo.com".
 486  *
 487  *  Return values:
 488  *     0 means the host matches the domain
 489  *     1 means the host is greater than the domain
 490  *    -1 means the host is less than the domain
 491  */
 492
 493 int
 494 matchDomainName(const char *h, const char *d)
 495 {
 496     int dl;
 497     int hl;
 498
 499     while ('.' == *h)
 500         h++;
 501
 502     hl = strlen(h);
 503
 504     dl = strlen(d);
 505
 506     /*
 507      * Start at the ends of the two strings and work towards the
 508      * beginning.
 509      */
 510     while (xtolower(h[--hl]) == xtolower(d[--dl])) {
 511         if (hl == 0 && dl == 0) {
 512             /*
 513              * We made it all the way to the beginning of both
 514              * strings without finding any difference.
 515              */
 516             return 0;
 517         }
 518
 519         if (0 == hl) {
 520             /*
 521              * The host string is shorter than the domain string.
 522              * There is only one case when this can be a match.
 523              * If the domain is just one character longer, and if
 524              * that character is a leading '.' then we call it a
 525              * match.
 526              */
 527
 528             if (1 == dl && '.' == d[0])
 529                 return 0;
 530             else
 531                 return -1;
 532         }
 533
 534         if (0 == dl) {
 535             /*
 536              * The domain string is shorter than the host string.
 537              * This is a match only if the first domain character
 538              * is a leading '.'.
 539              */
 540
 541             if ('.' == d[0])
 542                 return 0;
 543             else
 544                 return 1;
 545         }
 546     }
 547
 548     /*
 549      * We found different characters in the same position (from the end).
 550      */
 551     /*
 552      * If one of those character is '.' then its special.  In order
 553      * for splay tree sorting to work properly, "x-foo.com" must
 554      * be greater than ".foo.com" even though '-' is less than '.'.
 555      */
 556     if ('.' == d[dl])
 557         return 1;
 558
 559     if ('.' == h[hl])
 560         return -1;
 561
 562     return (xtolower(h[hl]) - xtolower(d[dl]));
 563 }
 564
 565
 566 /*
 567  * return true if we can serve requests for this method.
 568  */
 569 int
 570 urlCheckRequest(const HttpRequest * r)
 571 {
 572     int rc = 0;
 573     /* protocol "independent" methods
 574      *
 575      * actually these methods are specific to HTTP:
 576      * they are methods we recieve on our HTTP port,
 577      * and if we had a FTP listener would not be relevant
 578      * there.
 579      *
 580      * So, we should delegate them to HTTP. The problem is that we
 581      * do not have a default protocol from the client side of HTTP.
 582      */
 583
 584     if (r->method == METHOD_CONNECT)
 585         return 1;
 586
 587     if (r->method == METHOD_TRACE)
 588         return 1;
 589
 590     if (r->method == METHOD_PURGE)
 591         return 1;
 592
 593     /* does method match the protocol? */
 594     switch (r->protocol) {
 595
 596     case PROTO_URN:
 597
 598     case PROTO_HTTP:
 599
 600     case PROTO_CACHEOBJ:
 601         rc = 1;
 602         break;
 603
 604     case PROTO_FTP:
 605
 606         if (r->method == METHOD_PUT)
 607             rc = 1;
 608
 609     case PROTO_GOPHER:
 610
 611     case PROTO_WAIS:
 612
 613     case PROTO_WHOIS:
 614         if (r->method == METHOD_GET)
 615             rc = 1;
 616         else if (r->method == METHOD_HEAD)
 617             rc = 1;
 618
 619         break;
 620
 621     case PROTO_HTTPS:
 622 #ifdef USE_SSL
 623
 624         rc = 1;
 625
 626         break;
 627
 628 #else
 629         /*
 630         * Squid can't originate an SSL connection, so it should
 631         * never receive an "https:" URL.  It should always be
 632         * CONNECT instead.
 633         */
 634         rc = 0;
 635
 636 #endif
 637
 638     default:
 639         break;
 640     }
 641
 642     return rc;
 643 }
 644
 645 /*
 646  * Quick-n-dirty host extraction from a URL.  Steps:
 647  *      Look for a colon
 648  *      Skip any '/' after the colon
 649  *      Copy the next SQUID_MAXHOSTNAMELEN bytes to host[]
 650  *      Look for an ending '/' or ':' and terminate
 651  *      Look for login info preceeded by '@'
 652  */
 653
 654 class URLHostName
 655 {
 656
 657 public:
 658     char * extract(char const *url);
 659
 660 private:
 661     static char Host [SQUIDHOSTNAMELEN];
 662     void init(char const *);
 663     void findHostStart();
 664     void trimTrailingChars();
 665     void trimAuth();
 666     char const *hostStart;
 667     char const *url;
 668 };
 669
 670 char *
 671 urlHostname(const char *url)
 672 {
 673     return URLHostName().extract(url);
 674 }
 675
 676 char URLHostName::Host[SQUIDHOSTNAMELEN];
 677
 678 void
 679 URLHostName::init(char const *aUrl)
 680 {
 681     Host[0] = '\0';
 682     url = url;
 683 }
 684
 685 void
 686 URLHostName::findHostStart()
 687 {
 688     if (NULL == (hostStart = strchr(url, ':')))
 689         return;
 690
 691     ++hostStart;
 692
 693     while (*hostStart != '\0' && *hostStart == '/')
 694         ++hostStart;
 695 }
 696
 697 void
 698 URLHostName::trimTrailingChars()
 699 {
 700     char *t;
 701
 702     if ((t = strchr(Host, '/')))
 703         *t = '\0';
 704
 705     if ((t = strchr(Host, ':')))
 706         *t = '\0';
 707 }
 708
 709 void
 710 URLHostName::trimAuth()
 711 {
 712     char *t;
 713
 714     if ((t = strrchr(Host, '@'))) {
 715         t++;
 716         xmemmove(Host, t, strlen(t) + 1);
 717     }
 718 }
 719
 720 char *
 721 URLHostName::extract(char const *aUrl)
 722 {
 723     init(aUrl);
 724     findHostStart();
 725
 726     if (hostStart == NULL)
 727         return NULL;
 728
 729     xstrncpy(Host, hostStart, SQUIDHOSTNAMELEN);
 730
 731     trimTrailingChars();
 732
 733     trimAuth();
 734
 735     return Host;
 736 }
 737
 738 URL::URL() : scheme()
 739 {}
 740
 741 URL::URL(URLScheme const &aScheme): scheme(aScheme)
 742 {}