src/url.cc

   1
   2 /*
   3  * $Id: url.cc,v 1.135 2002/08/19 22:47:54 hno Exp $
   4  *
   5  * DEBUG: section 23    URL Parsing
   6  * AUTHOR: Duane Wessels
   7  *
   8  * SQUID Web Proxy Cache          http://www.squid-cache.org/
   9  * ----------------------------------------------------------
  10  *
  11  *  Squid is the result of efforts by numerous individuals from
  12  *  the Internet community; see the CONTRIBUTORS file for full
  13  *  details.   Many organizations have provided support for Squid's
  14  *  development; see the SPONSORS file for full details.  Squid is
  15  *  Copyrighted (C) 2001 by the Regents of the University of
  16  *  California; see the COPYRIGHT file for full details.  Squid
  17  *  incorporates software developed and/or copyrighted by other
  18  *  sources; see the CREDITS file for full details.
  19  *
  20  *  This program is free software; you can redistribute it and/or modify
  21  *  it under the terms of the GNU General Public License as published by
  22  *  the Free Software Foundation; either version 2 of the License, or
  23  *  (at your option) any later version.
  24  *
  25  *  This program is distributed in the hope that it will be useful,
  26  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
  27  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  28  *  GNU General Public License for more details.
  29  *
  30  *  You should have received a copy of the GNU General Public License
  31  *  along with this program; if not, write to the Free Software
  32  *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111, USA.
  33  *
  34  */
  35
  36 #include "squid.h"
  37
  38 const char *RequestMethodStr[] =
  39 {
  40     "NONE",
  41     "GET",
  42     "POST",
  43     "PUT",
  44     "HEAD",
  45     "CONNECT",
  46     "TRACE",
  47     "PURGE",
  48     "OPTIONS",
  49     "DELETE",
  50     "PROPFIND",
  51     "PROPPATCH",
  52     "MKCOL",
  53     "COPY",
  54     "MOVE",
  55     "LOCK",
  56     "UNLOCK",
  57     "BMOVE",
  58     "BDELETE",
  59     "BPROPFIND",
  60     "BPROPPATCH",
  61     "BCOPY",
  62     "SEARCH",
  63     "SUBSCRIBE",
  64     "UNSUBSCRIBE",
  65     "POLL",
  66     "%EXT00",
  67     "%EXT01",
  68     "%EXT02",
  69     "%EXT03",
  70     "%EXT04",
  71     "%EXT05",
  72     "%EXT06",
  73     "%EXT07",
  74     "%EXT08",
  75     "%EXT09",
  76     "%EXT10",
  77     "%EXT11",
  78     "%EXT12",
  79     "%EXT13",
  80     "%EXT14",
  81     "%EXT15",
  82     "%EXT16",
  83     "%EXT17",
  84     "%EXT18",
  85     "%EXT19",
  86     "ERROR"
  87 };
  88
  89 const char *ProtocolStr[] =
  90 {
  91     "NONE",
  92     "http",
  93     "ftp",
  94     "gopher",
  95     "wais",
  96     "cache_object",
  97     "icp",
  98 #if USE_HTCP
  99     "htcp",
 100 #endif
 101     "urn",
 102     "whois",
 103     "internal",
 104     "https",
 105     "TOTAL"
 106 };
 107
 108 static request_t *urnParse(method_t method, char *urn);
 109 static const char *const valid_hostname_chars =
 110 #if ALLOW_HOSTNAME_UNDERSCORES
 111 "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
 112 "abcdefghijklmnopqrstuvwxyz"
 113 "0123456789-._";
 114 #else
 115 "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
 116 "abcdefghijklmnopqrstuvwxyz"
 117 "0123456789-.";
 118 #endif
 119
 120 /* convert %xx in url string to a character
 121  * Allocate a new string and return a pointer to converted string */
 122
 123 char *
 124 url_convert_hex(char *org_url, int allocate)
 125 {
 126     static char code[] = "00";
 127     char *url = NULL;
 128     char *s = NULL;
 129     char *t = NULL;
 130     url = allocate ? (char *) xstrdup(org_url) : org_url;
 131     if ((int) strlen(url) < 3 || !strchr(url, '%'))
 132         return url;
 133     for (s = t = url; *s; s++) {
 134         if (*s == '%' && *(s + 1) && *(s + 2)) {
 135             code[0] = *(++s);
 136             code[1] = *(++s);
 137             *t++ = (char) strtol(code, NULL, 16);
 138         } else {
 139             *t++ = *s;
 140         }
 141     }
 142     do {
 143         *t++ = *s;
 144     } while (*s++);
 145     return url;
 146 }
 147
 148 void
 149 urlInitialize(void)
 150 {
 151     debug(23, 5) ("urlInitialize: Initializing...\n");
 152     assert(sizeof(ProtocolStr) == (PROTO_MAX + 1) * sizeof(char *));
 153     memset(&null_request_flags, '\0', sizeof(null_request_flags));
 154     /*
 155      * These test that our matchDomainName() function works the
 156      * way we expect it to.
 157      */
 158     assert(0 == matchDomainName("foo.com", "foo.com"));
 159     assert(0 == matchDomainName(".foo.com", "foo.com"));
 160     assert(0 == matchDomainName("foo.com", ".foo.com"));
 161     assert(0 == matchDomainName(".foo.com", ".foo.com"));
 162     assert(0 == matchDomainName("x.foo.com", ".foo.com"));
 163     assert(0 != matchDomainName("x.foo.com", "foo.com"));
 164     assert(0 != matchDomainName("foo.com", "x.foo.com"));
 165     assert(0 != matchDomainName("bar.com", "foo.com"));
 166     assert(0 != matchDomainName(".bar.com", "foo.com"));
 167     assert(0 != matchDomainName(".bar.com", ".foo.com"));
 168     assert(0 != matchDomainName("bar.com", ".foo.com"));
 169     assert(0 < matchDomainName("zzz.com", "foo.com"));
 170     assert(0 > matchDomainName("aaa.com", "foo.com"));
 171     assert(0 == matchDomainName("FOO.com", "foo.COM"));
 172     assert(0 < matchDomainName("bfoo.com", "afoo.com"));
 173     assert(0 > matchDomainName("afoo.com", "bfoo.com"));
 174     assert(0 < matchDomainName("x-foo.com", ".foo.com"));
 175     /* more cases? */
 176 }
 177
 178 method_t
 179 urlParseMethod(const char *s)
 180 {
 181     method_t method = METHOD_NONE;
 182     /*
 183      * This check for '%' makes sure that we don't
 184      * match one of the extension method placeholders,
 185      * which have the form %EXT[0-9][0-9]
 186      */
 187     if (*s == '%')
 188         return METHOD_NONE;
 189     for (method++; method < METHOD_ENUM_END; method++) {
 190         if (0 == strcasecmp(s, RequestMethodStr[method]))
 191             return method;
 192     }
 193     return METHOD_NONE;
 194 }
 195
 196
 197 protocol_t
 198 urlParseProtocol(const char *s)
 199 {
 200     /* test common stuff first */
 201     if (strcasecmp(s, "http") == 0)
 202         return PROTO_HTTP;
 203     if (strcasecmp(s, "ftp") == 0)
 204         return PROTO_FTP;
 205     if (strcasecmp(s, "https") == 0)
 206         return PROTO_HTTPS;
 207     if (strcasecmp(s, "file") == 0)
 208         return PROTO_FTP;
 209     if (strcasecmp(s, "gopher") == 0)
 210         return PROTO_GOPHER;
 211     if (strcasecmp(s, "wais") == 0)
 212         return PROTO_WAIS;
 213     if (strcasecmp(s, "cache_object") == 0)
 214         return PROTO_CACHEOBJ;
 215     if (strcasecmp(s, "urn") == 0)
 216         return PROTO_URN;
 217     if (strcasecmp(s, "whois") == 0)
 218         return PROTO_WHOIS;
 219     if (strcasecmp(s, "internal") == 0)
 220         return PROTO_INTERNAL;
 221     return PROTO_NONE;
 222 }
 223
 224
 225 int
 226 urlDefaultPort(protocol_t p)
 227 {
 228     switch (p) {
 229     case PROTO_HTTP:
 230         return 80;
 231     case PROTO_HTTPS:
 232         return 443;
 233     case PROTO_FTP:
 234         return 21;
 235     case PROTO_GOPHER:
 236         return 70;
 237     case PROTO_WAIS:
 238         return 210;
 239     case PROTO_CACHEOBJ:
 240     case PROTO_INTERNAL:
 241         return CACHE_HTTP_PORT;
 242     case PROTO_WHOIS:
 243         return 43;
 244     default:
 245         return 0;
 246     }
 247 }
 248
 249 request_t *
 250 urlParse(method_t method, char *url)
 251 {
 252     LOCAL_ARRAY(char, proto, MAX_URL);
 253     LOCAL_ARRAY(char, login, MAX_URL);
 254     LOCAL_ARRAY(char, host, MAX_URL);
 255     LOCAL_ARRAY(char, urlpath, MAX_URL);
 256     request_t *request = NULL;
 257     char *t = NULL;
 258     char *q = NULL;
 259     int port;
 260     protocol_t protocol = PROTO_NONE;
 261     int l;
 262     proto[0] = host[0] = urlpath[0] = login[0] = '\0';
 263
 264     if ((l = strlen(url)) + Config.appendDomainLen > (MAX_URL - 1)) {
 265         /* terminate so it doesn't overflow other buffers */
 266         *(url + (MAX_URL >> 1)) = '\0';
 267         debug(23, 1) ("urlParse: URL too large (%d bytes)\n", l);
 268         return NULL;
 269     }
 270     if (method == METHOD_CONNECT) {
 271         port = CONNECT_PORT;
 272         if (sscanf(url, "%[^:]:%d", host, &port) < 1)
 273             return NULL;
 274     } else if (!strncmp(url, "urn:", 4)) {
 275         return urnParse(method, url);
 276     } else {
 277         if (sscanf(url, "%[^:]://%[^/]%[^\r\n]", proto, host, urlpath) < 2)
 278             return NULL;
 279         protocol = urlParseProtocol(proto);
 280         port = urlDefaultPort(protocol);
 281         /* Is there any login informaiton? */
 282         if ((t = strrchr(host, '@'))) {
 283             strcpy((char *) login, (char *) host);
 284             t = strrchr(login, '@');
 285             *t = 0;
 286             strcpy((char *) host, t + 1);
 287         }
 288         if ((t = strrchr(host, ':'))) {
 289             *t++ = '\0';
 290             if (*t != '\0')
 291                 port = atoi(t);
 292         }
 293     }
 294     for (t = host; *t; t++)
 295         *t = xtolower(*t);
 296     if (stringHasWhitespace(host)) {
 297         if (URI_WHITESPACE_STRIP == Config.uri_whitespace) {
 298             t = q = host;
 299             while (*t) {
 300                 if (!xisspace(*t))
 301                     *q++ = *t;
 302                 t++;
 303             }
 304             *q = '\0';
 305         }
 306     }
 307     if (strspn(host, valid_hostname_chars) != strlen(host)) {
 308         debug(23, 1) ("urlParse: Illegal character in hostname '%s'\n", host);
 309         return NULL;
 310     }
 311     /* remove trailing dots from hostnames */
 312     while ((l = strlen(host)) > 0 && host[--l] == '.')
 313         host[l] = '\0';
 314     /* remove duplicate dots */
 315     while ((t = strstr(host, "..")))
 316         xmemmove(t, t + 1, strlen(t));
 317     if (Config.appendDomain && !strchr(host, '.'))
 318         strncat(host, Config.appendDomain, SQUIDHOSTNAMELEN);
 319     if (port == 0) {
 320         debug(23, 3) ("urlParse: Invalid port == 0\n");
 321         return NULL;
 322     }
 323 #ifdef HARDCODE_DENY_PORTS
 324     /* These ports are filtered in the default squid.conf, but
 325      * maybe someone wants them hardcoded... */
 326     if (port == 7 || port == 9 || port == 19) {
 327         debug(23, 0) ("urlParse: Deny access to port %d\n", port);
 328         return NULL;
 329     }
 330 #endif
 331     if (stringHasWhitespace(urlpath)) {
 332         debug(23, 2) ("urlParse: URI has whitespace: {%s}\n", url);
 333         switch (Config.uri_whitespace) {
 334         case URI_WHITESPACE_DENY:
 335             return NULL;
 336         case URI_WHITESPACE_ALLOW:
 337             break;
 338         case URI_WHITESPACE_ENCODE:
 339             t = rfc1738_escape_unescaped(urlpath);
 340             xstrncpy(urlpath, t, MAX_URL);
 341             break;
 342         case URI_WHITESPACE_CHOP:
 343             *(urlpath + strcspn(urlpath, w_space)) = '\0';
 344             break;
 345         case URI_WHITESPACE_STRIP:
 346         default:
 347             t = q = urlpath;
 348             while (*t) {
 349                 if (!xisspace(*t))
 350                     *q++ = *t;
 351                 t++;
 352             }
 353             *q = '\0';
 354         }
 355     }
 356     request = requestCreate(method, protocol, urlpath);
 357     xstrncpy(request->host, host, SQUIDHOSTNAMELEN);
 358     xstrncpy(request->login, login, MAX_LOGIN_SZ);
 359     request->port = (u_short) port;
 360     return request;
 361 }
 362
 363 static request_t *
 364 urnParse(method_t method, char *urn)
 365 {
 366     debug(50, 5) ("urnParse: %s\n", urn);
 367     return requestCreate(method, PROTO_URN, urn + 4);
 368 }
 369
 370 const char *
 371 urlCanonical(request_t * request)
 372 {
 373     LOCAL_ARRAY(char, portbuf, 32);
 374     LOCAL_ARRAY(char, urlbuf, MAX_URL);
 375     if (request->canonical)
 376         return request->canonical;
 377     if (request->protocol == PROTO_URN) {
 378         snprintf(urlbuf, MAX_URL, "urn:%s", strBuf(request->urlpath));
 379     } else {
 380         switch (request->method) {
 381         case METHOD_CONNECT:
 382             snprintf(urlbuf, MAX_URL, "%s:%d", request->host, request->port);
 383             break;
 384         default:
 385             portbuf[0] = '\0';
 386             if (request->port != urlDefaultPort(request->protocol))
 387                 snprintf(portbuf, 32, ":%d", request->port);
 388             snprintf(urlbuf, MAX_URL, "%s://%s%s%s%s%s",
 389                 ProtocolStr[request->protocol],
 390                 request->login,
 391                 *request->login ? "@" : null_string,
 392                 request->host,
 393                 portbuf,
 394                 strBuf(request->urlpath));
 395             break;
 396         }
 397     }
 398     return (request->canonical = xstrdup(urlbuf));
 399 }
 400
 401 char *
 402 urlCanonicalClean(const request_t * request)
 403 {
 404     LOCAL_ARRAY(char, buf, MAX_URL);
 405     LOCAL_ARRAY(char, portbuf, 32);
 406     LOCAL_ARRAY(char, loginbuf, MAX_LOGIN_SZ + 1);
 407     char *t;
 408     if (request->protocol == PROTO_URN) {
 409         snprintf(buf, MAX_URL, "urn:%s", strBuf(request->urlpath));
 410     } else {
 411         switch (request->method) {
 412         case METHOD_CONNECT:
 413             snprintf(buf, MAX_URL, "%s:%d", request->host, request->port);
 414             break;
 415         default:
 416             portbuf[0] = '\0';
 417             if (request->port != urlDefaultPort(request->protocol))
 418                 snprintf(portbuf, 32, ":%d", request->port);
 419             loginbuf[0] = '\0';
 420             if ((int) strlen(request->login) > 0) {
 421                 strcpy(loginbuf, request->login);
 422                 if ((t = strchr(loginbuf, ':')))
 423                     *t = '\0';
 424                 strcat(loginbuf, "@");
 425             }
 426             snprintf(buf, MAX_URL, "%s://%s%s%s%s",
 427                 ProtocolStr[request->protocol],
 428                 loginbuf,
 429                 request->host,
 430                 portbuf,
 431                 strBuf(request->urlpath));
 432             /*
 433              * strip arguments AFTER a question-mark
 434              */
 435             if (Config.onoff.strip_query_terms)
 436                 if ((t = strchr(buf, '?')))
 437                     *(++t) = '\0';
 438             break;
 439         }
 440     }
 441     if (stringHasCntl(buf))
 442         xstrncpy(buf, rfc1738_escape_unescaped(buf), MAX_URL);
 443     return buf;
 444 }
 445
 446 /*
 447  * matchDomainName() compares a hostname with a domainname according
 448  * to the following rules:
 449  *
 450  *    HOST          DOMAIN        MATCH?
 451  * ------------- -------------    ------
 452  *    foo.com       foo.com         YES
 453  *   .foo.com       foo.com         YES
 454  *  x.foo.com       foo.com          NO
 455  *    foo.com      .foo.com         YES
 456  *   .foo.com      .foo.com         YES
 457  *  x.foo.com      .foo.com         YES
 458  *
 459  *  We strip leading dots on hosts (but not domains!) so that
 460  *  ".foo.com" is is always the same as "foo.com".
 461  *
 462  *  Return values:
 463  *     0 means the host matches the domain
 464  *     1 means the host is greater than the domain
 465  *    -1 means the host is less than the domain
 466  */
 467
 468 int
 469 matchDomainName(const char *h, const char *d)
 470 {
 471     int dl;
 472     int hl;
 473     while ('.' == *h)
 474         h++;
 475     hl = strlen(h);
 476     dl = strlen(d);
 477     /*
 478      * Start at the ends of the two strings and work towards the
 479      * beginning.
 480      */
 481     while (xtolower(h[--hl]) == xtolower(d[--dl])) {
 482         if (hl == 0 && dl == 0) {
 483             /*
 484              * We made it all the way to the beginning of both
 485              * strings without finding any difference.
 486              */
 487             return 0;
 488         }
 489         if (0 == hl) {
 490             /*
 491              * The host string is shorter than the domain string.
 492              * There is only one case when this can be a match.
 493              * If the domain is just one character longer, and if
 494              * that character is a leading '.' then we call it a
 495              * match.
 496              */
 497             if (1 == dl && '.' == d[0])
 498                 return 0;
 499             else
 500                 return -1;
 501         }
 502         if (0 == dl) {
 503             /*
 504              * The domain string is shorter than the host string.
 505              * This is a match only if the first domain character
 506              * is a leading '.'.
 507              */
 508             if ('.' == d[0])
 509                 return 0;
 510             else
 511                 return 1;
 512         }
 513     }
 514     /*
 515      * We found different characters in the same position (from the end).
 516      */
 517     /*
 518      * If one of those character is '.' then its special.  In order
 519      * for splay tree sorting to work properly, "x-foo.com" must
 520      * be greater than ".foo.com" even though '-' is less than '.'.
 521      */
 522     if ('.' == d[dl])
 523         return 1;
 524     if ('.' == h[hl])
 525         return -1;
 526     return (xtolower(h[hl]) - xtolower(d[dl]));
 527 }
 528
 529 int
 530 urlCheckRequest(const request_t * r)
 531 {
 532     int rc = 0;
 533     /* protocol "independent" methods */
 534     if (r->method == METHOD_CONNECT)
 535         return 1;
 536     if (r->method == METHOD_TRACE)
 537         return 1;
 538     if (r->method == METHOD_PURGE)
 539         return 1;
 540     /* does method match the protocol? */
 541     switch (r->protocol) {
 542     case PROTO_URN:
 543     case PROTO_HTTP:
 544     case PROTO_CACHEOBJ:
 545         rc = 1;
 546         break;
 547     case PROTO_FTP:
 548         if (r->method == METHOD_PUT)
 549             rc = 1;
 550     case PROTO_GOPHER:
 551     case PROTO_WAIS:
 552     case PROTO_WHOIS:
 553         if (r->method == METHOD_GET)
 554             rc = 1;
 555         else if (r->method == METHOD_HEAD)
 556             rc = 1;
 557         break;
 558     case PROTO_HTTPS:
 559 #ifdef USE_SSL
 560         rc = 1;
 561         break;
 562 #else
 563         /*
 564          * Squid can't originate an SSL connection, so it should
 565          * never receive an "https:" URL.  It should always be
 566          * CONNECT instead.
 567          */
 568         rc = 0;
 569 #endif
 570     default:
 571         break;
 572     }
 573     return rc;
 574 }
 575
 576 /*
 577  * Quick-n-dirty host extraction from a URL.  Steps:
 578  *      Look for a colon
 579  *      Skip any '/' after the colon
 580  *      Copy the next SQUID_MAXHOSTNAMELEN bytes to host[]
 581  *      Look for an ending '/' or ':' and terminate
 582  *      Look for login info preceeded by '@'
 583  */
 584 char *
 585 urlHostname(const char *url)
 586 {
 587     LOCAL_ARRAY(char, host, SQUIDHOSTNAMELEN);
 588     char *t;
 589     host[0] = '\0';
 590     if (NULL == (t = strchr(url, ':')))
 591         return NULL;
 592     t++;
 593     while (*t != '\0' && *t == '/')
 594         t++;
 595     xstrncpy(host, t, SQUIDHOSTNAMELEN);
 596     if ((t = strchr(host, '/')))
 597         *t = '\0';
 598     if ((t = strchr(host, ':')))
 599         *t = '\0';
 600     if ((t = strrchr(host, '@'))) {
 601         t++;
 602         xmemmove(host, t, strlen(t) + 1);
 603     }
 604     return host;
 605 }
 606
 607 static void
 608 urlExtMethodAdd(const char *mstr)
 609 {
 610     method_t method = 0;
 611     for (method++; method < METHOD_ENUM_END; method++) {
 612         if (0 == strcmp(mstr, RequestMethodStr[method])) {
 613             debug(23, 2) ("Extension method '%s' already exists\n", mstr);
 614             return;
 615         }
 616         if (0 != strncmp("%EXT", RequestMethodStr[method], 4))
 617             continue;
 618         /* Don't free statically allocated "%EXTnn" string */
 619         RequestMethodStr[method] = xstrdup(mstr);
 620         debug(23, 1) ("Extension method '%s' added, enum=%d\n", mstr, (int) method);
 621         return;
 622     }
 623     debug(23, 1) ("WARNING: Could not add new extension method '%s' due to lack of array space\n", mstr);
 624 }
 625
 626 void
 627 urlExtMethodConfigure(void)
 628 {
 629     wordlist *w = Config.ext_methods;
 630     while (w) {
 631         char *s;
 632         for (s = w->key; *s; s++)
 633             *s = xtoupper(*s);
 634         urlExtMethodAdd(w->key);
 635         w = w->next;
 636     }
 637 }