src/url.cc

   1
   2 /*
   3  * $Id: url.cc,v 1.142 2003/01/23 00:37:29 robertc Exp $
   4  *
   5  * DEBUG: section 23    URL Parsing
   6  * AUTHOR: Duane Wessels
   7  *
   8  * SQUID Web Proxy Cache          http://www.squid-cache.org/
   9  * ----------------------------------------------------------
  10  *
  11  *  Squid is the result of efforts by numerous individuals from
  12  *  the Internet community; see the CONTRIBUTORS file for full
  13  *  details.   Many organizations have provided support for Squid's
  14  *  development; see the SPONSORS file for full details.  Squid is
  15  *  Copyrighted (C) 2001 by the Regents of the University of
  16  *  California; see the COPYRIGHT file for full details.  Squid
  17  *  incorporates software developed and/or copyrighted by other
  18  *  sources; see the CREDITS file for full details.
  19  *
  20  *  This program is free software; you can redistribute it and/or modify
  21  *  it under the terms of the GNU General Public License as published by
  22  *  the Free Software Foundation; either version 2 of the License, or
  23  *  (at your option) any later version.
  24  *
  25  *  This program is distributed in the hope that it will be useful,
  26  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
  27  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  28  *  GNU General Public License for more details.
  29  *
  30  *  You should have received a copy of the GNU General Public License
  31  *  along with this program; if not, write to the Free Software
  32  *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111, USA.
  33  *
  34  */
  35
  36 #include "squid.h"
  37 #include "HttpRequest.h"
  38
  39 const char *RequestMethodStr[] =
  40 {
  41     "NONE",
  42     "GET",
  43     "POST",
  44     "PUT",
  45     "HEAD",
  46     "CONNECT",
  47     "TRACE",
  48     "PURGE",
  49     "OPTIONS",
  50     "DELETE",
  51     "PROPFIND",
  52     "PROPPATCH",
  53     "MKCOL",
  54     "COPY",
  55     "MOVE",
  56     "LOCK",
  57     "UNLOCK",
  58     "BMOVE",
  59     "BDELETE",
  60     "BPROPFIND",
  61     "BPROPPATCH",
  62     "BCOPY",
  63     "SEARCH",
  64     "SUBSCRIBE",
  65     "UNSUBSCRIBE",
  66     "POLL",
  67     "%EXT00",
  68     "%EXT01",
  69     "%EXT02",
  70     "%EXT03",
  71     "%EXT04",
  72     "%EXT05",
  73     "%EXT06",
  74     "%EXT07",
  75     "%EXT08",
  76     "%EXT09",
  77     "%EXT10",
  78     "%EXT11",
  79     "%EXT12",
  80     "%EXT13",
  81     "%EXT14",
  82     "%EXT15",
  83     "%EXT16",
  84     "%EXT17",
  85     "%EXT18",
  86     "%EXT19",
  87     "ERROR"
  88 };
  89
  90 const char *ProtocolStr[] =
  91 {
  92     "NONE",
  93     "http",
  94     "ftp",
  95     "gopher",
  96     "wais",
  97     "cache_object",
  98     "icp",
  99 #if USE_HTCP
 100     "htcp",
 101 #endif
 102     "urn",
 103     "whois",
 104     "internal",
 105     "https",
 106     "TOTAL"
 107 };
 108
 109 static request_t *urnParse(method_t method, char *urn);
 110 #if CHECK_HOSTNAMES
 111 static const char *const valid_hostname_chars =
 112 #if ALLOW_HOSTNAME_UNDERSCORES
 113 "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
 114 "abcdefghijklmnopqrstuvwxyz"
 115 "0123456789-._";
 116 #else
 117 "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
 118 "abcdefghijklmnopqrstuvwxyz"
 119 "0123456789-.";
 120 #endif
 121 #endif /* CHECK_HOSTNAMES */
 122
 123 /* convert %xx in url string to a character
 124  * Allocate a new string and return a pointer to converted string */
 125
 126 char *
 127 url_convert_hex(char *org_url, int allocate)
 128 {
 129     static char code[] = "00";
 130     char *url = NULL;
 131     char *s = NULL;
 132     char *t = NULL;
 133     url = allocate ? (char *) xstrdup(org_url) : org_url;
 134     if ((int) strlen(url) < 3 || !strchr(url, '%'))
 135         return url;
 136     for (s = t = url; *s; s++) {
 137         if (*s == '%' && *(s + 1) && *(s + 2)) {
 138             code[0] = *(++s);
 139             code[1] = *(++s);
 140             *t++ = (char) strtol(code, NULL, 16);
 141         } else {
 142             *t++ = *s;
 143         }
 144     }
 145     do {
 146         *t++ = *s;
 147     } while (*s++);
 148     return url;
 149 }
 150
 151 void
 152 urlInitialize(void)
 153 {
 154     debug(23, 5) ("urlInitialize: Initializing...\n");
 155     assert(sizeof(ProtocolStr) == (PROTO_MAX + 1) * sizeof(char *));
 156     /*
 157      * These test that our matchDomainName() function works the
 158      * way we expect it to.
 159      */
 160     assert(0 == matchDomainName("foo.com", "foo.com"));
 161     assert(0 == matchDomainName(".foo.com", "foo.com"));
 162     assert(0 == matchDomainName("foo.com", ".foo.com"));
 163     assert(0 == matchDomainName(".foo.com", ".foo.com"));
 164     assert(0 == matchDomainName("x.foo.com", ".foo.com"));
 165     assert(0 != matchDomainName("x.foo.com", "foo.com"));
 166     assert(0 != matchDomainName("foo.com", "x.foo.com"));
 167     assert(0 != matchDomainName("bar.com", "foo.com"));
 168     assert(0 != matchDomainName(".bar.com", "foo.com"));
 169     assert(0 != matchDomainName(".bar.com", ".foo.com"));
 170     assert(0 != matchDomainName("bar.com", ".foo.com"));
 171     assert(0 < matchDomainName("zzz.com", "foo.com"));
 172     assert(0 > matchDomainName("aaa.com", "foo.com"));
 173     assert(0 == matchDomainName("FOO.com", "foo.COM"));
 174     assert(0 < matchDomainName("bfoo.com", "afoo.com"));
 175     assert(0 > matchDomainName("afoo.com", "bfoo.com"));
 176     assert(0 < matchDomainName("x-foo.com", ".foo.com"));
 177     /* more cases? */
 178 }
 179
 180 method_t &operator++ (method_t &aMethod)
 181 {
 182     aMethod = (method_t)(++(int)aMethod);
 183     return aMethod;
 184 }
 185
 186
 187 method_t
 188 urlParseMethod(const char *s)
 189 {
 190     method_t method = METHOD_NONE;
 191     /*
 192      * This check for '%' makes sure that we don't
 193      * match one of the extension method placeholders,
 194      * which have the form %EXT[0-9][0-9]
 195      */
 196     if (*s == '%')
 197         return METHOD_NONE;
 198     for (++method; method < METHOD_ENUM_END; ++method) {
 199         if (0 == strcasecmp(s, RequestMethodStr[method]))
 200             return method;
 201     }
 202     return METHOD_NONE;
 203 }
 204
 205
 206 protocol_t
 207 urlParseProtocol(const char *s)
 208 {
 209     /* test common stuff first */
 210     if (strcasecmp(s, "http") == 0)
 211         return PROTO_HTTP;
 212     if (strcasecmp(s, "ftp") == 0)
 213         return PROTO_FTP;
 214     if (strcasecmp(s, "https") == 0)
 215         return PROTO_HTTPS;
 216     if (strcasecmp(s, "file") == 0)
 217         return PROTO_FTP;
 218     if (strcasecmp(s, "gopher") == 0)
 219         return PROTO_GOPHER;
 220     if (strcasecmp(s, "wais") == 0)
 221         return PROTO_WAIS;
 222     if (strcasecmp(s, "cache_object") == 0)
 223         return PROTO_CACHEOBJ;
 224     if (strcasecmp(s, "urn") == 0)
 225         return PROTO_URN;
 226     if (strcasecmp(s, "whois") == 0)
 227         return PROTO_WHOIS;
 228     if (strcasecmp(s, "internal") == 0)
 229         return PROTO_INTERNAL;
 230     return PROTO_NONE;
 231 }
 232
 233
 234 int
 235 urlDefaultPort(protocol_t p)
 236 {
 237     switch (p) {
 238     case PROTO_HTTP:
 239         return 80;
 240     case PROTO_HTTPS:
 241         return 443;
 242     case PROTO_FTP:
 243         return 21;
 244     case PROTO_GOPHER:
 245         return 70;
 246     case PROTO_WAIS:
 247         return 210;
 248     case PROTO_CACHEOBJ:
 249     case PROTO_INTERNAL:
 250         return CACHE_HTTP_PORT;
 251     case PROTO_WHOIS:
 252         return 43;
 253     default:
 254         return 0;
 255     }
 256 }
 257
 258 request_t *
 259 urlParse(method_t method, char *url)
 260 {
 261     LOCAL_ARRAY(char, proto, MAX_URL);
 262     LOCAL_ARRAY(char, login, MAX_URL);
 263     LOCAL_ARRAY(char, host, MAX_URL);
 264     LOCAL_ARRAY(char, urlpath, MAX_URL);
 265     request_t *request = NULL;
 266     char *t = NULL;
 267     char *q = NULL;
 268     int port;
 269     protocol_t protocol = PROTO_NONE;
 270     int l;
 271     proto[0] = host[0] = urlpath[0] = login[0] = '\0';
 272
 273     if ((l = strlen(url)) + Config.appendDomainLen > (MAX_URL - 1)) {
 274         /* terminate so it doesn't overflow other buffers */
 275         *(url + (MAX_URL >> 1)) = '\0';
 276         debug(23, 1) ("urlParse: URL too large (%d bytes)\n", l);
 277         return NULL;
 278     }
 279     if (method == METHOD_CONNECT) {
 280         port = CONNECT_PORT;
 281         if (sscanf(url, "%[^:]:%d", host, &port) < 1)
 282             return NULL;
 283     } else if (!strncmp(url, "urn:", 4)) {
 284         return urnParse(method, url);
 285     } else {
 286         if (sscanf(url, "%[^:]://%[^/]%[^\r\n]", proto, host, urlpath) < 2)
 287             return NULL;
 288         protocol = urlParseProtocol(proto);
 289         port = urlDefaultPort(protocol);
 290         /* Is there any login informaiton? */
 291         if ((t = strrchr(host, '@'))) {
 292             strcpy((char *) login, (char *) host);
 293             t = strrchr(login, '@');
 294             *t = 0;
 295             strcpy((char *) host, t + 1);
 296         }
 297         if ((t = strrchr(host, ':'))) {
 298             *t++ = '\0';
 299             if (*t != '\0')
 300                 port = atoi(t);
 301         }
 302     }
 303     for (t = host; *t; t++)
 304         *t = xtolower(*t);
 305     if (stringHasWhitespace(host)) {
 306         if (URI_WHITESPACE_STRIP == Config.uri_whitespace) {
 307             t = q = host;
 308             while (*t) {
 309                 if (!xisspace(*t))
 310                     *q++ = *t;
 311                 t++;
 312             }
 313             *q = '\0';
 314         }
 315     }
 316 #if CHECK_HOSTNAMES
 317     if (Config.onoff.check_hostnames && strspn(host, valid_hostname_chars) != strlen(host)) {
 318         debug(23, 1) ("urlParse: Illegal character in hostname '%s'\n", host);
 319         return NULL;
 320     }
 321 #endif
 322 #if DONT_DO_THIS_IT_BREAKS_SEMANTIC_TRANSPARENCY
 323     /* remove trailing dots from hostnames */
 324     while ((l = strlen(host)) > 0 && host[--l] == '.')
 325         host[l] = '\0';
 326     /* remove duplicate dots */
 327     while ((t = strstr(host, "..")))
 328         xmemmove(t, t + 1, strlen(t));
 329 #endif
 330     if (Config.appendDomain && !strchr(host, '.'))
 331         strncat(host, Config.appendDomain, SQUIDHOSTNAMELEN);
 332     if (port < 1 || port > 65535) {
 333         debug(23, 3) ("urlParse: Invalid port '%d'\n", port);
 334         return NULL;
 335     }
 336 #ifdef HARDCODE_DENY_PORTS
 337     /* These ports are filtered in the default squid.conf, but
 338      * maybe someone wants them hardcoded... */
 339     if (port == 7 || port == 9 || port == 19) {
 340         debug(23, 0) ("urlParse: Deny access to port %d\n", port);
 341         return NULL;
 342     }
 343 #endif
 344     if (stringHasWhitespace(urlpath)) {
 345         debug(23, 2) ("urlParse: URI has whitespace: {%s}\n", url);
 346         switch (Config.uri_whitespace) {
 347         case URI_WHITESPACE_DENY:
 348             return NULL;
 349         case URI_WHITESPACE_ALLOW:
 350             break;
 351         case URI_WHITESPACE_ENCODE:
 352             t = rfc1738_escape_unescaped(urlpath);
 353             xstrncpy(urlpath, t, MAX_URL);
 354             break;
 355         case URI_WHITESPACE_CHOP:
 356             *(urlpath + strcspn(urlpath, w_space)) = '\0';
 357             break;
 358         case URI_WHITESPACE_STRIP:
 359         default:
 360             t = q = urlpath;
 361             while (*t) {
 362                 if (!xisspace(*t))
 363                     *q++ = *t;
 364                 t++;
 365             }
 366             *q = '\0';
 367         }
 368     }
 369     request = requestCreate(method, protocol, urlpath);
 370     xstrncpy(request->host, host, SQUIDHOSTNAMELEN);
 371     xstrncpy(request->login, login, MAX_LOGIN_SZ);
 372     request->port = (u_short) port;
 373     return request;
 374 }
 375
 376 static request_t *
 377 urnParse(method_t method, char *urn)
 378 {
 379     debug(50, 5) ("urnParse: %s\n", urn);
 380     return requestCreate(method, PROTO_URN, urn + 4);
 381 }
 382
 383 const char *
 384 urlCanonical(request_t * request)
 385 {
 386     LOCAL_ARRAY(char, portbuf, 32);
 387     LOCAL_ARRAY(char, urlbuf, MAX_URL);
 388     if (request->canonical)
 389         return request->canonical;
 390     if (request->protocol == PROTO_URN) {
 391         snprintf(urlbuf, MAX_URL, "urn:%s", request->urlpath.buf());
 392     } else {
 393         switch (request->method) {
 394         case METHOD_CONNECT:
 395             snprintf(urlbuf, MAX_URL, "%s:%d", request->host, request->port);
 396             break;
 397         default:
 398             portbuf[0] = '\0';
 399             if (request->port != urlDefaultPort(request->protocol))
 400                 snprintf(portbuf, 32, ":%d", request->port);
 401             snprintf(urlbuf, MAX_URL, "%s://%s%s%s%s%s",
 402                 ProtocolStr[request->protocol],
 403                 request->login,
 404                 *request->login ? "@" : null_string,
 405                 request->host,
 406                 portbuf,
 407                 request->urlpath.buf());
 408             break;
 409         }
 410     }
 411     return (request->canonical = xstrdup(urlbuf));
 412 }
 413
 414 char *
 415 urlCanonicalClean(const request_t * request)
 416 {
 417     LOCAL_ARRAY(char, buf, MAX_URL);
 418     LOCAL_ARRAY(char, portbuf, 32);
 419     LOCAL_ARRAY(char, loginbuf, MAX_LOGIN_SZ + 1);
 420     char *t;
 421     if (request->protocol == PROTO_URN) {
 422         snprintf(buf, MAX_URL, "urn:%s", request->urlpath.buf());
 423     } else {
 424         switch (request->method) {
 425         case METHOD_CONNECT:
 426             snprintf(buf, MAX_URL, "%s:%d", request->host, request->port);
 427             break;
 428         default:
 429             portbuf[0] = '\0';
 430             if (request->port != urlDefaultPort(request->protocol))
 431                 snprintf(portbuf, 32, ":%d", request->port);
 432             loginbuf[0] = '\0';
 433             if ((int) strlen(request->login) > 0) {
 434                 strcpy(loginbuf, request->login);
 435                 if ((t = strchr(loginbuf, ':')))
 436                     *t = '\0';
 437                 strcat(loginbuf, "@");
 438             }
 439             snprintf(buf, MAX_URL, "%s://%s%s%s%s",
 440                 ProtocolStr[request->protocol],
 441                 loginbuf,
 442                 request->host,
 443                 portbuf,
 444                 request->urlpath.buf());
 445             /*
 446              * strip arguments AFTER a question-mark
 447              */
 448             if (Config.onoff.strip_query_terms)
 449                 if ((t = strchr(buf, '?')))
 450                     *(++t) = '\0';
 451             break;
 452         }
 453     }
 454     if (stringHasCntl(buf))
 455         xstrncpy(buf, rfc1738_escape_unescaped(buf), MAX_URL);
 456     return buf;
 457 }
 458
 459 /*
 460  * matchDomainName() compares a hostname with a domainname according
 461  * to the following rules:
 462  *
 463  *    HOST          DOMAIN        MATCH?
 464  * ------------- -------------    ------
 465  *    foo.com       foo.com         YES
 466  *   .foo.com       foo.com         YES
 467  *  x.foo.com       foo.com          NO
 468  *    foo.com      .foo.com         YES
 469  *   .foo.com      .foo.com         YES
 470  *  x.foo.com      .foo.com         YES
 471  *
 472  *  We strip leading dots on hosts (but not domains!) so that
 473  *  ".foo.com" is is always the same as "foo.com".
 474  *
 475  *  Return values:
 476  *     0 means the host matches the domain
 477  *     1 means the host is greater than the domain
 478  *    -1 means the host is less than the domain
 479  */
 480
 481 int
 482 matchDomainName(const char *h, const char *d)
 483 {
 484     int dl;
 485     int hl;
 486     while ('.' == *h)
 487         h++;
 488     hl = strlen(h);
 489     dl = strlen(d);
 490     /*
 491      * Start at the ends of the two strings and work towards the
 492      * beginning.
 493      */
 494     while (xtolower(h[--hl]) == xtolower(d[--dl])) {
 495         if (hl == 0 && dl == 0) {
 496             /*
 497              * We made it all the way to the beginning of both
 498              * strings without finding any difference.
 499              */
 500             return 0;
 501         }
 502         if (0 == hl) {
 503             /*
 504              * The host string is shorter than the domain string.
 505              * There is only one case when this can be a match.
 506              * If the domain is just one character longer, and if
 507              * that character is a leading '.' then we call it a
 508              * match.
 509              */
 510             if (1 == dl && '.' == d[0])
 511                 return 0;
 512             else
 513                 return -1;
 514         }
 515         if (0 == dl) {
 516             /*
 517              * The domain string is shorter than the host string.
 518              * This is a match only if the first domain character
 519              * is a leading '.'.
 520              */
 521             if ('.' == d[0])
 522                 return 0;
 523             else
 524                 return 1;
 525         }
 526     }
 527     /*
 528      * We found different characters in the same position (from the end).
 529      */
 530     /*
 531      * If one of those character is '.' then its special.  In order
 532      * for splay tree sorting to work properly, "x-foo.com" must
 533      * be greater than ".foo.com" even though '-' is less than '.'.
 534      */
 535     if ('.' == d[dl])
 536         return 1;
 537     if ('.' == h[hl])
 538         return -1;
 539     return (xtolower(h[hl]) - xtolower(d[dl]));
 540 }
 541
 542 int
 543 urlCheckRequest(const request_t * r)
 544 {
 545     int rc = 0;
 546     /* protocol "independent" methods */
 547     if (r->method == METHOD_CONNECT)
 548         return 1;
 549     if (r->method == METHOD_TRACE)
 550         return 1;
 551     if (r->method == METHOD_PURGE)
 552         return 1;
 553     /* does method match the protocol? */
 554     switch (r->protocol) {
 555     case PROTO_URN:
 556     case PROTO_HTTP:
 557     case PROTO_CACHEOBJ:
 558         rc = 1;
 559         break;
 560     case PROTO_FTP:
 561         if (r->method == METHOD_PUT)
 562             rc = 1;
 563     case PROTO_GOPHER:
 564     case PROTO_WAIS:
 565     case PROTO_WHOIS:
 566         if (r->method == METHOD_GET)
 567             rc = 1;
 568         else if (r->method == METHOD_HEAD)
 569             rc = 1;
 570         break;
 571     case PROTO_HTTPS:
 572 #ifdef USE_SSL
 573         rc = 1;
 574         break;
 575 #else
 576         /*
 577          * Squid can't originate an SSL connection, so it should
 578          * never receive an "https:" URL.  It should always be
 579          * CONNECT instead.
 580          */
 581         rc = 0;
 582 #endif
 583     default:
 584         break;
 585     }
 586     return rc;
 587 }
 588
 589 /*
 590  * Quick-n-dirty host extraction from a URL.  Steps:
 591  *      Look for a colon
 592  *      Skip any '/' after the colon
 593  *      Copy the next SQUID_MAXHOSTNAMELEN bytes to host[]
 594  *      Look for an ending '/' or ':' and terminate
 595  *      Look for login info preceeded by '@'
 596  */
 597 char *
 598 urlHostname(const char *url)
 599 {
 600     LOCAL_ARRAY(char, host, SQUIDHOSTNAMELEN);
 601     char *t;
 602     host[0] = '\0';
 603     if (NULL == (t = strchr(url, ':')))
 604         return NULL;
 605     t++;
 606     while (*t != '\0' && *t == '/')
 607         t++;
 608     xstrncpy(host, t, SQUIDHOSTNAMELEN);
 609     if ((t = strchr(host, '/')))
 610         *t = '\0';
 611     if ((t = strchr(host, ':')))
 612         *t = '\0';
 613     if ((t = strrchr(host, '@'))) {
 614         t++;
 615         xmemmove(host, t, strlen(t) + 1);
 616     }
 617     return host;
 618 }
 619
 620 static void
 621 urlExtMethodAdd(const char *mstr)
 622 {
 623     method_t method = METHOD_NONE;
 624     for (++method; method < METHOD_ENUM_END; ++method) {
 625         if (0 == strcmp(mstr, RequestMethodStr[method])) {
 626             debug(23, 2) ("Extension method '%s' already exists\n", mstr);
 627             return;
 628         }
 629         if (0 != strncmp("%EXT", RequestMethodStr[method], 4))
 630             continue;
 631         /* Don't free statically allocated "%EXTnn" string */
 632         RequestMethodStr[method] = xstrdup(mstr);
 633         debug(23, 1) ("Extension method '%s' added, enum=%d\n", mstr, (int) method);
 634         return;
 635     }
 636     debug(23, 1) ("WARNING: Could not add new extension method '%s' due to lack of array space\n", mstr);
 637 }
 638
 639 void
 640 urlExtMethodConfigure(void)
 641 {
 642     wordlist *w = Config.ext_methods;
 643     while (w) {
 644         char *s;
 645         for (s = w->key; *s; s++)
 646             *s = xtoupper(*s);
 647         urlExtMethodAdd(w->key);
 648         w = w->next;
 649     }
 650 }