Cleanup: zap CVS Id tags

[thirdparty/squid.git] / src / url.cc
diff --git a/src/url.cc b/src/url.cc

index b50c867ba726d4f373e6cc504360c88de4119add..e78e4baf7fed383318bccaab5ff5baeaac166dfa 100644 (file)
--- a/src/url.cc
+++ b/src/url.cc
@@ -1,6 +1,6 @@
  
  /*
- * $Id: url.cc,v 1.155 2006/05/29 21:44:18 robertc Exp $
+ * $Id$
   *
   * DEBUG: section 23    URL Parsing
   * AUTHOR: Duane Wessels
@@ -21,12 +21,12 @@
   *  it under the terms of the GNU General Public License as published by
   *  the Free Software Foundation; either version 2 of the License, or
   *  (at your option) any later version.
- *  
+ *
   *  This program is distributed in the hope that it will be useful,
   *  but WITHOUT ANY WARRANTY; without even the implied warranty of
   *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   *  GNU General Public License for more details.
- *  
+ *
   *  You should have received a copy of the GNU General Public License
   *  along with this program; if not, write to the Free Software
   *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111, USA.
@@ -37,57 +37,28 @@
  #include "HttpRequest.h"
  #include "URLScheme.h"
  
-static HttpRequest *urnParse(method_t method, char *urn);
-#if CHECK_HOSTNAMES
-static const char *const valid_hostname_chars =
-#if ALLOW_HOSTNAME_UNDERSCORES
+static HttpRequest *urnParse(const HttpRequestMethod& method, char *urn);
+static const char valid_hostname_chars_u[] =
      "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
      "abcdefghijklmnopqrstuvwxyz"
-    "0123456789-._";
-#else
+    "0123456789-._"
+#if USE_IPV6
+    "[:]"
+#endif
+    ;
+static const char valid_hostname_chars[] =
      "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
      "abcdefghijklmnopqrstuvwxyz"
      "0123456789-."
-    ;
+#if USE_IPV6
+    "[:]"
  #endif
-#endif /* CHECK_HOSTNAMES */
-
-/* convert %xx in url string to a character
- * Allocate a new string and return a pointer to converted string */
-
-char *
-url_convert_hex(char *org_url, int allocate)
-{
-    static char code[] = "00";
-    char *url = NULL;
-    char *s = NULL;
-    char *t = NULL;
-    url = allocate ? (char *) xstrdup(org_url) : org_url;
-
-    if ((int) strlen(url) < 3 || !strchr(url, '%'))
-        return url;
-
-    for (s = t = url; *s; s++) {
-        if (*s == '%' && *(s + 1) && *(s + 2)) {
-            code[0] = *(++s);
-            code[1] = *(++s);
-            *t++ = (char) strtol(code, NULL, 16);
-        } else {
-            *t++ = *s;
-        }
-    }
-
-    do {
-        *t++ = *s;
-    } while (*s++);
-
-    return url;
-}
+    ;
  
  void
  urlInitialize(void)
  {
-    debug(23, 5) ("urlInitialize: Initializing...\n");
+    debugs(23, 5, "urlInitialize: Initializing...");
      /* this ensures that the number of protocol strings is the same as
       * the enum slots allocated because the last enum is always 'TOTAL'.
       */
@@ -116,7 +87,7 @@ urlInitialize(void)
      /* more cases? */
  }
  
-/*
+/**
   * urlParseProtocol() takes begin (b) and end (e) pointers, but for
   * backwards compatibility, e defaults to NULL, in which case we
   * assume b is NULL-terminated.
@@ -209,15 +180,23 @@ urlDefaultPort(protocol_t p)
   * If the 'request' arg is non-NULL, put parsed values there instead
   * of allocating a new HttpRequest.
   *
- * This abuses HttpRequest as a way of representing the parsed url 
+ * This abuses HttpRequest as a way of representing the parsed url
   * and its components.
   * method is used to switch parsers and to init the HttpRequest.
   * If method is METHOD_CONNECT, then rather than a URL a hostname:port is
   * looked for.
   * The url is non const so that if its too long we can NULL-terminate it in place.
   */
+
+/*
+ * This routine parses a URL. Its assumed that the URL is complete -
+ * ie, the end of the string is the end of the URL. Don't pass a partial
+ * URL here as this routine doesn't have any way of knowing whether
+ * its partial or not (ie, it handles the case of no trailing slash as
+ * being "end of host with implied path of /".
+ */
  HttpRequest *
-urlParse(method_t method, char *url, HttpRequest *request)
+urlParse(const HttpRequestMethod& method, char *url, HttpRequest *request)
  {
      LOCAL_ARRAY(char, proto, MAX_URL);
      LOCAL_ARRAY(char, login, MAX_URL);
@@ -228,31 +207,78 @@ urlParse(method_t method, char *url, HttpRequest *request)
      int port;
      protocol_t protocol = PROTO_NONE;
      int l;
+    int i;
+    const char *src;
+    char *dst;
      proto[0] = host[0] = urlpath[0] = login[0] = '\0';
  
      if ((l = strlen(url)) + Config.appendDomainLen > (MAX_URL - 1)) {
          /* terminate so it doesn't overflow other buffers */
          *(url + (MAX_URL >> 1)) = '\0';
-        debug(23, 1) ("urlParse: URL too large (%d bytes)\n", l);
+        debugs(23, 1, "urlParse: URL too large (" << l << " bytes)");
          return NULL;
      }
-
      if (method == METHOD_CONNECT) {
          port = CONNECT_PORT;
  
-        if (sscanf(url, "%[^:]:%d", host, &port) < 1)
-            return NULL;
+        if (sscanf(url, "[%[^]]]:%d", host, &port) < 1)
+            if (sscanf(url, "%[^:]:%d", host, &port) < 1)
+                return NULL;
+
      } else if (!strncmp(url, "urn:", 4)) {
          return urnParse(method, url);
      } else {
-        if (sscanf(url, "%[^:]://%[^/]%[^\r\n]", proto, host, urlpath) < 2)
+        /* Parse the URL: */
+        src = url;
+        i = 0;
+        /* Find first : - everything before is protocol */
+        for (i = 0, dst = proto; i < l && *src != ':'; i++, src++, dst++) {
+            *dst = *src;
+        }
+        if (i >= l)
              return NULL;
+        *dst = '\0';
  
-        protocol = urlParseProtocol(proto);
+        /* Then its :// */
+        /* (XXX yah, I'm not checking we've got enough data left before checking the array..) */
+        if (*src != ':' || *(src + 1) != '/' || *(src + 2) != '/')
+            return NULL;
+        i += 3;
+        src += 3;
  
+        /* Then everything until first /; thats host (and port; which we'll look for here later) */
+        /* bug 1881: If we don't get a "/" then we imply it was there */
+        for (dst = host; i < l && *src != '/' && *src != '\0'; i++, src++, dst++) {
+            *dst = *src;
+        }
+
+        /*
+         * We can't check for "i >= l" here because we could be at the end of the line
+         * and have a perfectly valid URL w/ no trailing '/'. In this case we assume we've
+         * been -given- a valid URL and the path is just '/'.
+         */
+        if (i > l)
+            return NULL;
+        *dst = '\0';
+
+        /* Then everything from / (inclusive) until \r\n or \0 - thats urlpath */
+        for (dst = urlpath; i < l && *src != '\r' && *src != '\n' && *src != '\0'; i++, src++, dst++) {
+            *dst = *src;
+        }
+
+        /* We -could- be at the end of the buffer here */
+        if (i > l)
+            return NULL;
+        /* If the URL path is empty we set it to be "/" */
+        if (dst == urlpath) {
+            *(dst++) = '/';
+        }
+        *dst = '\0';
+
+        protocol = urlParseProtocol(proto);
          port = urlDefaultPort(protocol);
  
-        /* Is there any login informaiton? */
+        /* Is there any login information? (we should eventually parse it above) */
          if ((t = strrchr(host, '@'))) {
              strcpy((char *) login, (char *) host);
              t = strrchr(login, '@');
@@ -260,11 +286,45 @@ urlParse(method_t method, char *url, HttpRequest *request)
              strcpy((char *) host, t + 1);
          }
  
-        if ((t = strrchr(host, ':'))) {
-            *t++ = '\0';
+        /* Is there any host information? (we should eventually parse it above) */
+        if (*host == '[') {
+            /* strip any IPA brackets. valid under IPv6. */
+            dst = host;
+#if USE_IPV6
+            /* only for IPv6 sadly, pre-IPv6/URL code can't handle the clean result properly anyway. */
+            src = host;
+            src++;
+            l = strlen(host);
+            i = 1;
+            for (; i < l && *src != ']' && *src != '\0'; i++, src++, dst++) {
+                *dst = *src;
+            }
+
+            /* we moved in-place, so truncate the actual hostname found */
+            *(dst++) = '\0';
+#else
+            /* IPv4-pure needs to skip the whole hostname to ']' inclusive for now */
+            while (*dst != '\0' && *dst != ']') dst++;
+#endif
+
+            /* skip ahead to either start of port, or original EOS */
+            while (*dst != '\0' && *dst != ':') dst++;
+            t = dst;
+        } else {
+            t = strrchr(host, ':');
+
+            if (t != strchr(host,':') ) {
+                /* RFC 2732 states IPv6 "SHOULD" be bracketed. allowing for times when its not. */
+                /* RFC 3986 'update' simply modifies this to an "is" with no emphasis at all! */
+                /* therefore we MUST accept the case where they are not bracketed at all. */
+                t = NULL;
+            }
+        }
  
-            if (*t != '\0')
-                port = atoi(t);
+        if (t && *t == ':') {
+            *t = '\0';
+            t++;
+            port = atoi(t);
          }
      }
  
@@ -274,41 +334,37 @@ urlParse(method_t method, char *url, HttpRequest *request)
      if (stringHasWhitespace(host)) {
          if (URI_WHITESPACE_STRIP == Config.uri_whitespace) {
              t = q = host;
-
              while (*t) {
                  if (!xisspace(*t))
                      *q++ = *t;
-
                  t++;
              }
-
              *q = '\0';
          }
      }
  
-#if CHECK_HOSTNAMES
-    if (Config.onoff.check_hostnames && strspn(host, valid_hostname_chars) != strlen(host)) {
-        debug(23, 1) ("urlParse: Illegal character in hostname '%s'\n", host);
+    debugs(23, 3, "urlParse: Split URL '" << url << "' into proto='" << proto << "', host='" << host << "', port='" << port << "', path='" << urlpath << "'");
+
+    if (Config.onoff.check_hostnames && strspn(host, Config.onoff.allow_underscore ? valid_hostname_chars_u : valid_hostname_chars) != strlen(host)) {
+        debugs(23, 1, "urlParse: Illegal character in hostname '" << host << "'");
          return NULL;
      }
  
-#endif
-#if DONT_DO_THIS_IT_BREAKS_SEMANTIC_TRANSPARENCY
+    if (Config.appendDomain && !strchr(host, '.'))
+        strncat(host, Config.appendDomain, SQUIDHOSTNAMELEN - strlen(host) - 1);
+
      /* remove trailing dots from hostnames */
      while ((l = strlen(host)) > 0 && host[--l] == '.')
          host[l] = '\0';
  
-    /* remove duplicate dots */
-    while ((t = strstr(host, "..")))
-        xmemmove(t, t + 1, strlen(t));
-
-#endif
-
-    if (Config.appendDomain && !strchr(host, '.'))
-        strncat(host, Config.appendDomain, SQUIDHOSTNAMELEN - strlen(host) - 1);
+    /* reject duplicate or leading dots */
+    if (strstr(host, "..") || *host == '.') {
+        debugs(23, 1, "urlParse: Illegal hostname '" << host << "'");
+        return NULL;
+    }
  
      if (port < 1 || port > 65535) {
-        debug(23, 3) ("urlParse: Invalid port '%d'\n", port);
+        debugs(23, 3, "urlParse: Invalid port '" << port << "'");
          return NULL;
      }
  
@@ -316,13 +372,13 @@ urlParse(method_t method, char *url, HttpRequest *request)
      /* These ports are filtered in the default squid.conf, but
       * maybe someone wants them hardcoded... */
      if (port == 7 || port == 9 || port == 19) {
-        debug(23, 0) ("urlParse: Deny access to port %d\n", port);
+        debugs(23, 0, "urlParse: Deny access to port " << port);
          return NULL;
      }
-
  #endif
+
      if (stringHasWhitespace(urlpath)) {
-        debug(23, 2) ("urlParse: URI has whitespace: {%s}\n", url);
+        debugs(23, 2, "urlParse: URI has whitespace: {" << url << "}");
  
          switch (Config.uri_whitespace) {
  
@@ -342,17 +398,13 @@ urlParse(method_t method, char *url, HttpRequest *request)
              break;
  
          case URI_WHITESPACE_STRIP:
-
          default:
              t = q = urlpath;
-
              while (*t) {
                  if (!xisspace(*t))
                      *q++ = *t;
-
                  t++;
              }
-
              *q = '\0';
          }
      }
@@ -363,16 +415,16 @@ urlParse(method_t method, char *url, HttpRequest *request)
          request->initHTTP(method, protocol, urlpath);
      }
  
-    xstrncpy(request->host, host, SQUIDHOSTNAMELEN);
+    request->SetHost(host);
      xstrncpy(request->login, login, MAX_LOGIN_SZ);
      request->port = (u_short) port;
      return request;
  }
  
  static HttpRequest *
-urnParse(method_t method, char *urn)
+urnParse(const HttpRequestMethod& method, char *urn)
  {
-    debug(50, 5) ("urnParse: %s\n", urn);
+    debugs(50, 5, "urnParse: " << urn);
      return new HttpRequest(method, PROTO_URN, urn + 4);
  }
  
@@ -380,6 +432,7 @@ const char *
  urlCanonical(HttpRequest * request)
  {
      LOCAL_ARRAY(char, portbuf, 32);
+/// \todo AYJ: Performance: making this a ptr and allocating when needed will be better than a write and future xstrdup().
      LOCAL_ARRAY(char, urlbuf, MAX_URL);
  
      if (request->canonical)
@@ -388,10 +441,11 @@ urlCanonical(HttpRequest * request)
      if (request->protocol == PROTO_URN) {
          snprintf(urlbuf, MAX_URL, "urn:%s", request->urlpath.buf());
      } else {
-        switch (request->method) {
+/// \todo AYJ: this could use "if..else and method == METHOD_CONNECT" easier.
+        switch (request->method.id()) {
  
          case METHOD_CONNECT:
-            snprintf(urlbuf, MAX_URL, "%s:%d", request->host, request->port);
+            snprintf(urlbuf, MAX_URL, "%s:%d", request->GetHost(), request->port);
              break;
  
          default:
@@ -404,7 +458,7 @@ urlCanonical(HttpRequest * request)
                       ProtocolStr[request->protocol],
                       request->login,
                       *request->login ? "@" : null_string,
-                     request->host,
+                     request->GetHost(),
                       portbuf,
                       request->urlpath.buf());
  
@@ -415,6 +469,10 @@ urlCanonical(HttpRequest * request)
      return (request->canonical = xstrdup(urlbuf));
  }
  
+/** \todo AYJ: Performance: This is an *almost* duplicate of urlCanoncical. But elides the query-string.
+ *        After copying it on in the first place! Would be less code to merge the two with a flag parameter.
+ *        and never copy the query-string part in the first place
+ */
  char *
  urlCanonicalClean(const HttpRequest * request)
  {
@@ -426,10 +484,13 @@ urlCanonicalClean(const HttpRequest * request)
      if (request->protocol == PROTO_URN) {
          snprintf(buf, MAX_URL, "urn:%s", request->urlpath.buf());
      } else {
-        switch (request->method) {
+/// \todo AYJ: this could use "if..else and method == METHOD_CONNECT" easier.
+        switch (request->method.id()) {
  
          case METHOD_CONNECT:
-            snprintf(buf, MAX_URL, "%s:%d", request->host, request->port);
+            snprintf(buf, MAX_URL, "%s:%d",
+                     request->GetHost(),
+                     request->port);
              break;
  
          default:
@@ -452,7 +513,7 @@ urlCanonicalClean(const HttpRequest * request)
              snprintf(buf, MAX_URL, "%s://%s%s%s%s",
                       ProtocolStr[request->protocol],
                       loginbuf,
-                     request->host,
+                     request->GetHost(),
                       portbuf,
                       request->urlpath.buf());
              /*
@@ -473,10 +534,109 @@ urlCanonicalClean(const HttpRequest * request)
      return buf;
  }
  
+/*
+ * Test if a URL is relative.
+ *
+ * RFC 2396, Section 5 (Page 17) implies that in a relative URL, a '/' will
+ * appear before a ':'.
+ */
+bool
+urlIsRelative(const char *url)
+{
+    const char *p;
+
+    if (url == NULL) {
+        return (false);
+    }
+    if (*url == '\0') {
+        return (false);
+    }
+
+    for (p = url; *p != '\0' && *p != ':' && *p != '/'; p++);
+
+    if (*p == ':') {
+        return (false);
+    }
+    return (true);
+}
+
+/*
+ * Convert a relative URL to an absolute URL using the context of a given
+ * request.
+ *
+ * It is assumed that you have already ensured that the URL is relative.
+ *
+ * If NULL is returned it is an indication that the method in use in the
+ * request does not distinguish between relative and absolute and you should
+ * use the url unchanged.
+ *
+ * If non-NULL is returned, it is up to the caller to free the resulting
+ * memory using safe_free().
+ */
+char *
+urlMakeAbsolute(const HttpRequest * req, const char *relUrl)
+{
+
+    if (req->method.id() == METHOD_CONNECT) {
+        return (NULL);
+    }
+
+    char *urlbuf = (char *)xmalloc(MAX_URL * sizeof(char));
+
+    if (req->protocol == PROTO_URN) {
+        snprintf(urlbuf, MAX_URL, "urn:%s", req->urlpath.buf());
+        return (urlbuf);
+    }
+
+    size_t urllen;
+
+    if (req->port != urlDefaultPort(req->protocol)) {
+        urllen = snprintf(urlbuf, MAX_URL, "%s://%s%s%s:%d",
+                          ProtocolStr[req->protocol],
+                          req->login,
+                          *req->login ? "@" : null_string,
+                          req->GetHost(),
+                          req->port
+                         );
+    } else {
+        urllen = snprintf(urlbuf, MAX_URL, "%s://%s%s%s",
+                          ProtocolStr[req->protocol],
+                          req->login,
+                          *req->login ? "@" : null_string,
+                          req->GetHost()
+                         );
+    }
+
+    if (relUrl[0] == '/') {
+        strncpy(&urlbuf[urllen], relUrl, MAX_URL - urllen - 1);
+    } else {
+        const char *path = req->urlpath.buf();
+        const char *last_slash = strrchr(path, '/');
+
+        if (last_slash == NULL) {
+            urlbuf[urllen++] = '/';
+            strncpy(&urlbuf[urllen], relUrl, MAX_URL - urllen - 1);
+        } else {
+            last_slash++;
+            size_t pathlen = last_slash - path;
+            if (pathlen > MAX_URL - urllen - 1) {
+                pathlen = MAX_URL - urllen - 1;
+            }
+            strncpy(&urlbuf[urllen], path, pathlen);
+            urllen += pathlen;
+            if (urllen + 1 < MAX_URL) {
+                strncpy(&urlbuf[urllen], relUrl, MAX_URL - urllen - 1);
+            }
+        }
+    }
+
+    return (urlbuf);
+}
+
  /*
   * matchDomainName() compares a hostname with a domainname according
   * to the following rules:
- * 
+ *
   *    HOST          DOMAIN        MATCH?
   * ------------- -------------    ------
   *    foo.com       foo.com         YES
@@ -684,7 +844,7 @@ void
  URLHostName::init(char const *aUrl)
  {
      Host[0] = '\0';
-    url = url;
+    url = aUrl;
  }
  
  void
@@ -697,6 +857,12 @@ URLHostName::findHostStart()
  
      while (*hostStart != '\0' && *hostStart == '/')
          ++hostStart;
+
+#if USE_IPV6
+    if (*hostStart == ']')
+        ++hostStart;
+#endif
+
  }
  
  void
@@ -707,8 +873,14 @@ URLHostName::trimTrailingChars()
      if ((t = strchr(Host, '/')))
          *t = '\0';
  
-    if ((t = strchr(Host, ':')))
+    if ((t = strrchr(Host, ':')))
          *t = '\0';
+
+#if USE_IPV6
+    if ((t = strchr(Host, ']')))
+        *t = '\0';
+#endif
+
  }
  
  void