From: Daniel Stenberg <daniel@haxx.se>
Date: Wed, 7 Jan 2026 08:26:14 +0000 (+0100)
Subject: urlapi: split parts of parseurl() into sub functions
X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=5f612acaa1936998677f5e664be706374de93efe;p=thirdparty%2Fcurl.git

urlapi: split parts of parseurl() into sub functions

- parse_file
- parse_scheme
- guess_scheme
- handle_fragment
- handle_query
- handle_path

Closes #20205
---

diff --git a/lib/urlapi.c b/lib/urlapi.c
index e974783d71..a67cf7a05f 100644
--- a/lib/urlapi.c
+++ b/lib/urlapi.c
@@ -898,314 +898,259 @@ end:
   return result ? 1 : 0; /* success */
 }
 
-static CURLUcode parseurl(const char *url, CURLU *u, unsigned int flags)
+static CURLUcode parse_file(const char *url, size_t urllen, CURLU *u,
+                            struct dynbuf *host, const char **pathp,
+                            size_t *pathlenp)
 {
   const char *path;
   size_t pathlen;
-  char *query = NULL;
-  char *fragment = NULL;
-  char schemebuf[MAX_SCHEME_LEN + 1];
-  size_t schemelen = 0;
-  size_t urllen;
-  CURLUcode result = CURLUE_OK;
-  size_t fraglen = 0;
-  struct dynbuf host;
+  bool uncpath = FALSE;
+  if(urllen <= 6)
+    /* file:/ is not enough to actually be a complete file: URL */
+    return CURLUE_BAD_FILE_URL;
 
-  DEBUGASSERT(url);
-
-  curlx_dyn_init(&host, CURL_MAX_INPUT_LENGTH);
-
-  result = Curl_junkscan(url, &urllen, !!(flags & CURLU_ALLOW_SPACE));
-  if(result)
-    goto fail;
+  /* path has been allocated large enough to hold this */
+  path = &url[5];
+  pathlen = urllen - 5;
 
-  schemelen = Curl_is_absolute_url(url, schemebuf, sizeof(schemebuf),
-                                   flags & (CURLU_GUESS_SCHEME |
-                                            CURLU_DEFAULT_SCHEME));
-
-  /* handle the file: scheme */
-  if(schemelen && !strcmp(schemebuf, "file")) {
-    bool uncpath = FALSE;
-    if(urllen <= 6) {
-      /* file:/ is not enough to actually be a complete file: URL */
-      result = CURLUE_BAD_FILE_URL;
-      goto fail;
-    }
-
-    /* path has been allocated large enough to hold this */
-    path = &url[5];
-    pathlen = urllen - 5;
+  u->scheme = curlx_strdup("file");
+  if(!u->scheme)
+    return CURLUE_OUT_OF_MEMORY;
 
-    u->scheme = curlx_strdup("file");
-    if(!u->scheme) {
-      result = CURLUE_OUT_OF_MEMORY;
-      goto fail;
-    }
+  /* Extra handling URLs with an authority component (i.e. that start with
+   * "file://")
+   *
+   * We allow omitted hostname (e.g. file:/<path>) -- valid according to
+   * RFC 8089, but not the (current) WHAT-WG URL spec.
+   */
+  if(path[0] == '/' && path[1] == '/') {
+    /* swallow the two slashes */
+    const char *ptr = &path[2];
 
-    /* Extra handling URLs with an authority component (i.e. that start with
-     * "file://")
+    /*
+     * According to RFC 8089, a file: URL can be reliably dereferenced if:
+     *
+     *  o it has no/blank hostname, or
+     *
+     *  o the hostname matches "localhost" (case-insensitively), or
+     *
+     *  o the hostname is a FQDN that resolves to this machine, or
      *
-     * We allow omitted hostname (e.g. file:/<path>) -- valid according to
-     * RFC 8089, but not the (current) WHAT-WG URL spec.
+     *  o it is an UNC String transformed to an URI (Windows only, RFC 8089
+     *    Appendix E.3).
+     *
+     * For brevity, we only consider URLs with empty, "localhost", or
+     * "127.0.0.1" hostnames as local, otherwise as an UNC String.
+     *
+     * Additionally, there is an exception for URLs with a Windows drive
+     * letter in the authority (which was accidentally omitted from RFC 8089
+     * Appendix E, but believe me, it was meant to be there. --MK)
      */
-    if(path[0] == '/' && path[1] == '/') {
-      /* swallow the two slashes */
-      const char *ptr = &path[2];
-
-      /*
-       * According to RFC 8089, a file: URL can be reliably dereferenced if:
-       *
-       *  o it has no/blank hostname, or
-       *
-       *  o the hostname matches "localhost" (case-insensitively), or
-       *
-       *  o the hostname is a FQDN that resolves to this machine, or
-       *
-       *  o it is an UNC String transformed to an URI (Windows only, RFC 8089
-       *    Appendix E.3).
-       *
-       * For brevity, we only consider URLs with empty, "localhost", or
-       * "127.0.0.1" hostnames as local, otherwise as an UNC String.
-       *
-       * Additionally, there is an exception for URLs with a Windows drive
-       * letter in the authority (which was accidentally omitted from RFC 8089
-       * Appendix E, but believe me, it was meant to be there. --MK)
-       */
-      if(ptr[0] != '/' && !STARTS_WITH_URL_DRIVE_PREFIX(ptr)) {
-        /* the URL includes a hostname, it must match "localhost" or
-           "127.0.0.1" to be valid */
-        if(checkprefix("localhost/", ptr) ||
-           checkprefix("127.0.0.1/", ptr)) {
-          ptr += 9; /* now points to the slash after the host */
-        }
-        else {
+    if(ptr[0] != '/' && !STARTS_WITH_URL_DRIVE_PREFIX(ptr)) {
+      /* the URL includes a hostname, it must match "localhost" or
+         "127.0.0.1" to be valid */
+      if(checkprefix("localhost/", ptr) ||
+         checkprefix("127.0.0.1/", ptr)) {
+        ptr += 9; /* now points to the slash after the host */
+      }
+      else {
 #ifdef _WIN32
-          size_t len;
-
-          /* the hostname, NetBIOS computer name, can not contain disallowed
-             chars, and the delimiting slash character must be appended to the
-             hostname */
-          path = strpbrk(ptr, "/\\:*?\"<>|");
-          if(!path || *path != '/') {
-            result = CURLUE_BAD_FILE_URL;
-            goto fail;
-          }
-
-          len = path - ptr;
-          if(len) {
-            CURLcode code = curlx_dyn_addn(&host, ptr, len);
-            if(code) {
-              result = cc2cu(code);
-              goto fail;
-            }
-            uncpath = TRUE;
-          }
+        size_t len;
+
+        /* the hostname, NetBIOS computer name, can not contain disallowed
+           chars, and the delimiting slash character must be appended to the
+           hostname */
+        path = strpbrk(ptr, "/\\:*?\"<>|");
+        if(!path || *path != '/')
+          return CURLUE_BAD_FILE_URL;
+
+        len = path - ptr;
+        if(len) {
+          CURLcode code = curlx_dyn_addn(host, ptr, len);
+          if(code)
+            return cc2cu(code);
+          uncpath = TRUE;
+        }
 
-          ptr -= 2; /* now points to the // before the host in UNC */
+        ptr -= 2; /* now points to the // before the host in UNC */
 #else
-          /* Invalid file://hostname/, expected localhost or 127.0.0.1 or
-             none */
-          result = CURLUE_BAD_FILE_URL;
-          goto fail;
+        /* Invalid file://hostname/, expected localhost or 127.0.0.1 or
+           none */
+        return CURLUE_BAD_FILE_URL;
 #endif
-        }
       }
-
-      path = ptr;
-      pathlen = urllen - (ptr - url);
     }
 
-    if(!uncpath)
-      /* no host for file: URLs by default */
-      curlx_dyn_reset(&host);
+    path = ptr;
+    pathlen = urllen - (ptr - url);
+  }
+
+  if(!uncpath)
+    /* no host for file: URLs by default */
+    curlx_dyn_reset(host);
 
 #if !defined(_WIN32) && !defined(MSDOS) && !defined(__CYGWIN__)
-    /* Do not allow Windows drive letters when not in Windows.
-     * This catches both "file:/c:" and "file:c:" */
-    if(('/' == path[0] && STARTS_WITH_URL_DRIVE_PREFIX(&path[1])) ||
-       STARTS_WITH_URL_DRIVE_PREFIX(path)) {
-      /* File drive letters are only accepted in MS-DOS/Windows */
-      result = CURLUE_BAD_FILE_URL;
-      goto fail;
-    }
+  /* Do not allow Windows drive letters when not in Windows.
+   * This catches both "file:/c:" and "file:c:" */
+  if(('/' == path[0] && STARTS_WITH_URL_DRIVE_PREFIX(&path[1])) ||
+     STARTS_WITH_URL_DRIVE_PREFIX(path)) {
+    /* File drive letters are only accepted in MS-DOS/Windows */
+    return CURLUE_BAD_FILE_URL;
+  }
 #else
-    /* If the path starts with a slash and a drive letter, ditch the slash */
-    if('/' == path[0] && STARTS_WITH_URL_DRIVE_PREFIX(&path[1])) {
-      /* This cannot be done with strcpy, as the memory chunks overlap! */
-      path++;
-      pathlen--;
-    }
+  /* If the path starts with a slash and a drive letter, ditch the slash */
+  if('/' == path[0] && STARTS_WITH_URL_DRIVE_PREFIX(&path[1])) {
+    /* This cannot be done with strcpy, as the memory chunks overlap! */
+    path++;
+    pathlen--;
+  }
 #endif
+  *pathp = path;
+  *pathlenp = pathlen;
+  return CURLUE_OK;
+}
+
+static CURLUcode parse_scheme(const char *url, CURLU *u, char *schemebuf,
+                              size_t schemelen, unsigned int flags,
+                              const char **hostpp)
+{
+  /* clear path */
+  const char *schemep = NULL;
+
+  if(schemelen) {
+    int i = 0;
+    const char *p = &url[schemelen + 1];
+    while((*p == '/') && (i < 4)) {
+      p++;
+      i++;
+    }
+
+    schemep = schemebuf;
+    if(!Curl_get_scheme_handler(schemep) &&
+       !(flags & CURLU_NON_SUPPORT_SCHEME))
+      return CURLUE_UNSUPPORTED_SCHEME;
+
+    if((i < 1) || (i > 3))
+      /* less than one or more than three slashes */
+      return CURLUE_BAD_SLASHES;
+
+    *hostpp = p; /* hostname starts here */
   }
   else {
-    /* clear path */
-    const char *schemep = NULL;
-    const char *hostp;
-    size_t hostlen;
-
-    if(schemelen) {
-      int i = 0;
-      const char *p = &url[schemelen + 1];
-      while((*p == '/') && (i < 4)) {
-        p++;
-        i++;
-      }
+    /* no scheme! */
 
-      schemep = schemebuf;
-      if(!Curl_get_scheme_handler(schemep) &&
-         !(flags & CURLU_NON_SUPPORT_SCHEME)) {
-        result = CURLUE_UNSUPPORTED_SCHEME;
-        goto fail;
-      }
+    if(!(flags & (CURLU_DEFAULT_SCHEME | CURLU_GUESS_SCHEME)))
+      return CURLUE_BAD_SCHEME;
 
-      if((i < 1) || (i > 3)) {
-        /* less than one or more than three slashes */
-        result = CURLUE_BAD_SLASHES;
-        goto fail;
-      }
-      hostp = p; /* hostname starts here */
-    }
-    else {
-      /* no scheme! */
+    if(flags & CURLU_DEFAULT_SCHEME)
+      schemep = DEFAULT_SCHEME;
 
-      if(!(flags & (CURLU_DEFAULT_SCHEME | CURLU_GUESS_SCHEME))) {
-        result = CURLUE_BAD_SCHEME;
-        goto fail;
-      }
-      if(flags & CURLU_DEFAULT_SCHEME)
-        schemep = DEFAULT_SCHEME;
+    /*
+     * The URL was badly formatted, let's try without scheme specified.
+     */
+    *hostpp = url;
+  }
 
-      /*
-       * The URL was badly formatted, let's try without scheme specified.
-       */
-      hostp = url;
-    }
+  if(schemep) {
+    u->scheme = curlx_strdup(schemep);
+    if(!u->scheme)
+      return CURLUE_OUT_OF_MEMORY;
+  }
+  return CURLUE_OK;
+}
 
-    if(schemep) {
-      u->scheme = curlx_strdup(schemep);
-      if(!u->scheme) {
-        result = CURLUE_OUT_OF_MEMORY;
-        goto fail;
-      }
-    }
+static CURLUcode guess_scheme(CURLU *u, struct dynbuf *host)
+{
+  const char *hostname = curlx_dyn_ptr(host);
+  const char *schemep = NULL;
+  /* legacy curl-style guess based on hostname */
+  if(checkprefix("ftp.", hostname))
+    schemep = "ftp";
+  else if(checkprefix("dict.", hostname))
+    schemep = "dict";
+  else if(checkprefix("ldap.", hostname))
+    schemep = "ldap";
+  else if(checkprefix("imap.", hostname))
+    schemep = "imap";
+  else if(checkprefix("smtp.", hostname))
+    schemep = "smtp";
+  else if(checkprefix("pop3.", hostname))
+    schemep = "pop3";
+  else
+    schemep = "http";
 
-    /* find the end of the hostname + port number */
-    hostlen = strcspn(hostp, "/?#");
-    path = &hostp[hostlen];
+  u->scheme = curlx_strdup(schemep);
+  if(!u->scheme)
+    return CURLUE_OUT_OF_MEMORY;
 
-    /* this pathlen also contains the query and the fragment */
-    pathlen = urllen - (path - url);
-    if(hostlen) {
+  u->guessed_scheme = TRUE;
+  return CURLUE_OK;
+}
 
-      result = parse_authority(u, hostp, hostlen, flags, &host, schemelen);
+static CURLUcode handle_fragment(CURLU *u, const char *fragment,
+                                 size_t fraglen, unsigned int flags)
+{
+  CURLUcode result;
+  u->fragment_present = TRUE;
+  if(fraglen > 1) {
+    /* skip the leading '#' in the copy but include the terminating null */
+    if(flags & CURLU_URLENCODE) {
+      struct dynbuf enc;
+      curlx_dyn_init(&enc, CURL_MAX_INPUT_LENGTH);
+      result = urlencode_str(&enc, fragment + 1, fraglen - 1, TRUE, FALSE);
       if(result)
-        goto fail;
-
-      if((flags & CURLU_GUESS_SCHEME) && !schemep) {
-        const char *hostname = curlx_dyn_ptr(&host);
-        /* legacy curl-style guess based on hostname */
-        if(checkprefix("ftp.", hostname))
-          schemep = "ftp";
-        else if(checkprefix("dict.", hostname))
-          schemep = "dict";
-        else if(checkprefix("ldap.", hostname))
-          schemep = "ldap";
-        else if(checkprefix("imap.", hostname))
-          schemep = "imap";
-        else if(checkprefix("smtp.", hostname))
-          schemep = "smtp";
-        else if(checkprefix("pop3.", hostname))
-          schemep = "pop3";
-        else
-          schemep = "http";
-
-        u->scheme = curlx_strdup(schemep);
-        if(!u->scheme) {
-          result = CURLUE_OUT_OF_MEMORY;
-          goto fail;
-        }
-        u->guessed_scheme = TRUE;
-      }
-    }
-    else if(flags & CURLU_NO_AUTHORITY) {
-      /* allowed to be empty. */
-      if(curlx_dyn_add(&host, "")) {
-        result = CURLUE_OUT_OF_MEMORY;
-        goto fail;
-      }
+        return result;
+      u->fragment = curlx_dyn_ptr(&enc);
     }
     else {
-      result = CURLUE_NO_HOST;
-      goto fail;
-    }
-  }
-
-  fragment = strchr(path, '#');
-  if(fragment) {
-    fraglen = pathlen - (fragment - path);
-    u->fragment_present = TRUE;
-    if(fraglen > 1) {
-      /* skip the leading '#' in the copy but include the terminating null */
-      if(flags & CURLU_URLENCODE) {
-        struct dynbuf enc;
-        curlx_dyn_init(&enc, CURL_MAX_INPUT_LENGTH);
-        result = urlencode_str(&enc, fragment + 1, fraglen - 1, TRUE, FALSE);
-        if(result)
-          goto fail;
-        u->fragment = curlx_dyn_ptr(&enc);
-      }
-      else {
-        u->fragment = Curl_memdup0(fragment + 1, fraglen - 1);
-        if(!u->fragment) {
-          result = CURLUE_OUT_OF_MEMORY;
-          goto fail;
-        }
-      }
+      u->fragment = Curl_memdup0(fragment + 1, fraglen - 1);
+      if(!u->fragment)
+        return CURLUE_OUT_OF_MEMORY;
     }
-    /* after this, pathlen still contains the query */
-    pathlen -= fraglen;
   }
+  return CURLUE_OK;
+}
 
-  query = memchr(path, '?', pathlen);
-  if(query) {
-    size_t qlen = fragment ? (size_t)(fragment - query) :
-      pathlen - (query - path);
-    pathlen -= qlen;
-    u->query_present = TRUE;
-    if(qlen > 1) {
-      if(flags & CURLU_URLENCODE) {
-        struct dynbuf enc;
-        curlx_dyn_init(&enc, CURL_MAX_INPUT_LENGTH);
-        /* skip the leading question mark */
-        result = urlencode_str(&enc, query + 1, qlen - 1, TRUE, TRUE);
-        if(result)
-          goto fail;
-        u->query = curlx_dyn_ptr(&enc);
-      }
-      else {
-        u->query = Curl_memdup0(query + 1, qlen - 1);
-        if(!u->query) {
-          result = CURLUE_OUT_OF_MEMORY;
-          goto fail;
-        }
-      }
+static CURLUcode handle_query(CURLU *u, const char *query,
+                              size_t qlen, unsigned int flags)
+{
+  u->query_present = TRUE;
+  if(qlen > 1) {
+    if(flags & CURLU_URLENCODE) {
+      struct dynbuf enc;
+      CURLUcode result;
+      curlx_dyn_init(&enc, CURL_MAX_INPUT_LENGTH);
+      /* skip the leading question mark */
+      result = urlencode_str(&enc, query + 1, qlen - 1, TRUE, TRUE);
+      if(result)
+        return result;
+      u->query = curlx_dyn_ptr(&enc);
     }
     else {
-      /* single byte query */
-      u->query = curlx_strdup("");
-      if(!u->query) {
-        result = CURLUE_OUT_OF_MEMORY;
-        goto fail;
-      }
+      u->query = Curl_memdup0(query + 1, qlen - 1);
+      if(!u->query)
+        return CURLUE_OUT_OF_MEMORY;
     }
   }
+  else {
+    /* single byte query */
+    u->query = curlx_strdup("");
+    if(!u->query)
+      return CURLUE_OUT_OF_MEMORY;
+  }
+  return CURLUE_OK;
+}
 
+static CURLUcode handle_path(CURLU *u, const char *path,
+                             size_t pathlen, unsigned int flags)
+{
+  CURLUcode result;
   if(pathlen && (flags & CURLU_URLENCODE)) {
     struct dynbuf enc;
     curlx_dyn_init(&enc, CURL_MAX_INPUT_LENGTH);
     result = urlencode_str(&enc, path, pathlen, TRUE, FALSE);
     if(result)
-      goto fail;
+      return result;
     pathlen = curlx_dyn_len(&enc);
     path = u->path = curlx_dyn_ptr(&enc);
   }
@@ -1217,10 +1162,8 @@ static CURLUcode parseurl(const char *url, CURLU *u, unsigned int flags)
   else {
     if(!u->path) {
       u->path = Curl_memdup0(path, pathlen);
-      if(!u->path) {
-        result = CURLUE_OUT_OF_MEMORY;
-        goto fail;
-      }
+      if(!u->path)
+        return CURLUE_OUT_OF_MEMORY;
       path = u->path;
     }
     else if(flags & CURLU_URLENCODE)
@@ -1231,20 +1174,95 @@ static CURLUcode parseurl(const char *url, CURLU *u, unsigned int flags)
       /* remove ../ and ./ sequences according to RFC3986 */
       char *dedot;
       int err = dedotdotify(path, pathlen, &dedot);
-      if(err) {
-        result = CURLUE_OUT_OF_MEMORY;
-        goto fail;
-      }
+      if(err)
+        return CURLUE_OUT_OF_MEMORY;
       if(dedot) {
         curlx_free(u->path);
         u->path = dedot;
       }
     }
   }
+  return CURLUE_OK;
+}
 
-  u->host = curlx_dyn_ptr(&host);
+static CURLUcode parseurl(const char *url, CURLU *u, unsigned int flags)
+{
+  const char *path;
+  size_t pathlen;
+  char schemebuf[MAX_SCHEME_LEN + 1];
+  size_t schemelen = 0;
+  size_t urllen;
+  CURLUcode result = CURLUE_OK;
+  struct dynbuf host;
 
-  return result;
+  DEBUGASSERT(url);
+
+  curlx_dyn_init(&host, CURL_MAX_INPUT_LENGTH);
+
+  result = Curl_junkscan(url, &urllen, !!(flags & CURLU_ALLOW_SPACE));
+  if(result)
+    goto fail;
+
+  schemelen = Curl_is_absolute_url(url, schemebuf, sizeof(schemebuf),
+                                   flags & (CURLU_GUESS_SCHEME |
+                                            CURLU_DEFAULT_SCHEME));
+
+  /* handle the file: scheme */
+  if(schemelen && !strcmp(schemebuf, "file"))
+    result = parse_file(url, urllen, u, &host, &path, &pathlen);
+  else {
+    const char *hostp = NULL;
+    size_t hostlen;
+    result = parse_scheme(url, u, schemebuf, schemelen, flags, &hostp);
+    if(result)
+      goto fail;
+
+    /* find the end of the hostname + port number */
+    hostlen = strcspn(hostp, "/?#");
+    path = &hostp[hostlen];
+
+    /* this pathlen also contains the query and the fragment */
+    pathlen = urllen - (path - url);
+    if(hostlen) {
+      result = parse_authority(u, hostp, hostlen, flags, &host,
+                               u->scheme != NULL);
+      if(!result && (flags & CURLU_GUESS_SCHEME) && !u->scheme)
+        result = guess_scheme(u, &host);
+    }
+    else if(flags & CURLU_NO_AUTHORITY) {
+      /* allowed to be empty. */
+      if(curlx_dyn_add(&host, ""))
+        result = CURLUE_OUT_OF_MEMORY;
+    }
+    else
+      result = CURLUE_NO_HOST;
+  }
+  if(!result) {
+    /* The path might at this point contain a fragment and/or a query to
+       handle */
+    const char *fragment = strchr(path, '#');
+    if(fragment) {
+      size_t fraglen = pathlen - (fragment - path);
+      result = handle_fragment(u, fragment, fraglen, flags);
+      /* after this, pathlen still contains the query */
+      pathlen -= fraglen;
+    }
+  }
+  if(!result) {
+    const char *query = memchr(path, '?', pathlen);
+    if(query) {
+      size_t qlen = pathlen - (query - path);
+      result = handle_query(u, query, qlen, flags);
+      pathlen -= qlen;
+    }
+  }
+  if(!result)
+    /* the fragment and query parts are trimmed off from the path */
+    result = handle_path(u, path, pathlen, flags);
+  if(!result) {
+    u->host = curlx_dyn_ptr(&host);
+    return CURLUE_OK;
+  }
 fail:
   curlx_dyn_free(&host);
   free_urlhandle(u);
diff --git a/tests/data/test1560 b/tests/data/test1560
index d766fa3a1a..4d129a871a 100644
--- a/tests/data/test1560
+++ b/tests/data/test1560
@@ -4,6 +4,7 @@
 <keywords>
 unittest
 urlapi
+url
 </keywords>
 </info>