From: Daniel Stenberg Date: Wed, 7 Jan 2026 08:26:14 +0000 (+0100) Subject: urlapi: split parts of parseurl() into sub functions X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=5f612acaa1936998677f5e664be706374de93efe;p=thirdparty%2Fcurl.git urlapi: split parts of parseurl() into sub functions - parse_file - parse_scheme - guess_scheme - handle_fragment - handle_query - handle_path Closes #20205 --- diff --git a/lib/urlapi.c b/lib/urlapi.c index e974783d71..a67cf7a05f 100644 --- a/lib/urlapi.c +++ b/lib/urlapi.c @@ -898,314 +898,259 @@ end: return result ? 1 : 0; /* success */ } -static CURLUcode parseurl(const char *url, CURLU *u, unsigned int flags) +static CURLUcode parse_file(const char *url, size_t urllen, CURLU *u, + struct dynbuf *host, const char **pathp, + size_t *pathlenp) { const char *path; size_t pathlen; - char *query = NULL; - char *fragment = NULL; - char schemebuf[MAX_SCHEME_LEN + 1]; - size_t schemelen = 0; - size_t urllen; - CURLUcode result = CURLUE_OK; - size_t fraglen = 0; - struct dynbuf host; + bool uncpath = FALSE; + if(urllen <= 6) + /* file:/ is not enough to actually be a complete file: URL */ + return CURLUE_BAD_FILE_URL; - DEBUGASSERT(url); - - curlx_dyn_init(&host, CURL_MAX_INPUT_LENGTH); - - result = Curl_junkscan(url, &urllen, !!(flags & CURLU_ALLOW_SPACE)); - if(result) - goto fail; + /* path has been allocated large enough to hold this */ + path = &url[5]; + pathlen = urllen - 5; - schemelen = Curl_is_absolute_url(url, schemebuf, sizeof(schemebuf), - flags & (CURLU_GUESS_SCHEME | - CURLU_DEFAULT_SCHEME)); - - /* handle the file: scheme */ - if(schemelen && !strcmp(schemebuf, "file")) { - bool uncpath = FALSE; - if(urllen <= 6) { - /* file:/ is not enough to actually be a complete file: URL */ - result = CURLUE_BAD_FILE_URL; - goto fail; - } - - /* path has been allocated large enough to hold this */ - path = &url[5]; - pathlen = urllen - 5; + u->scheme = curlx_strdup("file"); + if(!u->scheme) + return CURLUE_OUT_OF_MEMORY; - u->scheme = curlx_strdup("file"); - if(!u->scheme) { - result = CURLUE_OUT_OF_MEMORY; - goto fail; - } + /* Extra handling URLs with an authority component (i.e. that start with + * "file://") + * + * We allow omitted hostname (e.g. file:/) -- valid according to + * RFC 8089, but not the (current) WHAT-WG URL spec. + */ + if(path[0] == '/' && path[1] == '/') { + /* swallow the two slashes */ + const char *ptr = &path[2]; - /* Extra handling URLs with an authority component (i.e. that start with - * "file://") + /* + * According to RFC 8089, a file: URL can be reliably dereferenced if: + * + * o it has no/blank hostname, or + * + * o the hostname matches "localhost" (case-insensitively), or + * + * o the hostname is a FQDN that resolves to this machine, or * - * We allow omitted hostname (e.g. file:/) -- valid according to - * RFC 8089, but not the (current) WHAT-WG URL spec. + * o it is an UNC String transformed to an URI (Windows only, RFC 8089 + * Appendix E.3). + * + * For brevity, we only consider URLs with empty, "localhost", or + * "127.0.0.1" hostnames as local, otherwise as an UNC String. + * + * Additionally, there is an exception for URLs with a Windows drive + * letter in the authority (which was accidentally omitted from RFC 8089 + * Appendix E, but believe me, it was meant to be there. --MK) */ - if(path[0] == '/' && path[1] == '/') { - /* swallow the two slashes */ - const char *ptr = &path[2]; - - /* - * According to RFC 8089, a file: URL can be reliably dereferenced if: - * - * o it has no/blank hostname, or - * - * o the hostname matches "localhost" (case-insensitively), or - * - * o the hostname is a FQDN that resolves to this machine, or - * - * o it is an UNC String transformed to an URI (Windows only, RFC 8089 - * Appendix E.3). - * - * For brevity, we only consider URLs with empty, "localhost", or - * "127.0.0.1" hostnames as local, otherwise as an UNC String. - * - * Additionally, there is an exception for URLs with a Windows drive - * letter in the authority (which was accidentally omitted from RFC 8089 - * Appendix E, but believe me, it was meant to be there. --MK) - */ - if(ptr[0] != '/' && !STARTS_WITH_URL_DRIVE_PREFIX(ptr)) { - /* the URL includes a hostname, it must match "localhost" or - "127.0.0.1" to be valid */ - if(checkprefix("localhost/", ptr) || - checkprefix("127.0.0.1/", ptr)) { - ptr += 9; /* now points to the slash after the host */ - } - else { + if(ptr[0] != '/' && !STARTS_WITH_URL_DRIVE_PREFIX(ptr)) { + /* the URL includes a hostname, it must match "localhost" or + "127.0.0.1" to be valid */ + if(checkprefix("localhost/", ptr) || + checkprefix("127.0.0.1/", ptr)) { + ptr += 9; /* now points to the slash after the host */ + } + else { #ifdef _WIN32 - size_t len; - - /* the hostname, NetBIOS computer name, can not contain disallowed - chars, and the delimiting slash character must be appended to the - hostname */ - path = strpbrk(ptr, "/\\:*?\"<>|"); - if(!path || *path != '/') { - result = CURLUE_BAD_FILE_URL; - goto fail; - } - - len = path - ptr; - if(len) { - CURLcode code = curlx_dyn_addn(&host, ptr, len); - if(code) { - result = cc2cu(code); - goto fail; - } - uncpath = TRUE; - } + size_t len; + + /* the hostname, NetBIOS computer name, can not contain disallowed + chars, and the delimiting slash character must be appended to the + hostname */ + path = strpbrk(ptr, "/\\:*?\"<>|"); + if(!path || *path != '/') + return CURLUE_BAD_FILE_URL; + + len = path - ptr; + if(len) { + CURLcode code = curlx_dyn_addn(host, ptr, len); + if(code) + return cc2cu(code); + uncpath = TRUE; + } - ptr -= 2; /* now points to the // before the host in UNC */ + ptr -= 2; /* now points to the // before the host in UNC */ #else - /* Invalid file://hostname/, expected localhost or 127.0.0.1 or - none */ - result = CURLUE_BAD_FILE_URL; - goto fail; + /* Invalid file://hostname/, expected localhost or 127.0.0.1 or + none */ + return CURLUE_BAD_FILE_URL; #endif - } } - - path = ptr; - pathlen = urllen - (ptr - url); } - if(!uncpath) - /* no host for file: URLs by default */ - curlx_dyn_reset(&host); + path = ptr; + pathlen = urllen - (ptr - url); + } + + if(!uncpath) + /* no host for file: URLs by default */ + curlx_dyn_reset(host); #if !defined(_WIN32) && !defined(MSDOS) && !defined(__CYGWIN__) - /* Do not allow Windows drive letters when not in Windows. - * This catches both "file:/c:" and "file:c:" */ - if(('/' == path[0] && STARTS_WITH_URL_DRIVE_PREFIX(&path[1])) || - STARTS_WITH_URL_DRIVE_PREFIX(path)) { - /* File drive letters are only accepted in MS-DOS/Windows */ - result = CURLUE_BAD_FILE_URL; - goto fail; - } + /* Do not allow Windows drive letters when not in Windows. + * This catches both "file:/c:" and "file:c:" */ + if(('/' == path[0] && STARTS_WITH_URL_DRIVE_PREFIX(&path[1])) || + STARTS_WITH_URL_DRIVE_PREFIX(path)) { + /* File drive letters are only accepted in MS-DOS/Windows */ + return CURLUE_BAD_FILE_URL; + } #else - /* If the path starts with a slash and a drive letter, ditch the slash */ - if('/' == path[0] && STARTS_WITH_URL_DRIVE_PREFIX(&path[1])) { - /* This cannot be done with strcpy, as the memory chunks overlap! */ - path++; - pathlen--; - } + /* If the path starts with a slash and a drive letter, ditch the slash */ + if('/' == path[0] && STARTS_WITH_URL_DRIVE_PREFIX(&path[1])) { + /* This cannot be done with strcpy, as the memory chunks overlap! */ + path++; + pathlen--; + } #endif + *pathp = path; + *pathlenp = pathlen; + return CURLUE_OK; +} + +static CURLUcode parse_scheme(const char *url, CURLU *u, char *schemebuf, + size_t schemelen, unsigned int flags, + const char **hostpp) +{ + /* clear path */ + const char *schemep = NULL; + + if(schemelen) { + int i = 0; + const char *p = &url[schemelen + 1]; + while((*p == '/') && (i < 4)) { + p++; + i++; + } + + schemep = schemebuf; + if(!Curl_get_scheme_handler(schemep) && + !(flags & CURLU_NON_SUPPORT_SCHEME)) + return CURLUE_UNSUPPORTED_SCHEME; + + if((i < 1) || (i > 3)) + /* less than one or more than three slashes */ + return CURLUE_BAD_SLASHES; + + *hostpp = p; /* hostname starts here */ } else { - /* clear path */ - const char *schemep = NULL; - const char *hostp; - size_t hostlen; - - if(schemelen) { - int i = 0; - const char *p = &url[schemelen + 1]; - while((*p == '/') && (i < 4)) { - p++; - i++; - } + /* no scheme! */ - schemep = schemebuf; - if(!Curl_get_scheme_handler(schemep) && - !(flags & CURLU_NON_SUPPORT_SCHEME)) { - result = CURLUE_UNSUPPORTED_SCHEME; - goto fail; - } + if(!(flags & (CURLU_DEFAULT_SCHEME | CURLU_GUESS_SCHEME))) + return CURLUE_BAD_SCHEME; - if((i < 1) || (i > 3)) { - /* less than one or more than three slashes */ - result = CURLUE_BAD_SLASHES; - goto fail; - } - hostp = p; /* hostname starts here */ - } - else { - /* no scheme! */ + if(flags & CURLU_DEFAULT_SCHEME) + schemep = DEFAULT_SCHEME; - if(!(flags & (CURLU_DEFAULT_SCHEME | CURLU_GUESS_SCHEME))) { - result = CURLUE_BAD_SCHEME; - goto fail; - } - if(flags & CURLU_DEFAULT_SCHEME) - schemep = DEFAULT_SCHEME; + /* + * The URL was badly formatted, let's try without scheme specified. + */ + *hostpp = url; + } - /* - * The URL was badly formatted, let's try without scheme specified. - */ - hostp = url; - } + if(schemep) { + u->scheme = curlx_strdup(schemep); + if(!u->scheme) + return CURLUE_OUT_OF_MEMORY; + } + return CURLUE_OK; +} - if(schemep) { - u->scheme = curlx_strdup(schemep); - if(!u->scheme) { - result = CURLUE_OUT_OF_MEMORY; - goto fail; - } - } +static CURLUcode guess_scheme(CURLU *u, struct dynbuf *host) +{ + const char *hostname = curlx_dyn_ptr(host); + const char *schemep = NULL; + /* legacy curl-style guess based on hostname */ + if(checkprefix("ftp.", hostname)) + schemep = "ftp"; + else if(checkprefix("dict.", hostname)) + schemep = "dict"; + else if(checkprefix("ldap.", hostname)) + schemep = "ldap"; + else if(checkprefix("imap.", hostname)) + schemep = "imap"; + else if(checkprefix("smtp.", hostname)) + schemep = "smtp"; + else if(checkprefix("pop3.", hostname)) + schemep = "pop3"; + else + schemep = "http"; - /* find the end of the hostname + port number */ - hostlen = strcspn(hostp, "/?#"); - path = &hostp[hostlen]; + u->scheme = curlx_strdup(schemep); + if(!u->scheme) + return CURLUE_OUT_OF_MEMORY; - /* this pathlen also contains the query and the fragment */ - pathlen = urllen - (path - url); - if(hostlen) { + u->guessed_scheme = TRUE; + return CURLUE_OK; +} - result = parse_authority(u, hostp, hostlen, flags, &host, schemelen); +static CURLUcode handle_fragment(CURLU *u, const char *fragment, + size_t fraglen, unsigned int flags) +{ + CURLUcode result; + u->fragment_present = TRUE; + if(fraglen > 1) { + /* skip the leading '#' in the copy but include the terminating null */ + if(flags & CURLU_URLENCODE) { + struct dynbuf enc; + curlx_dyn_init(&enc, CURL_MAX_INPUT_LENGTH); + result = urlencode_str(&enc, fragment + 1, fraglen - 1, TRUE, FALSE); if(result) - goto fail; - - if((flags & CURLU_GUESS_SCHEME) && !schemep) { - const char *hostname = curlx_dyn_ptr(&host); - /* legacy curl-style guess based on hostname */ - if(checkprefix("ftp.", hostname)) - schemep = "ftp"; - else if(checkprefix("dict.", hostname)) - schemep = "dict"; - else if(checkprefix("ldap.", hostname)) - schemep = "ldap"; - else if(checkprefix("imap.", hostname)) - schemep = "imap"; - else if(checkprefix("smtp.", hostname)) - schemep = "smtp"; - else if(checkprefix("pop3.", hostname)) - schemep = "pop3"; - else - schemep = "http"; - - u->scheme = curlx_strdup(schemep); - if(!u->scheme) { - result = CURLUE_OUT_OF_MEMORY; - goto fail; - } - u->guessed_scheme = TRUE; - } - } - else if(flags & CURLU_NO_AUTHORITY) { - /* allowed to be empty. */ - if(curlx_dyn_add(&host, "")) { - result = CURLUE_OUT_OF_MEMORY; - goto fail; - } + return result; + u->fragment = curlx_dyn_ptr(&enc); } else { - result = CURLUE_NO_HOST; - goto fail; - } - } - - fragment = strchr(path, '#'); - if(fragment) { - fraglen = pathlen - (fragment - path); - u->fragment_present = TRUE; - if(fraglen > 1) { - /* skip the leading '#' in the copy but include the terminating null */ - if(flags & CURLU_URLENCODE) { - struct dynbuf enc; - curlx_dyn_init(&enc, CURL_MAX_INPUT_LENGTH); - result = urlencode_str(&enc, fragment + 1, fraglen - 1, TRUE, FALSE); - if(result) - goto fail; - u->fragment = curlx_dyn_ptr(&enc); - } - else { - u->fragment = Curl_memdup0(fragment + 1, fraglen - 1); - if(!u->fragment) { - result = CURLUE_OUT_OF_MEMORY; - goto fail; - } - } + u->fragment = Curl_memdup0(fragment + 1, fraglen - 1); + if(!u->fragment) + return CURLUE_OUT_OF_MEMORY; } - /* after this, pathlen still contains the query */ - pathlen -= fraglen; } + return CURLUE_OK; +} - query = memchr(path, '?', pathlen); - if(query) { - size_t qlen = fragment ? (size_t)(fragment - query) : - pathlen - (query - path); - pathlen -= qlen; - u->query_present = TRUE; - if(qlen > 1) { - if(flags & CURLU_URLENCODE) { - struct dynbuf enc; - curlx_dyn_init(&enc, CURL_MAX_INPUT_LENGTH); - /* skip the leading question mark */ - result = urlencode_str(&enc, query + 1, qlen - 1, TRUE, TRUE); - if(result) - goto fail; - u->query = curlx_dyn_ptr(&enc); - } - else { - u->query = Curl_memdup0(query + 1, qlen - 1); - if(!u->query) { - result = CURLUE_OUT_OF_MEMORY; - goto fail; - } - } +static CURLUcode handle_query(CURLU *u, const char *query, + size_t qlen, unsigned int flags) +{ + u->query_present = TRUE; + if(qlen > 1) { + if(flags & CURLU_URLENCODE) { + struct dynbuf enc; + CURLUcode result; + curlx_dyn_init(&enc, CURL_MAX_INPUT_LENGTH); + /* skip the leading question mark */ + result = urlencode_str(&enc, query + 1, qlen - 1, TRUE, TRUE); + if(result) + return result; + u->query = curlx_dyn_ptr(&enc); } else { - /* single byte query */ - u->query = curlx_strdup(""); - if(!u->query) { - result = CURLUE_OUT_OF_MEMORY; - goto fail; - } + u->query = Curl_memdup0(query + 1, qlen - 1); + if(!u->query) + return CURLUE_OUT_OF_MEMORY; } } + else { + /* single byte query */ + u->query = curlx_strdup(""); + if(!u->query) + return CURLUE_OUT_OF_MEMORY; + } + return CURLUE_OK; +} +static CURLUcode handle_path(CURLU *u, const char *path, + size_t pathlen, unsigned int flags) +{ + CURLUcode result; if(pathlen && (flags & CURLU_URLENCODE)) { struct dynbuf enc; curlx_dyn_init(&enc, CURL_MAX_INPUT_LENGTH); result = urlencode_str(&enc, path, pathlen, TRUE, FALSE); if(result) - goto fail; + return result; pathlen = curlx_dyn_len(&enc); path = u->path = curlx_dyn_ptr(&enc); } @@ -1217,10 +1162,8 @@ static CURLUcode parseurl(const char *url, CURLU *u, unsigned int flags) else { if(!u->path) { u->path = Curl_memdup0(path, pathlen); - if(!u->path) { - result = CURLUE_OUT_OF_MEMORY; - goto fail; - } + if(!u->path) + return CURLUE_OUT_OF_MEMORY; path = u->path; } else if(flags & CURLU_URLENCODE) @@ -1231,20 +1174,95 @@ static CURLUcode parseurl(const char *url, CURLU *u, unsigned int flags) /* remove ../ and ./ sequences according to RFC3986 */ char *dedot; int err = dedotdotify(path, pathlen, &dedot); - if(err) { - result = CURLUE_OUT_OF_MEMORY; - goto fail; - } + if(err) + return CURLUE_OUT_OF_MEMORY; if(dedot) { curlx_free(u->path); u->path = dedot; } } } + return CURLUE_OK; +} - u->host = curlx_dyn_ptr(&host); +static CURLUcode parseurl(const char *url, CURLU *u, unsigned int flags) +{ + const char *path; + size_t pathlen; + char schemebuf[MAX_SCHEME_LEN + 1]; + size_t schemelen = 0; + size_t urllen; + CURLUcode result = CURLUE_OK; + struct dynbuf host; - return result; + DEBUGASSERT(url); + + curlx_dyn_init(&host, CURL_MAX_INPUT_LENGTH); + + result = Curl_junkscan(url, &urllen, !!(flags & CURLU_ALLOW_SPACE)); + if(result) + goto fail; + + schemelen = Curl_is_absolute_url(url, schemebuf, sizeof(schemebuf), + flags & (CURLU_GUESS_SCHEME | + CURLU_DEFAULT_SCHEME)); + + /* handle the file: scheme */ + if(schemelen && !strcmp(schemebuf, "file")) + result = parse_file(url, urllen, u, &host, &path, &pathlen); + else { + const char *hostp = NULL; + size_t hostlen; + result = parse_scheme(url, u, schemebuf, schemelen, flags, &hostp); + if(result) + goto fail; + + /* find the end of the hostname + port number */ + hostlen = strcspn(hostp, "/?#"); + path = &hostp[hostlen]; + + /* this pathlen also contains the query and the fragment */ + pathlen = urllen - (path - url); + if(hostlen) { + result = parse_authority(u, hostp, hostlen, flags, &host, + u->scheme != NULL); + if(!result && (flags & CURLU_GUESS_SCHEME) && !u->scheme) + result = guess_scheme(u, &host); + } + else if(flags & CURLU_NO_AUTHORITY) { + /* allowed to be empty. */ + if(curlx_dyn_add(&host, "")) + result = CURLUE_OUT_OF_MEMORY; + } + else + result = CURLUE_NO_HOST; + } + if(!result) { + /* The path might at this point contain a fragment and/or a query to + handle */ + const char *fragment = strchr(path, '#'); + if(fragment) { + size_t fraglen = pathlen - (fragment - path); + result = handle_fragment(u, fragment, fraglen, flags); + /* after this, pathlen still contains the query */ + pathlen -= fraglen; + } + } + if(!result) { + const char *query = memchr(path, '?', pathlen); + if(query) { + size_t qlen = pathlen - (query - path); + result = handle_query(u, query, qlen, flags); + pathlen -= qlen; + } + } + if(!result) + /* the fragment and query parts are trimmed off from the path */ + result = handle_path(u, path, pathlen, flags); + if(!result) { + u->host = curlx_dyn_ptr(&host); + return CURLUE_OK; + } fail: curlx_dyn_free(&host); free_urlhandle(u); diff --git a/tests/data/test1560 b/tests/data/test1560 index d766fa3a1a..4d129a871a 100644 --- a/tests/data/test1560 +++ b/tests/data/test1560 @@ -4,6 +4,7 @@ unittest urlapi +url