From: Alberto Leiva Popper Date: Thu, 3 Apr 2025 23:45:23 +0000 (-0600) Subject: Figure out URI normalization X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=1fc9530782a07c8fd7c5cd54a96f3238d1f694b4;p=thirdparty%2FFORT-validator.git Figure out URI normalization I haven't actually found much incentive to justify the normalization, but libcurl provides a (still flawed as of 8.12.1, but workable) API to do it effortlessly. This is better than the previous implementation, and future-proof enough. --- diff --git a/src/types/url.c b/src/types/url.c index bfc5a720..4bf47d96 100644 --- a/src/types/url.c +++ b/src/types/url.c @@ -1,5 +1,7 @@ #include "types/url.h" +#include + #include "alloc.h" #include "common.h" #include "types/path.h" @@ -17,119 +19,82 @@ url_is_https(char const *url) } /* - * XXX use this: - * - * for (s = str; s[0] != '\0'; s++) { - error = validate_url_character(s[0]); - if (error) - return error; - } - * * @character is an integer because we sometimes receive signed chars, and other * times we get unsigned chars. * Casting a negative char into a unsigned char is undefined behavior. */ -//static int -//validate_url_character(int character) -//{ -// /* -// * RFCs 1738 and 3986 define a very specific range of allowed -// * characters, but I don't think we're that concerned about URL -// * correctness. Validating the URL properly is more involved than simply -// * checking legal characters, anyway. -// * -// * What I really need this validation for is ensure that we won't get -// * any trouble later, when we attempt to map the URL to a path. -// * -// * Sample trouble: Getting UTF-8 characters. Why are they trouble? -// * Because we don't have any guarantees that the system's file name -// * encoding is UTF-8. URIs are not supposed to contain UTF-8 in the -// * first place, so we have no reason to deal with encoding conversion. -// * -// * To be perfectly fair, we have no guarantees that the system's file -// * name encoding is ASCII-compatible either, but I need to hang onto -// * SOMETHING. -// * -// * (Asking users to use UTF-8 is fine, but asking users to use something -// * ASCII-compatible is a little better.) -// * -// * So just make sure that the character is printable ASCII. -// * -// * TODO (next iteration) Consider exhaustive URL validation. -// */ -// return (0x20 <= character && character <= 0x7E) -// ? 0 -// : pr_val_err("URL has non-printable character code '%d'.", character); -//} - -static char * -path_rewind(char const *root, char *cursor) +static int +validate_url_character(int character) { - for (cursor -= 2; root <= cursor; cursor--) - if (*cursor == '/') - return cursor + 1; - return NULL; + return (0x20 <= character && character <= 0x7E) + ? 0 + : pr_val_err("URL has non-printable character code '%d'.", character); } -static bool -has_bad_prefix(char const *url) +/* Not done by libcurl, apparently */ +static int +validate_url_characters(char const *str) { - // XXX what happens if code expects one but url is the other - if (strncmp(url, "rsync://", RPKI_SCHEMA_LEN) && - strncmp(url, "https://", RPKI_SCHEMA_LEN)) - return true; - - /* Disallow the root domain */ - url += RPKI_SCHEMA_LEN; - if (url[0] == '/') - return true; - if (url[0] == '.' && url[1] == '/') - return true; - - // XXX read the standard and reject more bad URLs - return false; + char const *s; + int error; + + for (s = str; s[0] != '\0'; s++) { + error = validate_url_character(s[0]); + if (error) + return error; + } + + return 0; } /* - * Collapses '//' (except from the schema), '.' and '..'. + * See RFC 3986. Basically, "rsync://%61.b/./c/.././%64/." -> "rsync://a.b/d" + * + * This is not actually a perfect normalization, because it's deferred to curl, + * whose implementation is somewhat flawed (at least until version 8.12.1): + * https://github.com/curl/curl/issues/16829 * - * "rsync://a.b/./c//.././/d/." -> "rsync://a.b/d" + * On the other hand, since Fort 2 no longer maps URI paths to literal local + * paths, all normalization does for us is prevent some theoretical redundant + * downloading, so it might not even be that necessary. */ char * url_normalize(char const *url) { - char *normal, *dst; - struct tokenizer tkn; + CURLU *curlu; + char *curl_normal; + char *normal; + CURLUcode err; - if (has_bad_prefix(url)) + if (validate_url_characters(url)) return NULL; - normal = pstrdup(url); - dst = normal + RPKI_SCHEMA_LEN; - token_init(&tkn, url + RPKI_SCHEMA_LEN); - - while (token_next(&tkn)) { - if (tkn.len == 1 && tkn.str[0] == '.') - continue; - if (tkn.len == 2 && tkn.str[0] == '.' && tkn.str[1] == '.') { - dst = path_rewind(normal + RPKI_SCHEMA_LEN, dst); - if (!dst) - goto fail; - continue; - } - strncpy(dst, tkn.str, tkn.len); - dst[tkn.len] = '/'; - dst += tkn.len + 1; - } + curlu = curl_url(); + if (!curlu) + enomem_panic(); + + /* The flag is needed by rsync */ + err = curl_url_set(curlu, CURLUPART_URL, url, CURLU_NON_SUPPORT_SCHEME); + if (err) + goto einval; + err = curl_url_get(curlu, CURLUPART_URL, &curl_normal, 0); + if (err) + goto einval; + + curl_url_cleanup(curlu); - /* Reject URL if there's nothing after the schema. Maybe unnecessary. */ - if (dst == normal + RPKI_SCHEMA_LEN) - goto fail; + if (strncmp(curl_normal, "rsync://", RPKI_SCHEMA_LEN) && + strncmp(curl_normal, "https://", RPKI_SCHEMA_LEN)) { + curl_free(curl_normal); + return NULL; + } - dst[-1] = '\0'; + normal = pstrdup(curl_normal); + curl_free(curl_normal); return normal; -fail: free(normal); +einval: pr_val_err("Error parsing URL: %s", curl_url_strerror(err)); + curl_url_cleanup(curlu); return NULL; } diff --git a/test/Makefile.am b/test/Makefile.am index b220fecf..84d4f18b 100644 --- a/test/Makefile.am +++ b/test/Makefile.am @@ -106,7 +106,7 @@ thread_pool_test_LDADD = ${CHECK_LIBS} check_PROGRAMS += url.test url_test_SOURCES = types/url_test.c -url_test_LDADD = ${CHECK_LIBS} +url_test_LDADD = ${CHECK_LIBS} ${CURL_LIBS} check_PROGRAMS += uthash.test uthash_test_SOURCES = types/uthash_test.c diff --git a/test/types/url_test.c b/test/types/url_test.c index f62bfec5..57bfeee5 100644 --- a/test/types/url_test.c +++ b/test/types/url_test.c @@ -16,18 +16,18 @@ START_TEST(test_normalize) { char *normal; - TEST_NORMALIZE("rsync://a.b.c", "rsync://a.b.c"); - TEST_NORMALIZE("rsync://a.b.c/", "rsync://a.b.c"); + TEST_NORMALIZE("rsync://a.b.c", "rsync://a.b.c/"); + TEST_NORMALIZE("rsync://a.b.c/", "rsync://a.b.c/"); TEST_NORMALIZE("rsync://a.b.c/d", "rsync://a.b.c/d"); - TEST_NORMALIZE("rsync://a.b.c//////", "rsync://a.b.c"); + TEST_NORMALIZE("rsync://a.b.c//////", "rsync://a.b.c//////"); TEST_NORMALIZE("rsync://a.b.c/d/e", "rsync://a.b.c/d/e"); - TEST_NORMALIZE("rsync://a.b.c/d/e/.", "rsync://a.b.c/d/e"); - TEST_NORMALIZE("rsync://a.b.c/d/e/.", "rsync://a.b.c/d/e"); - TEST_NORMALIZE("rsync://a.b.c/././d/././e/./.", "rsync://a.b.c/d/e"); - TEST_NORMALIZE("rsync://a.b.c/d/..", "rsync://a.b.c"); + TEST_NORMALIZE("rsync://a.b.c/d/e/.", "rsync://a.b.c/d/e/"); + TEST_NORMALIZE("rsync://a.b.c/d/e/.", "rsync://a.b.c/d/e/"); + TEST_NORMALIZE("rsync://a.b.c/././d/././e/./.", "rsync://a.b.c/d/e/"); + TEST_NORMALIZE("rsync://a.b.c/d/..", "rsync://a.b.c/"); TEST_NORMALIZE("rsync://a.b.c/x/../x/y/z", "rsync://a.b.c/x/y/z"); - TEST_NORMALIZE("rsync://a.b.c/d/../d/../d/e/", "rsync://a.b.c/d/e"); - TEST_NORMALIZE("rsync://x//y/z/../../m/./n/o", "rsync://x/m/n/o"); + TEST_NORMALIZE("rsync://a.b.c/d/../d/../d/e/", "rsync://a.b.c/d/e/"); + TEST_NORMALIZE("rsync://x//y/z/../../m/./n/o", "rsync://x//m/n/o"); ck_assert_ptr_eq(NULL, url_normalize("")); ck_assert_ptr_eq(NULL, url_normalize("h")); @@ -36,19 +36,32 @@ START_TEST(test_normalize) ck_assert_ptr_eq(NULL, url_normalize("https:")); ck_assert_ptr_eq(NULL, url_normalize("https:/")); ck_assert_ptr_eq(NULL, url_normalize("rsync://")); - ck_assert_ptr_eq(NULL, url_normalize("rsync://.")); - ck_assert_ptr_eq(NULL, url_normalize("https://./.")); - ck_assert_ptr_eq(NULL, url_normalize("https://./d")); - ck_assert_ptr_eq(NULL, url_normalize("rsync://..")); - ck_assert_ptr_eq(NULL, url_normalize("rsync://../..")); - ck_assert_ptr_eq(NULL, url_normalize("rsync://../d")); - ck_assert_ptr_eq(NULL, url_normalize("rsync://a.b.c/..")); - ck_assert_ptr_eq(NULL, url_normalize("rsync://a.b.c/../..")); - ck_assert_ptr_eq(NULL, url_normalize("rsync://a.b.c/../x")); - ck_assert_ptr_eq(NULL, url_normalize("rsync://a.b.c/../x/y/z")); - ck_assert_ptr_eq(NULL, url_normalize("rsync://a.b.c/d/e/../../..")); + ck_assert_ptr_eq(NULL, url_normalize("rsync://a.β.c/")); + + TEST_NORMALIZE("rsync://.", "rsync://./"); + TEST_NORMALIZE("https://./.", "https://./"); + TEST_NORMALIZE("https://./d", "https://./d"); + TEST_NORMALIZE("rsync://..", "rsync://../"); + TEST_NORMALIZE("rsync://../..", "rsync://../"); + TEST_NORMALIZE("rsync://../d", "rsync://../d"); + TEST_NORMALIZE("rsync://a.b.c/..", "rsync://a.b.c/"); + TEST_NORMALIZE("rsync://a.b.c/../..", "rsync://a.b.c/"); + TEST_NORMALIZE("rsync://a.b.c/../x", "rsync://a.b.c/x"); + TEST_NORMALIZE("rsync://a.b.c/../x/y/z", "rsync://a.b.c/x/y/z"); + TEST_NORMALIZE("rsync://a.b.c/d/e/../../..", "rsync://a.b.c/"); ck_assert_ptr_eq(NULL, url_normalize("http://a.b.c/d")); ck_assert_ptr_eq(NULL, url_normalize("abcde://a.b.c/d")); + TEST_NORMALIZE("HTTPS://a.b.c/d", "https://a.b.c/d"); + TEST_NORMALIZE("rSyNc://a.b.c/d", "rsync://a.b.c/d"); + + TEST_NORMALIZE("https://a.b.c:80/d/e", "https://a.b.c:80/d/e"); + /* TEST_NORMALIZE("https://a.b.c:443/d/e", "https://a.b.c/d/e"); */ + TEST_NORMALIZE("https://a.b.c:/d/e", "https://a.b.c/d/e"); + + /* + * XXX make sure libcurl 8.12.2 implements lowercasing domains, + * defaulting 443, and maybe reject UTF-8. + */ } END_TEST