From: Alberto Leiva Popper Date: Thu, 8 May 2025 22:33:54 +0000 (-0600) Subject: URI normalization: Allow and normalize IP literals in host X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=ad9fd04e63ee5f6a023f2c5d2782270995bd744d;p=thirdparty%2FFORT-validator.git URI normalization: Allow and normalize IP literals in host --- diff --git a/src/types/uri.c b/src/types/uri.c index 9d010c4e..c1070a1e 100644 --- a/src/types/uri.c +++ b/src/types/uri.c @@ -7,10 +7,6 @@ #include "log.h" #include "types/path.h" -/* - * XXX IPv6 addresses - */ - #define URI_ALLOW_UNKNOWN_SCHEME (1 << 0) static error_msg EM_SCHEME_EMPTY = "Scheme seems empty"; @@ -26,6 +22,7 @@ static error_msg EM_USERINFO_BADCHR = "Illegal character in userinfo component"; static error_msg EM_USERINFO_DISALLOWED = "Protocol disallows userinfo"; static error_msg EM_HOST_BADCHR = "Illegal character in host component"; static error_msg EM_HOST_EMPTY = "Protocol disallows empty host"; +static error_msg EM_HOST_LITERAL = "Unparseable IP literal in the host"; static error_msg EM_PORT_BADCHR = "Illegal non-digit character in port component"; static error_msg EM_PORT_RANGE = "Port value is out of range"; static error_msg EM_PATH_BADCHR = "Illegal character in path component"; @@ -119,6 +116,12 @@ is_digit(unsigned char chr) return '0' <= chr && chr <= '9'; } +static bool +is_hexdigit(unsigned char chr) +{ + return is_digit(chr) || is_lowercase_hex(chr) || is_uppercase_hex(chr); +} + static bool is_symbol(unsigned char chr, char const *symbols) { @@ -156,26 +159,44 @@ static void collect_authority(unsigned char const *auth, unsigned char const **at, unsigned char const **colon, unsigned char const **end) { + bool v6skip; + *at = NULL; *colon = NULL; + v6skip = false; for (; true; auth++) { - switch (auth[0]) { - case '/': - case '?': - case '#': - case '\0': - *end = auth; - return; - case '@': - if ((*at) == NULL) { - *colon = NULL; /* Was a password if not null */ - *at = auth; + if (v6skip) { + switch (auth[0]) { + case ']': + v6skip = false; + continue; + case '\0': + *end = auth; + return; + } + } else { + switch (auth[0]) { + case '/': + case '?': + case '#': + case '\0': + *end = auth; + return; + case '@': + if ((*at) == NULL) { + /* Was a password if not null */ + *colon = NULL; + *at = auth; + } + break; + case ':': + *colon = auth; + break; + case '[': + v6skip = true; + break; } - break; - case ':': - *colon = auth; - break; } } } @@ -211,7 +232,7 @@ collect_fragment(unsigned char const *fragment, unsigned char const **end) } static error_msg -normalize_scheme(struct uri_buffer *buf, struct sized_ustring *scheme) +normalize_scheme(struct uri_buffer *buf, struct sized_ustring const *scheme) { unsigned char chr; array_index c; @@ -275,7 +296,7 @@ uchar2hex(unsigned char chr, unsigned int *hex) } static error_msg -approve_pct_encoded(struct uri_buffer *buf, struct sized_ustring *sstr, +approve_pct_encoded(struct uri_buffer *buf, struct sized_ustring const *sstr, array_index *offset) { array_index off; @@ -332,7 +353,7 @@ approve_bin(struct uri_buffer *buf, unsigned char chr) } static error_msg -approve_utf8(struct uri_buffer *buf, struct sized_ustring *sstr, +approve_utf8(struct uri_buffer *buf, struct sized_ustring const *sstr, array_index *offset) { array_index off; @@ -385,7 +406,7 @@ approve_utf8(struct uri_buffer *buf, struct sized_ustring *sstr, } static error_msg -normalize_userinfo(struct uri_buffer *buf, struct sized_ustring *userinfo) +normalize_userinfo(struct uri_buffer *buf, struct sized_ustring const *userinfo) { array_index c; unsigned char chr; @@ -418,6 +439,94 @@ normalize_userinfo(struct uri_buffer *buf, struct sized_ustring *userinfo) return NULL; } +static error_msg +normalize_ipvfuture(struct uri_buffer *buf, struct sized_ustring const *ipf) +{ + array_index i; + unsigned char chr; + bool found_hex; + + approve_chara(buf, 'v'); + + found_hex = false; + for (i = 1; i < ipf->len; i++) { + chr = ipf->str[i]; + if (is_hexdigit(chr)) { + approve_chara(buf, chr); + found_hex = true; + } else if (chr == '.') + goto value; + else + return EM_HOST_LITERAL; + } + + return EM_HOST_LITERAL; + +value: if (!found_hex) + return EM_HOST_LITERAL; + approve_chara(buf, '.'); + i++; + if (i == ipf->len) + return EM_HOST_LITERAL; + for (; i < ipf->len; i++) { + chr = ipf->str[i]; + if (is_unreserved(chr) || is_subdelim(chr) || chr == ':') + approve_chara(buf, chr); + else + return EM_HOST_LITERAL; + } + + return NULL; +} + +static error_msg +normalize_ipv6(struct uri_buffer *buf, struct sized_ustring const *v6) +{ + char dirty[INET6_ADDRSTRLEN]; + struct in6_addr addr; + char clean[INET6_ADDRSTRLEN]; + array_index i; + + if (v6->len > (INET6_ADDRSTRLEN - 1)) + return EM_HOST_LITERAL; + + memcpy(dirty, v6->str, v6->len); + dirty[v6->len] = '\0'; + if (inet_pton(AF_INET6, dirty, &addr) != 1) + return EM_HOST_LITERAL; + + if (inet_ntop(AF_INET6, &addr, clean, INET6_ADDRSTRLEN) == NULL) + return EM_HOST_LITERAL; + + for (i = 0; clean[i] != '\0'; i++) + approve_chara(buf, clean[i]); + + return NULL; +} + +static error_msg +normalize_ip_literal(struct uri_buffer *buf, struct sized_ustring const *lit) +{ + struct sized_ustring content; + error_msg error; + + if (lit->len < 3) + return EM_HOST_LITERAL; + if (lit->str[lit->len - 1] != ']') + return EM_HOST_LITERAL; + + content.str = lit->str + 1; + content.len = lit->len - 2; + + approve_chara(buf, '['); + error = (content.str[0] == 'v') + ? normalize_ipvfuture(buf, &content) + : normalize_ipv6(buf, &content); + approve_chara(buf, ']'); + + return error; +} + static error_msg normalize_host(struct uri_buffer *buf, struct sized_ustring *host) { @@ -425,6 +534,12 @@ normalize_host(struct uri_buffer *buf, struct sized_ustring *host) unsigned char chr; error_msg error; + if (host->len == 0) + return NULL; + + if (host->str[0] == '[') + return normalize_ip_literal(buf, host); + for (c = 0; c < host->len; c++) { chr = host->str[c]; if (is_uppercase(chr)) @@ -449,7 +564,7 @@ normalize_host(struct uri_buffer *buf, struct sized_ustring *host) } static error_msg -normalize_port(struct uri_buffer *buf, struct sized_ustring *port, +normalize_port(struct uri_buffer *buf, struct sized_ustring const *port, struct schema_metadata const *schema) { array_index c; @@ -489,7 +604,7 @@ strnchr(unsigned char const *str, size_t n, unsigned char chr) } static bool -next_segment(struct sized_ustring *path, struct sized_ustring *segment) +next_segment(struct sized_ustring const *path, struct sized_ustring *segment) { segment->str += segment->len + 1; if (segment->str > (path->str + path->len)) @@ -508,7 +623,7 @@ rewind_buffer(struct uri_buffer *buf, size_t limit) } static error_msg -normalize_path(struct uri_buffer *buf, struct sized_ustring *path) +normalize_path(struct uri_buffer *buf, struct sized_ustring const *path) { struct sized_ustring segment; array_index i; @@ -562,7 +677,7 @@ normalize_path(struct uri_buffer *buf, struct sized_ustring *path) } static error_msg -normalize_post_path(struct uri_buffer *buf, struct sized_ustring *post, +normalize_post_path(struct uri_buffer *buf, struct sized_ustring const *post, char prefix) { array_index c; diff --git a/test/types/uri_test.c b/test/types/uri_test.c index 3278e609..bac7831e 100644 --- a/test/types/uri_test.c +++ b/test/types/uri_test.c @@ -54,7 +54,7 @@ END_TEST (unsigned char *)dirty, URI_ALLOW_UNKNOWN_SCHEME, &normal \ )); -START_TEST(awkward_dot_dotting) +START_TEST(test_awkward_dot_dotting) { char *normal; @@ -70,7 +70,46 @@ START_TEST(awkward_dot_dotting) } END_TEST -START_TEST(test_port) +START_TEST(test_3986_host) +{ + char *normal; + + printf("rfc3986#3.2.2: Host\n"); + + TEST_NORMALIZE("https://[2001:db8::1]/", "https://[2001:db8::1]/"); + TEST_NORMALIZE("https://[2001:0db8::1]/", "https://[2001:db8::1]/"); + TEST_NORMALIZE("https://[2001:db8::0001]:1234/", "https://[2001:db8::1]:1234/"); + TEST_NORMALIZE("https://[::]/", "https://[::]/"); + TEST_NORMALIZE("https://[0::]/", "https://[::]/"); + TEST_NORMALIZE("https://[2001:db8:0:0:0:0:0:1]/", "https://[2001:db8::1]/"); + + TEST_NORMALIZE_FAIL("https://[]/", EM_HOST_LITERAL); + TEST_NORMALIZE_FAIL("https://[/", EM_HOST_LITERAL); + TEST_NORMALIZE_FAIL("https://[2001:db8::/", EM_HOST_LITERAL); + TEST_NORMALIZE_FAIL("https://[2001:db8::]a/", EM_HOST_LITERAL); + TEST_NORMALIZE_FAIL("https://[v]/", EM_HOST_LITERAL); + TEST_NORMALIZE_FAIL("https://[v1g]/", EM_HOST_LITERAL); + TEST_NORMALIZE_FAIL("https://[v.]/", EM_HOST_LITERAL); + TEST_NORMALIZE_FAIL("https://[v1.]/", EM_HOST_LITERAL); + TEST_NORMALIZE("https://[v1.1]/", "https://[v1.1]/"); + TEST_NORMALIZE("https://[v19.a-z.A_Z~0:9!$&'()*+,;=]/", "https://[v19.a-z.A_Z~0:9!$&'()*+,;=]/"); + TEST_NORMALIZE_FAIL("https://[v1.%]/", EM_HOST_LITERAL); + TEST_NORMALIZE_FAIL("https://[v1.%31]/", EM_HOST_LITERAL); + TEST_NORMALIZE_FAIL("https://[v1./]/", EM_HOST_LITERAL); + TEST_NORMALIZE_FAIL("https://[v1.?]/", EM_HOST_LITERAL); + TEST_NORMALIZE_FAIL("https://[v1.#]/", EM_HOST_LITERAL); + TEST_NORMALIZE_FAIL("https://[v1.[]/", EM_HOST_LITERAL); + TEST_NORMALIZE_FAIL("https://[v1.]]/", EM_HOST_LITERAL); + TEST_NORMALIZE_FAIL("https://[v1.@]/", EM_HOST_LITERAL); + TEST_NORMALIZE("https://[FFFF:FFFF:FFFF:FFFF:FFFF:FFFF:255.255.255.255]/", "https://[ffff:ffff:ffff:ffff:ffff:ffff:ffff:ffff]/"); + TEST_NORMALIZE_FAIL("https://[FFFF:FFFF:FFFF:FFFF:FFFF:FFFF:255.255.255.2555]/", EM_HOST_LITERAL); + TEST_NORMALIZE_FAIL("https://[potato]/", EM_HOST_LITERAL); + + TEST_NORMALIZE_FAIL("https://[2001:db8::1][2001:db8::1]/", EM_HOST_LITERAL); +} +END_TEST + +START_TEST(test_3986_port) { char *normal; @@ -84,7 +123,7 @@ START_TEST(test_port) } END_TEST -START_TEST(pct_encoding) +START_TEST(test_3986_pct_encoding) { char *normal; @@ -285,7 +324,7 @@ START_TEST(test_unknown_protocols) } END_TEST -START_TEST(reserved_unchanged) +START_TEST(test_3986_reserved) { char *normal; @@ -307,7 +346,7 @@ START_TEST(reserved_unchanged) } END_TEST -START_TEST(test_query) +START_TEST(test_3986_query) { char *normal; @@ -331,7 +370,7 @@ START_TEST(test_query) } END_TEST -START_TEST(test_fragment) +START_TEST(test_3986_fragment) { char *normal; @@ -356,7 +395,7 @@ START_TEST(test_fragment) } END_TEST -START_TEST(lowercase_scheme_and_host) +START_TEST(test_3986_lowercase_scheme_and_host) { char *normal; @@ -372,7 +411,7 @@ START_TEST(lowercase_scheme_and_host) } END_TEST -START_TEST(decode_unreserved_characters) +START_TEST(test_3986_unreserved_characters) { char *normal; @@ -383,7 +422,7 @@ START_TEST(decode_unreserved_characters) } END_TEST -START_TEST(path_segment_normalization) +START_TEST(test_3986_path_segment_normalization) { char *normal; @@ -415,7 +454,7 @@ START_TEST(path_segment_normalization) } END_TEST -START_TEST(all_the_above_combined) +START_TEST(test_3986_aggregated) { char *normal; @@ -426,7 +465,7 @@ START_TEST(all_the_above_combined) } END_TEST -START_TEST(scheme_based_normalization) +START_TEST(test_3986_scheme_based_normalization) { char *normal; @@ -437,7 +476,7 @@ START_TEST(scheme_based_normalization) } END_TEST -START_TEST(https_grammar) +START_TEST(test_https_grammar) { char *normal; @@ -458,7 +497,7 @@ START_TEST(https_grammar) } END_TEST -START_TEST(https_default_port) +START_TEST(test_https_default_port) { char *normal; @@ -477,7 +516,7 @@ START_TEST(https_default_port) } END_TEST -START_TEST(disallow_http_empty_host) +START_TEST(test_https_disallow_empty_host) { char *normal; @@ -497,7 +536,7 @@ START_TEST(disallow_http_empty_host) } END_TEST -START_TEST(provide_default_path) +START_TEST(test_https_provide_default_path) { char *normal; @@ -508,7 +547,7 @@ START_TEST(provide_default_path) } END_TEST -START_TEST(scheme_and_host_lowercase) +START_TEST(test_https_scheme_and_host_lowercase) { char *normal; @@ -520,7 +559,7 @@ START_TEST(scheme_and_host_lowercase) } END_TEST -START_TEST(not_reserved_not_pct_encoded) +START_TEST(test_https_not_reserved_not_pct_encoded) { char *normal; @@ -572,7 +611,7 @@ START_TEST(not_reserved_not_pct_encoded) } END_TEST -START_TEST(aggregated_423) +START_TEST(test_https_aggregated) { char *normal; @@ -584,7 +623,7 @@ START_TEST(aggregated_423) } END_TEST -START_TEST(disallow_https_userinfo) +START_TEST(test_https_disallow_userinfo) { char *normal; @@ -596,7 +635,7 @@ START_TEST(disallow_https_userinfo) } END_TEST -START_TEST(rsync_grammar) +START_TEST(test_rsync_grammar) { char *normal; @@ -621,7 +660,7 @@ START_TEST(rsync_grammar) TEST_NORMALIZE("rsync://a.b.c/m/r", "rsync://a.b.c/m/r"); TEST_NORMALIZE("rsync://user@a.b.c:1234", "rsync://user@a.b.c:1234/"); TEST_NORMALIZE("rsync://a.b.c", "rsync://a.b.c/"); - TEST_NORMALIZE_FAIL("rsync://[@a.b.c", EM_USERINFO_BADCHR); + TEST_NORMALIZE_FAIL("rsync://]@a.b.c", EM_USERINFO_BADCHR); /* hier-part = path-absolute */ /* ie. "rsync:/" [ pchar+ ( "/" pchar* )* ] */ @@ -649,7 +688,7 @@ START_TEST(rsync_grammar) } END_TEST -START_TEST(rsync_default_port) +START_TEST(test_rsync_default_port) { char *normal; @@ -669,35 +708,36 @@ static Suite *create_suite(void) misc = tcase_create("Miscellaneous"); tcase_add_test(misc, test_rewind); tcase_add_test(misc, test_unknown_protocols); - tcase_add_test(misc, awkward_dot_dotting); + tcase_add_test(misc, test_awkward_dot_dotting); tcase_add_test(misc, test_same_origin); tcase_add_test(misc, test_utf8); generic = tcase_create("RFC 3986 (generic URI)"); - tcase_add_test(generic, pct_encoding); - tcase_add_test(generic, reserved_unchanged); - tcase_add_test(generic, test_port); - tcase_add_test(generic, test_query); - tcase_add_test(generic, test_fragment); - tcase_add_test(generic, lowercase_scheme_and_host); - tcase_add_test(generic, decode_unreserved_characters); - tcase_add_test(generic, path_segment_normalization); - tcase_add_test(generic, all_the_above_combined); - tcase_add_test(generic, scheme_based_normalization); + tcase_add_test(generic, test_3986_pct_encoding); + tcase_add_test(generic, test_3986_reserved); + tcase_add_test(generic, test_3986_host); + tcase_add_test(generic, test_3986_port); + tcase_add_test(generic, test_3986_query); + tcase_add_test(generic, test_3986_fragment); + tcase_add_test(generic, test_3986_lowercase_scheme_and_host); + tcase_add_test(generic, test_3986_unreserved_characters); + tcase_add_test(generic, test_3986_path_segment_normalization); + tcase_add_test(generic, test_3986_aggregated); + tcase_add_test(generic, test_3986_scheme_based_normalization); https = tcase_create("RFC 9110 (https)"); - tcase_add_test(https, https_grammar); - tcase_add_test(https, https_default_port); - tcase_add_test(https, disallow_http_empty_host); - tcase_add_test(https, provide_default_path); - tcase_add_test(https, scheme_and_host_lowercase); - tcase_add_test(https, not_reserved_not_pct_encoded); - tcase_add_test(https, aggregated_423); - tcase_add_test(https, disallow_https_userinfo); + tcase_add_test(https, test_https_grammar); + tcase_add_test(https, test_https_default_port); + tcase_add_test(https, test_https_disallow_empty_host); + tcase_add_test(https, test_https_provide_default_path); + tcase_add_test(https, test_https_scheme_and_host_lowercase); + tcase_add_test(https, test_https_not_reserved_not_pct_encoded); + tcase_add_test(https, test_https_aggregated); + tcase_add_test(https, test_https_disallow_userinfo); rsync = tcase_create("RFC 5781 (rsync)"); - tcase_add_test(rsync, rsync_grammar); - tcase_add_test(rsync, rsync_default_port); + tcase_add_test(rsync, test_rsync_grammar); + tcase_add_test(rsync, test_rsync_default_port); suite = suite_create("url"); suite_add_tcase(suite, misc);