From: Alberto Leiva Popper Date: Tue, 6 May 2025 23:43:47 +0000 (-0600) Subject: Implement URI normalization X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=9dda124b5eb270382cdbc4fbe47992ff9bfa87c5;p=thirdparty%2FFORT-validator.git Implement URI normalization Stop deferring this to curl; it's not bound to come out soon. --- diff --git a/src/rrdp.c b/src/rrdp.c index 9972769d..3d99c3a0 100644 --- a/src/rrdp.c +++ b/src/rrdp.c @@ -554,7 +554,7 @@ handle_publish(xmlTextReaderPtr reader, struct parser_args *args) /* Parsing done */ - pr_clutter("Publish %s", logv_filename(tag.meta.uri)); + pr_clutter("Publish %s", logv_filename(uri_str(&tag.meta.uri))); file = state_find_file(args->state, &tag.meta.uri); @@ -629,7 +629,7 @@ handle_withdraw(xmlTextReaderPtr reader, struct parser_args *args) goto end; } - pr_clutter("Withdraw %s", logv_filename(tag.meta.uri)); + pr_clutter("Withdraw %s", logv_filename(uri_str(&tag.meta.uri))); file = state_find_file(args->state, &tag.meta.uri); diff --git a/src/types/uri.c b/src/types/uri.c index 98bb6422..886f91a0 100644 --- a/src/types/uri.c +++ b/src/types/uri.c @@ -1,6 +1,5 @@ #include "types/uri.h" -#include #include #include "alloc.h" @@ -8,106 +7,679 @@ #include "log.h" #include "types/path.h" -bool -uri_is_rsync(struct uri const *url) +#define URI_ALLOW_UNKNOWN_SCHEME (1 << 1) + +struct sized_string { + char const *str; + size_t len; +}; + +struct uri_buffer { + char *dst; + array_index d; + size_t capacity; +}; + +struct schema_metadata { + unsigned int default_port; + bool allow_userinfo; + bool allow_empty_host; + bool allow_query; + bool allow_fragment; +}; + +struct schema_metadata const HTTPS = { + .default_port = 443, + .allow_userinfo = false, + .allow_empty_host = false, + .allow_query = true, + .allow_fragment = true, +}; + +struct schema_metadata const RSYNC = { + .default_port = 873, + .allow_userinfo = true, + .allow_empty_host = true, + .allow_query = false, + .allow_fragment = false, +}; + +static bool +is_proto(struct sized_string *scheme, char const *proto) { - return str_starts_with(url->_str, "rsync://"); + return strncasecmp(scheme->str, proto, scheme->len) == 0; } -bool -uri_is_https(struct uri const *url) +static struct schema_metadata const * +get_metadata(struct sized_string *scheme) { - return str_starts_with(url->_str, "https://"); + if (scheme->len != 5) + return NULL; + + if (is_proto(scheme, "https")) + return &HTTPS; + if (is_proto(scheme, "rsync")) + return &RSYNC; + + return NULL; } -/* - * @character is an integer because we sometimes receive signed chars, and other - * times we get unsigned chars. - * Casting a negative char into a unsigned char is undefined behavior. - */ -static int -validate_url_character(int character) +static bool +is_lowercase(char chr) +{ + return 'a' <= chr && chr <= 'z'; +} + +static bool +is_uppercase(char chr) +{ + return 'A' <= chr && chr <= 'Z'; +} + +static bool +is_digit(char chr) +{ + return '0' <= chr && chr <= '9'; +} + +static bool +is_symbol(char chr, char const *symbols) +{ + for (; symbols[0] != '\0'; symbols++) + if (chr == symbols[0]) + return true; + return false; +} + +static char +to_lowercase(char uppercase) +{ + return uppercase - ('A' - 'a'); +} + +static char +to_uppercase(char chr) { - return (0x20 <= character && character <= 0x7E) - ? 0 - : pr_val_err("URL has non-printable character code '%d'.", character); + return is_lowercase(chr) ? (chr + ('A' - 'a')) : chr; } -/* Not done by libcurl, apparently */ -static int -validate_url_characters(char const *str) +static bool +invalid(char const *errmsg) { - char const *s; - int error; + printf("%s\n", errmsg); + return false; +} - for (s = str; s[0] != '\0'; s++) { - error = validate_url_character(s[0]); - if (error) - return error; +static void +approve_chara(struct uri_buffer *buf, char chr) +{ + if (buf->d >= buf->capacity) { + /* It seems this is dead code. */ + buf->capacity += 16; + buf->dst = prealloc(buf->dst, buf->capacity); } - return 0; + buf->dst[buf->d++] = chr; +} + +static bool +collect_authority(char const *auth, char const **at, char const **colon, + char const **end) +{ + *at = NULL; + *colon = NULL; + + for (; true; auth++) { + switch (auth[0]) { + case '/': + case '?': + case '#': + case '\0': + *end = auth; + return true; + case '@': + if ((*at) == NULL) { + *colon = NULL; /* Was a password if not null */ + *at = auth; + } + break; + case ':': + *colon = auth; + break; + } + } +} + +static void +collect_path(char const *path, char const **end) +{ + for (; true; path++) + if (path[0] == '\0' || path[0] == '?' || path[0] == '#') { + *end = path; + return; + } +} + +static void +collect_query(char const *query, char const **end) +{ + for (; true; query++) + if (query[0] == '\0' || query[0] == '#') { + *end = query; + return; + } +} + +static void +collect_fragment(char const *fragment, char const **end) +{ + for (; true; fragment++) + if (fragment[0] == '\0') { + *end = fragment; + return; + } +} + +static bool +normalize_scheme(struct uri_buffer *buf, struct sized_string *scheme) +{ + char chr; + array_index c; + + if (scheme->len == 0) + return invalid("Scheme seems empty."); + + chr = scheme->str[0]; + if (is_lowercase(chr)) + approve_chara(buf, chr); + else if (is_uppercase(chr)) + approve_chara(buf, to_lowercase(chr)); + else + return invalid("First character is not a letter."); + + for (c = 1; c < scheme->len; c++) { + chr = scheme->str[c]; + if (is_lowercase(chr) || is_digit(chr) || is_symbol(chr, "+.-")) + approve_chara(buf, chr); + else if (is_uppercase(chr)) + approve_chara(buf, to_lowercase(chr)); + else + return invalid("Schema character is not letter, digit, plus, period or hyphen."); + } + + approve_chara(buf, ':'); + approve_chara(buf, '/'); + approve_chara(buf, '/'); + return true; +} + +static bool +is_unreserved(char chr) +{ + return is_lowercase(chr) + || is_uppercase(chr) + || is_digit(chr) + || is_symbol(chr, "-._~"); +} + +static bool +is_subdelim(char chr) +{ + return is_symbol(chr, "!$&'()*+,;="); +} + +static bool +char2hex(char chr, unsigned int *hex) +{ + if (is_digit(chr)) { + *hex = chr - '0'; + return true; + } + if (is_uppercase(chr)) { + *hex = chr - 'A' + 10; + return true; + } + if (is_lowercase(chr)) { + *hex = chr - 'a' + 10; + return true; + } + + printf("Invalid hex digit: %c\n", chr); + return invalid("Invalid hexadecimal digit."); +} + +static bool +approve_pct_encoded(struct uri_buffer *buf, struct sized_string *sstr, + array_index *offset) +{ + array_index off; + unsigned int hex1; + unsigned int hex2; + unsigned int val; + + off = *offset; + + if (sstr->len - off < 3) + return invalid("Unterminated %-encoding."); + + if (!char2hex(sstr->str[off + 1], &hex1)) + return false; + if (!char2hex(sstr->str[off + 2], &hex2)) + return false; + + val = (hex1 << 4) | hex2; + + if (is_unreserved(val)) { + approve_chara(buf, val); + *offset += 2; + return true; + } + + approve_chara(buf, '%'); + approve_chara(buf, to_uppercase(sstr->str[off + 1])); + approve_chara(buf, to_uppercase(sstr->str[off + 2])); + *offset += 2; + return true; +} + +static bool +handle_pchar(struct uri_buffer *buf, struct sized_string *sstr, + array_index *offset) +{ + char chr = sstr->str[*offset]; + + if (is_unreserved(chr)) + approve_chara(buf, chr); + else if (chr == '%') + approve_pct_encoded(buf, sstr, offset); + else if (is_subdelim(chr)) + approve_chara(buf, chr); + else if (chr == ':' || chr == '@') + approve_chara(buf, chr); + else + return false; + return true; +} + +static bool +normalize_userinfo(struct uri_buffer *buf, struct sized_string *userinfo) +{ + array_index c; + char chr; + + if (userinfo->len == 0) + return true; + + for (c = 0; c < userinfo->len; c++) { + chr = userinfo->str[c]; + if (is_unreserved(chr)) + approve_chara(buf, chr); + else if (chr == '%') { + if (!approve_pct_encoded(buf, userinfo, &c)) + return false; + } else if (is_subdelim(chr)) + approve_chara(buf, chr); + else if (chr == ':') + approve_chara(buf, chr); + else + return invalid("Illegal character in userinfo section."); + } + + approve_chara(buf, '@'); + return true; +} + +static bool +normalize_host(struct uri_buffer *buf, struct sized_string *host) +{ + array_index c; + char chr; + + for (c = 0; c < host->len; c++) { + chr = host->str[c]; + if (is_uppercase(chr)) + approve_chara(buf, to_lowercase(chr)); + else if (is_unreserved(chr)) + approve_chara(buf, chr); + else if (chr == '%') { + if (!approve_pct_encoded(buf, host, &c)) + return false; + } else if (is_subdelim(chr)) + approve_chara(buf, chr); + else + return invalid("Illegal character in host section."); + } + + return true; +} + +static bool +normalize_port(struct uri_buffer *buf, struct sized_string *port, + struct schema_metadata const *schema) +{ + array_index c; + char chr; + unsigned int portnum; + + if (port->len == 0) + return true; + + portnum = 0; + for (c = 0; c < port->len; c++) { + chr = port->str[c]; + if (!is_digit(chr)) + return invalid("Illegal non-digit character in port section."); + portnum = 10 * portnum + (chr - '0'); + if (portnum > 0xFFFF) + return invalid("Port value is too large."); + } + + if (schema && (portnum == schema->default_port)) + return true; + + approve_chara(buf, ':'); + for (c = 0; c < port->len; c++) + approve_chara(buf, port->str[c]); + return true; +} + +static char const * +strnchr(char const *str, size_t n, char chr) +{ + array_index s; + for (s = 0; s < n; s++) + if (str[s] == chr) + break; + return str + s; +} + +static bool +next_segment(struct sized_string *path, struct sized_string *segment) +{ + segment->str += segment->len + 1; + if (segment->str > (path->str + path->len)) + return false; + segment->len = strnchr(segment->str, + path->len - (segment->str - path->str), + '/') - segment->str; + return true; +} + +static void +rewind_buffer(struct uri_buffer *buf, size_t limit) +{ + while ((buf->d > limit) && (buf->dst[--buf->d] != '/')) + ; +} + +static bool +normalize_path(struct uri_buffer *buf, struct sized_string *path) +{ + struct sized_string segment; + array_index i; + char chr; + size_t limit; + + if (path->len == 0) { + approve_chara(buf, '/'); + return true; + } + + segment.str = path->str; + segment.len = 0; + limit = buf->d; + + while (next_segment(path, &segment)) { + approve_chara(buf, '/'); + for (i = 0; i < segment.len; i++) { + chr = segment.str[i]; + if (is_unreserved(chr)) + approve_chara(buf, chr); + else if (chr == '%') { + if (!approve_pct_encoded(buf, &segment, &i)) + return false; + } else if (is_subdelim(chr) || is_symbol(chr, ":@")) + approve_chara(buf, chr); + else + return invalid("Illegal character in path section."); + } + + if (buf->dst[buf->d - 2] == '/' && + buf->dst[buf->d - 1] == '.') + rewind_buffer(buf, limit); + if (buf->dst[buf->d - 3] == '/' && + buf->dst[buf->d - 2] == '.' && + buf->dst[buf->d - 1] == '.') { + rewind_buffer(buf, limit); + rewind_buffer(buf, limit); + } + } + + if (limit == buf->d) + approve_chara(buf, '/'); + return true; +} + +static bool +normalize_post_path(struct uri_buffer *buf, struct sized_string *post, + char prefix) +{ + array_index c; + char chr; + + if (post->len == 0) + return true; + + approve_chara(buf, prefix); + for (c = 1; c < post->len; c++) { + if (handle_pchar(buf, post, &c)) + continue; + chr = post->str[c]; + if (chr == ':' || chr == '@') + approve_chara(buf, chr); + else + return invalid("Illegal character in query section."); + } + + return true; } /* * See RFC 3986. Basically, "rsync://%61.b/./c/.././%64/." -> "rsync://a.b/d" - * - * This is not actually a perfect normalization, because it's deferred to curl, - * whose implementation is somewhat flawed (at least until version 8.12.1): - * https://github.com/curl/curl/issues/16829 - * - * That said, since Fort 2 no longer maps URI paths to literal local paths, all - * normalization does for us is prevent some theoretical redundant downloading, - * so it's fine. */ static char * -url_normalize(char const *url) +url_normalize(char const *url, int flags) { - CURLU *curlu; - char *curl_normal; - char *normal; - CURLUcode err; + struct sized_string scheme; + struct sized_string authority; + struct sized_string userinfo; + struct sized_string host; + struct sized_string port; + struct sized_string path; + struct sized_string query; + struct sized_string fragment; + + char const *cursor; + char const *at; + char const *colon; + + struct schema_metadata const *meta; + struct uri_buffer buf; + + pr_clutter("-----------------------"); + pr_clutter("input: %s", url); + + cursor = strchr(url, ':'); + if (!cursor) { + printf("Schema not terminated\n"); + return NULL; + } - if (validate_url_characters(url)) + scheme.str = url; + scheme.len = cursor - url; + pr_clutter(" scheme: %.*s (len:%zu)", (int)scheme.len, scheme.str, scheme.len); + meta = get_metadata(&scheme); + if (!(flags & URI_ALLOW_UNKNOWN_SCHEME) && !meta) { + printf("Unknown scheme\n"); return NULL; + } - curlu = curl_url(); - if (!curlu) - enomem_panic(); + if (cursor[1] != '/' || cursor[2] != '/') { + printf("Missing \"://\"\n"); + return NULL; + } - /* The flag is needed by rsync */ - err = curl_url_set(curlu, CURLUPART_URL, url, CURLU_NON_SUPPORT_SCHEME); - if (err) - goto einval; - err = curl_url_get(curlu, CURLUPART_URL, &curl_normal, 0); - if (err) - goto einval; + authority.str = cursor + 3; + if (!collect_authority(authority.str, &at, &colon, &cursor)) + return NULL; + authority.len = cursor - authority.str; + pr_clutter(" authority: %.*s (len:%zu)", (int)authority.len, authority.str, authority.len); + if (authority.len == 0) + return NULL; - curl_url_cleanup(curlu); + if (at != NULL) { + if (meta && !meta->allow_userinfo) { + printf("Protocol disallows userinfo.\n"); + return NULL; + } + + userinfo.str = authority.str; + userinfo.len = at - authority.str; + host.str = at + 1; + } else { + userinfo.str = NULL; + userinfo.len = 0; + host.str = authority.str; + } - if (strncmp(curl_normal, "rsync://", RPKI_SCHEMA_LEN) && - strncmp(curl_normal, "https://", RPKI_SCHEMA_LEN)) { - curl_free(curl_normal); + if (colon != NULL) { + host.len = colon - host.str; + port.str = colon + 1; + port.len = cursor - port.str; + } else { + host.len = cursor - host.str; + port.str = NULL; + port.len = 0; + } + + if (host.len == 0 && meta && !meta->allow_empty_host) { + printf("Protocol disallows empty host.\n"); return NULL; } - normal = pstrdup(curl_normal); - curl_free(curl_normal); - return normal; + pr_clutter(" userinfo: %.*s (len:%zu)", (int)userinfo.len, userinfo.str, userinfo.len); + pr_clutter(" host: %.*s (len:%zu)", (int)host.len, host.str, host.len); + pr_clutter(" port: %.*s (len:%zu)", (int)port.len, port.str, port.len); + + if (cursor[0] == '\0') { + memset(&path, 0, sizeof(path)); + memset(&query, 0, sizeof(query)); + memset(&fragment, 0, sizeof(fragment)); + + } else { /* '/' */ + path.str = cursor; + collect_path(path.str, &cursor); + path.len = cursor - path.str; + + switch (cursor[0]) { + case '\0': + memset(&query, 0, sizeof(query)); + memset(&fragment, 0, sizeof(fragment)); + break; + + case '?': + if (meta && !meta->allow_query) { + printf("Protocol disallows query.\n"); + return NULL; + } + + query.str = cursor; + collect_query(query.str + 1, &cursor); + query.len = cursor - query.str; + switch (cursor[0]) { + case '\0': + memset(&fragment, 0, sizeof(fragment)); + break; + case '#': + goto frag; + default: + pr_crit("Unhandled character after query: %c", + cursor[0]); + } + break; + + case '#': + memset(&query, 0, sizeof(query)); + +frag: if (meta && !meta->allow_fragment) { + printf("Protocol disallows fragment.\n"); + return NULL; + } + fragment.str = cursor; + collect_fragment(fragment.str + 1, &cursor); + fragment.len = cursor - fragment.str; + break; + + default: + pr_crit("Unhandled character after path: %c", + cursor[0]); + } + } -einval: pr_val_err("Error parsing URL: %s", curl_url_strerror(err)); - curl_url_cleanup(curlu); + pr_clutter(" path: %.*s (len:%zu)", (int)path.len, path.str, path.len); + pr_clutter(" query: %.*s (len:%zu)", (int)query.len, query.str, query.len); + pr_clutter(" fragment: %.*s (len:%zu)", (int)fragment.len, fragment.str, fragment.len); + + buf.capacity = scheme.len + authority.len + path.len + + query.len + fragment.len + 5; /* "://" + maybe '/' + '\0' */ + buf.dst = pmalloc(buf.capacity); + buf.d = 0; + + pr_clutter("-> Normalizing scheme."); + if (!normalize_scheme(&buf, &scheme)) + goto cancel; + pr_clutter("-> Normalizing userinfo."); + if (!normalize_userinfo(&buf, &userinfo)) + goto cancel; + pr_clutter("-> Normalizing host."); + if (!normalize_host(&buf, &host)) + goto cancel; + pr_clutter("-> Normalizing port."); + if (!normalize_port(&buf, &port, meta)) + goto cancel; + pr_clutter("-> Normalizing path."); + if (!normalize_path(&buf, &path)) + goto cancel; + pr_clutter("-> Normalizing query."); + if (!normalize_post_path(&buf, &query, '?')) + goto cancel; + pr_clutter("-> Normalizing fragment."); + if (!normalize_post_path(&buf, &fragment, '#')) + goto cancel; + + approve_chara(&buf, '\0'); + return buf.dst; + +cancel: free(buf.dst); return NULL; } int uri_init(struct uri *url, char const *str) { - str = url_normalize(str); - if (!str) + char *normal; + + normal = url_normalize(str, 0); + if (!normal) return EINVAL; - __URI_INIT(url, str); + __URI_INIT(url, normal); + + if (!uri_is_https(url) && !uri_is_rsync(url)) { + free(normal); + return ENOTSUP; + } + return 0; } @@ -133,6 +705,18 @@ uri_cleanup(struct uri *url) url->_str = NULL; } +bool +uri_is_rsync(struct uri const *url) +{ + return str_starts_with(url->_str, "rsync:"); +} + +bool +uri_is_https(struct uri const *url) +{ + return str_starts_with(url->_str, "https:"); +} + bool uri_equals(struct uri const *u1, struct uri const *u2) { @@ -162,6 +746,23 @@ uri_parent(struct uri const *child, struct uri *parent) return 0; } +void +uri_child(struct uri const *parent, char const *name, size_t len, + struct uri *child) +{ + size_t slash; + + slash = parent->_str[parent->_len - 1] != '/'; + + child->_len = parent->_len + slash + len; + child->_str = pmalloc(child->_len + 1); + strncpy(child->_str, parent->_str, parent->_len); + if (slash) + child->_str[parent->_len] = '/'; + strncpy(child->_str + parent->_len + slash, name, len); + child->_str[child->_len] = '\0'; +} + bool uri_same_origin(struct uri const *uri1, struct uri const *uri2) { @@ -192,21 +793,4 @@ uri_same_origin(struct uri const *uri1, struct uri const *uri2) return false; } -void -uri_child(struct uri const *parent, char const *name, size_t len, - struct uri *child) -{ - size_t slash; - - slash = parent->_str[parent->_len - 1] != '/'; - - child->_len = parent->_len + slash + len; - child->_str = pmalloc(child->_len + 1); - strncpy(child->_str, parent->_str, parent->_len); - if (slash) - child->_str[parent->_len] = '/'; - strncpy(child->_str + parent->_len + slash, name, len); - child->_str[child->_len] = '\0'; -} - DEFINE_ARRAY_LIST_FUNCTIONS(uris, struct uri, ) diff --git a/src/types/uri.h b/src/types/uri.h index bc2b11ea..601e0490 100644 --- a/src/types/uri.h +++ b/src/types/uri.h @@ -20,7 +20,7 @@ void uri_copy(struct uri *, struct uri const *); void uri_cleanup(struct uri *); #define uri_str(u) ((char const *)((u)->_str)) -#define uri_len(u) ((u)->_len) +#define uri_len(u) ((size_t const)((u)->_len)) bool uri_is_rsync(struct uri const *); bool uri_is_https(struct uri const *); diff --git a/test/Makefile.am b/test/Makefile.am index dc056357..1f878875 100644 --- a/test/Makefile.am +++ b/test/Makefile.am @@ -45,7 +45,6 @@ base64_test_LDADD = ${CHECK_LIBS} check_PROGRAMS += cache.test cache_test_SOURCES = cache_test.c cache_test_LDADD = ${CHECK_LIBS} -cache_test_LDADD += ${CURL_LIBS} cache_test_LDADD += ${XML2_LIBS} cache_test_LDADD += ${JANSSON_LIBS} @@ -80,21 +79,18 @@ pdu_stream_test_LDADD = ${CHECK_LIBS} check_PROGRAMS += rrdp.test rrdp_test_SOURCES = rrdp_test.c rrdp_test_LDADD = ${CHECK_LIBS} -rrdp_test_LDADD += ${CURL_LIBS} rrdp_test_LDADD += ${XML2_LIBS} rrdp_test_LDADD += ${JANSSON_LIBS} check_PROGRAMS += rrdp_update.test rrdp_update_test_SOURCES = rrdp_update_test.c rrdp_update_test_LDADD = ${CHECK_LIBS} -rrdp_update_test_LDADD += ${CURL_LIBS} rrdp_update_test_LDADD += ${XML2_LIBS} rrdp_update_test_LDADD += ${JANSSON_LIBS} check_PROGRAMS += rsync.test rsync_test_SOURCES = rsync_test.c rsync_test_LDADD = ${CHECK_LIBS} -rsync_test_LDADD += ${CURL_LIBS} check_PROGRAMS += serial.test serial_test_SOURCES = types/serial_test.c @@ -107,7 +103,6 @@ serial_test_LDADD = ${CHECK_LIBS} check_PROGRAMS += tal.test tal_test_SOURCES = object/tal_test.c tal_test_LDADD = ${CHECK_LIBS} -tal_test_LDADD += ${CURL_LIBS} check_PROGRAMS += task.test task_test_SOURCES = task_test.c @@ -120,7 +115,6 @@ thread_pool_test_LDADD = ${CHECK_LIBS} check_PROGRAMS += uri.test uri_test_SOURCES = types/uri_test.c uri_test_LDADD = ${CHECK_LIBS} -uri_test_LDADD += ${CURL_LIBS} check_PROGRAMS += uthash.test uthash_test_SOURCES = types/uthash_test.c diff --git a/test/types/uri_test.c b/test/types/uri_test.c index bad9e15c..e19b0482 100644 --- a/test/types/uri_test.c +++ b/test/types/uri_test.c @@ -7,61 +7,55 @@ #include "types/path.c" #include "types/uri.c" -#define TEST_NORMALIZE(dirty, clean) \ - normal = url_normalize(dirty); \ - ck_assert_str_eq(clean, normal); \ - free(normal) +#define TEST_REWIND(expected, test, limit) \ + parser.dst = test; \ + parser.d = strlen(test); \ + rewind_buffer(&parser, limit); \ + ck_assert_uint_eq(strlen(expected), parser.d) -START_TEST(test_normalize) +START_TEST(test_rewind) { - char *normal; + struct uri_buffer parser; - TEST_NORMALIZE("rsync://a.b.c", "rsync://a.b.c/"); - TEST_NORMALIZE("rsync://a.b.c/", "rsync://a.b.c/"); - TEST_NORMALIZE("rsync://a.b.c/d", "rsync://a.b.c/d"); - TEST_NORMALIZE("rsync://a.b.c//////", "rsync://a.b.c//////"); - TEST_NORMALIZE("rsync://a.b.c/d/e", "rsync://a.b.c/d/e"); - TEST_NORMALIZE("rsync://a.b.c/d/e/.", "rsync://a.b.c/d/e/"); - TEST_NORMALIZE("rsync://a.b.c/d/e/.", "rsync://a.b.c/d/e/"); - TEST_NORMALIZE("rsync://a.b.c/././d/././e/./.", "rsync://a.b.c/d/e/"); - TEST_NORMALIZE("rsync://a.b.c/d/..", "rsync://a.b.c/"); - TEST_NORMALIZE("rsync://a.b.c/x/../x/y/z", "rsync://a.b.c/x/y/z"); - TEST_NORMALIZE("rsync://a.b.c/d/../d/../d/e/", "rsync://a.b.c/d/e/"); - TEST_NORMALIZE("rsync://x//y/z/../../m/./n/o", "rsync://x//m/n/o"); + TEST_REWIND("/a/b", "/a/b/c", 0); + TEST_REWIND("/a/b", "/a/b/cdefg", 0); - ck_assert_ptr_eq(NULL, url_normalize("")); - ck_assert_ptr_eq(NULL, url_normalize("h")); - ck_assert_ptr_eq(NULL, url_normalize("http")); - ck_assert_ptr_eq(NULL, url_normalize("https")); - ck_assert_ptr_eq(NULL, url_normalize("https:")); - ck_assert_ptr_eq(NULL, url_normalize("https:/")); - ck_assert_ptr_eq(NULL, url_normalize("rsync://")); - ck_assert_ptr_eq(NULL, url_normalize("rsync://a.β.c/")); + TEST_REWIND("/a/b", "/a/b/c", 2); + TEST_REWIND("/a/b", "/a/b/cdefg", 2); - TEST_NORMALIZE("rsync://.", "rsync://./"); - TEST_NORMALIZE("https://./.", "https://./"); - TEST_NORMALIZE("https://./d", "https://./d"); - TEST_NORMALIZE("rsync://..", "rsync://../"); - TEST_NORMALIZE("rsync://../..", "rsync://../"); - TEST_NORMALIZE("rsync://../d", "rsync://../d"); - TEST_NORMALIZE("rsync://a.b.c/..", "rsync://a.b.c/"); - TEST_NORMALIZE("rsync://a.b.c/../..", "rsync://a.b.c/"); - TEST_NORMALIZE("rsync://a.b.c/../x", "rsync://a.b.c/x"); - TEST_NORMALIZE("rsync://a.b.c/../x/y/z", "rsync://a.b.c/x/y/z"); - TEST_NORMALIZE("rsync://a.b.c/d/e/../../..", "rsync://a.b.c/"); - ck_assert_ptr_eq(NULL, url_normalize("http://a.b.c/d")); - ck_assert_ptr_eq(NULL, url_normalize("abcde://a.b.c/d")); - TEST_NORMALIZE("HTTPS://a.b.c/d", "https://a.b.c/d"); - TEST_NORMALIZE("rSyNc://a.b.c/d", "rsync://a.b.c/d"); + TEST_REWIND("/a/b", "/a/b/c", 4); + TEST_REWIND("/a/b", "/a/b/cdefg", 4); - TEST_NORMALIZE("https://a.b.c:80/d/e", "https://a.b.c:80/d/e"); - /* TEST_NORMALIZE("https://a.b.c:443/d/e", "https://a.b.c/d/e"); */ - TEST_NORMALIZE("https://a.b.c:/d/e", "https://a.b.c/d/e"); + TEST_REWIND("/a/b", "/a/b", 4); +} +END_TEST + +#define TEST_NORMALIZE(dirty, clean) \ + normal = url_normalize(dirty, 0); \ + ck_assert_str_eq(clean, normal); \ + free(normal) + +#define TEST_NORMALIZE_AUS(dirty, clean) \ + normal = url_normalize(dirty, URI_ALLOW_UNKNOWN_SCHEME); \ + ck_assert_str_eq(clean, normal); \ + free(normal) + +#define TEST_NORMALIZE_FAIL(dirty) \ + ck_assert_ptr_eq(NULL, url_normalize(dirty, 0)); + +START_TEST(awkward_dot_dotting) +{ + char *normal; /* - * XXX make sure libcurl 8.12.2 implements lowercasing domains, - * defaulting 443, and maybe reject UTF-8. + * Additional, tricky: RFC 3986 never states that `//` should be + * normalized as `/`, which is seemingly implying that `/d//..` equals + * `/d/`, not `/` (as Unix would lead one to believe). */ + printf("Extra\n"); + + TEST_NORMALIZE("rsync://a.b.c//////", "rsync://a.b.c//////"); + TEST_NORMALIZE_AUS("http://a.b.c/d//..", "http://a.b.c/d"); } END_TEST @@ -96,17 +90,385 @@ START_TEST(test_same_origin) } END_TEST +START_TEST(test_unknown_protocols) +{ + char *normal; + + printf("Unknown protocols\n"); + + TEST_NORMALIZE_FAIL("httpz://a.b.c/d"); + TEST_NORMALIZE_FAIL("abcde://a.b.c/d"); + TEST_NORMALIZE_AUS("httpz://a.b.c/d", "httpz://a.b.c/d"); + TEST_NORMALIZE_AUS("abcde://a.b.c/d", "abcde://a.b.c/d"); +} +END_TEST + +START_TEST(reserved_unchanged) +{ + char *normal; + + printf("3986#2.2: \"characters in the reserved set are protected from normalization\"\n"); + printf("3986#6.2.2.1: Percent-encoding should always be uppercase\n"); + +#define RESERVED_PCT "%3A%2F%3F%23%5B%5D%40%21%24%26%27%28%29%2A%2B%2C%3B%3D" +#define SUBDELIMS "!$&'()*+,;=" + + TEST_NORMALIZE("https://" RESERVED_PCT ":1234/" RESERVED_PCT "?" RESERVED_PCT "#" RESERVED_PCT, + "https://" RESERVED_PCT ":1234/" RESERVED_PCT "?" RESERVED_PCT "#" RESERVED_PCT); + TEST_NORMALIZE("https://" SUBDELIMS ":1234/" SUBDELIMS "?" SUBDELIMS "#" SUBDELIMS, + "https://" SUBDELIMS ":1234/" SUBDELIMS "?" SUBDELIMS "#" SUBDELIMS); + + TEST_NORMALIZE("rsync://" RESERVED_PCT "@" RESERVED_PCT ":1234/" RESERVED_PCT, + "rsync://" RESERVED_PCT "@" RESERVED_PCT ":1234/" RESERVED_PCT); + TEST_NORMALIZE("rsync://" SUBDELIMS "@" SUBDELIMS ":1234/" SUBDELIMS, + "rsync://" SUBDELIMS "@" SUBDELIMS ":1234/" SUBDELIMS); +} +END_TEST + +START_TEST(lowercase_scheme_and_host) +{ + char *normal; + + printf("3986#6.2.2.1, 9110#4.2.3c: Lowercase scheme and host\n"); + + TEST_NORMALIZE_AUS("http://a.b.c/d", "http://a.b.c/d"); + TEST_NORMALIZE_AUS("abcde://a.b.c/d", "abcde://a.b.c/d"); + TEST_NORMALIZE_AUS("HTTPS://a.b.c/d", "https://a.b.c/d"); + TEST_NORMALIZE_AUS("rSyNc://a.b.c/d", "rsync://a.b.c/d"); + TEST_NORMALIZE_AUS("HTTPS://A.B.C/d", "https://a.b.c/d"); + TEST_NORMALIZE_AUS("HTTP://WWW.EXAMPLE.COM/aBc/dEf", "http://www.example.com/aBc/dEf"); + TEST_NORMALIZE_AUS("HTTP://WWW.EXAMPLE.COM/aBc/dEf?gHi#jKl", "http://www.example.com/aBc/dEf?gHi#jKl"); +} +END_TEST + +START_TEST(decode_unreserved_characters) +{ + char *normal; + + printf("3986#6.2.2.2, 9110#4.2.3d: Decode unreserved characters\n"); + + TEST_NORMALIZE_AUS("http://%61%7A.%41%5A.%30%39/%61%7A%41%5A%30%39", "http://az.AZ.09/azAZ09"); + TEST_NORMALIZE_AUS("http://%2D%2E%5F%7E/%2D%2E%5F%7E", "http://-._~/-._~"); +} +END_TEST + +START_TEST(path_segment_normalization) +{ + char *normal; + + printf("3986#6.2.2.3: Path segment normalization\n"); + + TEST_NORMALIZE("rsync://a.b.c", "rsync://a.b.c/"); + TEST_NORMALIZE("rsync://a.b.c/", "rsync://a.b.c/"); + TEST_NORMALIZE("rsync://a.b.c/d", "rsync://a.b.c/d"); + TEST_NORMALIZE("rsync://a.b.c//////", "rsync://a.b.c//////"); + TEST_NORMALIZE("rsync://a.b.c/d/e", "rsync://a.b.c/d/e"); + TEST_NORMALIZE("rsync://a.b.c/d/e/.", "rsync://a.b.c/d/e"); + TEST_NORMALIZE("rsync://a.b.c/d/e/.", "rsync://a.b.c/d/e"); + TEST_NORMALIZE("rsync://a.b.c/././d/././e/./.", "rsync://a.b.c/d/e"); + TEST_NORMALIZE("rsync://a.b.c/d/..", "rsync://a.b.c/"); + TEST_NORMALIZE("rsync://a.b.c/x/../x/y/z", "rsync://a.b.c/x/y/z"); + TEST_NORMALIZE("rsync://a.b.c/d/../d/../d/e/", "rsync://a.b.c/d/e/"); + TEST_NORMALIZE("rsync://x//y/z/../../m/./n/o", "rsync://x//m/n/o"); + TEST_NORMALIZE("rsync://.", "rsync://./"); + TEST_NORMALIZE("https://./.", "https://./"); + TEST_NORMALIZE("https://./d", "https://./d"); + TEST_NORMALIZE("rsync://..", "rsync://../"); + TEST_NORMALIZE("rsync://../..", "rsync://../"); + TEST_NORMALIZE("rsync://../d", "rsync://../d"); + TEST_NORMALIZE("rsync://a.b.c/..", "rsync://a.b.c/"); + TEST_NORMALIZE("rsync://a.b.c/../..", "rsync://a.b.c/"); + TEST_NORMALIZE("rsync://a.b.c/../x", "rsync://a.b.c/x"); + TEST_NORMALIZE("rsync://a.b.c/../x/y/z", "rsync://a.b.c/x/y/z"); + TEST_NORMALIZE("rsync://a.b.c/d/e/../../..", "rsync://a.b.c/"); +} +END_TEST + +START_TEST(all_the_above_combined) +{ + char *normal; + + printf("3986#6.2.2: All the above, combined\n"); + + TEST_NORMALIZE_AUS("example://a/b/c/%5Bfoo%5D", "example://a/b/c/%5Bfoo%5D"); + TEST_NORMALIZE_AUS("eXAMPLE://a/./b/../b/%63/%5bfoo%5d", "example://a/b/c/%5Bfoo%5D"); +} +END_TEST + +START_TEST(scheme_based_normalization) +{ + char *normal; + + printf("3986#6.2.3: Scheme-based normalization\n"); + + TEST_NORMALIZE_AUS("http://example.com/?", "http://example.com/?"); + TEST_NORMALIZE_AUS("http://example.com/#", "http://example.com/#"); +} +END_TEST + +START_TEST(https_grammar) +{ + printf("9110#4.2.2: https-URI = \"https\" \"://\" authority path-abempty [ \"?\" query ]\n"); + printf(" authority = host [ \":\" port ]\n"); + printf(" path-abempty = *( \"/\" segment )\n"); + printf(" segment = *pchar\n"); + + TEST_NORMALIZE_FAIL(""); + TEST_NORMALIZE_FAIL("h"); + TEST_NORMALIZE_FAIL("http"); + TEST_NORMALIZE_FAIL("https"); + TEST_NORMALIZE_FAIL("https:"); + TEST_NORMALIZE_FAIL("https:/"); + TEST_NORMALIZE_FAIL("https://"); + TEST_NORMALIZE_FAIL("https://a.β.c/"); + TEST_NORMALIZE_FAIL("https://a.b.c/β"); + + /* I think everything else is already tested below. */ +} +END_TEST + +START_TEST(https_default_port) +{ + char *normal; + + printf("9110#4.2.2: Default https port is 443\n"); + printf("(Also 9110#4.2.3: Omit default port)\n"); + + TEST_NORMALIZE("https://a.b.c/", "https://a.b.c/"); + TEST_NORMALIZE("https://a.b.c:/", "https://a.b.c/"); + TEST_NORMALIZE("https://a.b.c:443/", "https://a.b.c/"); + TEST_NORMALIZE("https://a.b.c:873/", "https://a.b.c:873/"); + + TEST_NORMALIZE("https://a.b.c", "https://a.b.c/"); + TEST_NORMALIZE("https://a.b.c:", "https://a.b.c/"); + TEST_NORMALIZE("https://a.b.c:443", "https://a.b.c/"); + TEST_NORMALIZE("https://a.b.c:873", "https://a.b.c:873/"); +} +END_TEST + +START_TEST(disallow_http_empty_host) +{ + char *normal; + + printf("9110#4.2.2: Disallow https empty host\n"); + printf("(Also 9110#4.2.3: Empty path normalizes to '/')\n"); + + TEST_NORMALIZE("https://a", "https://a/"); + TEST_NORMALIZE_FAIL("https://"); + TEST_NORMALIZE("https://a/f/g", "https://a/f/g"); + TEST_NORMALIZE_FAIL("https:///f/g"); + TEST_NORMALIZE("https://a:1234/f/g", "https://a:1234/f/g"); + TEST_NORMALIZE_FAIL("https://:1234/f/g"); + TEST_NORMALIZE("https://a?123", "https://a/?123"); + TEST_NORMALIZE_FAIL("https://?123"); + TEST_NORMALIZE("https://a#123", "https://a/#123"); + TEST_NORMALIZE_FAIL("https://#123"); +} +END_TEST + +START_TEST(provide_default_path) +{ + char *normal; + + printf("9110#4.2.3: Empty path normalizes to '/'\n"); + + TEST_NORMALIZE("https://example.com/", "https://example.com/"); + TEST_NORMALIZE("https://example.com", "https://example.com/"); +} +END_TEST + +START_TEST(scheme_and_host_lowercase) +{ + char *normal; + + printf("9110#4.2.3: Scheme and host normalize to lowercase\n"); + + TEST_NORMALIZE("https://c.d.e:123/FgHi/jKlM?NoPQ#rStU", "https://c.d.e:123/FgHi/jKlM?NoPQ#rStU"); + TEST_NORMALIZE("HTTPS://C.D.E:123/FgHi/jKlM?NoPQ#rStU", "https://c.d.e:123/FgHi/jKlM?NoPQ#rStU"); + TEST_NORMALIZE("hTtPs://C.d.E:123/FgHi/jKlM?NoPQ#rStU", "https://c.d.e:123/FgHi/jKlM?NoPQ#rStU"); +} +END_TEST + +START_TEST(not_reserved_not_pct_encoded) +{ + char *normal; + + /* + * Note: It seems "not in the reserved set" apparently means "unreserved + * characters," not "any character, except those in the reserved set." + * + * Otherwise there are too many exceptions: Non-printables, whitespace, + * quotes, percent, less/greater than, backslash, caret, backtick, + * curlies and pipe. + * + * That being said, we're going to cover all characters in the same + * test. + */ + printf("9110#4.2.3: \"Characters other than those in the 'reserved' set\" normalize to not percent-encoded\n"); + +/* "All Characters, Encoded Uppercase" */ +#define ACEU "%00%01%02%03%04%05%06%07%08%09%0A%0B%0C%0D%0E%0F" \ + "%10%11%12%13%14%15%16%17%18%19%1A%1B%1C%1D%1E%1F" \ + "%20%21%22%23%24%25%26%27%28%29%2A%2B%2C%2D%2E%2F" \ + "%30%31%32%33%34%35%36%37%38%39%3A%3B%3C%3D%3E%3F" \ + "%40%41%42%43%44%45%46%47%48%49%4A%4B%4C%4D%4E%4F" \ + "%50%51%52%53%54%55%56%57%58%59%5A%5B%5C%5D%5E%5F" \ + "%60%61%62%63%64%65%66%67%68%69%6A%6B%6C%6D%6E%6F" \ + "%70%71%72%73%74%75%76%77%78%79%7A%7B%7C%7D%7E%7F" +/* "All Characters, Encoded Lowercase" */ +#define ACEL "%00%01%02%03%04%05%06%07%08%09%0a%0b%0c%0d%0e%0f" \ + "%10%11%12%13%14%15%16%17%18%19%1a%1b%1c%1d%1e%1f" \ + "%20%21%22%23%24%25%26%27%28%29%2a%2b%2c%2d%2e%2f" \ + "%30%31%32%33%34%35%36%37%38%39%3a%3b%3c%3d%3e%3f" \ + "%40%41%42%43%44%45%46%47%48%49%4a%4b%4c%4d%4e%4f" \ + "%50%51%52%53%54%55%56%57%58%59%5a%5b%5c%5d%5e%5f" \ + "%60%61%62%63%64%65%66%67%68%69%6a%6b%6c%6d%6e%6f" \ + "%70%71%72%73%74%75%76%77%78%79%7a%7b%7c%7d%7e%7f" \ +/* "All Characters, Decoded" */ +#define ACD "%00%01%02%03%04%05%06%07%08%09%0A%0B%0C%0D%0E%0F" \ + "%10%11%12%13%14%15%16%17%18%19%1A%1B%1C%1D%1E%1F" \ + "%20%21%22%23%24%25%26%27%28%29%2A%2B%2C-.%2F" \ + "0123456789%3A%3B%3C%3D%3E%3F" \ + "%40ABCDEFGHIJKLMNO" \ + "PQRSTUVWXYZ%5B%5C%5D%5E_" \ + "%60abcdefghijklmno" \ + "pqrstuvwxyz%7B%7C%7D~%7F" + + TEST_NORMALIZE("https://" ACEU "/" ACEU "?" ACEU "#" ACEU, + "https://" ACD "/" ACD "?" ACD "#" ACD); + TEST_NORMALIZE("https://" ACEL "/" ACEL "?" ACEL "#" ACEL, + "https://" ACD "/" ACD "?" ACD "#" ACD); +} +END_TEST + +START_TEST(aggregated_423) +{ + char *normal; + + printf("9110#4.2.3: Aggregated example\n"); + + TEST_NORMALIZE("https://example.com:443/~smith/home.html", "https://example.com/~smith/home.html"); + TEST_NORMALIZE("https://EXAMPLE.com/%7Esmith/home.html", "https://example.com/~smith/home.html"); + TEST_NORMALIZE("https://EXAMPLE.com:/%7esmith/home.html", "https://example.com/~smith/home.html"); +} +END_TEST + +START_TEST(disallow_https_userinfo) +{ + char *normal; + + printf("9110#4.2.4: Disallow https userinfo\n"); + + TEST_NORMALIZE("https://c.d.e/f/g", "https://c.d.e/f/g"); + TEST_NORMALIZE_FAIL("https://a@c.d.e/f/g"); + TEST_NORMALIZE_FAIL("https://a:b@c.d.e/f/g"); +} +END_TEST + +START_TEST(rsync_grammar) +{ + char *normal; + + printf("5781#2: rsync://[user@]host[:PORT]/Source\n"); + printf("rsyncuri = \"rsync:\" hier-part\n"); + + TEST_NORMALIZE_FAIL(""); + TEST_NORMALIZE_FAIL("r"); + TEST_NORMALIZE_FAIL("rsyn"); + TEST_NORMALIZE_FAIL("rsync"); + TEST_NORMALIZE_FAIL("rsync:"); + TEST_NORMALIZE_FAIL("rsync:/"); + TEST_NORMALIZE_FAIL("rsync://"); + TEST_NORMALIZE_FAIL("rsync://a.β.c/"); + TEST_NORMALIZE_FAIL("rsync://a.b.c/β"); + + TEST_NORMALIZE("rsync://a.b.c/m", "rsync://a.b.c/m"); + TEST_NORMALIZE("rsync://a.b.c/m/r", "rsync://a.b.c/m/r"); + TEST_NORMALIZE_FAIL("rsync://a.b.c/m/r?query"); + TEST_NORMALIZE_FAIL("rsync://a.b.c/m/r#fragment"); + + /* hier-part = "//" authority path-abempty */ + TEST_NORMALIZE("rsync://user@a.b.c:1234/m/r", "rsync://user@a.b.c:1234/m/r"); + TEST_NORMALIZE("rsync://a.b.c/m/r", "rsync://a.b.c/m/r"); + TEST_NORMALIZE("rsync://user@a.b.c:1234", "rsync://user@a.b.c:1234/"); + TEST_NORMALIZE("rsync://a.b.c", "rsync://a.b.c/"); + + /* hier-part = path-absolute */ + /* ie. "rsync:/" [ pchar+ ( "/" pchar* )* ] */ + /* (These refer to local files. The RFC allows them, but Fort shouldn't.) */ + TEST_NORMALIZE_FAIL("rsync:/"); + TEST_NORMALIZE_FAIL("rsync:/a"); + TEST_NORMALIZE_FAIL("rsync:/a/"); + TEST_NORMALIZE_FAIL("rsync:/a/a"); + TEST_NORMALIZE_FAIL("rsync:/a/a/a"); + TEST_NORMALIZE_FAIL("rsync:/abc/def/xyz"); + TEST_NORMALIZE_FAIL("rsync:/abc////def//xyz"); + + /* hier-part = path-rootless */ + /* ie. "rsync:" pchar+ ( "/" pchar* )* */ + /* (Also local paths. Disallowed by Fort needs.) */ + TEST_NORMALIZE_FAIL("rsync:a"); + TEST_NORMALIZE_FAIL("rsync:aa"); + TEST_NORMALIZE_FAIL("rsync:aa/"); + TEST_NORMALIZE_FAIL("rsync:aa/a"); + TEST_NORMALIZE_FAIL("rsync:aa/aa"); + TEST_NORMALIZE_FAIL("rsync:aa///aa"); + + /* hier-part = path-empty */ + TEST_NORMALIZE_FAIL("rsync:"); +} +END_TEST + +START_TEST(rsync_default_port) +{ + char *normal; + + printf("5781#2: Default rsync port is 873\n"); + TEST_NORMALIZE("rsync://a.b.c/", "rsync://a.b.c/"); + TEST_NORMALIZE("rsync://a.b.c:/", "rsync://a.b.c/"); + TEST_NORMALIZE("rsync://a.b.c:873/", "rsync://a.b.c/"); + TEST_NORMALIZE("rsync://a.b.c:443/", "rsync://a.b.c:443/"); +} +END_TEST + static Suite *create_suite(void) { Suite *suite; - TCase *misc; + TCase *misc, *generic, *https, *rsync; - misc = tcase_create("misc"); - tcase_add_test(misc, test_normalize); + misc = tcase_create("Miscellaneous"); + tcase_add_test(misc, test_rewind); + tcase_add_test(misc, test_unknown_protocols); + tcase_add_test(misc, awkward_dot_dotting); tcase_add_test(misc, test_same_origin); + generic = tcase_create("RFC 3986 (generic URI)"); + tcase_add_test(generic, reserved_unchanged); + tcase_add_test(generic, lowercase_scheme_and_host); + tcase_add_test(generic, decode_unreserved_characters); + tcase_add_test(generic, path_segment_normalization); + tcase_add_test(generic, all_the_above_combined); + tcase_add_test(generic, scheme_based_normalization); + + https = tcase_create("RFC 9110 (https)"); + tcase_add_test(https, https_grammar); + tcase_add_test(https, https_default_port); + tcase_add_test(https, disallow_http_empty_host); + tcase_add_test(https, provide_default_path); + tcase_add_test(https, scheme_and_host_lowercase); + tcase_add_test(https, not_reserved_not_pct_encoded); + tcase_add_test(https, aggregated_423); + tcase_add_test(https, disallow_https_userinfo); + + rsync = tcase_create("RFC 5781 (rsync)"); + tcase_add_test(rsync, rsync_grammar); + tcase_add_test(rsync, rsync_default_port); + suite = suite_create("url"); suite_add_tcase(suite, misc); + suite_add_tcase(suite, generic); + suite_add_tcase(suite, https); + suite_add_tcase(suite, rsync); return suite; }