]> git.ipfire.org Git - thirdparty/FORT-validator.git/commitdiff
Implement URI normalization
authorAlberto Leiva Popper <ydahhrk@gmail.com>
Tue, 6 May 2025 23:43:47 +0000 (17:43 -0600)
committerAlberto Leiva Popper <ydahhrk@gmail.com>
Tue, 6 May 2025 23:43:47 +0000 (17:43 -0600)
Stop deferring this to curl; it's not bound to come out soon.

src/rrdp.c
src/types/uri.c
src/types/uri.h
test/Makefile.am
test/types/uri_test.c

index 9972769d6b6a93ce4eee4bd0188dbe546f4a3e2c..3d99c3a01eb6c20bd4a4cdc1a2749814b079e3f2 100644 (file)
@@ -554,7 +554,7 @@ handle_publish(xmlTextReaderPtr reader, struct parser_args *args)
 
        /* Parsing done */
 
-       pr_clutter("Publish %s", logv_filename(tag.meta.uri));
+       pr_clutter("Publish %s", logv_filename(uri_str(&tag.meta.uri)));
 
        file = state_find_file(args->state, &tag.meta.uri);
 
@@ -629,7 +629,7 @@ handle_withdraw(xmlTextReaderPtr reader, struct parser_args *args)
                goto end;
        }
 
-       pr_clutter("Withdraw %s", logv_filename(tag.meta.uri));
+       pr_clutter("Withdraw %s", logv_filename(uri_str(&tag.meta.uri)));
 
        file = state_find_file(args->state, &tag.meta.uri);
 
index 98bb6422eff53e3d03fad6b914b3c99b4f09c539..886f91a065b7e16326868648e57d05be93e4a4fa 100644 (file)
@@ -1,6 +1,5 @@
 #include "types/uri.h"
 
-#include <curl/curl.h>
 #include <errno.h>
 
 #include "alloc.h"
 #include "log.h"
 #include "types/path.h"
 
-bool
-uri_is_rsync(struct uri const *url)
+#define URI_ALLOW_UNKNOWN_SCHEME (1 << 1)
+
+struct sized_string {
+       char const *str;
+       size_t len;
+};
+
+struct uri_buffer {
+       char *dst;
+       array_index d;
+       size_t capacity;
+};
+
+struct schema_metadata {
+       unsigned int default_port;
+       bool allow_userinfo;
+       bool allow_empty_host;
+       bool allow_query;
+       bool allow_fragment;
+};
+
+struct schema_metadata const HTTPS = {
+       .default_port = 443,
+       .allow_userinfo = false,
+       .allow_empty_host = false,
+       .allow_query = true,
+       .allow_fragment = true,
+};
+
+struct schema_metadata const RSYNC = {
+       .default_port = 873,
+       .allow_userinfo = true,
+       .allow_empty_host = true,
+       .allow_query = false,
+       .allow_fragment = false,
+};
+
+static bool
+is_proto(struct sized_string *scheme, char const *proto)
 {
-       return str_starts_with(url->_str, "rsync://");
+       return strncasecmp(scheme->str, proto, scheme->len) == 0;
 }
 
-bool
-uri_is_https(struct uri const *url)
+static struct schema_metadata const *
+get_metadata(struct sized_string *scheme)
 {
-       return str_starts_with(url->_str, "https://");
+       if (scheme->len != 5)
+               return NULL;
+
+       if (is_proto(scheme, "https"))
+               return &HTTPS;
+       if (is_proto(scheme, "rsync"))
+               return &RSYNC;
+
+       return NULL;
 }
 
-/*
- * @character is an integer because we sometimes receive signed chars, and other
- * times we get unsigned chars.
- * Casting a negative char into a unsigned char is undefined behavior.
- */
-static int
-validate_url_character(int character)
+static bool
+is_lowercase(char chr)
+{
+       return 'a' <= chr && chr <= 'z';
+}
+
+static bool
+is_uppercase(char chr)
+{
+       return 'A' <= chr && chr <= 'Z';
+}
+
+static bool
+is_digit(char chr)
+{
+       return '0' <= chr && chr <= '9';
+}
+
+static bool
+is_symbol(char chr, char const *symbols)
+{
+       for (; symbols[0] != '\0'; symbols++)
+               if (chr == symbols[0])
+                       return true;
+       return false;
+}
+
+static char
+to_lowercase(char uppercase)
+{
+       return uppercase - ('A' - 'a');
+}
+
+static char
+to_uppercase(char chr)
 {
-       return (0x20 <= character && character <= 0x7E)
-           ? 0
-           : pr_val_err("URL has non-printable character code '%d'.", character);
+       return is_lowercase(chr) ? (chr + ('A' - 'a')) : chr;
 }
 
-/* Not done by libcurl, apparently */
-static int
-validate_url_characters(char const *str)
+static bool
+invalid(char const *errmsg)
 {
-       char const *s;
-       int error;
+       printf("%s\n", errmsg);
+       return false;
+}
 
-       for (s = str; s[0] != '\0'; s++) {
-               error = validate_url_character(s[0]);
-               if (error)
-                       return error;
+static void
+approve_chara(struct uri_buffer *buf, char chr)
+{
+       if (buf->d >= buf->capacity) {
+               /* It seems this is dead code. */
+               buf->capacity += 16;
+               buf->dst = prealloc(buf->dst, buf->capacity);
        }
 
-       return 0;
+       buf->dst[buf->d++] = chr;
+}
+
+static bool
+collect_authority(char const *auth, char const **at, char const **colon,
+    char const **end)
+{
+       *at = NULL;
+       *colon = NULL;
+
+       for (; true; auth++) {
+               switch (auth[0]) {
+               case '/':
+               case '?':
+               case '#':
+               case '\0':
+                       *end = auth;
+                       return true;
+               case '@':
+                       if ((*at) == NULL) {
+                               *colon = NULL; /* Was a password if not null */
+                               *at = auth;
+                       }
+                       break;
+               case ':':
+                       *colon = auth;
+                       break;
+               }
+       }
+}
+
+static void
+collect_path(char const *path, char const **end)
+{
+       for (; true; path++)
+               if (path[0] == '\0' || path[0] == '?' || path[0] == '#') {
+                       *end = path;
+                       return;
+               }
+}
+
+static void
+collect_query(char const *query, char const **end)
+{
+       for (; true; query++)
+               if (query[0] == '\0' || query[0] == '#') {
+                       *end = query;
+                       return;
+               }
+}
+
+static void
+collect_fragment(char const *fragment, char const **end)
+{
+       for (; true; fragment++)
+               if (fragment[0] == '\0') {
+                       *end = fragment;
+                       return;
+               }
+}
+
+static bool
+normalize_scheme(struct uri_buffer *buf, struct sized_string *scheme)
+{
+       char chr;
+       array_index c;
+
+       if (scheme->len == 0)
+               return invalid("Scheme seems empty.");
+
+       chr = scheme->str[0];
+       if (is_lowercase(chr))
+               approve_chara(buf, chr);
+       else if (is_uppercase(chr))
+               approve_chara(buf, to_lowercase(chr));
+       else
+               return invalid("First character is not a letter.");
+
+       for (c = 1; c < scheme->len; c++) {
+               chr = scheme->str[c];
+               if (is_lowercase(chr) || is_digit(chr) || is_symbol(chr, "+.-"))
+                       approve_chara(buf, chr);
+               else if (is_uppercase(chr))
+                       approve_chara(buf, to_lowercase(chr));
+               else
+                       return invalid("Schema character is not letter, digit, plus, period or hyphen.");
+       }
+
+       approve_chara(buf, ':');
+       approve_chara(buf, '/');
+       approve_chara(buf, '/');
+       return true;
+}
+
+static bool
+is_unreserved(char chr)
+{
+       return is_lowercase(chr)
+           || is_uppercase(chr)
+           || is_digit(chr)
+           || is_symbol(chr, "-._~");
+}
+
+static bool
+is_subdelim(char chr)
+{
+       return is_symbol(chr, "!$&'()*+,;=");
+}
+
+static bool
+char2hex(char chr, unsigned int *hex)
+{
+       if (is_digit(chr)) {
+               *hex = chr - '0';
+               return true;
+       }
+       if (is_uppercase(chr)) {
+               *hex = chr - 'A' + 10;
+               return true;
+       }
+       if (is_lowercase(chr)) {
+               *hex = chr - 'a' + 10;
+               return true;
+       }
+
+       printf("Invalid hex digit: %c\n", chr);
+       return invalid("Invalid hexadecimal digit.");
+}
+
+static bool
+approve_pct_encoded(struct uri_buffer *buf, struct sized_string *sstr,
+    array_index *offset)
+{
+       array_index off;
+       unsigned int hex1;
+       unsigned int hex2;
+       unsigned int val;
+
+       off = *offset;
+
+       if (sstr->len - off < 3)
+               return invalid("Unterminated %-encoding.");
+
+       if (!char2hex(sstr->str[off + 1], &hex1))
+               return false;
+       if (!char2hex(sstr->str[off + 2], &hex2))
+               return false;
+
+       val = (hex1 << 4) | hex2;
+
+       if (is_unreserved(val)) {
+               approve_chara(buf, val);
+               *offset += 2;
+               return true;
+       }
+
+       approve_chara(buf, '%');
+       approve_chara(buf, to_uppercase(sstr->str[off + 1]));
+       approve_chara(buf, to_uppercase(sstr->str[off + 2]));
+       *offset += 2;
+       return true;
+}
+
+static bool
+handle_pchar(struct uri_buffer *buf, struct sized_string *sstr,
+    array_index *offset)
+{
+       char chr = sstr->str[*offset];
+
+       if (is_unreserved(chr))
+               approve_chara(buf, chr);
+       else if (chr == '%')
+               approve_pct_encoded(buf, sstr, offset);
+       else if (is_subdelim(chr))
+               approve_chara(buf, chr);
+       else if (chr == ':' || chr == '@')
+               approve_chara(buf, chr);
+       else
+               return false;
+       return true;
+}
+
+static bool
+normalize_userinfo(struct uri_buffer *buf, struct sized_string *userinfo)
+{
+       array_index c;
+       char chr;
+
+       if (userinfo->len == 0)
+               return true;
+
+       for (c = 0; c < userinfo->len; c++) {
+               chr = userinfo->str[c];
+               if (is_unreserved(chr))
+                       approve_chara(buf, chr);
+               else if (chr == '%') {
+                       if (!approve_pct_encoded(buf, userinfo, &c))
+                               return false;
+               } else if (is_subdelim(chr))
+                       approve_chara(buf, chr);
+               else if (chr == ':')
+                       approve_chara(buf, chr);
+               else
+                       return invalid("Illegal character in userinfo section.");
+       }
+
+       approve_chara(buf, '@');
+       return true;
+}
+
+static bool
+normalize_host(struct uri_buffer *buf, struct sized_string *host)
+{
+       array_index c;
+       char chr;
+
+       for (c = 0; c < host->len; c++) {
+               chr = host->str[c];
+               if (is_uppercase(chr))
+                       approve_chara(buf, to_lowercase(chr));
+               else if (is_unreserved(chr))
+                       approve_chara(buf, chr);
+               else if (chr == '%') {
+                       if (!approve_pct_encoded(buf, host, &c))
+                               return false;
+               } else if (is_subdelim(chr))
+                       approve_chara(buf, chr);
+               else
+                       return invalid("Illegal character in host section.");
+       }
+
+       return true;
+}
+
+static bool
+normalize_port(struct uri_buffer *buf, struct sized_string *port,
+    struct schema_metadata const *schema)
+{
+       array_index c;
+       char chr;
+       unsigned int portnum;
+
+       if (port->len == 0)
+               return true;
+
+       portnum = 0;
+       for (c = 0; c < port->len; c++) {
+               chr = port->str[c];
+               if (!is_digit(chr))
+                       return invalid("Illegal non-digit character in port section.");
+               portnum = 10 * portnum + (chr - '0');
+               if (portnum > 0xFFFF)
+                       return invalid("Port value is too large.");
+       }
+
+       if (schema && (portnum == schema->default_port))
+               return true;
+
+       approve_chara(buf, ':');
+       for (c = 0; c < port->len; c++)
+               approve_chara(buf, port->str[c]);
+       return true;
+}
+
+static char const *
+strnchr(char const *str, size_t n, char chr)
+{
+       array_index s;
+       for (s = 0; s < n; s++)
+               if (str[s] == chr)
+                       break;
+       return str + s;
+}
+
+static bool
+next_segment(struct sized_string *path, struct sized_string *segment)
+{
+       segment->str += segment->len + 1;
+       if (segment->str > (path->str + path->len))
+               return false;
+       segment->len = strnchr(segment->str,
+           path->len - (segment->str - path->str),
+           '/') - segment->str;
+       return true;
+}
+
+static void
+rewind_buffer(struct uri_buffer *buf, size_t limit)
+{
+       while ((buf->d > limit) && (buf->dst[--buf->d] != '/'))
+               ;
+}
+
+static bool
+normalize_path(struct uri_buffer *buf, struct sized_string *path)
+{
+       struct sized_string segment;
+       array_index i;
+       char chr;
+       size_t limit;
+
+       if (path->len == 0) {
+               approve_chara(buf, '/');
+               return true;
+       }
+
+       segment.str = path->str;
+       segment.len = 0;
+       limit = buf->d;
+
+       while (next_segment(path, &segment)) {
+               approve_chara(buf, '/');
+               for (i = 0; i < segment.len; i++) {
+                       chr = segment.str[i];
+                       if (is_unreserved(chr))
+                               approve_chara(buf, chr);
+                       else if (chr == '%') {
+                               if (!approve_pct_encoded(buf, &segment, &i))
+                                       return false;
+                       } else if (is_subdelim(chr) || is_symbol(chr, ":@"))
+                               approve_chara(buf, chr);
+                       else
+                               return invalid("Illegal character in path section.");
+               }
+
+               if (buf->dst[buf->d - 2] == '/' &&
+                   buf->dst[buf->d - 1] == '.')
+                       rewind_buffer(buf, limit);
+               if (buf->dst[buf->d - 3] == '/' &&
+                   buf->dst[buf->d - 2] == '.' &&
+                   buf->dst[buf->d - 1] == '.') {
+                       rewind_buffer(buf, limit);
+                       rewind_buffer(buf, limit);
+               }
+       }
+
+       if (limit == buf->d)
+               approve_chara(buf, '/');
+       return true;
+}
+
+static bool
+normalize_post_path(struct uri_buffer *buf, struct sized_string *post,
+    char prefix)
+{
+       array_index c;
+       char chr;
+
+       if (post->len == 0)
+               return true;
+
+       approve_chara(buf, prefix);
+       for (c = 1; c < post->len; c++) {
+               if (handle_pchar(buf, post, &c))
+                       continue;
+               chr = post->str[c];
+               if (chr == ':' || chr == '@')
+                       approve_chara(buf, chr);
+               else
+                       return invalid("Illegal character in query section.");
+       }
+
+       return true;
 }
 
 /*
  * See RFC 3986. Basically, "rsync://%61.b/./c/.././%64/." -> "rsync://a.b/d"
- *
- * This is not actually a perfect normalization, because it's deferred to curl,
- * whose implementation is somewhat flawed (at least until version 8.12.1):
- * https://github.com/curl/curl/issues/16829
- *
- * That said, since Fort 2 no longer maps URI paths to literal local paths, all
- * normalization does for us is prevent some theoretical redundant downloading,
- * so it's fine.
  */
 static char *
-url_normalize(char const *url)
+url_normalize(char const *url, int flags)
 {
-       CURLU *curlu;
-       char *curl_normal;
-       char *normal;
-       CURLUcode err;
+       struct sized_string scheme;
+       struct sized_string authority;
+       struct sized_string userinfo;
+       struct sized_string host;
+       struct sized_string port;
+       struct sized_string path;
+       struct sized_string query;
+       struct sized_string fragment;
+
+       char const *cursor;
+       char const *at;
+       char const *colon;
+
+       struct schema_metadata const *meta;
+       struct uri_buffer buf;
+
+       pr_clutter("-----------------------");
+       pr_clutter("input: %s", url);
+
+       cursor = strchr(url, ':');
+       if (!cursor) {
+               printf("Schema not terminated\n");
+               return NULL;
+       }
 
-       if (validate_url_characters(url))
+       scheme.str = url;
+       scheme.len = cursor - url;
+       pr_clutter("  scheme: %.*s (len:%zu)", (int)scheme.len, scheme.str, scheme.len);
+       meta = get_metadata(&scheme);
+       if (!(flags & URI_ALLOW_UNKNOWN_SCHEME) && !meta) {
+               printf("Unknown scheme\n");
                return NULL;
+       }
 
-       curlu = curl_url();
-       if (!curlu)
-               enomem_panic();
+       if (cursor[1] != '/' || cursor[2] != '/') {
+               printf("Missing \"://\"\n");
+               return NULL;
+       }
 
-       /* The flag is needed by rsync */
-       err = curl_url_set(curlu, CURLUPART_URL, url, CURLU_NON_SUPPORT_SCHEME);
-       if (err)
-               goto einval;
-       err = curl_url_get(curlu, CURLUPART_URL, &curl_normal, 0);
-       if (err)
-               goto einval;
+       authority.str = cursor + 3;
+       if (!collect_authority(authority.str, &at, &colon, &cursor))
+               return NULL;
+       authority.len = cursor - authority.str;
+       pr_clutter("  authority: %.*s (len:%zu)", (int)authority.len, authority.str, authority.len);
+       if (authority.len == 0)
+               return NULL;
 
-       curl_url_cleanup(curlu);
+       if (at != NULL) {
+               if (meta && !meta->allow_userinfo) {
+                       printf("Protocol disallows userinfo.\n");
+                       return NULL;
+               }
+
+               userinfo.str = authority.str;
+               userinfo.len = at - authority.str;
+               host.str = at + 1;
+       } else {
+               userinfo.str = NULL;
+               userinfo.len = 0;
+               host.str = authority.str;
+       }
 
-       if (strncmp(curl_normal, "rsync://", RPKI_SCHEMA_LEN) &&
-           strncmp(curl_normal, "https://", RPKI_SCHEMA_LEN)) {
-               curl_free(curl_normal);
+       if (colon != NULL) {
+               host.len = colon - host.str;
+               port.str = colon + 1;
+               port.len = cursor - port.str;
+       } else {
+               host.len = cursor - host.str;
+               port.str = NULL;
+               port.len = 0;
+       }
+
+       if (host.len == 0 && meta && !meta->allow_empty_host) {
+               printf("Protocol disallows empty host.\n");
                return NULL;
        }
 
-       normal = pstrdup(curl_normal);
-       curl_free(curl_normal);
-       return normal;
+       pr_clutter("  userinfo: %.*s (len:%zu)", (int)userinfo.len, userinfo.str, userinfo.len);
+       pr_clutter("  host: %.*s (len:%zu)", (int)host.len, host.str, host.len);
+       pr_clutter("  port: %.*s (len:%zu)", (int)port.len, port.str, port.len);
+
+       if (cursor[0] == '\0') {
+               memset(&path, 0, sizeof(path));
+               memset(&query, 0, sizeof(query));
+               memset(&fragment, 0, sizeof(fragment));
+
+       } else { /* '/' */
+               path.str = cursor;
+               collect_path(path.str, &cursor);
+               path.len = cursor - path.str;
+
+               switch (cursor[0]) {
+               case '\0':
+                       memset(&query, 0, sizeof(query));
+                       memset(&fragment, 0, sizeof(fragment));
+                       break;
+
+               case '?':
+                       if (meta && !meta->allow_query) {
+                               printf("Protocol disallows query.\n");
+                               return NULL;
+                       }
+
+                       query.str = cursor;
+                       collect_query(query.str + 1, &cursor);
+                       query.len = cursor - query.str;
+                       switch (cursor[0]) {
+                       case '\0':
+                               memset(&fragment, 0, sizeof(fragment));
+                               break;
+                       case '#':
+                               goto frag;
+                       default:
+                               pr_crit("Unhandled character after query: %c",
+                                   cursor[0]);
+                       }
+                       break;
+
+               case '#':
+                       memset(&query, 0, sizeof(query));
+
+frag:                  if (meta && !meta->allow_fragment) {
+                               printf("Protocol disallows fragment.\n");
+                               return NULL;
+                       }
+                       fragment.str = cursor;
+                       collect_fragment(fragment.str + 1, &cursor);
+                       fragment.len = cursor - fragment.str;
+                       break;
+
+               default:
+                       pr_crit("Unhandled character after path: %c",
+                           cursor[0]);
+               }
+       }
 
-einval:        pr_val_err("Error parsing URL: %s", curl_url_strerror(err));
-       curl_url_cleanup(curlu);
+       pr_clutter("  path: %.*s (len:%zu)", (int)path.len, path.str, path.len);
+       pr_clutter("  query: %.*s (len:%zu)", (int)query.len, query.str, query.len);
+       pr_clutter("  fragment: %.*s (len:%zu)", (int)fragment.len, fragment.str, fragment.len);
+
+       buf.capacity = scheme.len + authority.len + path.len
+           + query.len + fragment.len + 5; /* "://" + maybe '/' + '\0' */
+       buf.dst = pmalloc(buf.capacity);
+       buf.d = 0;
+
+       pr_clutter("-> Normalizing scheme.");
+       if (!normalize_scheme(&buf, &scheme))
+               goto cancel;
+       pr_clutter("-> Normalizing userinfo.");
+       if (!normalize_userinfo(&buf, &userinfo))
+               goto cancel;
+       pr_clutter("-> Normalizing host.");
+       if (!normalize_host(&buf, &host))
+               goto cancel;
+       pr_clutter("-> Normalizing port.");
+       if (!normalize_port(&buf, &port, meta))
+               goto cancel;
+       pr_clutter("-> Normalizing path.");
+       if (!normalize_path(&buf, &path))
+               goto cancel;
+       pr_clutter("-> Normalizing query.");
+       if (!normalize_post_path(&buf, &query, '?'))
+               goto cancel;
+       pr_clutter("-> Normalizing fragment.");
+       if (!normalize_post_path(&buf, &fragment, '#'))
+               goto cancel;
+
+       approve_chara(&buf, '\0');
+       return buf.dst;
+
+cancel:        free(buf.dst);
        return NULL;
 }
 
 int
 uri_init(struct uri *url, char const *str)
 {
-       str = url_normalize(str);
-       if (!str)
+       char *normal;
+
+       normal = url_normalize(str, 0);
+       if (!normal)
                return EINVAL;
 
-       __URI_INIT(url, str);
+       __URI_INIT(url, normal);
+
+       if (!uri_is_https(url) && !uri_is_rsync(url)) {
+               free(normal);
+               return ENOTSUP;
+       }
+
        return 0;
 }
 
@@ -133,6 +705,18 @@ uri_cleanup(struct uri *url)
        url->_str = NULL;
 }
 
+bool
+uri_is_rsync(struct uri const *url)
+{
+       return str_starts_with(url->_str, "rsync:");
+}
+
+bool
+uri_is_https(struct uri const *url)
+{
+       return str_starts_with(url->_str, "https:");
+}
+
 bool
 uri_equals(struct uri const *u1, struct uri const *u2)
 {
@@ -162,6 +746,23 @@ uri_parent(struct uri const *child, struct uri *parent)
        return 0;
 }
 
+void
+uri_child(struct uri const *parent, char const *name, size_t len,
+    struct uri *child)
+{
+       size_t slash;
+
+       slash = parent->_str[parent->_len - 1] != '/';
+
+       child->_len = parent->_len + slash + len;
+       child->_str = pmalloc(child->_len + 1);
+       strncpy(child->_str, parent->_str, parent->_len);
+       if (slash)
+               child->_str[parent->_len] = '/';
+       strncpy(child->_str + parent->_len + slash, name, len);
+       child->_str[child->_len] = '\0';
+}
+
 bool
 uri_same_origin(struct uri const *uri1, struct uri const *uri2)
 {
@@ -192,21 +793,4 @@ uri_same_origin(struct uri const *uri1, struct uri const *uri2)
        return false;
 }
 
-void
-uri_child(struct uri const *parent, char const *name, size_t len,
-    struct uri *child)
-{
-       size_t slash;
-
-       slash = parent->_str[parent->_len - 1] != '/';
-
-       child->_len = parent->_len + slash + len;
-       child->_str = pmalloc(child->_len + 1);
-       strncpy(child->_str, parent->_str, parent->_len);
-       if (slash)
-               child->_str[parent->_len] = '/';
-       strncpy(child->_str + parent->_len + slash, name, len);
-       child->_str[child->_len] = '\0';
-}
-
 DEFINE_ARRAY_LIST_FUNCTIONS(uris, struct uri, )
index bc2b11ead89e04493da060801544b5e6577301e2..601e0490b24ef2f906fe69ae0464093119a64010 100644 (file)
@@ -20,7 +20,7 @@ void uri_copy(struct uri *, struct uri const *);
 void uri_cleanup(struct uri *);
 
 #define uri_str(u) ((char const *)((u)->_str))
-#define uri_len(u) ((u)->_len)
+#define uri_len(u) ((size_t const)((u)->_len))
 
 bool uri_is_rsync(struct uri const *);
 bool uri_is_https(struct uri const *);
index dc056357223e21af9e97779f9ca7f6c90391cc28..1f878875f85f4b3196cec98fe999b65cbea1f042 100644 (file)
@@ -45,7 +45,6 @@ base64_test_LDADD =           ${CHECK_LIBS}
 check_PROGRAMS +=              cache.test
 cache_test_SOURCES =           cache_test.c
 cache_test_LDADD =             ${CHECK_LIBS}
-cache_test_LDADD +=            ${CURL_LIBS}
 cache_test_LDADD +=            ${XML2_LIBS}
 cache_test_LDADD +=            ${JANSSON_LIBS}
 
@@ -80,21 +79,18 @@ pdu_stream_test_LDADD =             ${CHECK_LIBS}
 check_PROGRAMS +=              rrdp.test
 rrdp_test_SOURCES =            rrdp_test.c
 rrdp_test_LDADD =              ${CHECK_LIBS}
-rrdp_test_LDADD +=             ${CURL_LIBS}
 rrdp_test_LDADD +=             ${XML2_LIBS}
 rrdp_test_LDADD +=             ${JANSSON_LIBS}
 
 check_PROGRAMS +=              rrdp_update.test
 rrdp_update_test_SOURCES =     rrdp_update_test.c
 rrdp_update_test_LDADD =       ${CHECK_LIBS}
-rrdp_update_test_LDADD +=      ${CURL_LIBS}
 rrdp_update_test_LDADD +=      ${XML2_LIBS}
 rrdp_update_test_LDADD +=      ${JANSSON_LIBS}
 
 check_PROGRAMS +=              rsync.test
 rsync_test_SOURCES =           rsync_test.c
 rsync_test_LDADD =             ${CHECK_LIBS}
-rsync_test_LDADD +=            ${CURL_LIBS}
 
 check_PROGRAMS +=              serial.test
 serial_test_SOURCES =          types/serial_test.c
@@ -107,7 +103,6 @@ serial_test_LDADD =         ${CHECK_LIBS}
 check_PROGRAMS +=              tal.test
 tal_test_SOURCES =             object/tal_test.c
 tal_test_LDADD =               ${CHECK_LIBS}
-tal_test_LDADD +=              ${CURL_LIBS}
 
 check_PROGRAMS +=              task.test
 task_test_SOURCES =            task_test.c
@@ -120,7 +115,6 @@ thread_pool_test_LDADD =    ${CHECK_LIBS}
 check_PROGRAMS +=              uri.test
 uri_test_SOURCES =             types/uri_test.c
 uri_test_LDADD =               ${CHECK_LIBS}
-uri_test_LDADD +=              ${CURL_LIBS}
 
 check_PROGRAMS +=              uthash.test
 uthash_test_SOURCES =          types/uthash_test.c
index bad9e15ce5aec9745b6e6a63aa1ea484dc136ff0..e19b0482bb3d4594c89e82851dd2d3840e214270 100644 (file)
@@ -7,61 +7,55 @@
 #include "types/path.c"
 #include "types/uri.c"
 
-#define TEST_NORMALIZE(dirty, clean)                                   \
-       normal = url_normalize(dirty);                                  \
-       ck_assert_str_eq(clean, normal);                                \
-       free(normal)
+#define TEST_REWIND(expected, test, limit)                             \
+       parser.dst = test;                                              \
+       parser.d = strlen(test);                                        \
+       rewind_buffer(&parser, limit);                                  \
+       ck_assert_uint_eq(strlen(expected), parser.d)
 
-START_TEST(test_normalize)
+START_TEST(test_rewind)
 {
-       char *normal;
+       struct uri_buffer parser;
 
-       TEST_NORMALIZE("rsync://a.b.c", "rsync://a.b.c/");
-       TEST_NORMALIZE("rsync://a.b.c/", "rsync://a.b.c/");
-       TEST_NORMALIZE("rsync://a.b.c/d", "rsync://a.b.c/d");
-       TEST_NORMALIZE("rsync://a.b.c//////", "rsync://a.b.c//////");
-       TEST_NORMALIZE("rsync://a.b.c/d/e", "rsync://a.b.c/d/e");
-       TEST_NORMALIZE("rsync://a.b.c/d/e/.", "rsync://a.b.c/d/e/");
-       TEST_NORMALIZE("rsync://a.b.c/d/e/.", "rsync://a.b.c/d/e/");
-       TEST_NORMALIZE("rsync://a.b.c/././d/././e/./.", "rsync://a.b.c/d/e/");
-       TEST_NORMALIZE("rsync://a.b.c/d/..", "rsync://a.b.c/");
-       TEST_NORMALIZE("rsync://a.b.c/x/../x/y/z", "rsync://a.b.c/x/y/z");
-       TEST_NORMALIZE("rsync://a.b.c/d/../d/../d/e/", "rsync://a.b.c/d/e/");
-       TEST_NORMALIZE("rsync://x//y/z/../../m/./n/o", "rsync://x//m/n/o");
+       TEST_REWIND("/a/b", "/a/b/c", 0);
+       TEST_REWIND("/a/b", "/a/b/cdefg", 0);
 
-       ck_assert_ptr_eq(NULL, url_normalize(""));
-       ck_assert_ptr_eq(NULL, url_normalize("h"));
-       ck_assert_ptr_eq(NULL, url_normalize("http"));
-       ck_assert_ptr_eq(NULL, url_normalize("https"));
-       ck_assert_ptr_eq(NULL, url_normalize("https:"));
-       ck_assert_ptr_eq(NULL, url_normalize("https:/"));
-       ck_assert_ptr_eq(NULL, url_normalize("rsync://"));
-       ck_assert_ptr_eq(NULL, url_normalize("rsync://a.β.c/"));
+       TEST_REWIND("/a/b", "/a/b/c", 2);
+       TEST_REWIND("/a/b", "/a/b/cdefg", 2);
 
-       TEST_NORMALIZE("rsync://.", "rsync://./");
-       TEST_NORMALIZE("https://./.", "https://./");
-       TEST_NORMALIZE("https://./d", "https://./d");
-       TEST_NORMALIZE("rsync://..", "rsync://../");
-       TEST_NORMALIZE("rsync://../..", "rsync://../");
-       TEST_NORMALIZE("rsync://../d", "rsync://../d");
-       TEST_NORMALIZE("rsync://a.b.c/..", "rsync://a.b.c/");
-       TEST_NORMALIZE("rsync://a.b.c/../..", "rsync://a.b.c/");
-       TEST_NORMALIZE("rsync://a.b.c/../x", "rsync://a.b.c/x");
-       TEST_NORMALIZE("rsync://a.b.c/../x/y/z", "rsync://a.b.c/x/y/z");
-       TEST_NORMALIZE("rsync://a.b.c/d/e/../../..", "rsync://a.b.c/");
-       ck_assert_ptr_eq(NULL, url_normalize("http://a.b.c/d"));
-       ck_assert_ptr_eq(NULL, url_normalize("abcde://a.b.c/d"));
-       TEST_NORMALIZE("HTTPS://a.b.c/d", "https://a.b.c/d");
-       TEST_NORMALIZE("rSyNc://a.b.c/d", "rsync://a.b.c/d");
+       TEST_REWIND("/a/b", "/a/b/c", 4);
+       TEST_REWIND("/a/b", "/a/b/cdefg", 4);
 
-       TEST_NORMALIZE("https://a.b.c:80/d/e", "https://a.b.c:80/d/e");
-       /* TEST_NORMALIZE("https://a.b.c:443/d/e", "https://a.b.c/d/e"); */
-       TEST_NORMALIZE("https://a.b.c:/d/e", "https://a.b.c/d/e");
+       TEST_REWIND("/a/b", "/a/b", 4);
+}
+END_TEST
+
+#define TEST_NORMALIZE(dirty, clean)                                   \
+       normal = url_normalize(dirty, 0);                               \
+       ck_assert_str_eq(clean, normal);                                \
+       free(normal)
+
+#define TEST_NORMALIZE_AUS(dirty, clean)                               \
+       normal = url_normalize(dirty, URI_ALLOW_UNKNOWN_SCHEME);        \
+       ck_assert_str_eq(clean, normal);                                \
+       free(normal)
+
+#define TEST_NORMALIZE_FAIL(dirty)                                     \
+       ck_assert_ptr_eq(NULL, url_normalize(dirty, 0));
+
+START_TEST(awkward_dot_dotting)
+{
+       char *normal;
 
        /*
-        * XXX make sure libcurl 8.12.2 implements lowercasing domains,
-        * defaulting 443, and maybe reject UTF-8.
+        * Additional, tricky: RFC 3986 never states that `//` should be
+        * normalized as `/`, which is seemingly implying that `/d//..` equals
+        * `/d/`, not `/` (as Unix would lead one to believe).
         */
+       printf("Extra\n");
+
+       TEST_NORMALIZE("rsync://a.b.c//////", "rsync://a.b.c//////");
+       TEST_NORMALIZE_AUS("http://a.b.c/d//..", "http://a.b.c/d");
 }
 END_TEST
 
@@ -96,17 +90,385 @@ START_TEST(test_same_origin)
 }
 END_TEST
 
+START_TEST(test_unknown_protocols)
+{
+       char *normal;
+
+       printf("Unknown protocols\n");
+
+       TEST_NORMALIZE_FAIL("httpz://a.b.c/d");
+       TEST_NORMALIZE_FAIL("abcde://a.b.c/d");
+       TEST_NORMALIZE_AUS("httpz://a.b.c/d", "httpz://a.b.c/d");
+       TEST_NORMALIZE_AUS("abcde://a.b.c/d", "abcde://a.b.c/d");
+}
+END_TEST
+
+START_TEST(reserved_unchanged)
+{
+       char *normal;
+
+       printf("3986#2.2: \"characters in the reserved set are protected from normalization\"\n");
+       printf("3986#6.2.2.1: Percent-encoding should always be uppercase\n");
+
+#define RESERVED_PCT "%3A%2F%3F%23%5B%5D%40%21%24%26%27%28%29%2A%2B%2C%3B%3D"
+#define SUBDELIMS "!$&'()*+,;="
+
+       TEST_NORMALIZE("https://" RESERVED_PCT ":1234/" RESERVED_PCT "?" RESERVED_PCT "#" RESERVED_PCT,
+                       "https://" RESERVED_PCT ":1234/" RESERVED_PCT "?" RESERVED_PCT "#" RESERVED_PCT);
+       TEST_NORMALIZE("https://" SUBDELIMS ":1234/" SUBDELIMS "?" SUBDELIMS "#" SUBDELIMS,
+                       "https://" SUBDELIMS ":1234/" SUBDELIMS "?" SUBDELIMS "#" SUBDELIMS);
+
+       TEST_NORMALIZE("rsync://" RESERVED_PCT "@" RESERVED_PCT ":1234/" RESERVED_PCT,
+                       "rsync://" RESERVED_PCT "@" RESERVED_PCT ":1234/" RESERVED_PCT);
+       TEST_NORMALIZE("rsync://" SUBDELIMS "@" SUBDELIMS ":1234/" SUBDELIMS,
+                       "rsync://" SUBDELIMS "@" SUBDELIMS ":1234/" SUBDELIMS);
+}
+END_TEST
+
+START_TEST(lowercase_scheme_and_host)
+{
+       char *normal;
+
+       printf("3986#6.2.2.1, 9110#4.2.3c: Lowercase scheme and host\n");
+
+       TEST_NORMALIZE_AUS("http://a.b.c/d", "http://a.b.c/d");
+       TEST_NORMALIZE_AUS("abcde://a.b.c/d", "abcde://a.b.c/d");
+       TEST_NORMALIZE_AUS("HTTPS://a.b.c/d", "https://a.b.c/d");
+       TEST_NORMALIZE_AUS("rSyNc://a.b.c/d", "rsync://a.b.c/d");
+       TEST_NORMALIZE_AUS("HTTPS://A.B.C/d", "https://a.b.c/d");
+       TEST_NORMALIZE_AUS("HTTP://WWW.EXAMPLE.COM/aBc/dEf", "http://www.example.com/aBc/dEf");
+       TEST_NORMALIZE_AUS("HTTP://WWW.EXAMPLE.COM/aBc/dEf?gHi#jKl", "http://www.example.com/aBc/dEf?gHi#jKl");
+}
+END_TEST
+
+START_TEST(decode_unreserved_characters)
+{
+       char *normal;
+
+       printf("3986#6.2.2.2, 9110#4.2.3d: Decode unreserved characters\n");
+
+       TEST_NORMALIZE_AUS("http://%61%7A.%41%5A.%30%39/%61%7A%41%5A%30%39", "http://az.AZ.09/azAZ09");
+       TEST_NORMALIZE_AUS("http://%2D%2E%5F%7E/%2D%2E%5F%7E", "http://-._~/-._~");
+}
+END_TEST
+
+START_TEST(path_segment_normalization)
+{
+       char *normal;
+
+       printf("3986#6.2.2.3: Path segment normalization\n");
+
+       TEST_NORMALIZE("rsync://a.b.c", "rsync://a.b.c/");
+       TEST_NORMALIZE("rsync://a.b.c/", "rsync://a.b.c/");
+       TEST_NORMALIZE("rsync://a.b.c/d", "rsync://a.b.c/d");
+       TEST_NORMALIZE("rsync://a.b.c//////", "rsync://a.b.c//////");
+       TEST_NORMALIZE("rsync://a.b.c/d/e", "rsync://a.b.c/d/e");
+       TEST_NORMALIZE("rsync://a.b.c/d/e/.", "rsync://a.b.c/d/e");
+       TEST_NORMALIZE("rsync://a.b.c/d/e/.", "rsync://a.b.c/d/e");
+       TEST_NORMALIZE("rsync://a.b.c/././d/././e/./.", "rsync://a.b.c/d/e");
+       TEST_NORMALIZE("rsync://a.b.c/d/..", "rsync://a.b.c/");
+       TEST_NORMALIZE("rsync://a.b.c/x/../x/y/z", "rsync://a.b.c/x/y/z");
+       TEST_NORMALIZE("rsync://a.b.c/d/../d/../d/e/", "rsync://a.b.c/d/e/");
+       TEST_NORMALIZE("rsync://x//y/z/../../m/./n/o", "rsync://x//m/n/o");
+       TEST_NORMALIZE("rsync://.", "rsync://./");
+       TEST_NORMALIZE("https://./.", "https://./");
+       TEST_NORMALIZE("https://./d", "https://./d");
+       TEST_NORMALIZE("rsync://..", "rsync://../");
+       TEST_NORMALIZE("rsync://../..", "rsync://../");
+       TEST_NORMALIZE("rsync://../d", "rsync://../d");
+       TEST_NORMALIZE("rsync://a.b.c/..", "rsync://a.b.c/");
+       TEST_NORMALIZE("rsync://a.b.c/../..", "rsync://a.b.c/");
+       TEST_NORMALIZE("rsync://a.b.c/../x", "rsync://a.b.c/x");
+       TEST_NORMALIZE("rsync://a.b.c/../x/y/z", "rsync://a.b.c/x/y/z");
+       TEST_NORMALIZE("rsync://a.b.c/d/e/../../..", "rsync://a.b.c/");
+}
+END_TEST
+
+START_TEST(all_the_above_combined)
+{
+       char *normal;
+
+       printf("3986#6.2.2: All the above, combined\n");
+
+       TEST_NORMALIZE_AUS("example://a/b/c/%5Bfoo%5D", "example://a/b/c/%5Bfoo%5D");
+       TEST_NORMALIZE_AUS("eXAMPLE://a/./b/../b/%63/%5bfoo%5d", "example://a/b/c/%5Bfoo%5D");
+}
+END_TEST
+
+START_TEST(scheme_based_normalization)
+{
+       char *normal;
+
+       printf("3986#6.2.3: Scheme-based normalization\n");
+
+       TEST_NORMALIZE_AUS("http://example.com/?", "http://example.com/?");
+       TEST_NORMALIZE_AUS("http://example.com/#", "http://example.com/#");
+}
+END_TEST
+
+START_TEST(https_grammar)
+{
+       printf("9110#4.2.2: https-URI     = \"https\" \"://\" authority path-abempty [ \"?\" query ]\n");
+       printf("            authority     = host [ \":\" port ]\n");
+       printf("            path-abempty  = *( \"/\" segment )\n");
+       printf("            segment       = *pchar\n");
+
+       TEST_NORMALIZE_FAIL("");
+       TEST_NORMALIZE_FAIL("h");
+       TEST_NORMALIZE_FAIL("http");
+       TEST_NORMALIZE_FAIL("https");
+       TEST_NORMALIZE_FAIL("https:");
+       TEST_NORMALIZE_FAIL("https:/");
+       TEST_NORMALIZE_FAIL("https://");
+       TEST_NORMALIZE_FAIL("https://a.β.c/");
+       TEST_NORMALIZE_FAIL("https://a.b.c/β");
+
+       /* I think everything else is already tested below. */
+}
+END_TEST
+
+START_TEST(https_default_port)
+{
+       char *normal;
+
+       printf("9110#4.2.2: Default https port is 443\n");
+       printf("(Also 9110#4.2.3: Omit default port)\n");
+
+       TEST_NORMALIZE("https://a.b.c/", "https://a.b.c/");
+       TEST_NORMALIZE("https://a.b.c:/", "https://a.b.c/");
+       TEST_NORMALIZE("https://a.b.c:443/", "https://a.b.c/");
+       TEST_NORMALIZE("https://a.b.c:873/", "https://a.b.c:873/");
+
+       TEST_NORMALIZE("https://a.b.c", "https://a.b.c/");
+       TEST_NORMALIZE("https://a.b.c:", "https://a.b.c/");
+       TEST_NORMALIZE("https://a.b.c:443", "https://a.b.c/");
+       TEST_NORMALIZE("https://a.b.c:873", "https://a.b.c:873/");
+}
+END_TEST
+
+START_TEST(disallow_http_empty_host)
+{
+       char *normal;
+
+       printf("9110#4.2.2: Disallow https empty host\n");
+       printf("(Also 9110#4.2.3: Empty path normalizes to '/')\n");
+
+       TEST_NORMALIZE("https://a", "https://a/");
+       TEST_NORMALIZE_FAIL("https://");
+       TEST_NORMALIZE("https://a/f/g", "https://a/f/g");
+       TEST_NORMALIZE_FAIL("https:///f/g");
+       TEST_NORMALIZE("https://a:1234/f/g", "https://a:1234/f/g");
+       TEST_NORMALIZE_FAIL("https://:1234/f/g");
+       TEST_NORMALIZE("https://a?123", "https://a/?123");
+       TEST_NORMALIZE_FAIL("https://?123");
+       TEST_NORMALIZE("https://a#123", "https://a/#123");
+       TEST_NORMALIZE_FAIL("https://#123");
+}
+END_TEST
+
+START_TEST(provide_default_path)
+{
+       char *normal;
+
+       printf("9110#4.2.3: Empty path normalizes to '/'\n");
+
+       TEST_NORMALIZE("https://example.com/", "https://example.com/");
+       TEST_NORMALIZE("https://example.com", "https://example.com/");
+}
+END_TEST
+
+START_TEST(scheme_and_host_lowercase)
+{
+       char *normal;
+
+       printf("9110#4.2.3: Scheme and host normalize to lowercase\n");
+
+       TEST_NORMALIZE("https://c.d.e:123/FgHi/jKlM?NoPQ#rStU", "https://c.d.e:123/FgHi/jKlM?NoPQ#rStU");
+       TEST_NORMALIZE("HTTPS://C.D.E:123/FgHi/jKlM?NoPQ#rStU", "https://c.d.e:123/FgHi/jKlM?NoPQ#rStU");
+       TEST_NORMALIZE("hTtPs://C.d.E:123/FgHi/jKlM?NoPQ#rStU", "https://c.d.e:123/FgHi/jKlM?NoPQ#rStU");
+}
+END_TEST
+
+START_TEST(not_reserved_not_pct_encoded)
+{
+       char *normal;
+
+       /*
+        * Note: It seems "not in the reserved set" apparently means "unreserved
+        * characters," not "any character, except those in the reserved set."
+        *
+        * Otherwise there are too many exceptions: Non-printables, whitespace,
+        * quotes, percent, less/greater than, backslash, caret, backtick,
+        * curlies and pipe.
+        *
+        * That being said, we're going to cover all characters in the same
+        * test.
+        */
+       printf("9110#4.2.3: \"Characters other than those in the 'reserved' set\" normalize to not percent-encoded\n");
+
+/* "All Characters, Encoded Uppercase" */
+#define ACEU "%00%01%02%03%04%05%06%07%08%09%0A%0B%0C%0D%0E%0F"        \
+       "%10%11%12%13%14%15%16%17%18%19%1A%1B%1C%1D%1E%1F"      \
+       "%20%21%22%23%24%25%26%27%28%29%2A%2B%2C%2D%2E%2F"      \
+       "%30%31%32%33%34%35%36%37%38%39%3A%3B%3C%3D%3E%3F"      \
+       "%40%41%42%43%44%45%46%47%48%49%4A%4B%4C%4D%4E%4F"      \
+       "%50%51%52%53%54%55%56%57%58%59%5A%5B%5C%5D%5E%5F"      \
+       "%60%61%62%63%64%65%66%67%68%69%6A%6B%6C%6D%6E%6F"      \
+       "%70%71%72%73%74%75%76%77%78%79%7A%7B%7C%7D%7E%7F"
+/* "All Characters, Encoded Lowercase" */
+#define ACEL "%00%01%02%03%04%05%06%07%08%09%0a%0b%0c%0d%0e%0f"        \
+       "%10%11%12%13%14%15%16%17%18%19%1a%1b%1c%1d%1e%1f"      \
+       "%20%21%22%23%24%25%26%27%28%29%2a%2b%2c%2d%2e%2f"      \
+       "%30%31%32%33%34%35%36%37%38%39%3a%3b%3c%3d%3e%3f"      \
+       "%40%41%42%43%44%45%46%47%48%49%4a%4b%4c%4d%4e%4f"      \
+       "%50%51%52%53%54%55%56%57%58%59%5a%5b%5c%5d%5e%5f"      \
+       "%60%61%62%63%64%65%66%67%68%69%6a%6b%6c%6d%6e%6f"      \
+       "%70%71%72%73%74%75%76%77%78%79%7a%7b%7c%7d%7e%7f"      \
+/* "All Characters, Decoded" */
+#define ACD "%00%01%02%03%04%05%06%07%08%09%0A%0B%0C%0D%0E%0F" \
+       "%10%11%12%13%14%15%16%17%18%19%1A%1B%1C%1D%1E%1F"      \
+       "%20%21%22%23%24%25%26%27%28%29%2A%2B%2C-.%2F"          \
+       "0123456789%3A%3B%3C%3D%3E%3F"                          \
+       "%40ABCDEFGHIJKLMNO"                                    \
+       "PQRSTUVWXYZ%5B%5C%5D%5E_"                              \
+       "%60abcdefghijklmno"                                    \
+       "pqrstuvwxyz%7B%7C%7D~%7F"
+
+       TEST_NORMALIZE("https://" ACEU "/" ACEU "?" ACEU "#" ACEU,
+                       "https://" ACD "/" ACD "?" ACD "#" ACD);
+       TEST_NORMALIZE("https://" ACEL "/" ACEL "?" ACEL "#" ACEL,
+                       "https://" ACD "/" ACD "?" ACD "#" ACD);
+}
+END_TEST
+
+START_TEST(aggregated_423)
+{
+       char *normal;
+
+       printf("9110#4.2.3: Aggregated example\n");
+
+       TEST_NORMALIZE("https://example.com:443/~smith/home.html", "https://example.com/~smith/home.html");
+       TEST_NORMALIZE("https://EXAMPLE.com/%7Esmith/home.html", "https://example.com/~smith/home.html");
+       TEST_NORMALIZE("https://EXAMPLE.com:/%7esmith/home.html", "https://example.com/~smith/home.html");
+}
+END_TEST
+
+START_TEST(disallow_https_userinfo)
+{
+       char *normal;
+
+       printf("9110#4.2.4: Disallow https userinfo\n");
+
+       TEST_NORMALIZE("https://c.d.e/f/g", "https://c.d.e/f/g");
+       TEST_NORMALIZE_FAIL("https://a@c.d.e/f/g");
+       TEST_NORMALIZE_FAIL("https://a:b@c.d.e/f/g");
+}
+END_TEST
+
+START_TEST(rsync_grammar)
+{
+       char *normal;
+
+       printf("5781#2: rsync://[user@]host[:PORT]/Source\n");
+       printf("rsyncuri        = \"rsync:\" hier-part\n");
+
+       TEST_NORMALIZE_FAIL("");
+       TEST_NORMALIZE_FAIL("r");
+       TEST_NORMALIZE_FAIL("rsyn");
+       TEST_NORMALIZE_FAIL("rsync");
+       TEST_NORMALIZE_FAIL("rsync:");
+       TEST_NORMALIZE_FAIL("rsync:/");
+       TEST_NORMALIZE_FAIL("rsync://");
+       TEST_NORMALIZE_FAIL("rsync://a.β.c/");
+       TEST_NORMALIZE_FAIL("rsync://a.b.c/β");
+
+       TEST_NORMALIZE("rsync://a.b.c/m", "rsync://a.b.c/m");
+       TEST_NORMALIZE("rsync://a.b.c/m/r", "rsync://a.b.c/m/r");
+       TEST_NORMALIZE_FAIL("rsync://a.b.c/m/r?query");
+       TEST_NORMALIZE_FAIL("rsync://a.b.c/m/r#fragment");
+
+       /* hier-part     = "//" authority path-abempty */
+       TEST_NORMALIZE("rsync://user@a.b.c:1234/m/r", "rsync://user@a.b.c:1234/m/r");
+       TEST_NORMALIZE("rsync://a.b.c/m/r", "rsync://a.b.c/m/r");
+       TEST_NORMALIZE("rsync://user@a.b.c:1234", "rsync://user@a.b.c:1234/");
+       TEST_NORMALIZE("rsync://a.b.c", "rsync://a.b.c/");
+
+       /* hier-part     = path-absolute */
+       /* ie. "rsync:/" [ pchar+ ( "/" pchar* )* ] */
+       /* (These refer to local files. The RFC allows them, but Fort shouldn't.) */
+       TEST_NORMALIZE_FAIL("rsync:/");
+       TEST_NORMALIZE_FAIL("rsync:/a");
+       TEST_NORMALIZE_FAIL("rsync:/a/");
+       TEST_NORMALIZE_FAIL("rsync:/a/a");
+       TEST_NORMALIZE_FAIL("rsync:/a/a/a");
+       TEST_NORMALIZE_FAIL("rsync:/abc/def/xyz");
+       TEST_NORMALIZE_FAIL("rsync:/abc////def//xyz");
+
+       /* hier-part     = path-rootless */
+       /* ie. "rsync:" pchar+ ( "/" pchar* )* */
+       /* (Also local paths. Disallowed by Fort needs.) */
+       TEST_NORMALIZE_FAIL("rsync:a");
+       TEST_NORMALIZE_FAIL("rsync:aa");
+       TEST_NORMALIZE_FAIL("rsync:aa/");
+       TEST_NORMALIZE_FAIL("rsync:aa/a");
+       TEST_NORMALIZE_FAIL("rsync:aa/aa");
+       TEST_NORMALIZE_FAIL("rsync:aa///aa");
+
+       /* hier-part     = path-empty */
+       TEST_NORMALIZE_FAIL("rsync:");
+}
+END_TEST
+
+START_TEST(rsync_default_port)
+{
+       char *normal;
+
+       printf("5781#2: Default rsync port is 873\n");
+       TEST_NORMALIZE("rsync://a.b.c/", "rsync://a.b.c/");
+       TEST_NORMALIZE("rsync://a.b.c:/", "rsync://a.b.c/");
+       TEST_NORMALIZE("rsync://a.b.c:873/", "rsync://a.b.c/");
+       TEST_NORMALIZE("rsync://a.b.c:443/", "rsync://a.b.c:443/");
+}
+END_TEST
+
 static Suite *create_suite(void)
 {
        Suite *suite;
-       TCase *misc;
+       TCase *misc, *generic, *https, *rsync;
 
-       misc = tcase_create("misc");
-       tcase_add_test(misc, test_normalize);
+       misc = tcase_create("Miscellaneous");
+       tcase_add_test(misc, test_rewind);
+       tcase_add_test(misc, test_unknown_protocols);
+       tcase_add_test(misc, awkward_dot_dotting);
        tcase_add_test(misc, test_same_origin);
 
+       generic = tcase_create("RFC 3986 (generic URI)");
+       tcase_add_test(generic, reserved_unchanged);
+       tcase_add_test(generic, lowercase_scheme_and_host);
+       tcase_add_test(generic, decode_unreserved_characters);
+       tcase_add_test(generic, path_segment_normalization);
+       tcase_add_test(generic, all_the_above_combined);
+       tcase_add_test(generic, scheme_based_normalization);
+
+       https = tcase_create("RFC 9110 (https)");
+       tcase_add_test(https, https_grammar);
+       tcase_add_test(https, https_default_port);
+       tcase_add_test(https, disallow_http_empty_host);
+       tcase_add_test(https, provide_default_path);
+       tcase_add_test(https, scheme_and_host_lowercase);
+       tcase_add_test(https, not_reserved_not_pct_encoded);
+       tcase_add_test(https, aggregated_423);
+       tcase_add_test(https, disallow_https_userinfo);
+
+       rsync = tcase_create("RFC 5781 (rsync)");
+       tcase_add_test(rsync, rsync_grammar);
+       tcase_add_test(rsync, rsync_default_port);
+
        suite = suite_create("url");
        suite_add_tcase(suite, misc);
+       suite_add_tcase(suite, generic);
+       suite_add_tcase(suite, https);
+       suite_add_tcase(suite, rsync);
 
        return suite;
 }