Figure out URI normalization

author Alberto Leiva Popper <ydahhrk@gmail.com>

Thu, 3 Apr 2025 23:45:23 +0000 (17:45 -0600)

committer Alberto Leiva Popper <ydahhrk@gmail.com>

Thu, 3 Apr 2025 23:48:09 +0000 (17:48 -0600)
author Alberto Leiva Popper <ydahhrk@gmail.com>
Thu, 3 Apr 2025 23:45:23 +0000 (17:45 -0600)
committer Alberto Leiva Popper <ydahhrk@gmail.com>
Thu, 3 Apr 2025 23:48:09 +0000 (17:48 -0600)
diff --git a/src/types/url.c b/src/types/url.c

index bfc5a7207c457859012f44fbe4a7978e8f5054d5..4bf47d969e1b499ff47676cc32935309d67d2cf9 100644 (file)
--- a/src/types/url.c
+++ b/src/types/url.c
@@ -1,5 +1,7 @@
  #include "types/url.h"
  
+#include <curl/curl.h>
+
  #include "alloc.h"
  #include "common.h"
  #include "types/path.h"
@@ -17,119 +19,82 @@ url_is_https(char const *url)
  }
  
  /*
- * XXX use this:
- *
- *     for (s = str; s[0] != '\0'; s++) {
-               error = validate_url_character(s[0]);
-               if (error)
-                       return error;
-       }
- *
   * @character is an integer because we sometimes receive signed chars, and other
   * times we get unsigned chars.
   * Casting a negative char into a unsigned char is undefined behavior.
   */
-//static int
-//validate_url_character(int character)
-//{
-//     /*
-//      * RFCs 1738 and 3986 define a very specific range of allowed
-//      * characters, but I don't think we're that concerned about URL
-//      * correctness. Validating the URL properly is more involved than simply
-//      * checking legal characters, anyway.
-//      *
-//      * What I really need this validation for is ensure that we won't get
-//      * any trouble later, when we attempt to map the URL to a path.
-//      *
-//      * Sample trouble: Getting UTF-8 characters. Why are they trouble?
-//      * Because we don't have any guarantees that the system's file name
-//      * encoding is UTF-8. URIs are not supposed to contain UTF-8 in the
-//      * first place, so we have no reason to deal with encoding conversion.
-//      *
-//      * To be perfectly fair, we have no guarantees that the system's file
-//      * name encoding is ASCII-compatible either, but I need to hang onto
-//      * SOMETHING.
-//      *
-//      * (Asking users to use UTF-8 is fine, but asking users to use something
-//      * ASCII-compatible is a little better.)
-//      *
-//      * So just make sure that the character is printable ASCII.
-//      *
-//      * TODO (next iteration) Consider exhaustive URL validation.
-//      */
-//     return (0x20 <= character && character <= 0x7E)
-//         ? 0
-//         : pr_val_err("URL has non-printable character code '%d'.", character);
-//}
-
-static char *
-path_rewind(char const *root, char *cursor)
+static int
+validate_url_character(int character)
  {
-       for (cursor -= 2; root <= cursor; cursor--)
-               if (*cursor == '/')
-                       return cursor + 1;
-       return NULL;
+       return (0x20 <= character && character <= 0x7E)
+           ? 0
+           : pr_val_err("URL has non-printable character code '%d'.", character);
  }
  
-static bool
-has_bad_prefix(char const *url)
+/* Not done by libcurl, apparently */
+static int
+validate_url_characters(char const *str)
  {
-       // XXX what happens if code expects one but url is the other
-       if (strncmp(url, "rsync://", RPKI_SCHEMA_LEN) &&
-           strncmp(url, "https://", RPKI_SCHEMA_LEN))
-               return true;
-
-       /* Disallow the root domain */
-       url += RPKI_SCHEMA_LEN;
-       if (url[0] == '/')
-               return true;
-       if (url[0] == '.' && url[1] == '/')
-               return true;
-
-       // XXX read the standard and reject more bad URLs
-       return false;
+       char const *s;
+       int error;
+
+       for (s = str; s[0] != '\0'; s++) {
+               error = validate_url_character(s[0]);
+               if (error)
+                       return error;
+       }
+
+       return 0;
  }
  
  /*
- * Collapses '//' (except from the schema), '.' and '..'.
+ * See RFC 3986. Basically, "rsync://%61.b/./c/.././%64/." -> "rsync://a.b/d"
+ *
+ * This is not actually a perfect normalization, because it's deferred to curl,
+ * whose implementation is somewhat flawed (at least until version 8.12.1):
+ * https://github.com/curl/curl/issues/16829
   *
- * "rsync://a.b/./c//.././/d/." -> "rsync://a.b/d"
+ * On the other hand, since Fort 2 no longer maps URI paths to literal local
+ * paths, all normalization does for us is prevent some theoretical redundant
+ * downloading, so it might not even be that necessary.
   */
  char *
  url_normalize(char const *url)
  {
-       char *normal, *dst;
-       struct tokenizer tkn;
+       CURLU *curlu;
+       char *curl_normal;
+       char *normal;
+       CURLUcode err;
  
-       if (has_bad_prefix(url))
+       if (validate_url_characters(url))
                 return NULL;
  
-       normal = pstrdup(url);
-       dst = normal + RPKI_SCHEMA_LEN;
-       token_init(&tkn, url + RPKI_SCHEMA_LEN);
-
-       while (token_next(&tkn)) {
-               if (tkn.len == 1 && tkn.str[0] == '.')
-                       continue;
-               if (tkn.len == 2 && tkn.str[0] == '.' && tkn.str[1] == '.') {
-                       dst = path_rewind(normal + RPKI_SCHEMA_LEN, dst);
-                       if (!dst)
-                               goto fail;
-                       continue;
-               }
-               strncpy(dst, tkn.str, tkn.len);
-               dst[tkn.len] = '/';
-               dst += tkn.len + 1;
-       }
+       curlu = curl_url();
+       if (!curlu)
+               enomem_panic();
+
+       /* The flag is needed by rsync */
+       err = curl_url_set(curlu, CURLUPART_URL, url, CURLU_NON_SUPPORT_SCHEME);
+       if (err)
+               goto einval;
+       err = curl_url_get(curlu, CURLUPART_URL, &curl_normal, 0);
+       if (err)
+               goto einval;
+
+       curl_url_cleanup(curlu);
  
-       /* Reject URL if there's nothing after the schema. Maybe unnecessary. */
-       if (dst == normal + RPKI_SCHEMA_LEN)
-               goto fail;
+       if (strncmp(curl_normal, "rsync://", RPKI_SCHEMA_LEN) &&
+           strncmp(curl_normal, "https://", RPKI_SCHEMA_LEN)) {
+               curl_free(curl_normal);
+               return NULL;
+       }
  
-       dst[-1] = '\0';
+       normal = pstrdup(curl_normal);
+       curl_free(curl_normal);
         return normal;
  
-fail:  free(normal);
+einval:        pr_val_err("Error parsing URL: %s", curl_url_strerror(err));
+       curl_url_cleanup(curlu);
         return NULL;
  }
  
diff --git a/test/Makefile.am b/test/Makefile.am

index b220fecff19272c1ca45f2cd2903f90cfa4830cc..84d4f18bd08b448cdd64e4d012189388990da8ff 100644 (file)
--- a/test/Makefile.am
+++ b/test/Makefile.am
@@ -106,7 +106,7 @@ thread_pool_test_LDADD =    ${CHECK_LIBS}
  
  check_PROGRAMS +=              url.test
  url_test_SOURCES =             types/url_test.c
-url_test_LDADD =               ${CHECK_LIBS}
+url_test_LDADD =               ${CHECK_LIBS} ${CURL_LIBS}
  
  check_PROGRAMS +=              uthash.test
  uthash_test_SOURCES =          types/uthash_test.c
diff --git a/test/types/url_test.c b/test/types/url_test.c

index f62bfec576d6b5025d62c86cf6241dd9fb63f537..57bfeee5d4a668e9b6cfa7c4268b861143305d36 100644 (file)
--- a/test/types/url_test.c
+++ b/test/types/url_test.c
@@ -16,18 +16,18 @@ START_TEST(test_normalize)
  {
         char *normal;
  
-       TEST_NORMALIZE("rsync://a.b.c", "rsync://a.b.c");
-       TEST_NORMALIZE("rsync://a.b.c/", "rsync://a.b.c");
+       TEST_NORMALIZE("rsync://a.b.c", "rsync://a.b.c/");
+       TEST_NORMALIZE("rsync://a.b.c/", "rsync://a.b.c/");
         TEST_NORMALIZE("rsync://a.b.c/d", "rsync://a.b.c/d");
-       TEST_NORMALIZE("rsync://a.b.c//////", "rsync://a.b.c");
+       TEST_NORMALIZE("rsync://a.b.c//////", "rsync://a.b.c//////");
         TEST_NORMALIZE("rsync://a.b.c/d/e", "rsync://a.b.c/d/e");
-       TEST_NORMALIZE("rsync://a.b.c/d/e/.", "rsync://a.b.c/d/e");
-       TEST_NORMALIZE("rsync://a.b.c/d/e/.", "rsync://a.b.c/d/e");
-       TEST_NORMALIZE("rsync://a.b.c/././d/././e/./.", "rsync://a.b.c/d/e");
-       TEST_NORMALIZE("rsync://a.b.c/d/..", "rsync://a.b.c");
+       TEST_NORMALIZE("rsync://a.b.c/d/e/.", "rsync://a.b.c/d/e/");
+       TEST_NORMALIZE("rsync://a.b.c/d/e/.", "rsync://a.b.c/d/e/");
+       TEST_NORMALIZE("rsync://a.b.c/././d/././e/./.", "rsync://a.b.c/d/e/");
+       TEST_NORMALIZE("rsync://a.b.c/d/..", "rsync://a.b.c/");
         TEST_NORMALIZE("rsync://a.b.c/x/../x/y/z", "rsync://a.b.c/x/y/z");
-       TEST_NORMALIZE("rsync://a.b.c/d/../d/../d/e/", "rsync://a.b.c/d/e");
-       TEST_NORMALIZE("rsync://x//y/z/../../m/./n/o", "rsync://x/m/n/o");
+       TEST_NORMALIZE("rsync://a.b.c/d/../d/../d/e/", "rsync://a.b.c/d/e/");
+       TEST_NORMALIZE("rsync://x//y/z/../../m/./n/o", "rsync://x//m/n/o");
  
         ck_assert_ptr_eq(NULL, url_normalize(""));
         ck_assert_ptr_eq(NULL, url_normalize("h"));
@@ -36,19 +36,32 @@ START_TEST(test_normalize)
         ck_assert_ptr_eq(NULL, url_normalize("https:"));
         ck_assert_ptr_eq(NULL, url_normalize("https:/"));
         ck_assert_ptr_eq(NULL, url_normalize("rsync://"));
-       ck_assert_ptr_eq(NULL, url_normalize("rsync://."));
-       ck_assert_ptr_eq(NULL, url_normalize("https://./."));
-       ck_assert_ptr_eq(NULL, url_normalize("https://./d"));
-       ck_assert_ptr_eq(NULL, url_normalize("rsync://.."));
-       ck_assert_ptr_eq(NULL, url_normalize("rsync://../.."));
-       ck_assert_ptr_eq(NULL, url_normalize("rsync://../d"));
-       ck_assert_ptr_eq(NULL, url_normalize("rsync://a.b.c/.."));
-       ck_assert_ptr_eq(NULL, url_normalize("rsync://a.b.c/../.."));
-       ck_assert_ptr_eq(NULL, url_normalize("rsync://a.b.c/../x"));
-       ck_assert_ptr_eq(NULL, url_normalize("rsync://a.b.c/../x/y/z"));
-       ck_assert_ptr_eq(NULL, url_normalize("rsync://a.b.c/d/e/../../.."));
+       ck_assert_ptr_eq(NULL, url_normalize("rsync://a.β.c/"));
+
+       TEST_NORMALIZE("rsync://.", "rsync://./");
+       TEST_NORMALIZE("https://./.", "https://./");
+       TEST_NORMALIZE("https://./d", "https://./d");
+       TEST_NORMALIZE("rsync://..", "rsync://../");
+       TEST_NORMALIZE("rsync://../..", "rsync://../");
+       TEST_NORMALIZE("rsync://../d", "rsync://../d");
+       TEST_NORMALIZE("rsync://a.b.c/..", "rsync://a.b.c/");
+       TEST_NORMALIZE("rsync://a.b.c/../..", "rsync://a.b.c/");
+       TEST_NORMALIZE("rsync://a.b.c/../x", "rsync://a.b.c/x");
+       TEST_NORMALIZE("rsync://a.b.c/../x/y/z", "rsync://a.b.c/x/y/z");
+       TEST_NORMALIZE("rsync://a.b.c/d/e/../../..", "rsync://a.b.c/");
         ck_assert_ptr_eq(NULL, url_normalize("http://a.b.c/d"));
         ck_assert_ptr_eq(NULL, url_normalize("abcde://a.b.c/d"));
+       TEST_NORMALIZE("HTTPS://a.b.c/d", "https://a.b.c/d");
+       TEST_NORMALIZE("rSyNc://a.b.c/d", "rsync://a.b.c/d");
+
+       TEST_NORMALIZE("https://a.b.c:80/d/e", "https://a.b.c:80/d/e");
+       /* TEST_NORMALIZE("https://a.b.c:443/d/e", "https://a.b.c/d/e"); */
+       TEST_NORMALIZE("https://a.b.c:/d/e", "https://a.b.c/d/e");
+
+       /*
+        * XXX make sure libcurl 8.12.2 implements lowercasing domains,
+        * defaulting 443, and maybe reject UTF-8.
+        */
  }
  END_TEST
author	Alberto Leiva Popper <ydahhrk@gmail.com>
	Thu, 3 Apr 2025 23:45:23 +0000 (17:45 -0600)
committer	Alberto Leiva Popper <ydahhrk@gmail.com>
	Thu, 3 Apr 2025 23:48:09 +0000 (17:48 -0600)
src/types/url.c		patch \| blob \| blame \| history
test/Makefile.am		patch \| blob \| blame \| history
test/types/url_test.c		patch \| blob \| blame \| history