/*
* XXX IPv6 addresses
- * XXX UTF-8
*/
-#define URI_ALLOW_UNKNOWN_SCHEME (1 << 1)
+#define URI_ALLOW_UNKNOWN_SCHEME (1 << 0)
static error_msg EM_SCHEME_EMPTY = "Scheme seems empty";
static error_msg EM_SCHEME_1ST = "First scheme character is not a letter";
static error_msg EM_SCHEME_NOTREMOTE = "Missing \"://\"";
static error_msg EM_PCT_NOTHEX = "Invalid hexadecimal digit in percent encoding";
static error_msg EM_PCT_NOT3 = "Unterminated percent-encoding";
+static error_msg EM_UTF8 = "Invalid UTF-8";
static error_msg EM_USERINFO_BADCHR = "Illegal character in userinfo component";
static error_msg EM_USERINFO_DISALLOWED = "Protocol disallows userinfo";
static error_msg EM_HOST_BADCHR = "Illegal character in host component";
static error_msg EM_QF_BADCHR = "Illegal character in query or fragment";
static error_msg EM_FRAGMENT_DISALLOWED = "Protocol disallows fragment";
-struct sized_string {
- char const *str;
+struct sized_ustring {
+ unsigned char const *str;
size_t len;
};
};
static bool
-is_proto(struct sized_string *scheme, char const *proto)
+is_proto(struct sized_ustring *scheme, char const *proto)
{
- return strncasecmp(scheme->str, proto, scheme->len) == 0;
+ return strncasecmp((char const *)scheme->str, proto, scheme->len) == 0;
}
static struct schema_metadata const *
-get_metadata(struct sized_string *scheme)
+get_metadata(struct sized_ustring *scheme)
{
if (scheme->len != 5)
return NULL;
}
static bool
-is_lowercase(char chr)
+is_lowercase(unsigned char chr)
{
return 'a' <= chr && chr <= 'z';
}
static bool
-is_uppercase(char chr)
+is_uppercase(unsigned char chr)
{
return 'A' <= chr && chr <= 'Z';
}
static bool
-is_lowercase_hex(char chr)
+is_lowercase_hex(unsigned char chr)
{
return 'a' <= chr && chr <= 'f';
}
static bool
-is_uppercase_hex(char chr)
+is_uppercase_hex(unsigned char chr)
{
return 'A' <= chr && chr <= 'F';
}
static bool
-is_digit(char chr)
+is_digit(unsigned char chr)
{
return '0' <= chr && chr <= '9';
}
static bool
-is_symbol(char chr, char const *symbols)
+is_symbol(unsigned char chr, char const *symbols)
{
for (; symbols[0] != '\0'; symbols++)
if (chr == symbols[0])
}
static char
-to_lowercase(char uppercase)
+to_lowercase(unsigned char uppercase)
{
return uppercase - ('A' - 'a');
}
static char
-to_uppercase(char chr)
+to_uppercase(unsigned char chr)
{
return is_lowercase(chr) ? (chr + ('A' - 'a')) : chr;
}
approve_chara(struct uri_buffer *buf, char chr)
{
if (buf->d >= buf->capacity) {
- /* It seems this is dead code. */
- buf->capacity += 16;
+ /* Needed when we convert UTF-8 to percent-encoding */
+ buf->capacity += 32;
buf->dst = prealloc(buf->dst, buf->capacity);
}
}
static void
-collect_authority(char const *auth, char const **at, char const **colon,
- char const **end)
+collect_authority(unsigned char const *auth, unsigned char const **at,
+ unsigned char const **colon, unsigned char const **end)
{
*at = NULL;
*colon = NULL;
}
static void
-collect_path(char const *path, char const **end)
+collect_path(unsigned char const *path, unsigned char const **end)
{
for (; true; path++)
if (path[0] == '\0' || path[0] == '?' || path[0] == '#') {
}
static void
-collect_query(char const *query, char const **end)
+collect_query(unsigned char const *query, unsigned char const **end)
{
for (; true; query++)
if (query[0] == '\0' || query[0] == '#') {
}
static void
-collect_fragment(char const *fragment, char const **end)
+collect_fragment(unsigned char const *fragment, unsigned char const **end)
{
for (; true; fragment++)
if (fragment[0] == '\0') {
}
static error_msg
-normalize_scheme(struct uri_buffer *buf, struct sized_string *scheme)
+normalize_scheme(struct uri_buffer *buf, struct sized_ustring *scheme)
{
- char chr;
+ unsigned char chr;
array_index c;
chr = scheme->str[0];
}
static bool
-is_unreserved(char chr)
+is_unreserved(unsigned char chr)
{
return is_lowercase(chr)
|| is_uppercase(chr)
}
static bool
-is_subdelim(char chr)
+is_subdelim(unsigned char chr)
{
return is_symbol(chr, "!$&'()*+,;=");
}
static error_msg
-char2hex(char chr, unsigned int *hex)
+uchar2hex(unsigned char chr, unsigned int *hex)
{
if (is_digit(chr)) {
*hex = chr - '0';
}
static error_msg
-approve_pct_encoded(struct uri_buffer *buf, struct sized_string *sstr,
+approve_pct_encoded(struct uri_buffer *buf, struct sized_ustring *sstr,
array_index *offset)
{
array_index off;
if (sstr->len - off < 3)
return EM_PCT_NOT3;
- error = char2hex(sstr->str[off + 1], &hex1);
+ error = uchar2hex(sstr->str[off + 1], &hex1);
if (error)
return error;
- error = char2hex(sstr->str[off + 2], &hex2);
+ error = uchar2hex(sstr->str[off + 2], &hex2);
if (error)
return error;
return NULL;
}
+static bool
+is_utf8(unsigned char chr)
+{
+ return chr & 0x80;
+}
+
+static char
+bin2hex(unsigned char bin)
+{
+ return bin + ((bin < 10) ? '0' : ('A' - 10));
+}
+
+static void
+approve_bin(struct uri_buffer *buf, unsigned char chr)
+{
+ approve_chara(buf, '%');
+ approve_chara(buf, bin2hex(chr >> 4));
+ approve_chara(buf, bin2hex(chr & 0xF));
+}
+
static error_msg
-normalize_userinfo(struct uri_buffer *buf, struct sized_string *userinfo)
+approve_utf8(struct uri_buffer *buf, struct sized_ustring *sstr,
+ array_index *offset)
+{
+ array_index off;
+ unsigned char chr1;
+ unsigned char chr2;
+ unsigned char chr3;
+ unsigned char chr4;
+
+ off = *offset;
+ if (sstr->len - off < 2)
+ return EM_UTF8;
+ chr1 = sstr->str[off];
+ chr2 = sstr->str[off + 1];
+ if ((chr1 & 0xE0) == 0xC0 && (chr2 & 0xC0) == 0x80) {
+ approve_bin(buf, chr1);
+ approve_bin(buf, chr2);
+ *offset += 1;
+ return NULL;
+ }
+
+ if (sstr->len - off < 3)
+ return EM_UTF8;
+ chr3 = sstr->str[off + 2];
+ if ((chr1 & 0xF0) == 0xE0 &&
+ (chr2 & 0xC0) == 0x80 &&
+ (chr3 & 0xC0) == 0x80) {
+ approve_bin(buf, chr1);
+ approve_bin(buf, chr2);
+ approve_bin(buf, chr3);
+ *offset += 2;
+ return NULL;
+ }
+
+ if (sstr->len - off < 4)
+ return EM_UTF8;
+ chr4 = sstr->str[off + 3];
+ if ((chr1 & 0xF8) == 0xF0 &&
+ (chr2 & 0xC0) == 0x80 &&
+ (chr3 & 0xC0) == 0x80 &&
+ (chr4 & 0xC0) == 0x80) {
+ approve_bin(buf, chr1);
+ approve_bin(buf, chr2);
+ approve_bin(buf, chr3);
+ approve_bin(buf, chr4);
+ *offset += 3;
+ return NULL;
+ }
+
+ return EM_UTF8;
+}
+
+static error_msg
+normalize_userinfo(struct uri_buffer *buf, struct sized_ustring *userinfo)
{
array_index c;
- char chr;
+ unsigned char chr;
error_msg error;
if (userinfo->len == 0)
approve_chara(buf, chr);
else if (chr == ':')
approve_chara(buf, chr);
- else
+ else if (is_utf8(chr)) {
+ error = approve_utf8(buf, userinfo, &c);
+ if (error)
+ return error;
+ } else
return EM_USERINFO_BADCHR;
}
}
static error_msg
-normalize_host(struct uri_buffer *buf, struct sized_string *host)
+normalize_host(struct uri_buffer *buf, struct sized_ustring *host)
{
array_index c;
- char chr;
+ unsigned char chr;
error_msg error;
for (c = 0; c < host->len; c++) {
return error;
} else if (is_subdelim(chr))
approve_chara(buf, chr);
- else
+ else if (is_utf8(chr)) {
+ error = approve_utf8(buf, host, &c);
+ if (error)
+ return error;
+ } else
return EM_HOST_BADCHR;
}
}
static error_msg
-normalize_port(struct uri_buffer *buf, struct sized_string *port,
+normalize_port(struct uri_buffer *buf, struct sized_ustring *port,
struct schema_metadata const *schema)
{
array_index c;
- char chr;
+ unsigned char chr;
unsigned int portnum;
if (port->len == 0)
return NULL;
}
-static char const *
-strnchr(char const *str, size_t n, char chr)
+static unsigned char const *
+strnchr(unsigned char const *str, size_t n, unsigned char chr)
{
array_index s;
for (s = 0; s < n; s++)
}
static bool
-next_segment(struct sized_string *path, struct sized_string *segment)
+next_segment(struct sized_ustring *path, struct sized_ustring *segment)
{
segment->str += segment->len + 1;
if (segment->str > (path->str + path->len))
}
static error_msg
-normalize_path(struct uri_buffer *buf, struct sized_string *path)
+normalize_path(struct uri_buffer *buf, struct sized_ustring *path)
{
- struct sized_string segment;
+ struct sized_ustring segment;
array_index i;
- char chr;
+ unsigned char chr;
size_t limit;
error_msg error;
return error;
} else if (is_subdelim(chr) || is_symbol(chr, ":@"))
approve_chara(buf, chr);
- else
+ else if (is_utf8(chr)) {
+ error = approve_utf8(buf, &segment, &i);
+ if (error)
+ return error;
+ } else
return EM_PATH_BADCHR;
}
}
static error_msg
-normalize_post_path(struct uri_buffer *buf, struct sized_string *post,
+normalize_post_path(struct uri_buffer *buf, struct sized_ustring *post,
char prefix)
{
array_index c;
- char chr;
+ unsigned char chr;
error_msg error;
if (post->len == 0)
approve_chara(buf, chr);
else if (is_symbol(chr, ":@/?"))
approve_chara(buf, chr);
- else
+ else if (is_utf8(chr)) {
+ error = approve_utf8(buf, post, &c);
+ if (error)
+ return error;
+ } else
return EM_QF_BADCHR;
}
}
static void
-print_component(char const *name, struct sized_string *component)
+print_component(char const *name, struct sized_ustring *component)
{
pr_clutter(" %s: %.*s (len:%zu)", name, (int)component->len,
component->str, component->len);
* and needs to be released.
*/
static error_msg
-url_normalize(char const *url, int flags, char **result)
+url_normalize(unsigned char const *url, int flags, char **result)
{
- struct sized_string scheme;
- struct sized_string authority;
- struct sized_string userinfo;
- struct sized_string host;
- struct sized_string port;
- struct sized_string path;
- struct sized_string query;
- struct sized_string fragment;
+ struct sized_ustring scheme;
+ struct sized_ustring authority;
+ struct sized_ustring userinfo;
+ struct sized_ustring host;
+ struct sized_ustring port;
+ struct sized_ustring path;
+ struct sized_ustring query;
+ struct sized_ustring fragment;
- char const *cursor;
- char const *at;
- char const *colon;
+ unsigned char const *cursor;
+ unsigned char const *at;
+ unsigned char const *colon;
struct schema_metadata const *meta;
struct uri_buffer buf;
pr_clutter("-----------------------");
pr_clutter("input: %s", url);
- cursor = strchr(url, ':');
+ cursor = (unsigned char const *)strchr((char const *)url, ':');
if (!cursor)
return EM_SCHEME_NOCOLON;
if (cursor == url)
char *normal;
error_msg error;
- error = url_normalize(str, 0, &normal);
+ error = url_normalize((unsigned char const *)str, 0, &normal);
if (error)
return error;
END_TEST
#define TEST_NORMALIZE(dirty, clean) \
- ck_assert_pstr_eq(NULL, url_normalize(dirty, 0, &normal)); \
+ ck_assert_pstr_eq(NULL, url_normalize( \
+ (unsigned char *)dirty, 0, &normal \
+ )); \
ck_assert_str_eq(clean, normal); \
free(normal)
#define TEST_NORMALIZE_AUS(dirty, clean) \
ck_assert_ptr_eq(NULL, url_normalize( \
- dirty, URI_ALLOW_UNKNOWN_SCHEME, &normal \
+ (unsigned char *)dirty, URI_ALLOW_UNKNOWN_SCHEME, &normal \
)); \
ck_assert_str_eq(clean, normal); \
free(normal)
#define TEST_NORMALIZE_FAIL(dirty, error) \
- ck_assert_str_eq(error, url_normalize(dirty, 0, &normal));
+ ck_assert_str_eq(error, url_normalize( \
+ (unsigned char *)dirty, 0, &normal \
+ ));
#define TEST_NORMALIZE_FAIL_AUS(dirty, error) \
ck_assert_str_eq(error, url_normalize( \
- dirty, URI_ALLOW_UNKNOWN_SCHEME, &normal \
+ (unsigned char *)dirty, URI_ALLOW_UNKNOWN_SCHEME, &normal \
));
START_TEST(awkward_dot_dotting)
TEST_NORMALIZE("https://%7C/", "https://%7C/");
TEST_NORMALIZE("https://%7c/", "https://%7C/");
+ TEST_NORMALIZE("https://a%6fa/", "https://aoa/");
+ TEST_NORMALIZE("https://a%7ca/", "https://a%7Ca/");
+
TEST_NORMALIZE_FAIL("https://%6G", EM_PCT_NOTHEX);
TEST_NORMALIZE_FAIL("https://%G6", EM_PCT_NOTHEX);
}
END_TEST
+static unsigned char const ASCI = 'a'; /* 0_______ */
+static unsigned char const CONT = 0x80; /* 10______ */
+static unsigned char const DUO = 0xC0; /* 110_____ */
+static unsigned char const TRIO = 0xE0; /* 1110____ */
+static unsigned char const QUAD = 0xF0; /* 11110___ */
+static unsigned char const CHRS[] = { ASCI, CONT, DUO, TRIO, QUAD, 0 };
+
+static void
+test_utf8_fail(unsigned char chr1, unsigned char chr2,
+ unsigned char chr3, unsigned char chr4)
+{
+ char *normal;
+ char messy[32];
+
+ if (chr1 == ASCI && chr2 == ASCI && chr3 == ASCI && chr4 == ASCI)
+ return;
+ if (chr1 == ASCI && chr2 == ASCI && chr3 == DUO && chr4 == CONT)
+ return;
+ if (chr1 == ASCI && chr2 == DUO && chr3 == CONT && chr4 == ASCI)
+ return;
+ if (chr1 == DUO && chr2 == CONT && chr3 == ASCI && chr4 == ASCI)
+ return;
+ if (chr1 == DUO && chr2 == CONT && chr3 == DUO && chr4 == CONT)
+ return;
+ if (chr1 == ASCI && chr2 == TRIO && chr3 == CONT && chr4 == CONT)
+ return;
+ if (chr1 == TRIO && chr2 == CONT && chr3 == CONT && chr4 == ASCI)
+ return;
+ if (chr1 == QUAD && chr2 == CONT && chr3 == CONT && chr4 == CONT)
+ return;
+
+ strcpy(messy, "https://----/");
+ messy[8] = chr1;
+ messy[9] = chr2;
+ messy[10] = chr3;
+ messy[11] = chr4;
+ TEST_NORMALIZE_FAIL(messy, EM_UTF8);
+}
+
+START_TEST(test_utf8)
+{
+ char *normal;
+ array_index c1, c2, c3, c4;
+
+ TEST_NORMALIZE("https://a.β.c/", "https://a.%CE%B2.c/");
+ TEST_NORMALIZE("https://a.砦.c/", "https://a.%E7%A0%A6.c/");
+ TEST_NORMALIZE("https://a.𝆑.c/", "https://a.%F0%9D%86%91.c/");
+
+ TEST_NORMALIZE_FAIL_AUS("βsync://a.b.c/", EM_SCHEME_1ST);
+ TEST_NORMALIZE_FAIL_AUS("rsβnc://a.b.c/", EM_SCHEME_NTH);
+ TEST_NORMALIZE("rsync://β@a.b.c/", "rsync://%CE%B2@a.b.c/");
+ TEST_NORMALIZE_FAIL("rsync://a.b.c:β/", EM_PORT_BADCHR);
+ TEST_NORMALIZE("https://a.b.c/β", "https://a.b.c/%CE%B2");
+ TEST_NORMALIZE("https://a.b.c/?β", "https://a.b.c/?%CE%B2");
+ TEST_NORMALIZE("https://a.b.c/#β", "https://a.b.c/#%CE%B2");
+
+ for (c1 = 0; CHRS[c1]; c1++)
+ for (c2 = 0; CHRS[c2]; c2++)
+ for (c3 = 0; CHRS[c3]; c3++)
+ for (c4 = 0; CHRS[c4]; c4++)
+ test_utf8_fail(CHRS[c1], CHRS[c2],
+ CHRS[c3], CHRS[c4]);
+}
+END_TEST
+
START_TEST(test_unknown_protocols)
{
char *normal;
TEST_NORMALIZE_FAIL("https:", EM_SCHEME_NOTREMOTE);
TEST_NORMALIZE_FAIL("https:/", EM_SCHEME_NOTREMOTE);
TEST_NORMALIZE_FAIL("https://", EM_HOST_EMPTY);
- TEST_NORMALIZE_FAIL("https://a.β.c/", EM_HOST_BADCHR);
- TEST_NORMALIZE_FAIL("https://a.b.c/β", EM_PATH_BADCHR);
- /* I think everything else is already tested below. */
+ /* I think everything else is already tested elsewhere. */
}
END_TEST
TEST_NORMALIZE_FAIL("rsync:", EM_SCHEME_NOTREMOTE);
TEST_NORMALIZE_FAIL("rsync:/", EM_SCHEME_NOTREMOTE);
TEST_NORMALIZE_FAIL("rsync://", EM_HOST_EMPTY);
- TEST_NORMALIZE_FAIL("rsync://a.β.c/", EM_HOST_BADCHR);
- TEST_NORMALIZE_FAIL("rsync://a.b.c/β", EM_PATH_BADCHR);
TEST_NORMALIZE("rsync://a.b.c/m", "rsync://a.b.c/m");
TEST_NORMALIZE("rsync://a.b.c/m/r", "rsync://a.b.c/m/r");
tcase_add_test(misc, test_unknown_protocols);
tcase_add_test(misc, awkward_dot_dotting);
tcase_add_test(misc, test_same_origin);
+ tcase_add_test(misc, test_utf8);
generic = tcase_create("RFC 3986 (generic URI)");
tcase_add_test(generic, pct_encoding);