From: Timo Sirainen <timo.sirainen@open-xchange.com>
Date: Wed, 13 Jan 2021 17:50:50 +0000 (+0200)
Subject: lib-imap: Add imap_utf7_to_utf8_escaped() and imap_escaped_utf8_to_utf7()
X-Git-Tag: 2.3.14.rc1~57
X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=bffa6a9c0cbe5fd906969325e268406017c09ce7;p=thirdparty%2Fdovecot%2Fcore.git

lib-imap: Add imap_utf7_to_utf8_escaped() and imap_escaped_utf8_to_utf7()

These can be used to convert invalid mUTF-7 to escaped UTF-8 and back to the
original invalid input.
---

diff --git a/src/lib-imap/imap-utf7.c b/src/lib-imap/imap-utf7.c
index ba97d90551..7ea53f535a 100644
--- a/src/lib-imap/imap-utf7.c
+++ b/src/lib-imap/imap-utf7.c
@@ -55,25 +55,50 @@ mbase64_encode(string_t *dest, const unsigned char *in, size_t len)
 	str_append_c(dest, '-');
 }
 
-static const char *imap_utf8_first_encode_char(const char *str)
+static const char *
+imap_utf8_first_encode_char(const char *str, char escape_char)
 {
 	const char *p;
 
 	for (p = str; *p != '\0'; p++) {
-		if (*p == '&' || *p < 0x20 || *p >= 0x7f)
+		if (*p == '&' || *p < 0x20 || *p >= 0x7f || *p == escape_char)
 			return p;
 	}
 	return NULL;
 }
 
-int imap_utf8_to_utf7(const char *src, string_t *dest)
+int imap_escaped_utf8_hex_to_char(const char *str, unsigned char *chr_r)
+{
+	unsigned int i = 0;
+	unsigned char c = 0;
+
+	/* NOTE: Only lowercase hex characters are allowed so the output is
+	   reversible. */
+	for (;;) {
+		if (str[i] >= '0' && str[i] <= '9')
+			c += str[i] - '0';
+		else if (str[i] >= 'a' && str[i] <= 'f')
+			c += str[i] - 'a' + 10;
+		else
+			return -1;
+		if (++i == 2)
+			break;
+		c *= 0x10;
+	}
+	*chr_r = c;
+	return 0;
+}
+
+static int
+imap_utf8_to_utf7_int(const char *src, char escape_char, string_t *dest)
 {
 	const char *p;
 	unichar_t chr;
 	uint8_t *utf16, *u;
 	uint16_t u16;
+	unsigned char c;
 
-	p = imap_utf8_first_encode_char(src);
+	p = imap_utf8_first_encode_char(src, escape_char);
 	if (p == NULL) {
 		/* no characters that need to be encoded */
 		str_append(dest, src);
@@ -84,6 +109,12 @@ int imap_utf8_to_utf7(const char *src, string_t *dest)
 	str_append_data(dest, src, p-src);
 	utf16 = t_malloc0(MALLOC_MULTIPLY(strlen(p), 2));
 	while (*p != '\0') {
+		if (*p == escape_char &&
+		    imap_escaped_utf8_hex_to_char(p+1, &c) == 0) {
+			str_append_c(dest, c);
+			p += 3;
+			continue;
+		}
 		if (*p == '&') {
 			str_append(dest, "&-");
 			p++;
@@ -118,12 +149,24 @@ int imap_utf8_to_utf7(const char *src, string_t *dest)
 	return 0;
 }
 
+int imap_utf8_to_utf7(const char *src, string_t *dest)
+{
+	return imap_utf8_to_utf7_int(src, '\0', dest);
+}
+
+int imap_escaped_utf8_to_utf7(const char *src, char escape_char, string_t *dest)
+{
+	i_assert(escape_char != '&');
+
+	return imap_utf8_to_utf7_int(src, escape_char, dest);
+}
+
 int t_imap_utf8_to_utf7(const char *src, const char **dest_r)
 {
 	string_t *str;
 	int ret;
 
-	if (imap_utf8_first_encode_char(src) == NULL) {
+	if (imap_utf8_first_encode_char(src, '\0') == NULL) {
 		*dest_r = src;
 		return 0;
 	}
@@ -253,14 +296,18 @@ static int mbase64_decode_to_utf8(string_t *dest, const char **_src)
 	return 0;
 }
 
-int imap_utf7_to_utf8(const char *src, string_t *dest)
+static int
+imap_utf7_to_utf8_int(const char *src, const char *escape_chars, string_t *dest)
 {
 	const char *p;
 
 	for (p = src; *p != '\0'; p++) {
-		if (*p < 0x20 || *p >= 0x7f)
-			return -1;
-		if (*p == '&')
+		if (*p < 0x20 || *p >= 0x7f) {
+			if (escape_chars[0] == '\0')
+				return -1;
+			break;
+		}
+		if (*p == '&' || strchr(escape_chars, *p) != NULL)
 			break;
 	}
 	if (*p == '\0') {
@@ -272,13 +319,23 @@ int imap_utf7_to_utf8(const char *src, string_t *dest)
 	/* at least one encoded character */
 	str_append_data(dest, src, p-src);
 	while (*p != '\0') {
-		if (*p == '&') {
+		if (strchr(escape_chars, *p) != NULL ||
+		    *p < 0x20 || *p >= 0x7f) {
+			str_printfa(dest, "%c%02x", escape_chars[0],
+				    (unsigned char)*p);
+			p++;
+		} else if (*p == '&') {
 			if (*++p == '-') {
 				str_append_c(dest, '&');
 				p++;
 			} else {
-				if (mbase64_decode_to_utf8(dest, &p) < 0)
-					return -1;
+				size_t orig_size = str_len(dest);
+				if (mbase64_decode_to_utf8(dest, &p) < 0) {
+					if (escape_chars[0] == '\0')
+						return -1;
+					str_truncate(dest, orig_size);
+					str_printfa(dest, "%c26", escape_chars[0]);
+				}
 			}
 		} else {
 			str_append_c(dest, *p++);
@@ -287,6 +344,20 @@ int imap_utf7_to_utf8(const char *src, string_t *dest)
 	return 0;
 }
 
+int imap_utf7_to_utf8(const char *src, string_t *dest)
+{
+	return imap_utf7_to_utf8_int(src, "", dest);
+}
+
+void imap_utf7_to_utf8_escaped(const char *src, const char *escape_chars,
+			       string_t *dest)
+{
+	i_assert(escape_chars[0] != '&');
+
+	if (imap_utf7_to_utf8_int(src, escape_chars, dest) < 0)
+		i_unreached();
+}
+
 bool imap_utf7_is_valid(const char *src)
 {
 	const char *p;
diff --git a/src/lib-imap/imap-utf7.h b/src/lib-imap/imap-utf7.h
index 5d7875f198..d7ae3066e6 100644
--- a/src/lib-imap/imap-utf7.h
+++ b/src/lib-imap/imap-utf7.h
@@ -5,9 +5,23 @@
    valid UTF-8. */
 int imap_utf8_to_utf7(const char *src, string_t *dest);
 int t_imap_utf8_to_utf7(const char *src, const char **dest_r);
+/* Like imap_utf8_to_utf7(), but decode all <escape_char><hex> instances.
+   Returns -1 if src isn't valid UTF-8. Note that invalid <escape_char> content
+   isn't treated as an error - it's simply passed through. */
+int imap_escaped_utf8_to_utf7(const char *src, char escape_char, string_t *dest);
+/* For manually parsing the <hex> after <escape_char>. Returns 0 on success,
+   -1 if str doesn't point to valid <hex>. */
+int imap_escaped_utf8_hex_to_char(const char *str, unsigned char *chr_r);
+
 /* Convert IMAP-UTF-7 string to UTF-8. Returns 0 if ok, -1 if src isn't
    valid IMAP-UTF-7. */
 int imap_utf7_to_utf8(const char *src, string_t *dest);
+/* Like imap_utf7_to_utf8(), but write invalid input as <escape_chars[0]><hex>.
+   All the characters in escape_chars[] are escaped in the same way. This
+   allows converting the escaped output back to the original (broken)
+   IMAP-UTF-7 input. */
+void imap_utf7_to_utf8_escaped(const char *src, const char *escape_chars,
+			       string_t *dest);
 /* Returns TRUE if the string is valid IMAP-UTF-7 string. */
 bool imap_utf7_is_valid(const char *src);
 
diff --git a/src/lib-imap/test-imap-utf7.c b/src/lib-imap/test-imap-utf7.c
index b487e04e6d..216eebf4a7 100644
--- a/src/lib-imap/test-imap-utf7.c
+++ b/src/lib-imap/test-imap-utf7.c
@@ -22,19 +22,26 @@ static void test_imap_utf7_by_example(void)
 		{ NULL, "&Jjo!" },
 		{ NULL, "&U,BTFw-&ZeVnLIqe-" } /* unnecessary shift */
 	};
-	string_t *dest;
+	string_t *dest, *dest2;
 	unsigned int i;
 
 	dest = t_str_new(256);
+	dest2 = t_str_new(256);
 
 	test_begin("imap mutf7 examples");
 	for (i = 0; i < N_ELEMENTS(tests); i++) {
+		str_truncate(dest, 0);
 		if (tests[i].utf8 != NULL) {
-			str_truncate(dest, 0);
 			if (imap_utf8_to_utf7(tests[i].utf8, dest) < 0)
 				test_assert_idx(tests[i].mutf7 == NULL, i);
 			else
 				test_assert_idx(null_strcmp(tests[i].mutf7, str_c(dest)) == 0, i);
+		} else {
+			/* invalid mUTF-7 - test that escaping works */
+			str_truncate(dest2, 0);
+			imap_utf7_to_utf8_escaped(tests[i].mutf7, "%", dest);
+			imap_escaped_utf8_to_utf7(str_c(dest), '%', dest2);
+			test_assert_idx(strcmp(tests[i].mutf7, str_c(dest2)) == 0, i);
 		}
 		if (tests[i].mutf7 != NULL) {
 			str_truncate(dest, 0);
@@ -45,6 +52,15 @@ static void test_imap_utf7_by_example(void)
 			test_assert_idx(imap_utf7_is_valid(tests[i].mutf7) != (tests[i].utf8 == NULL), i);
 		}
 	}
+
+	str_truncate(dest, 0);
+	imap_utf7_to_utf8_escaped(".foo%", "%.", dest);
+	test_assert_strcmp(str_c(dest), "%2efoo%25");
+
+	str_truncate(dest, 0);
+	test_assert(imap_escaped_utf8_to_utf7("%foo%2ebar", '%', dest) == 0);
+	test_assert_strcmp(str_c(dest), "%foo.bar");
+
 	test_end();
 }
 
@@ -85,9 +101,12 @@ static void test_imap_utf7_ucs4_cases(void)
 static const char mb64[64]="ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+,";
 static void test_imap_utf7_non_utf16(void)
 {
+	string_t *dest, *dest2;
 	unsigned int i;
 
 	test_begin("imap mutf7 non-utf16");
+	dest = t_str_new(32);
+	dest2 = t_str_new(32);
 	for (i = 0; i <= 255; ++i) {
 		/* Invalid, code a single 8-bit octet */
 		const char csrc[] = {
@@ -98,6 +117,13 @@ static void test_imap_utf7_non_utf16(void)
 			'\0'
 		};
 		test_assert_idx(!imap_utf7_is_valid(csrc), i);
+
+		/* escaping can reverse the original string */
+		str_truncate(dest, 0);
+		str_truncate(dest2, 0);
+		imap_utf7_to_utf8_escaped(csrc, "%", dest);
+		imap_escaped_utf8_to_utf7(str_c(dest), '%', dest2);
+		test_assert_idx(strcmp(csrc, str_c(dest2)) == 0, i);
 	}
 	for (i = 0; i <= 255; ++i) {
 		/* Invalid, U+00E4 followed by a single octet */
@@ -111,6 +137,13 @@ static void test_imap_utf7_non_utf16(void)
 			'\0'
 		};
 		test_assert_idx(!imap_utf7_is_valid(csrc), i);
+
+		/* escaping can reverse the original string */
+		str_truncate(dest, 0);
+		str_truncate(dest2, 0);
+		imap_utf7_to_utf8_escaped(csrc, "%", dest);
+		imap_escaped_utf8_to_utf7(str_c(dest), '%', dest2);
+		test_assert_idx(strcmp(csrc, str_c(dest2)) == 0, i);
 	}
 	test_end();
 }