From: Aki Tuomi <aki.tuomi@open-xchange.com>
Date: Wed, 26 Jun 2024 19:44:32 +0000 (+0300)
Subject: lib: Add punycode decoder
X-Git-Tag: 2.4.0~1556
X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=eabb4115a76f4ba3beb193f4fdb2f484bdf4da48;p=thirdparty%2Fdovecot%2Fcore.git

lib: Add punycode decoder
---

diff --git a/src/lib/Makefile.am b/src/lib/Makefile.am
index 26158edb15..a87d7b7415 100644
--- a/src/lib/Makefile.am
+++ b/src/lib/Makefile.am
@@ -161,6 +161,7 @@ liblib_la_SOURCES = \
 	process-stat.c \
 	process-title.c \
 	priorityq.c \
+	punycode.c \
 	randgen.c \
 	rand.c \
 	read-full.c \
@@ -321,6 +322,7 @@ headers = \
 	process-stat.h \
 	process-title.h \
 	priorityq.h \
+	punycode.h \
 	randgen.h \
 	read-full.h \
 	restrict-access.h \
@@ -445,6 +447,7 @@ test_lib_SOURCES = \
 	test-primes.c \
 	test-printf-format-fix.c \
 	test-priorityq.c \
+	test-punycode.c \
 	test-random.c \
 	test-seq-range-array.c \
 	test-seq-set-builder.c \
diff --git a/src/lib/punycode.c b/src/lib/punycode.c
new file mode 100644
index 0000000000..b9a802d4d9
--- /dev/null
+++ b/src/lib/punycode.c
@@ -0,0 +1,150 @@
+/* Copyright (c) 2024 Dovecot authors, see the included COPYING file */
+
+#include "lib.h"
+#include "array.h"
+#include "str.h"
+#include "unichar.h"
+#include "punycode.h"
+
+/* Bootstring parameters for Punycode */
+
+static const unsigned int base = 36; /* maximum basic code point */
+static const unsigned int tmin = 1;
+static const unsigned int tmax = 26;
+static const unsigned int skew = 38;
+static const unsigned int damp = 700;
+static const unsigned int initialBias = 72;
+static const unsigned int initialN = 0x80;
+static const unsigned int delimiter = u'-';
+
+/*
+      code points    digit-values
+      ------------   ----------------------
+      41..5A (A-Z) =  0 to 25, respectively
+      61..7A (a-z) =  0 to 25, respectively
+      30..39 (0-9) = 26 to 35, respectively
+*/
+static inline unsigned int decode_digit(unsigned char cp)
+{
+	if (cp >= '0' && cp <= '9')
+		return cp - u'0' + 26;
+	else if (cp >= 'A' && cp <= 'Z')
+		return cp - u'A';
+	else if (cp >= 'a' && cp <= 'z')
+		return cp - u'a';
+	else
+		return base;
+}
+
+/* Bias adaptation function */
+
+static unsigned int adapt(unsigned int delta, unsigned int numpoints, bool firsttime)
+{
+	unsigned int k;
+
+	delta = firsttime ? delta / damp : delta >> 1;
+	/* delta >> 1 is a faster way of doing delta / 2 */
+	delta += delta / numpoints;
+
+	for (k = 0;  delta > ((base - tmin) * tmax) / 2;  k += base)
+		delta /= base - tmin;
+
+	return k + (base - tmin + 1) * delta / (delta + skew);
+}
+
+/* Decodes a punycoded string into output, or returns -1 on error. */
+int punycode_decode(const char *input, size_t len, string_t *output)
+{
+	ARRAY(unichar_t) label;
+	size_t i = 0;
+	size_t out = 0;
+	unsigned int n = initialN, bias = initialBias;
+	const char *delim = NULL;
+	const char *end = CONST_PTR_OFFSET(input, len);
+	const char *ptr = input;
+	t_array_init(&label, len);
+
+	/* find the rightmost delimiter, if present in string */
+	delim = strrchr(ptr, delimiter);
+	i_assert(delim == NULL || delim < end);
+
+	/* no delimiter found, reset to start of string */
+	if (delim == NULL)
+		delim = input;
+	i_assert(delim <= end);
+
+	for (ptr = input; ptr < delim; ptr++) {
+		if ((unsigned char)*ptr >= 0x80)
+			/* Has non-ascii input, this cannot be punycoded. */
+			return -1;
+		i_assert(out < sizeof(label));
+		/* Add basic code points to label */
+		unichar_t ch = (unsigned char)*ptr;
+		array_push_back(&label, &ch);
+	}
+
+	out = array_count(&label);
+
+	/* Main decoding loop: start from after delimiter */
+	if (delim != input)
+		ptr = delim + 1;
+	else
+		ptr = input;
+
+	i_assert(ptr < end);
+	while (ptr < end) {
+		unsigned int oldi, w, k, digit, t;
+		/* Decode a generalized variable-length integer into delta,
+		   which gets added to i.  The overflow checking is easier if
+		   we increase i as we go, then subtract off its starting
+		   value at the end to obtain delta.  */
+
+		oldi = i;
+		w = 1;
+		k = base;
+
+		while (ptr <= end) {
+			/* ptr points to next digit to decode */
+			digit = decode_digit(*ptr++);
+			if (digit >= base)
+				return -1;
+			if (digit > (UINT_MAX - i) / w)
+				return -1;
+			i += digit * w;
+			t = k <= bias ? tmin :
+				k >= bias + tmax ? tmax : k - bias;
+			if (digit < t)
+				break;
+			if (w > UINT_MAX / (base - t))
+				return -1;
+			w *= (base - t);
+			k += base;
+		}
+
+		bias = adapt(i - oldi, out + 1, oldi == 0);
+
+		/* i was supposed to wrap around from out+1 to 0, incrementing
+		   n each time, so we'll fix that now: */
+
+		if (i / (out + 1) > UINT_MAX - n)
+			return -1;
+
+		n += i / (out + 1);
+		i %= (out + 1);
+
+		if (n < initialN)
+			return -1;
+
+		/* Insert n at position i of the output: */
+		if (i <= out) {
+			out++;
+			array_insert(&label, i, &n, 1);
+		} else
+			return -1;
+
+		i++;
+	}
+
+	uni_ucs4_to_utf8(array_front(&label), out, output);
+	return 0;
+}
diff --git a/src/lib/punycode.h b/src/lib/punycode.h
new file mode 100644
index 0000000000..40adf3b78f
--- /dev/null
+++ b/src/lib/punycode.h
@@ -0,0 +1,8 @@
+#ifndef PUNYCODE_H
+#define PUNYCODE_H
+
+/* Parse input as a punycode-encoded string and append it to
+   output. Returns 0 on success and -1 on failure. */
+int punycode_decode(const char *input, size_t len, string_t *output);
+
+#endif
diff --git a/src/lib/test-lib.inc b/src/lib/test-lib.inc
index 59e7042850..34ec63225b 100644
--- a/src/lib/test-lib.inc
+++ b/src/lib/test-lib.inc
@@ -88,6 +88,7 @@ TEST(test_primes)
 TEST(test_printf_format_fix)
 FATAL(fatal_printf_format_fix)
 TEST(test_priorityq)
+TEST(test_punycode)
 TEST(test_random)
 FATAL(fatal_random)
 TEST(test_seq_range_array)
diff --git a/src/lib/test-punycode.c b/src/lib/test-punycode.c
new file mode 100644
index 0000000000..485a5cfa42
--- /dev/null
+++ b/src/lib/test-punycode.c
@@ -0,0 +1,46 @@
+/* Copyright (c) 2024 Dovecot authors, see the included COPYING file */
+
+#include "test-lib.h"
+#include "punycode.h"
+#include "str.h"
+
+static void test_punycode_decode(void)
+{
+	const struct test_case {
+		const char *in;
+		const char *out;
+		int ret;
+	} cases[] = {
+		/* has ASCII, appends */
+		{ .in = "gr-zia", .out = "\x67\x72\xc3\xa5", .ret = 0 },
+		/* has ASCII, inserts */
+		{ .in = "bl-yia", .out = "\x62\xc3\xa5\x6c", .ret = 0 },
+		/* has ASCII, inserts AND appends */
+		{ .in = "stlbl-nrad",
+		  .out = "\x73\x74\xc3\xa5\x6c\x62\x6c\xc3\xa5", .ret = 0 },
+		/* has no ASCII, appends */
+		{ .in = "--7sbabjsrp6aymef",
+		  .out = "\xd0\xb0\xd0\xba\xd1\x82\xd1\x80\xd0\xb8\xd1\x81\xd0"
+			 "\xb0\x2d\xd0\xb2\xd0\xb5\xd1\x81\xd0\xbd\xd0\xb0",
+		  .ret = 0 },
+		/* broken */
+		{ .in = "zz-zzzz", .out = "", .ret = -1 },
+	};
+
+	unsigned int i;
+	string_t *r = t_str_new(42);
+
+	test_begin("punycode decoding");
+	for (i = 0; i < N_ELEMENTS(cases); i ++) {
+		str_truncate(r, 0);
+		int ret = punycode_decode(cases[i].in, strlen(cases[i].in), r);
+		test_assert_idx(ret == cases[i].ret, i);
+		test_assert_strcmp_idx(str_c(r), cases[i].out, i);
+	}
+	test_end();
+}
+
+void test_punycode(void)
+{
+	test_punycode_decode();
+}