lib: Add punycode decoder

author Aki Tuomi <aki.tuomi@open-xchange.com>

Wed, 26 Jun 2024 19:44:32 +0000 (22:44 +0300)

committer aki.tuomi <aki.tuomi@open-xchange.com>

Fri, 28 Jun 2024 09:48:14 +0000 (09:48 +0000)
author Aki Tuomi <aki.tuomi@open-xchange.com>
Wed, 26 Jun 2024 19:44:32 +0000 (22:44 +0300)
committer aki.tuomi <aki.tuomi@open-xchange.com>
Fri, 28 Jun 2024 09:48:14 +0000 (09:48 +0000)
diff --git a/src/lib/Makefile.am b/src/lib/Makefile.am

index 26158edb1550f59e2767026a8426bbbd3a449041..a87d7b7415a4a3a6af1c0a06cefc5043588dd738 100644 (file)
--- a/src/lib/Makefile.am
+++ b/src/lib/Makefile.am
@@ -161,6 +161,7 @@ liblib_la_SOURCES = \
         process-stat.c \
         process-title.c \
         priorityq.c \
+       punycode.c \
         randgen.c \
         rand.c \
         read-full.c \
@@ -321,6 +322,7 @@ headers = \
         process-stat.h \
         process-title.h \
         priorityq.h \
+       punycode.h \
         randgen.h \
         read-full.h \
         restrict-access.h \
@@ -445,6 +447,7 @@ test_lib_SOURCES = \
         test-primes.c \
         test-printf-format-fix.c \
         test-priorityq.c \
+       test-punycode.c \
         test-random.c \
         test-seq-range-array.c \
         test-seq-set-builder.c \
diff --git a/src/lib/punycode.c b/src/lib/punycode.c

new file mode 100644 (file)

index 0000000..b9a802d
--- /dev/null
+++ b/src/lib/punycode.c
@@ -0,0 +1,150 @@
+/* Copyright (c) 2024 Dovecot authors, see the included COPYING file */
+
+#include "lib.h"
+#include "array.h"
+#include "str.h"
+#include "unichar.h"
+#include "punycode.h"
+
+/* Bootstring parameters for Punycode */
+
+static const unsigned int base = 36; /* maximum basic code point */
+static const unsigned int tmin = 1;
+static const unsigned int tmax = 26;
+static const unsigned int skew = 38;
+static const unsigned int damp = 700;
+static const unsigned int initialBias = 72;
+static const unsigned int initialN = 0x80;
+static const unsigned int delimiter = u'-';
+
+/*
+      code points    digit-values
+      ------------   ----------------------
+      41..5A (A-Z) =  0 to 25, respectively
+      61..7A (a-z) =  0 to 25, respectively
+      30..39 (0-9) = 26 to 35, respectively
+*/
+static inline unsigned int decode_digit(unsigned char cp)
+{
+       if (cp >= '0' && cp <= '9')
+               return cp - u'0' + 26;
+       else if (cp >= 'A' && cp <= 'Z')
+               return cp - u'A';
+       else if (cp >= 'a' && cp <= 'z')
+               return cp - u'a';
+       else
+               return base;
+}
+
+/* Bias adaptation function */
+
+static unsigned int adapt(unsigned int delta, unsigned int numpoints, bool firsttime)
+{
+       unsigned int k;
+
+       delta = firsttime ? delta / damp : delta >> 1;
+       /* delta >> 1 is a faster way of doing delta / 2 */
+       delta += delta / numpoints;
+
+       for (k = 0;  delta > ((base - tmin) * tmax) / 2;  k += base)
+               delta /= base - tmin;
+
+       return k + (base - tmin + 1) * delta / (delta + skew);
+}
+
+/* Decodes a punycoded string into output, or returns -1 on error. */
+int punycode_decode(const char *input, size_t len, string_t *output)
+{
+       ARRAY(unichar_t) label;
+       size_t i = 0;
+       size_t out = 0;
+       unsigned int n = initialN, bias = initialBias;
+       const char *delim = NULL;
+       const char *end = CONST_PTR_OFFSET(input, len);
+       const char *ptr = input;
+       t_array_init(&label, len);
+
+       /* find the rightmost delimiter, if present in string */
+       delim = strrchr(ptr, delimiter);
+       i_assert(delim == NULL || delim < end);
+
+       /* no delimiter found, reset to start of string */
+       if (delim == NULL)
+               delim = input;
+       i_assert(delim <= end);
+
+       for (ptr = input; ptr < delim; ptr++) {
+               if ((unsigned char)*ptr >= 0x80)
+                       /* Has non-ascii input, this cannot be punycoded. */
+                       return -1;
+               i_assert(out < sizeof(label));
+               /* Add basic code points to label */
+               unichar_t ch = (unsigned char)*ptr;
+               array_push_back(&label, &ch);
+       }
+
+       out = array_count(&label);
+
+       /* Main decoding loop: start from after delimiter */
+       if (delim != input)
+               ptr = delim + 1;
+       else
+               ptr = input;
+
+       i_assert(ptr < end);
+       while (ptr < end) {
+               unsigned int oldi, w, k, digit, t;
+               /* Decode a generalized variable-length integer into delta,
+                  which gets added to i.  The overflow checking is easier if
+                  we increase i as we go, then subtract off its starting
+                  value at the end to obtain delta.  */
+
+               oldi = i;
+               w = 1;
+               k = base;
+
+               while (ptr <= end) {
+                       /* ptr points to next digit to decode */
+                       digit = decode_digit(*ptr++);
+                       if (digit >= base)
+                               return -1;
+                       if (digit > (UINT_MAX - i) / w)
+                               return -1;
+                       i += digit * w;
+                       t = k <= bias ? tmin :
+                               k >= bias + tmax ? tmax : k - bias;
+                       if (digit < t)
+                               break;
+                       if (w > UINT_MAX / (base - t))
+                               return -1;
+                       w *= (base - t);
+                       k += base;
+               }
+
+               bias = adapt(i - oldi, out + 1, oldi == 0);
+
+               /* i was supposed to wrap around from out+1 to 0, incrementing
+                  n each time, so we'll fix that now: */
+
+               if (i / (out + 1) > UINT_MAX - n)
+                       return -1;
+
+               n += i / (out + 1);
+               i %= (out + 1);
+
+               if (n < initialN)
+                       return -1;
+
+               /* Insert n at position i of the output: */
+               if (i <= out) {
+                       out++;
+                       array_insert(&label, i, &n, 1);
+               } else
+                       return -1;
+
+               i++;
+       }
+
+       uni_ucs4_to_utf8(array_front(&label), out, output);
+       return 0;
+}
diff --git a/src/lib/punycode.h b/src/lib/punycode.h

new file mode 100644 (file)

index 0000000..40adf3b
--- /dev/null
+++ b/src/lib/punycode.h
@@ -0,0 +1,8 @@
+#ifndef PUNYCODE_H
+#define PUNYCODE_H
+
+/* Parse input as a punycode-encoded string and append it to
+   output. Returns 0 on success and -1 on failure. */
+int punycode_decode(const char *input, size_t len, string_t *output);
+
+#endif
diff --git a/src/lib/test-lib.inc b/src/lib/test-lib.inc

index 59e70428505ff5eddb9f788e012e82a6259e4d1c..34ec63225bdfe126fd718f75fd136775831a6ae8 100644 (file)
--- a/src/lib/test-lib.inc
+++ b/src/lib/test-lib.inc
@@ -88,6 +88,7 @@ TEST(test_primes)
  TEST(test_printf_format_fix)
  FATAL(fatal_printf_format_fix)
  TEST(test_priorityq)
+TEST(test_punycode)
  TEST(test_random)
  FATAL(fatal_random)
  TEST(test_seq_range_array)
diff --git a/src/lib/test-punycode.c b/src/lib/test-punycode.c

new file mode 100644 (file)

index 0000000..485a5cf
--- /dev/null
+++ b/src/lib/test-punycode.c
@@ -0,0 +1,46 @@
+/* Copyright (c) 2024 Dovecot authors, see the included COPYING file */
+
+#include "test-lib.h"
+#include "punycode.h"
+#include "str.h"
+
+static void test_punycode_decode(void)
+{
+       const struct test_case {
+               const char *in;
+               const char *out;
+               int ret;
+       } cases[] = {
+               /* has ASCII, appends */
+               { .in = "gr-zia", .out = "\x67\x72\xc3\xa5", .ret = 0 },
+               /* has ASCII, inserts */
+               { .in = "bl-yia", .out = "\x62\xc3\xa5\x6c", .ret = 0 },
+               /* has ASCII, inserts AND appends */
+               { .in = "stlbl-nrad",
+                 .out = "\x73\x74\xc3\xa5\x6c\x62\x6c\xc3\xa5", .ret = 0 },
+               /* has no ASCII, appends */
+               { .in = "--7sbabjsrp6aymef",
+                 .out = "\xd0\xb0\xd0\xba\xd1\x82\xd1\x80\xd0\xb8\xd1\x81\xd0"
+                        "\xb0\x2d\xd0\xb2\xd0\xb5\xd1\x81\xd0\xbd\xd0\xb0",
+                 .ret = 0 },
+               /* broken */
+               { .in = "zz-zzzz", .out = "", .ret = -1 },
+       };
+
+       unsigned int i;
+       string_t *r = t_str_new(42);
+
+       test_begin("punycode decoding");
+       for (i = 0; i < N_ELEMENTS(cases); i ++) {
+               str_truncate(r, 0);
+               int ret = punycode_decode(cases[i].in, strlen(cases[i].in), r);
+               test_assert_idx(ret == cases[i].ret, i);
+               test_assert_strcmp_idx(str_c(r), cases[i].out, i);
+       }
+       test_end();
+}
+
+void test_punycode(void)
+{
+       test_punycode_decode();
+}
author	Aki Tuomi <aki.tuomi@open-xchange.com>
	Wed, 26 Jun 2024 19:44:32 +0000 (22:44 +0300)
committer	aki.tuomi <aki.tuomi@open-xchange.com>
	Fri, 28 Jun 2024 09:48:14 +0000 (09:48 +0000)
src/lib/Makefile.am		patch \| blob \| blame \| history
src/lib/punycode.c	[new file with mode: 0644]	patch \| blob
src/lib/punycode.h	[new file with mode: 0644]	patch \| blob
src/lib/test-lib.inc		patch \| blob \| blame \| history
src/lib/test-punycode.c	[new file with mode: 0644]	patch \| blob