From: Stephan Bosch <stephan.bosch@open-xchange.com>
Date: Fri, 21 Mar 2025 03:26:50 +0000 (+0100)
Subject: lib: Start new Unicode Character Database implementation
X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=e19cfbf8f8c346262bb39fe253d63c4763c350f6;p=thirdparty%2Fdovecot%2Fcore.git

lib: Start new Unicode Character Database implementation

It uses a pre-compiled trie structure and will in later commits feature full
support for the data necessary for Unicode normalization. Stuff needed for
lib-language can be migrated here as well.
---

diff --git a/.gitignore b/.gitignore
index c1158e9ebf..84e994bc96 100644
--- a/.gitignore
+++ b/.gitignore
@@ -101,6 +101,10 @@ src/lib/event-filter-lexer.c
 src/lib/event-filter-parser.c
 src/lib/event-filter-parser.h
 src/lib/unicodemap.c
+src/lib/unicode-data-tables.c
+src/lib/unicode-data-tables.h
+src/lib/unicode-data-types.c
+src/lib/unicode-data-types.h
 src/lib-compression/bench-compression
 src/lib-language/PropList.txt
 src/lib-language/WordBreakProperty.txt
diff --git a/src/lib/Makefile.am b/src/lib/Makefile.am
index d0e2715984..1d98248ecf 100644
--- a/src/lib/Makefile.am
+++ b/src/lib/Makefile.am
@@ -4,6 +4,8 @@ AM_CPPFLAGS = \
 noinst_LTLIBRARIES = liblib.la
 
 BUILT_SOURCES = $(srcdir)/unicodemap.c \
+		$(srcdir)/unicode-data-tables.c \
+		$(srcdir)/unicode-data-tables.h \
 		event-filter-lexer.c \
 		event-filter-parser.c \
 		event-filter-parser.h
@@ -11,11 +13,17 @@ BUILT_SOURCES = $(srcdir)/unicodemap.c \
 UCD_URL = https://dovecot.org/res
 UCD_DIR = $(srcdir)/ucd
 UCD_FILES = \
+	$(UCD_DIR)/DerivedCoreProperties.txt \
 	$(UCD_DIR)/UnicodeData.txt
 
-EXTRA_DIST = unicodemap.c unicodemap.pl $(UCD_FILES)
-EXTRA_CLEAN = unicodemap.c
-
+EXTRA_DIST = \
+	unicodemap.c \
+	unicode-data-tables.c \
+	unicode-data-tables.h \
+	unicodemap.pl \
+	unicode-ucd-compile.py \
+	$(UCD_FILES)
+EXTRA_CLEAN = unicodemap.c unicode-data-tables.c
 
 # Squelch autoconf error about using .[ly] sources but not defining $(LEX)
 # and $(YACC).  Using false here avoids accidental use.
@@ -37,11 +45,16 @@ YACC=/bin/false
 # dependency, anything including the header will race the bison process.
 event-filter-parser.h: event-filter-parser.c
 
+$(UCD_DIR)/DerivedCoreProperties.txt:
+	$(AM_V_at)test -f $@ || $(WGET) -nv -O $@ $(UCD_URL)/DerivedCoreProperties.txt
 $(UCD_DIR)/UnicodeData.txt:
 	$(AM_V_at)test -f $@ || $(WGET) -nv -O $@ $(UCD_URL)/UnicodeData.txt
 
 $(srcdir)/unicodemap.c: $(srcdir)/unicodemap.pl $(UCD_DIR)/UnicodeData.txt
 	$(AM_V_GEN)$(PERL) $(srcdir)/unicodemap.pl < $(UCD_DIR)/UnicodeData.txt > $@
+$(srcdir)/unicode-data-tables.c $(srcdir)/unicode-data-tables.h &: \
+	$(srcdir)/unicode-ucd-compile.py $(UCD_FILES)
+	$(AM_V_GEN)$(PYTHON) $(srcdir)/unicode-ucd-compile.py $(UCD_DIR) $(srcdir)
 
 liblib_la_LIBADD = $(LIBUNWIND_LIBS)
 liblib_la_SOURCES = \
@@ -200,6 +213,8 @@ liblib_la_SOURCES = \
 	unlink-directory.c \
 	unlink-old-files.c \
 	unichar.c \
+	unicode-data-tables.c \
+	unicode-data.c \
 	uri-util.c \
 	utc-offset.c \
 	utc-mktime.c \
@@ -360,6 +375,9 @@ headers = \
 	unlink-directory.h \
 	unlink-old-files.h \
 	unichar.h \
+	unicode-data-static.h \
+	unicode-data-tables.h \
+	unicode-data.h \
 	uri-util.h \
 	utc-offset.h \
 	utc-mktime.h \
@@ -370,7 +388,8 @@ test_programs = test-lib
 noinst_PROGRAMS = $(test_programs)
 
 test_lib_CPPFLAGS = \
-	-I$(top_srcdir)/src/lib-test
+	-I$(top_srcdir)/src/lib-test \
+	-DUCD_DIR=\"$(UCD_DIR)\"
 
 test_libs = \
 	../lib-test/libtest.la \
@@ -467,6 +486,7 @@ test_lib_SOURCES = \
 	test-str-table.c \
 	test-time-util.c \
 	test-unichar.c \
+	test-unicode-data.c \
 	test-utc-mktime.c \
 	test-uri.c \
 	test-wildcard-match.c
@@ -478,7 +498,7 @@ test_headers = \
 test_lib_LDADD = $(test_libs) -lm
 test_lib_DEPENDENCIES = $(test_libs)
 
-check-local:
+check-local: $(UCD_FILES)
 	for bin in $(test_programs); do \
 	  if ! $(RUN_TEST) ./$$bin; then exit 1; fi; \
 	done
diff --git a/src/lib/test-lib.inc b/src/lib/test-lib.inc
index 2fceca2f9b..d3a1f33669 100644
--- a/src/lib/test-lib.inc
+++ b/src/lib/test-lib.inc
@@ -107,6 +107,7 @@ TEST(test_str_sanitize)
 TEST(test_str_table)
 TEST(test_time_util)
 TEST(test_unichar)
+TEST(test_unicode_data)
 TEST(test_uri)
 TEST(test_utc_mktime)
 TEST(test_wildcard_match)
diff --git a/src/lib/test-unicode-data.c b/src/lib/test-unicode-data.c
new file mode 100644
index 0000000000..b01b57b658
--- /dev/null
+++ b/src/lib/test-unicode-data.c
@@ -0,0 +1,169 @@
+/* Copyright (c) 2025 Dovecot authors, see the included COPYING file */
+
+#include "test-lib.h"
+#include "strnum.h"
+#include "str.h"
+#include "unichar.h"
+#include "istream.h"
+#include "unicode-data.h"
+
+#include <fcntl.h>
+
+#define UCD_UNICODE_DATA_TXT "UnicodeData.txt"
+
+static void test_unicode_data_line(const char *line, unsigned int line_num)
+{
+	static uint32_t cp_first = 0;
+
+	const char *const *columns = t_strsplit(line, ";");
+	if (str_array_length(columns) < 15) {
+		test_failed(t_strdup_printf(
+			"Invalid data at %s:%u",
+			UCD_UNICODE_DATA_TXT, line_num));
+		return;
+	}
+
+	const char *cp_hex = columns[0];
+	uint32_t cp;
+
+	if (str_to_uint32_hex(cp_hex, &cp) < 0) {
+		test_failed(t_strdup_printf(
+				"Invalid data at %s:%u: "
+				"Bad code point",
+				UCD_UNICODE_DATA_TXT, line_num));
+		return;
+	}
+
+	/* Parse Name */
+
+	const char *cp_name = columns[1];
+	size_t cp_name_len = strlen(cp_name);
+	const char *p;
+
+	if (cp_name[0] == '<' && cp_name[cp_name_len - 1] == '>') {
+		p = strchr(cp_name + 1, ',');
+		if (p != NULL) {
+			if (strcmp(p, ", First>") == 0) {
+				cp_first = cp;
+				return;
+			} else if (strcmp(p, ", Last>") != 0) {
+				test_failed(t_strdup_printf(
+					"Invalid data at %s:%u: "
+					"Bad code point range: %s",
+					UCD_UNICODE_DATA_TXT, line_num, cp_name));
+				return;
+			}
+		}
+	}
+
+	/* Parse General_Category */
+
+	uint8_t general_category =
+		(uint8_t)unicode_general_category_from_string(columns[2]);
+	if (general_category == UNICODE_GENERAL_CATEGORY_INVALID) {
+		test_failed(t_strdup_printf(
+			    "Invalid data at %s:%u: "
+			    "Bad General_Category for code point %"PRIu32": %s",
+			    UCD_UNICODE_DATA_TXT, line_num, cp, columns[2]));
+		return;
+	}
+	test_assert(!unicode_general_category_is_group(general_category));
+
+	/* Parse Simple_*case_Mapping */
+
+	uint32_t simple_uppercase_mapping = 0;
+	uint32_t simple_lowercase_mapping = 0;
+	uint32_t simple_titlecase_mapping = 0;
+
+	if (*columns[12] != '\0' &&
+	    str_to_uint32_hex(columns[12], &simple_uppercase_mapping) < 0) {
+		test_failed(t_strdup_printf(
+			    "Invalid data at %s:%u: "
+			    "Bad Simple_Uppercase_Mapping for code point %"PRIu32": %s",
+			    UCD_UNICODE_DATA_TXT, line_num, cp, columns[12]));
+		return;
+	}
+	if (*columns[13] != '\0' &&
+	    str_to_uint32_hex(columns[13], &simple_lowercase_mapping) < 0) {
+		test_failed(t_strdup_printf(
+			    "Invalid data at %s:%u: "
+			    "Bad Simple_Lowercase_Mapping for code point %"PRIu32": %s",
+			    UCD_UNICODE_DATA_TXT, line_num, cp, columns[13]));
+		return;
+	}
+	if (*columns[14] != '\0' &&
+	    str_to_uint32_hex(columns[14], &simple_titlecase_mapping) < 0) {
+		test_failed(t_strdup_printf(
+			    "Invalid data at %s:%u: "
+			    "Bad Simple_Titlecase_Mapping for code point %"PRIu32": %s",
+			    UCD_UNICODE_DATA_TXT, line_num, cp, columns[14]));
+		return;
+	}
+
+	/* Check data */
+
+	uint32_t cp_last = cp;
+
+	if (cp_first == 0)
+		cp_first = cp;
+	for (cp = cp_first; cp <= cp_last && !test_has_failed(); cp++) {
+		const struct unicode_code_point_data *cp_data =
+			unicode_code_point_get_data(cp);
+
+		test_assert_idx(
+			cp_data->general_category == general_category, cp);
+
+		test_assert_idx(
+			cp_data->simple_titlecase_mapping == simple_titlecase_mapping,
+			cp);
+	}
+
+	cp_first = 0;
+}
+
+static void
+test_ucd_file(const char *filename,
+	      void (*test_line)(const char *line, unsigned int line_num))
+{
+	const char *file_path = t_strconcat(UCD_DIR, "/", filename, NULL);
+	struct istream *input;
+	int fd;
+
+	fd = open(file_path, O_RDONLY);
+	if (fd < 0)
+		i_fatal("Failed to open '%s': %m", file_path);
+
+	test_begin(t_strdup_printf("unicode_data - %s", filename));
+
+	input = i_stream_create_fd_autoclose(&fd, 1024);
+
+	unsigned int line_num = 0;
+
+	while (!test_has_failed()) {
+		char *line = i_stream_read_next_line(input);
+
+		if (line == NULL)
+			break;
+		line_num++;
+
+		char *comment = strchr(line, '#');
+
+		if (comment != NULL)
+			*comment = '\0';
+		if (*line == '\0')
+			continue;
+
+		T_BEGIN {
+			test_line(line, line_num);
+		} T_END;
+	}
+
+	i_stream_destroy(&input);
+	test_end();
+}
+
+void test_unicode_data(void)
+{
+	/* Check that UCD data files match with what is compiled. */
+	test_ucd_file(UCD_UNICODE_DATA_TXT, test_unicode_data_line);
+}
diff --git a/src/lib/unicode-data-static.h b/src/lib/unicode-data-static.h
new file mode 100644
index 0000000000..0258548985
--- /dev/null
+++ b/src/lib/unicode-data-static.h
@@ -0,0 +1,102 @@
+#ifndef UNICODE_DATA_STATIC_H
+#define UNICODE_DATA_STATIC_H
+
+/* UAX #44, Section 5.7.1: General Category Values
+ */
+enum unicode_general_category {
+	UNICODE_GENERAL_CATEGORY_INVALID = 0,
+
+	/* LC - Cased_Letter: Lu | Ll | Lt */
+	UNICODE_GENERAL_CATEGORY_LC = (1 << 4),
+	/* L - Letter: Lu | Ll | Lt | Lm | Lo */
+	UNICODE_GENERAL_CATEGORY_L = (1 << 5) | UNICODE_GENERAL_CATEGORY_LC,
+	/* M - Mark: Mn | Mc | Me */
+	UNICODE_GENERAL_CATEGORY_M = (2 << 5),
+	/* N - Number: Nd | Nl | No */
+	UNICODE_GENERAL_CATEGORY_N = (3 << 5),
+	/* P - Punctuation: Pc | Pd | Ps | Pe | Pi | Pf | Po */
+	UNICODE_GENERAL_CATEGORY_P = (4 << 5),
+	/* S - Symbol: Sm | Sc | Sk | So */
+	UNICODE_GENERAL_CATEGORY_S = (5 << 5),
+	/* Z - Separator: Zs | Zl | Zp */
+	UNICODE_GENERAL_CATEGORY_Z = (6 << 5),
+	/* C - Other: Cc | Cf | Cs | Co | Cn */
+	UNICODE_GENERAL_CATEGORY_C = (7 << 5),
+
+	UNICODE_GENERAL_CATEGORY_GROUP_MASK = (0xf0),
+
+	/* Lu - Uppercase_Letter */
+	UNICODE_GENERAL_CATEGORY_LU = UNICODE_GENERAL_CATEGORY_LC | 1,
+	/* Ll - Lowercase_Letter */
+	UNICODE_GENERAL_CATEGORY_LL = UNICODE_GENERAL_CATEGORY_LC | 2,
+	/* Lt - Titlecase_Letter */
+	UNICODE_GENERAL_CATEGORY_LT = UNICODE_GENERAL_CATEGORY_LC | 3,
+	/* Lm - Modifier_Letter */
+	UNICODE_GENERAL_CATEGORY_LM = UNICODE_GENERAL_CATEGORY_L | 4,
+	/* Lo - Other_Letter */
+	UNICODE_GENERAL_CATEGORY_LO = UNICODE_GENERAL_CATEGORY_L | 5,
+
+	/* Mn - Nonspacing_Mark */
+	UNICODE_GENERAL_CATEGORY_MN = UNICODE_GENERAL_CATEGORY_M | 1,
+	/* Mc - Spacing_Mark */
+	UNICODE_GENERAL_CATEGORY_MC = UNICODE_GENERAL_CATEGORY_M | 2,
+	/* Me - Enclosing_Mark */
+	UNICODE_GENERAL_CATEGORY_ME = UNICODE_GENERAL_CATEGORY_M | 3,
+
+	/* Nd - Decimal_Number */
+	UNICODE_GENERAL_CATEGORY_ND = UNICODE_GENERAL_CATEGORY_N | 1,
+	/* Nl - Letter_Number */
+	UNICODE_GENERAL_CATEGORY_NL = UNICODE_GENERAL_CATEGORY_N | 2,
+	/* No - Other_Number */
+	UNICODE_GENERAL_CATEGORY_NO = UNICODE_GENERAL_CATEGORY_N | 3,
+
+	/* Pc -	Connector_Punctuation */
+	UNICODE_GENERAL_CATEGORY_PC = UNICODE_GENERAL_CATEGORY_P | 1,
+	/* Pd - Dash_Punctuation */
+	UNICODE_GENERAL_CATEGORY_PD = UNICODE_GENERAL_CATEGORY_P | 2,
+	/* Ps - Open_Punctuation */
+	UNICODE_GENERAL_CATEGORY_PS = UNICODE_GENERAL_CATEGORY_P | 3,
+	/* Pe - Close_Punctuation */
+	UNICODE_GENERAL_CATEGORY_PE = UNICODE_GENERAL_CATEGORY_P | 4,
+	/* Pi - Initial_Punctuation */
+	UNICODE_GENERAL_CATEGORY_PI = UNICODE_GENERAL_CATEGORY_P | 5,
+	/* Pf - Final_Punctuation */
+	UNICODE_GENERAL_CATEGORY_PF = UNICODE_GENERAL_CATEGORY_P | 6,
+	/* Po - Other_Punctuation */
+	UNICODE_GENERAL_CATEGORY_PO = UNICODE_GENERAL_CATEGORY_P | 7,
+
+	/* Sm - Math_Symbol */
+	UNICODE_GENERAL_CATEGORY_SM = UNICODE_GENERAL_CATEGORY_S | 1,
+	/* Sc - Currency_Symbol */
+	UNICODE_GENERAL_CATEGORY_SC = UNICODE_GENERAL_CATEGORY_S | 2,
+	/* Sk - Modifier_Symbol */
+	UNICODE_GENERAL_CATEGORY_SK = UNICODE_GENERAL_CATEGORY_S | 3,
+	/* So -	Other_Symbol */
+	UNICODE_GENERAL_CATEGORY_SO = UNICODE_GENERAL_CATEGORY_S | 4,
+
+	/* Zs - Space_Separator */
+	UNICODE_GENERAL_CATEGORY_ZS = UNICODE_GENERAL_CATEGORY_Z | 1,
+	/* Zl - Line_Separator */
+	UNICODE_GENERAL_CATEGORY_ZL = UNICODE_GENERAL_CATEGORY_Z | 2,
+	/* Zp - Paragraph_Separator */
+	UNICODE_GENERAL_CATEGORY_ZP = UNICODE_GENERAL_CATEGORY_Z | 3,
+
+	/* Cc - Control */
+	UNICODE_GENERAL_CATEGORY_CC = UNICODE_GENERAL_CATEGORY_C | 1,
+	/* Cf - Format */
+	UNICODE_GENERAL_CATEGORY_CF = UNICODE_GENERAL_CATEGORY_C | 2,
+	/* Cs - Surrogate */
+	UNICODE_GENERAL_CATEGORY_CS = UNICODE_GENERAL_CATEGORY_C | 3,
+	/* Co - Private_Use */
+	UNICODE_GENERAL_CATEGORY_CO = UNICODE_GENERAL_CATEGORY_C | 4,
+	/* Cn - Unassigned */
+	UNICODE_GENERAL_CATEGORY_CN = UNICODE_GENERAL_CATEGORY_C | 5,
+};
+
+struct unicode_code_point_data {
+	uint8_t general_category; // Not yet used
+
+	uint32_t simple_titlecase_mapping;
+};
+
+#endif
diff --git a/src/lib/unicode-data.c b/src/lib/unicode-data.c
new file mode 100644
index 0000000000..31e49903d5
--- /dev/null
+++ b/src/lib/unicode-data.c
@@ -0,0 +1,172 @@
+/* Copyright (c) 2025 Dovecot authors, see the included COPYING file */
+
+#include "lib.h"
+#include "unicode-data.h"
+
+uint8_t unicode_general_category_from_string(const char *str)
+{
+	if (str == NULL || strlen(str) != 2)
+		return UNICODE_GENERAL_CATEGORY_INVALID;
+
+	switch (str[0]) {
+	case 'L':
+		switch (str[1]) {
+		/* Lu - Uppercase_Letter */
+		case 'u':
+			return UNICODE_GENERAL_CATEGORY_LU;
+		/* Ll - Lowercase_Letter */
+		case 'l':
+			return UNICODE_GENERAL_CATEGORY_LL;
+		/* Lt - Titlecase_Letter */
+		case 't':
+			return UNICODE_GENERAL_CATEGORY_LT;
+		/* LC - Cased_Letter: Lu | Ll | Lt */
+		case 'C':
+			return UNICODE_GENERAL_CATEGORY_LC;
+		/* Lm - Modifier_Letter */
+		case 'm':
+			return UNICODE_GENERAL_CATEGORY_LM;
+		/* Lo - Other_Letter */
+		case 'o':
+			return UNICODE_GENERAL_CATEGORY_LO;
+		/* L - Letter: Lu | Ll | Lt | Lm | Lo */
+		case '\0':
+			return UNICODE_GENERAL_CATEGORY_L;
+		default:
+			break;
+		}
+		break;
+	case 'M':
+		switch (str[1]) {
+		/* Mn - Nonspacing_Mark */
+		case 'n':
+			return UNICODE_GENERAL_CATEGORY_MN;
+		/* Mc - Spacing_Mark */
+		case 'c':
+			return UNICODE_GENERAL_CATEGORY_MC;
+		/* Me - Enclosing_Mark */
+		case 'e':
+			return UNICODE_GENERAL_CATEGORY_ME;
+		/* M - Mark: Mn | Mc | Me */
+		case '\0':
+			return UNICODE_GENERAL_CATEGORY_M;
+		default:
+			break;
+		}
+		break;
+	case 'N':
+		switch (str[1]) {
+		/* Nd - Decimal_Number */
+		case 'd':
+			return UNICODE_GENERAL_CATEGORY_ND;
+		/* Nl - Letter_Number */
+		case 'l':
+			return UNICODE_GENERAL_CATEGORY_NL;
+		/* No - Other_Number */
+		case 'o':
+			return UNICODE_GENERAL_CATEGORY_NO;
+		/* N - Number: Nd | Nl | No */
+		case '\0':
+			return UNICODE_GENERAL_CATEGORY_N;
+		default:
+			break;
+		}
+		break;
+	case 'P':
+		switch (str[1]) {
+		/* Pc -	Connector_Punctuation */
+		case 'c':
+			return UNICODE_GENERAL_CATEGORY_PC;
+		/* Pd - Dash_Punctuation */
+		case 'd':
+			return UNICODE_GENERAL_CATEGORY_PD;
+		/* Ps - Open_Punctuation */
+		case 's':
+			return UNICODE_GENERAL_CATEGORY_PS;
+		/* Pe - Close_Punctuation */
+		case 'e':
+			return UNICODE_GENERAL_CATEGORY_PE;
+		/* Pi - Initial_Punctuation */
+		case 'i':
+			return UNICODE_GENERAL_CATEGORY_PI;
+		/* Pf - Final_Punctuation */
+		case 'f':
+			return UNICODE_GENERAL_CATEGORY_PF;
+		/* Po - Other_Punctuation */
+		case 'o':
+			return UNICODE_GENERAL_CATEGORY_PO;
+		/* P - Punctuation: Pc | Pd | Ps | Pe | Pi | Pf | Po */
+		case '\0':
+			return UNICODE_GENERAL_CATEGORY_P;
+		default:
+			break;
+		}
+		break;
+	case 'S':
+		switch (str[1]) {
+		/* Sm - Math_Symbol */
+		case 'm':
+			return UNICODE_GENERAL_CATEGORY_SM;
+		/* Sc - Currency_Symbol */
+		case 'c':
+			return UNICODE_GENERAL_CATEGORY_SC;
+		/* Sk - Modifier_Symbol */
+		case 'k':
+			return UNICODE_GENERAL_CATEGORY_SK;
+		/* So -	Other_Symbol */
+		case 'o':
+			return UNICODE_GENERAL_CATEGORY_SO;
+		/* S - Symbol: Sm | Sc | Sk | So */
+		case '\0':
+			return UNICODE_GENERAL_CATEGORY_S;
+		default:
+			break;
+		}
+		break;
+	case 'Z':
+		switch (str[1]) {
+		/* Zs - Space_Separator */
+		case 's':
+			return UNICODE_GENERAL_CATEGORY_ZS;
+		/* Zl - Line_Separator */
+		case 'l':
+			return UNICODE_GENERAL_CATEGORY_ZL;
+		/* Zp - Paragraph_Separator */
+		case 'p':
+			return UNICODE_GENERAL_CATEGORY_ZP;
+		/* Z - Separator: Zs | Zl | Zp */
+		case '\0':
+			return UNICODE_GENERAL_CATEGORY_Z;
+		default:
+			break;
+		}
+		break;
+	case 'C':
+		switch (str[1]) {
+		/* Cc - Control */
+		case 'c':
+			return UNICODE_GENERAL_CATEGORY_CC;
+		/* Cf - Format */
+		case 'f':
+			return UNICODE_GENERAL_CATEGORY_CF;
+		/* Cs - Surrogate */
+		case 's':
+			return UNICODE_GENERAL_CATEGORY_CS;
+		/* Co - Private_Use */
+		case 'o':
+			return UNICODE_GENERAL_CATEGORY_CO;
+		/* Cn - Unassigned */
+		case 'n':
+			return UNICODE_GENERAL_CATEGORY_CN;
+		/* C - Other: Cc | Cf | Cs | Co | Cn */
+		case '\0':
+			return UNICODE_GENERAL_CATEGORY_C;
+		default:
+			break;
+		}
+		break;
+	default:
+		break;
+	}
+	return UNICODE_GENERAL_CATEGORY_INVALID;
+}
diff --git a/src/lib/unicode-data.h b/src/lib/unicode-data.h
new file mode 100644
index 0000000000..6b156f1b30
--- /dev/null
+++ b/src/lib/unicode-data.h
@@ -0,0 +1,29 @@
+#ifndef UNICODE_DATA_H
+#define UNICODE_DATA_H
+
+#include "unicode-data-tables.h"
+
+static inline bool
+unicode_general_category_is_group(enum unicode_general_category gencat)
+{
+	return ((gencat & 0x0f) == 0x00);
+}
+
+static inline const struct unicode_code_point_data *
+unicode_code_point_get_data(uint32_t cp)
+{
+	unsigned int idx8 = cp >> 24;
+	unsigned int blk16 = unicode_code_points_index8[idx8];
+	unsigned int idx16 = (blk16 << 8) + ((cp >> 16) & 0xFF);
+	unsigned int blk24 = unicode_code_points_index16[idx16];
+	unsigned int idx24 = (blk24 << 8) + ((cp >> 8) & 0xFF);
+	unsigned int blk32 = unicode_code_points_index24[idx24];
+	unsigned int idx32 = (blk32 << 8) + (cp & 0xFF);
+	unsigned int idxcp = unicode_code_points_index32[idx32];
+
+	return &unicode_code_points[idxcp];
+}
+
+uint8_t unicode_general_category_from_string(const char *str);
+
+#endif
diff --git a/src/lib/unicode-ucd-compile.py b/src/lib/unicode-ucd-compile.py
new file mode 100755
index 0000000000..acd3653983
--- /dev/null
+++ b/src/lib/unicode-ucd-compile.py
@@ -0,0 +1,805 @@
+#!/usr/bin/env python3
+# Copyright (c) 2025 Dovecot authors, see the included COPYING file
+
+import argparse
+import bisect
+import copy
+import re
+import sys
+from pathlib import Path
+
+
+source_files = []
+
+ud_codepoints = []
+ud_codepoints_first = []
+ud_codepoints_last = []
+ud_codepoints_index = {}
+
+ud_codepoints_index8 = {}
+ud_codepoints_index16 = {}
+ud_codepoints_index16_reused = {}
+ud_codepoints_index16_offsets = {}
+ud_codepoints_index24 = {}
+ud_codepoints_index24_reused = {}
+ud_codepoints_index24_offsets = {}
+ud_codepoints_index32 = {}
+ud_codepoints_index32_reused = {}
+ud_codepoints_index32_offsets = {}
+ud_codepoints_index16_blocks = 1
+ud_codepoints_index24_blocks = 2
+ud_codepoints_index32_blocks = 2
+
+
+class UCDFileOpen:
+    def __init__(self, filename):
+        self.filename = filename
+
+    def __enter__(self):
+        global ucd_dir
+        global source_files
+
+        self.fd = open(ucd_dir + "/" + self.filename, mode="r", encoding="utf-8")
+        source_files.append(self.filename)
+        return self
+
+    def __exit__(self, exception_type, exception_value, exception_traceback):
+        self.fd.close()
+
+    def __str__(self):
+        return self.filename
+
+
+class CodePointData:
+    def mergeFrom(self, data, default=False):
+        for attr in dir(data):
+            if callable(getattr(data, attr)):
+                continue
+            if attr.startswith("__"):
+                continue
+            if default and hasattr(self, attr):
+                continue
+            setattr(self, attr, getattr(data, attr))
+
+
+class CodePointRange:
+    def insert(self, n):
+        global ud_codepoints
+        global ud_codepoints_first
+        global ud_codepoints_last
+
+        ud_codepoints.insert(n, self)
+        ud_codepoints_first.insert(n, self.cp_first)
+        ud_codepoints_last.insert(n, self.cp_last)
+
+    def modify(self, n):
+        global ud_codepoints
+        global ud_codepoints_first
+        global ud_codepoints_last
+
+        ud_codepoints_first[n] = self.cp_first
+        ud_codepoints_last[n] = self.cp_last
+
+    def __new__(cls, cp_first, cp_last, data, default=False):
+        global ud_codepoints
+        global ud_codepoints_first
+        global ud_codepoints_last
+
+        cprn_first = None
+
+        if len(ud_codepoints) == 0:
+            cprn = super().__new__(cls)
+            cprn.cp_first = cp_first
+            cprn.cp_last = cp_last
+            cprn.data = data
+            cprn.insert(0)
+            return
+
+        idx_first = bisect.bisect_left(ud_codepoints_first, cp_first)
+        idx_last = bisect.bisect_right(ud_codepoints_last, cp_last)
+        rng_first = idx_first - 1
+        rng_last = idx_last + 1
+        if rng_last >= len(ud_codepoints):
+            rng_last = len(ud_codepoints) - 1
+
+        # Check existing ranges
+        nn = None
+        n = rng_first
+        while n <= rng_last:
+            cpr = ud_codepoints[n]
+            pos = n
+            n += 1
+
+            # No overlap with this range
+            if cp_last < cpr.cp_first or cp_first > cpr.cp_last:
+                continue
+            # Exact match
+            if cp_first == cpr.cp_first and cp_last == cpr.cp_last:
+                cpr.data.mergeFrom(data, default)
+                return cpr
+            # New range fully envelops existing
+            if cp_first <= cpr.cp_first and cp_last >= cpr.cp_last:
+                # Split off range before
+                if cp_first < cpr.cp_first:
+                    cprn = super().__new__(cls)
+                    cprn.cp_first = cp_first
+                    cprn.cp_last = cpr.cp_first - 1
+                    cprn.data = copy.deepcopy(data)
+                    cprn.insert(pos)
+                    rng_last += 1
+                    if cprn_first is None:
+                        cprn_first = cprn
+                # Merge with existing
+                cpr.data.mergeFrom(data, default)
+                # Split off range after
+                if cp_last > cpr.cp_last:
+                    cp_first = cpr.cp_last + 1
+                    nn = pos + 1
+                    continue
+                break
+            # New range fully enveloped by existing
+            if cp_first > cpr.cp_first and cp_last < cpr.cp_last:
+                cprn = super().__new__(cls)
+                cprn.cp_first = cp_last + 1
+                cprn.cp_last = cpr.cp_last
+                cprn.data = cpr.data
+                cprn.insert(pos + 1)
+                rng_last += 1
+                cpr.cp_last = cp_first - 1
+                cpr.modify(pos)
+                cprn = super().__new__(cls)
+                cprn.cp_first = cp_first
+                cprn.cp_last = cp_last
+                cprn.data = copy.deepcopy(cpr.data)
+                cprn.data.mergeFrom(data, default)
+                cprn.insert(pos + 1)
+                rng_last += 1
+                return cprn
+            # New range aligns with beginning of existing
+            if cp_first == cpr.cp_first and cp_last < cpr.cp_last:
+                cpr.cp_first = cp_last + 1
+                cpr.modify(pos)
+                cprn = super().__new__(cls)
+                cprn.cp_first = cp_first
+                cprn.cp_last = cp_last
+                cprn.data = copy.deepcopy(cpr.data)
+                cprn.data.mergeFrom(data, default)
+                cprn.insert(pos)
+                rng_last += 1
+                return cprn
+            # New range aligns with end of existing
+            if cp_first > cpr.cp_first and cp_last == cpr.cp_last:
+                cpr.cp_last = cp_first - 1
+                cpr.modify(pos)
+                cprn = super().__new__(cls)
+                cprn.cp_first = cp_first
+                cprn.cp_last = cp_last
+                cprn.data = copy.deepcopy(cpr.data)
+                cprn.data.mergeFrom(data, default)
+                cprn.insert(pos + 1)
+                rng_last += 1
+                return cprn
+            # New range crosses the beginning of existing
+            if cp_first < cpr.cp_first and cp_last >= cpr.cp_first:
+                cprn = super().__new__(cls)
+                cprn.cp_first = cp_first
+                cprn.cp_last = cpr.cp_first - 1
+                cprn.data = data
+                cprn.insert(pos)
+                rng_last += 1
+                cprn = super().__new__(cls)
+                cprn.cp_first = cpr.cp_first
+                cprn.cp_last = cp_last
+                cprn.data = copy.deepcopy(cpr.data)
+                cprn.data.mergeFrom(data, default)
+                cprn.insert(pos + 1)
+                rng_last += 1
+                cpr.cp_first = cp_last + 1
+                cpr.modify(pos + 2)
+                return cprn
+            # New range crosses the end of existing
+            if cp_first <= cpr.cp_last and cp_last > cpr.cp_last:
+                cprn = super().__new__(cls)
+                cprn.cp_first = cp_first
+                cprn.cp_last = cpr.cp_last
+                cprn.data = copy.deepcopy(cpr.data)
+                cprn.data.mergeFrom(data, default)
+                cprn.insert(pos)
+                rng_last += 1
+                if cprn_first is None:
+                    cprn_first = cprn
+                tmp = cp_first
+                cp_first = cpr.cp_last + 1
+                cpr.cp_last = tmp - 1
+                cpr.modify(pos + 1)
+                nn = pos + 1
+                continue
+
+        cprn = super().__new__(cls)
+        cprn.cp_first = cp_first
+        cprn.cp_last = cp_last
+        cprn.data = data
+        if nn is None:
+            cprn.insert(idx_first)
+        else:
+            cprn.insert(nn)
+        if cprn_first is None:
+            cprn_first = cprn
+
+        return cprn_first
+
+
+def die(message):
+    module_filename = Path(__file__).name
+    print(f"{module_filename}: {message}", file=sys.stderr)
+    sys.exit(1)
+
+
+def parse_cp_range(column):
+    rng_hex = column.strip()
+    if len(rng_hex) == 0:
+        return None
+    rng = rng_hex.split("..")
+
+    cp_hex = rng[0].strip()
+    cp_first = int(cp_hex, 16)
+    cp_last = cp_first
+
+    if len(rng) > 1:
+        cp_hex = rng[1].strip()
+        cp_last = int(cp_hex, 16)
+
+    return (cp_first, cp_last)
+
+
+def read_ucd_files():
+    global ud_decomposition_type_names
+    global ud_composition_exclusions
+
+    # UnicodeData.txt
+    with UCDFileOpen("UnicodeData.txt") as ucd:
+        cp_range_first = None
+        line_num = 0
+        for line in ucd.fd:
+            line_num = line_num + 1
+            data = line.split("#")
+            line = data[0].strip()
+            if len(line) == 0:
+                continue
+
+            cols = line.split(";")
+            if len(cols) < 15:
+                die(f"{ucd}:{line_num}: Missing columns")
+
+            # (0) Code point in hex
+
+            cp_first = cp_last = int(cols[0].strip(), 16)
+
+            # (1) Name
+
+            cp_name = cols[1].strip()
+
+            x = re.search("<([^>]*), (First|Last)>", cp_name)
+            if x:
+                if x.group(2) == "First":
+                    cp_range_first = cp_first
+                    continue
+                if x.group(2) == "Last" and cp_range_first is not None:
+                    cp_first = cp_range_first
+                    cp_name = "<%s>" % x.group(1)
+                    cp_range_first = None
+
+            cpd = CodePointData()
+            cpd.name = cp_name
+
+            # (2) General_Category
+
+            cpd.general_category = cols[2].strip()
+
+            # (3) Canonical_Combining_Class
+
+            ccc = cols[3].strip()
+            if ccc != "":
+                cpd.canonical_combining_class = int(ccc)
+
+            # (5) Decomposition_Type, Decomposition_Mapping
+
+            x = re.search("(<([^>]*)> )?(.+)", cols[5].strip())
+            if x:
+                if x.group(2) is not None:
+                    cpd.decomposition_type = x.group(2)
+                dcs_txt = x.group(3).split(" ")
+                dcs = []
+                for dc_txt in dcs_txt:
+                    dcs.append(int(dc_txt.strip(), 16))
+                cpd.decomposition_first = dcs
+
+            # (12) Simple_Uppercase_Mapping
+
+            code = cols[12].strip()
+            if code != "":
+                cpd.simple_uppercase_mapping = int(code, 16)
+
+            # (13) Simple_Lowercase_Mapping
+
+            code = cols[13].strip()
+            if code != "":
+                cpd.simple_lowercase_mapping = int(code, 16)
+
+            # (14) Simple_Titlecase_Mapping
+
+            code = cols[14].strip()
+            if code != "":
+                cpd.simple_titlecase_mapping = int(code, 16)
+
+            # Add range
+            CodePointRange(cp_first, cp_last, cpd)
+
+
+def create_cp_range_index():
+    global ud_codepoints
+    global ud_codepoints_index
+
+    for cpr in ud_codepoints:
+        ud_codepoints_index[cpr.cp_first] = cpr
+
+
+def update_cp_index_tables(cp_first, cp_last, cp_pos):
+    global ud_codepoints_index8
+    global ud_codepoints_index16
+    global ud_codepoints_index16_reused
+    global ud_codepoints_index16_offsets
+    global ud_codepoints_index16_blocks
+    global ud_codepoints_index24
+    global ud_codepoints_index24_reused
+    global ud_codepoints_index24_offsets
+    global ud_codepoints_index24_blocks
+    global ud_codepoints_index32
+    global ud_codepoints_index32_reused
+    global ud_codepoints_index32_offsets
+    global ud_codepoints_index32_blocks
+
+    cp_range = range(cp_first, cp_last + 1)
+
+    id16_block = None
+    id24_block = None
+    id32_block = None
+    first16 = True
+    first24 = True
+    first32 = True
+
+    last_rcp = cp_last
+    for rcp in cp_range:
+        # Index for first 8 bits of code point
+        id8_idx = rcp >> 24
+        if id8_idx in ud_codepoints_index8:
+            id16_block = ud_codepoints_index8[id8_idx]
+        elif (
+            id16_block is not None
+            and not first16
+            and ((last_rcp & 0xFFFFFF) == 0xFFFFFF or (rcp >> 24) != (last_rcp >> 24))
+        ):
+            ud_codepoints_index8[id8_idx] = id16_block
+            if id16_block not in ud_codepoints_index16_reused:
+                ud_codepoints_index16_reused[id16_block] = 1
+            ud_codepoints_index16_reused[id16_block] += 1
+        else:
+            first16 = False
+            id16_block = ud_codepoints_index16_blocks
+            ud_codepoints_index8[id8_idx] = id16_block
+            ud_codepoints_index16_offsets[id16_block] = rcp & (((1 << 8) - 1) << 24)
+            ud_codepoints_index16_blocks += 1
+
+        # Index for first 16 bits of code point
+        id16_idx = (id16_block << 8) + ((rcp >> 16) & 0xFF)
+        if id16_idx in ud_codepoints_index16:
+            id24_block = ud_codepoints_index16[id16_idx]
+        elif (
+            id24_block is not None
+            and not first24
+            and ((last_rcp & 0xFFFF) == 0xFFFF or (rcp >> 16) != (last_rcp >> 16))
+        ):
+            ud_codepoints_index16[id16_idx] = id24_block
+            if id24_block not in ud_codepoints_index24_reused:
+                ud_codepoints_index24_reused[id24_block] = 1
+            ud_codepoints_index24_reused[id24_block] += 1
+        else:
+            first24 = False
+            id24_block = ud_codepoints_index24_blocks
+            ud_codepoints_index16[id16_idx] = id24_block
+            ud_codepoints_index24_offsets[id24_block] = rcp & (((1 << 16) - 1) << 16)
+            ud_codepoints_index24_blocks += 1
+
+        # Index for first 24 bits of code point
+        id24_idx = (id24_block << 8) + ((rcp >> 8) & 0xFF)
+        if id24_idx in ud_codepoints_index24:
+            id32_block = ud_codepoints_index24[id24_idx]
+        elif (
+            id32_block is not None
+            and not first32
+            and ((last_rcp & 0xFF) == 0xFF or (rcp >> 8) != (last_rcp >> 8))
+        ):
+            ud_codepoints_index24[id24_idx] = id32_block
+            if id32_block not in ud_codepoints_index32_reused:
+                ud_codepoints_index32_reused[id32_block] = 1
+            ud_codepoints_index32_reused[id32_block] += 1
+        else:
+            first32 = False
+            id32_block = ud_codepoints_index32_blocks
+            ud_codepoints_index24[id24_idx] = id32_block
+            ud_codepoints_index32_offsets[id32_block] = rcp & (((1 << 24) - 1) << 8)
+            ud_codepoints_index32_blocks += 1
+
+        # Index for first 32 bits of code point
+        id32_idx = (id32_block << 8) + (rcp & 0xFF)
+        ud_codepoints_index32[id32_idx] = cp_pos
+
+
+def create_cp_index_tables():
+    global ud_codepoints
+
+    # Create code point index
+    for n in range(0, len(ud_codepoints)):
+        cpr = ud_codepoints[n]
+        cp_first = cpr.cp_first
+        cp_last = cpr.cp_last
+
+        update_cp_index_tables(cp_first, cp_last, n)
+
+
+def get_general_category_def(gc):
+    return "UNICODE_GENERAL_CATEGORY_%s" % gc.upper()
+
+
+def decomposition_type_def(dt):
+    return "UNICODE_DECOMPOSITION_TYPE_%s" % dt.upper()
+
+
+def print_list(code_list):
+    last = len(code_list) - 1
+    n = 0
+    print("\t", end="")
+    for code in code_list:
+        print("0x%05x" % code, end="")
+        if n == last:
+            break
+        print(",", end="")
+
+        n += 1
+        if (n % 8) == 0:
+            print("")
+            print("\t", end="")
+            if (n % 10) == 0:
+                print("// INDEX %u" % n)
+                print("\t", end="")
+        else:
+            print(" ", end="")
+
+
+def print_top_message():
+    global ucd_dir
+    global source_files
+
+    print("/* This file is automatically generated by unicode-ucd-compile.py from:")
+    for sf in source_files:
+        print("     %s/%s" % (ucd_dir, sf))
+    print(" */")
+    print("")
+
+
+def write_tables_h():
+    global output_dir
+    global ud_decomposition_max_length
+    global ud_compositions_max_per_starter
+
+    orig_stdout = sys.stdout
+
+    with open(output_dir + "/unicode-data-tables.h", mode="w", encoding="utf-8") as fd:
+        sys.stdout = fd
+
+        print("#ifndef UNICODE_DATA_TABLES_H")
+        print("#define UNICODE_DATA_TABLES_H")
+        print("")
+        print_top_message()
+        print('#include "unicode-data-static.h"')
+        print("")
+        print("extern const struct unicode_code_point_data unicode_code_points[];")
+        print("")
+        print("extern const uint8_t unicode_code_points_index8[];")
+        print("extern const uint8_t unicode_code_points_index16[];")
+        print("extern const uint16_t unicode_code_points_index24[];")
+        print("extern const uint16_t unicode_code_points_index32[];")
+        print("")
+        print("#endif")
+
+    sys.stdout = orig_stdout
+
+
+def write_tables_c():
+    global output_dir
+    global ud_codepoints
+    global ud_decompositions
+    global ud_compositions
+    global ud_composition_primaries
+    global ud_case_mappings
+
+    orig_stdout = sys.stdout
+
+    with open(output_dir + "/unicode-data-tables.c", mode="w", encoding="utf-8") as fd:
+        sys.stdout = fd
+        print_top_message()
+
+        print('#include "lib.h"')
+        print('#include "unicode-data-tables.h"')
+        print("")
+        print("const struct unicode_code_point_data unicode_code_points[] = {")
+        print("\t{ // [0000] <invalid>")
+        print("\t\t.general_category = UNICODE_GENERAL_CATEGORY_INVALID,")
+        print("\t},")
+        print("\t{ // [0001] <unassigned>")
+        print("\t\t.general_category = UNICODE_GENERAL_CATEGORY_CN,")
+        print("\t},")
+        n = 2
+        for cpr in ud_codepoints:
+            cpd = cpr.data
+
+            if cpr.cp_last > cpr.cp_first:
+                range_str = "U+%04X..U+%04X" % (cpr.cp_first, cpr.cp_last)
+            else:
+                range_str = "U+%04X" % (cpr.cp_first)
+            print("\t{ // [%04X] %s: %s" % (n, range_str, cpd.name))
+            n = n + 1
+
+            print(
+                "\t\t.general_category = %s,"
+                % get_general_category_def(cpd.general_category)
+            )
+            if hasattr(cpd, "simple_titlecase_mapping"):
+                print(
+                    "\t\t.simple_titlecase_mapping = 0x%04X,"
+                    % cpd.simple_titlecase_mapping
+                )
+            print("\t},")
+        print("};")
+        print("")
+        # Code points index8
+        print("const uint8_t unicode_code_points_index8[] = {")
+        print("\t", end="")
+        for n in range(0, 256):
+            if n in ud_codepoints_index8:
+                print("0x%02x" % ud_codepoints_index8[n], end="")
+            else:
+                print("0x00", end="")
+            if n == 255:
+                break
+            print(",", end="")
+
+            if ((n + 1) % 8) == 0:
+                print("\n\t", end="")
+            else:
+                print(" ", end="")
+        print(",")
+        print("};")
+        print("")
+        # Code points index16
+        print("const uint8_t unicode_code_points_index16[] = {")
+        print("\t// Block 0x00: <invalid>")
+        print("\t", end="")
+        last = (1 << 8) - 1
+        for n in range(0 << 8, last + 1):
+            print("0x00", end="")
+            if n == last:
+                break
+            print(",", end="")
+
+            if ((n + 1) % 8) == 0:
+                print("\n\t", end="")
+            else:
+                print(" ", end="")
+        print(",")
+        print("\t", end="")
+        last = (ud_codepoints_index16_blocks << 8) - 1
+        for n in range((1 << 8), last + 1):
+            if (n & ((1 << 8) - 1)) == 0:
+                blk_id = n >> 8
+                blk_offset = ud_codepoints_index16_offsets[blk_id]
+                blk_end = blk_offset + (1 << 24) - 1
+                print(
+                    "// Block 0x%02X: U+%06X..U+%06X" % (blk_id, blk_offset, blk_end),
+                    end="",
+                )
+                if blk_id in ud_codepoints_index16_reused:
+                    print(
+                        " (used %u times)" % ud_codepoints_index16_reused[blk_id], end=""
+                    )
+                print("")
+                print("\t", end="")
+            if n in ud_codepoints_index16:
+                print("0x%02x" % ud_codepoints_index16[n], end="")
+            elif ud_codepoints_index16_offsets[n >> 8] + ((n & 0xFF) << 16) > 0x10FFFF:
+                print("0x00", end="")
+            else:
+                print("0x01", end="")
+            if n == last:
+                break
+            print(",", end="")
+
+            if ((n + 1) % 8) == 0:
+                print("")
+                print("\t", end="")
+            else:
+                print(" ", end="")
+        print("")
+        print("};")
+        print("")
+        # Code points index24
+        print("const uint16_t unicode_code_points_index24[] = {")
+        print("\t// Block 0x00: <invalid>")
+        print("\t", end="")
+        last = (1 << 8) - 1
+        for n in range((0 << 8), last + 1):
+            print("0x000", end="")
+            if n == last:
+                break
+            print(",", end="")
+
+            if ((n + 1) % 8) == 0:
+                print("")
+                print("\t", end="")
+            else:
+                print(" ", end="")
+        print(",")
+        print("\t// Block 0x01: <unassigned>")
+        print("\t", end="")
+        last = (2 << 8) - 1
+        for n in range((1 << 8), last + 1):
+            print("0x001", end="")
+            if n == last:
+                break
+            print(",", end="")
+
+            if ((n + 1) % 8) == 0:
+                print("")
+                print("\t", end="")
+            else:
+                print(" ", end="")
+        print(",")
+        print("\t", end="")
+        last = (ud_codepoints_index24_blocks << 8) - 1
+        for n in range((2 << 8), last + 1):
+            if (n & ((1 << 8) - 1)) == 0:
+                blk_id = n >> 8
+                blk_offset = ud_codepoints_index24_offsets[blk_id]
+                blk_end = blk_offset + (1 << 16) - 1
+                print(
+                    "// Block 0x%04X: U+%06X..U+%06X" % (blk_id, blk_offset, blk_end),
+                    end="",
+                )
+                if blk_id in ud_codepoints_index24_reused:
+                    print(
+                        " (used %u times)" % ud_codepoints_index24_reused[blk_id], end=""
+                    )
+                print("")
+                print("\t", end="")
+            if n in ud_codepoints_index24:
+                print("0x%03x" % ud_codepoints_index24[n], end="")
+            else:
+                print("0x001", end="")
+            if n == last:
+                break
+            print(",", end="")
+
+            if ((n + 1) % 8) == 0:
+                print("")
+                print("\t", end="")
+            else:
+                print(" ", end="")
+        print(",")
+        print("};")
+        print("")
+        # Code points index32
+        print("const uint16_t unicode_code_points_index32[] = {")
+        print("\t// Block 0x000: <invalid>")
+        print("\t", end="")
+        last = (1 << 8) - 1
+        for n in range(0 << 8, last + 1):
+            print("0x0000", end="")
+            if n == last:
+                break
+            print(",", end="")
+
+            if ((n + 1) % 8) == 0:
+                print("")
+                print("\t", end="")
+            else:
+                print(" ", end="")
+        print(",")
+        print("\t// Block 0x001: <unassigned>")
+        print("\t", end="")
+        last = (2 << 8) - 1
+        for n in range(1 << 8, last + 1):
+            print("0x0001", end="")
+            if n == last:
+                break
+            print(",", end="")
+
+            if ((n + 1) % 8) == 0:
+                print("")
+                print("\t", end="")
+            else:
+                print(" ", end="")
+        print(",")
+        print("\t", end="")
+        last = (ud_codepoints_index32_blocks << 8) - 1
+        for n in range(2 << 8, last + 1):
+            if (n & ((1 << 8) - 1)) == 0:
+                blk_id = n >> 8
+                blk_offset = ud_codepoints_index32_offsets[blk_id]
+                blk_end = blk_offset + (1 << 8) - 1
+                print(
+                    "// Block 0x%04X: U+%06X - U+%06X" % (blk_id, blk_offset, blk_end),
+                    end="",
+                )
+                if blk_id in ud_codepoints_index32_reused:
+                    print(
+                        " (used %u times)" % ud_codepoints_index32_reused[blk_id], end=""
+                    )
+                print("")
+                print("\t", end="")
+            if n in ud_codepoints_index32:
+                print("0x%04x" % (ud_codepoints_index32[n] + 2), end="")
+            else:
+                print("0x0001", end="")
+            if n == last:
+                break
+            print(",", end="")
+
+            if ((n + 1) % 8) == 0:
+                print("")
+                print("\t", end="")
+            else:
+                print(" ", end="")
+        print(",")
+        print("};")
+
+    sys.stdout = orig_stdout
+
+
+def main():
+    global ucd_dir
+    global output_dir
+    global source_files
+
+    """Entry point."""
+    parser = argparse.ArgumentParser(
+        prog="unicode-ucd-compile.py",
+        description="Compile the Unicode Character Database files into C code",
+    )
+    parser.add_argument(
+        "ucd-dir",
+        type=str,
+        help="Directory containing the UCD files",
+    )
+    parser.add_argument(
+        "output-dir",
+        type=str,
+        help="Output directory where the C header and source files are written",
+    )
+    args = parser.parse_args()
+
+    ucd_dir = getattr(args, "ucd-dir")
+    output_dir = getattr(args, "output-dir")
+
+    read_ucd_files()
+    source_files.sort()
+
+    create_cp_range_index()
+
+    create_cp_index_tables()
+
+    write_tables_h()
+    write_tables_c()
+
+
+if __name__ == "__main__":
+    main()