]> git.ipfire.org Git - thirdparty/dovecot/core.git/commitdiff
lib: Start new Unicode Character Database implementation
authorStephan Bosch <stephan.bosch@open-xchange.com>
Fri, 21 Mar 2025 03:26:50 +0000 (04:26 +0100)
committerStephan Bosch <stephan.bosch@open-xchange.com>
Fri, 1 Aug 2025 01:11:19 +0000 (03:11 +0200)
It uses a pre-compiled trie structure and will in later commits feature full
support for the data necessary for Unicode normalization. Stuff needed for
lib-language can be migrated here as well.

.gitignore
src/lib/Makefile.am
src/lib/test-lib.inc
src/lib/test-unicode-data.c [new file with mode: 0644]
src/lib/unicode-data-static.h [new file with mode: 0644]
src/lib/unicode-data.c [new file with mode: 0644]
src/lib/unicode-data.h [new file with mode: 0644]
src/lib/unicode-ucd-compile.py [new file with mode: 0755]

index c1158e9ebf443b47e2618d4f00f92d1be8e9e69b..84e994bc969c1853e81bf822f8c4b1d4bd137b7b 100644 (file)
@@ -101,6 +101,10 @@ src/lib/event-filter-lexer.c
 src/lib/event-filter-parser.c
 src/lib/event-filter-parser.h
 src/lib/unicodemap.c
+src/lib/unicode-data-tables.c
+src/lib/unicode-data-tables.h
+src/lib/unicode-data-types.c
+src/lib/unicode-data-types.h
 src/lib-compression/bench-compression
 src/lib-language/PropList.txt
 src/lib-language/WordBreakProperty.txt
index d0e2715984245778e2dc75dcc693f5a7dd0ef9a3..1d98248ecf9eb6b881b6a8dcb5b10f087eace84f 100644 (file)
@@ -4,6 +4,8 @@ AM_CPPFLAGS = \
 noinst_LTLIBRARIES = liblib.la
 
 BUILT_SOURCES = $(srcdir)/unicodemap.c \
+               $(srcdir)/unicode-data-tables.c \
+               $(srcdir)/unicode-data-tables.h \
                event-filter-lexer.c \
                event-filter-parser.c \
                event-filter-parser.h
@@ -11,11 +13,17 @@ BUILT_SOURCES = $(srcdir)/unicodemap.c \
 UCD_URL = https://dovecot.org/res
 UCD_DIR = $(srcdir)/ucd
 UCD_FILES = \
+       $(UCD_DIR)/DerivedCoreProperties.txt \
        $(UCD_DIR)/UnicodeData.txt
 
-EXTRA_DIST = unicodemap.c unicodemap.pl $(UCD_FILES)
-EXTRA_CLEAN = unicodemap.c
-
+EXTRA_DIST = \
+       unicodemap.c \
+       unicode-data-tables.c \
+       unicode-data-tables.h \
+       unicodemap.pl \
+       unicode-ucd-compile.py \
+       $(UCD_FILES)
+EXTRA_CLEAN = unicodemap.c unicode-data-tables.c
 
 # Squelch autoconf error about using .[ly] sources but not defining $(LEX)
 # and $(YACC).  Using false here avoids accidental use.
@@ -37,11 +45,16 @@ YACC=/bin/false
 # dependency, anything including the header will race the bison process.
 event-filter-parser.h: event-filter-parser.c
 
+$(UCD_DIR)/DerivedCoreProperties.txt:
+       $(AM_V_at)test -f $@ || $(WGET) -nv -O $@ $(UCD_URL)/DerivedCoreProperties.txt
 $(UCD_DIR)/UnicodeData.txt:
        $(AM_V_at)test -f $@ || $(WGET) -nv -O $@ $(UCD_URL)/UnicodeData.txt
 
 $(srcdir)/unicodemap.c: $(srcdir)/unicodemap.pl $(UCD_DIR)/UnicodeData.txt
        $(AM_V_GEN)$(PERL) $(srcdir)/unicodemap.pl < $(UCD_DIR)/UnicodeData.txt > $@
+$(srcdir)/unicode-data-tables.c $(srcdir)/unicode-data-tables.h &: \
+       $(srcdir)/unicode-ucd-compile.py $(UCD_FILES)
+       $(AM_V_GEN)$(PYTHON) $(srcdir)/unicode-ucd-compile.py $(UCD_DIR) $(srcdir)
 
 liblib_la_LIBADD = $(LIBUNWIND_LIBS)
 liblib_la_SOURCES = \
@@ -200,6 +213,8 @@ liblib_la_SOURCES = \
        unlink-directory.c \
        unlink-old-files.c \
        unichar.c \
+       unicode-data-tables.c \
+       unicode-data.c \
        uri-util.c \
        utc-offset.c \
        utc-mktime.c \
@@ -360,6 +375,9 @@ headers = \
        unlink-directory.h \
        unlink-old-files.h \
        unichar.h \
+       unicode-data-static.h \
+       unicode-data-tables.h \
+       unicode-data.h \
        uri-util.h \
        utc-offset.h \
        utc-mktime.h \
@@ -370,7 +388,8 @@ test_programs = test-lib
 noinst_PROGRAMS = $(test_programs)
 
 test_lib_CPPFLAGS = \
-       -I$(top_srcdir)/src/lib-test
+       -I$(top_srcdir)/src/lib-test \
+       -DUCD_DIR=\"$(UCD_DIR)\"
 
 test_libs = \
        ../lib-test/libtest.la \
@@ -467,6 +486,7 @@ test_lib_SOURCES = \
        test-str-table.c \
        test-time-util.c \
        test-unichar.c \
+       test-unicode-data.c \
        test-utc-mktime.c \
        test-uri.c \
        test-wildcard-match.c
@@ -478,7 +498,7 @@ test_headers = \
 test_lib_LDADD = $(test_libs) -lm
 test_lib_DEPENDENCIES = $(test_libs)
 
-check-local:
+check-local: $(UCD_FILES)
        for bin in $(test_programs); do \
          if ! $(RUN_TEST) ./$$bin; then exit 1; fi; \
        done
index 2fceca2f9bccaf4a63c1e2dd90c024af59fe2295..d3a1f336697479f813c91383efac8c0d40501be3 100644 (file)
@@ -107,6 +107,7 @@ TEST(test_str_sanitize)
 TEST(test_str_table)
 TEST(test_time_util)
 TEST(test_unichar)
+TEST(test_unicode_data)
 TEST(test_uri)
 TEST(test_utc_mktime)
 TEST(test_wildcard_match)
diff --git a/src/lib/test-unicode-data.c b/src/lib/test-unicode-data.c
new file mode 100644 (file)
index 0000000..b01b57b
--- /dev/null
@@ -0,0 +1,169 @@
+/* Copyright (c) 2025 Dovecot authors, see the included COPYING file */
+
+#include "test-lib.h"
+#include "strnum.h"
+#include "str.h"
+#include "unichar.h"
+#include "istream.h"
+#include "unicode-data.h"
+
+#include <fcntl.h>
+
+#define UCD_UNICODE_DATA_TXT "UnicodeData.txt"
+
+static void test_unicode_data_line(const char *line, unsigned int line_num)
+{
+       static uint32_t cp_first = 0;
+
+       const char *const *columns = t_strsplit(line, ";");
+       if (str_array_length(columns) < 15) {
+               test_failed(t_strdup_printf(
+                       "Invalid data at %s:%u",
+                       UCD_UNICODE_DATA_TXT, line_num));
+               return;
+       }
+
+       const char *cp_hex = columns[0];
+       uint32_t cp;
+
+       if (str_to_uint32_hex(cp_hex, &cp) < 0) {
+               test_failed(t_strdup_printf(
+                               "Invalid data at %s:%u: "
+                               "Bad code point",
+                               UCD_UNICODE_DATA_TXT, line_num));
+               return;
+       }
+
+       /* Parse Name */
+
+       const char *cp_name = columns[1];
+       size_t cp_name_len = strlen(cp_name);
+       const char *p;
+
+       if (cp_name[0] == '<' && cp_name[cp_name_len - 1] == '>') {
+               p = strchr(cp_name + 1, ',');
+               if (p != NULL) {
+                       if (strcmp(p, ", First>") == 0) {
+                               cp_first = cp;
+                               return;
+                       } else if (strcmp(p, ", Last>") != 0) {
+                               test_failed(t_strdup_printf(
+                                       "Invalid data at %s:%u: "
+                                       "Bad code point range: %s",
+                                       UCD_UNICODE_DATA_TXT, line_num, cp_name));
+                               return;
+                       }
+               }
+       }
+
+       /* Parse General_Category */
+
+       uint8_t general_category =
+               (uint8_t)unicode_general_category_from_string(columns[2]);
+       if (general_category == UNICODE_GENERAL_CATEGORY_INVALID) {
+               test_failed(t_strdup_printf(
+                           "Invalid data at %s:%u: "
+                           "Bad General_Category for code point %"PRIu32": %s",
+                           UCD_UNICODE_DATA_TXT, line_num, cp, columns[2]));
+               return;
+       }
+       test_assert(!unicode_general_category_is_group(general_category));
+
+       /* Parse Simple_*case_Mapping */
+
+       uint32_t simple_uppercase_mapping = 0;
+       uint32_t simple_lowercase_mapping = 0;
+       uint32_t simple_titlecase_mapping = 0;
+
+       if (*columns[12] != '\0' &&
+           str_to_uint32_hex(columns[12], &simple_uppercase_mapping) < 0) {
+               test_failed(t_strdup_printf(
+                           "Invalid data at %s:%u: "
+                           "Bad Simple_Uppercase_Mapping for code point %"PRIu32": %s",
+                           UCD_UNICODE_DATA_TXT, line_num, cp, columns[12]));
+               return;
+       }
+       if (*columns[13] != '\0' &&
+           str_to_uint32_hex(columns[13], &simple_lowercase_mapping) < 0) {
+               test_failed(t_strdup_printf(
+                           "Invalid data at %s:%u: "
+                           "Bad Simple_Lowercase_Mapping for code point %"PRIu32": %s",
+                           UCD_UNICODE_DATA_TXT, line_num, cp, columns[13]));
+               return;
+       }
+       if (*columns[14] != '\0' &&
+           str_to_uint32_hex(columns[14], &simple_titlecase_mapping) < 0) {
+               test_failed(t_strdup_printf(
+                           "Invalid data at %s:%u: "
+                           "Bad Simple_Titlecase_Mapping for code point %"PRIu32": %s",
+                           UCD_UNICODE_DATA_TXT, line_num, cp, columns[14]));
+               return;
+       }
+
+       /* Check data */
+
+       uint32_t cp_last = cp;
+
+       if (cp_first == 0)
+               cp_first = cp;
+       for (cp = cp_first; cp <= cp_last && !test_has_failed(); cp++) {
+               const struct unicode_code_point_data *cp_data =
+                       unicode_code_point_get_data(cp);
+
+               test_assert_idx(
+                       cp_data->general_category == general_category, cp);
+
+               test_assert_idx(
+                       cp_data->simple_titlecase_mapping == simple_titlecase_mapping,
+                       cp);
+       }
+
+       cp_first = 0;
+}
+
+static void
+test_ucd_file(const char *filename,
+             void (*test_line)(const char *line, unsigned int line_num))
+{
+       const char *file_path = t_strconcat(UCD_DIR, "/", filename, NULL);
+       struct istream *input;
+       int fd;
+
+       fd = open(file_path, O_RDONLY);
+       if (fd < 0)
+               i_fatal("Failed to open '%s': %m", file_path);
+
+       test_begin(t_strdup_printf("unicode_data - %s", filename));
+
+       input = i_stream_create_fd_autoclose(&fd, 1024);
+
+       unsigned int line_num = 0;
+
+       while (!test_has_failed()) {
+               char *line = i_stream_read_next_line(input);
+
+               if (line == NULL)
+                       break;
+               line_num++;
+
+               char *comment = strchr(line, '#');
+
+               if (comment != NULL)
+                       *comment = '\0';
+               if (*line == '\0')
+                       continue;
+
+               T_BEGIN {
+                       test_line(line, line_num);
+               } T_END;
+       }
+
+       i_stream_destroy(&input);
+       test_end();
+}
+
+void test_unicode_data(void)
+{
+       /* Check that UCD data files match with what is compiled. */
+       test_ucd_file(UCD_UNICODE_DATA_TXT, test_unicode_data_line);
+}
diff --git a/src/lib/unicode-data-static.h b/src/lib/unicode-data-static.h
new file mode 100644 (file)
index 0000000..0258548
--- /dev/null
@@ -0,0 +1,102 @@
+#ifndef UNICODE_DATA_STATIC_H
+#define UNICODE_DATA_STATIC_H
+
+/* UAX #44, Section 5.7.1: General Category Values
+ */
+enum unicode_general_category {
+       UNICODE_GENERAL_CATEGORY_INVALID = 0,
+
+       /* LC - Cased_Letter: Lu | Ll | Lt */
+       UNICODE_GENERAL_CATEGORY_LC = (1 << 4),
+       /* L - Letter: Lu | Ll | Lt | Lm | Lo */
+       UNICODE_GENERAL_CATEGORY_L = (1 << 5) | UNICODE_GENERAL_CATEGORY_LC,
+       /* M - Mark: Mn | Mc | Me */
+       UNICODE_GENERAL_CATEGORY_M = (2 << 5),
+       /* N - Number: Nd | Nl | No */
+       UNICODE_GENERAL_CATEGORY_N = (3 << 5),
+       /* P - Punctuation: Pc | Pd | Ps | Pe | Pi | Pf | Po */
+       UNICODE_GENERAL_CATEGORY_P = (4 << 5),
+       /* S - Symbol: Sm | Sc | Sk | So */
+       UNICODE_GENERAL_CATEGORY_S = (5 << 5),
+       /* Z - Separator: Zs | Zl | Zp */
+       UNICODE_GENERAL_CATEGORY_Z = (6 << 5),
+       /* C - Other: Cc | Cf | Cs | Co | Cn */
+       UNICODE_GENERAL_CATEGORY_C = (7 << 5),
+
+       UNICODE_GENERAL_CATEGORY_GROUP_MASK = (0xf0),
+
+       /* Lu - Uppercase_Letter */
+       UNICODE_GENERAL_CATEGORY_LU = UNICODE_GENERAL_CATEGORY_LC | 1,
+       /* Ll - Lowercase_Letter */
+       UNICODE_GENERAL_CATEGORY_LL = UNICODE_GENERAL_CATEGORY_LC | 2,
+       /* Lt - Titlecase_Letter */
+       UNICODE_GENERAL_CATEGORY_LT = UNICODE_GENERAL_CATEGORY_LC | 3,
+       /* Lm - Modifier_Letter */
+       UNICODE_GENERAL_CATEGORY_LM = UNICODE_GENERAL_CATEGORY_L | 4,
+       /* Lo - Other_Letter */
+       UNICODE_GENERAL_CATEGORY_LO = UNICODE_GENERAL_CATEGORY_L | 5,
+
+       /* Mn - Nonspacing_Mark */
+       UNICODE_GENERAL_CATEGORY_MN = UNICODE_GENERAL_CATEGORY_M | 1,
+       /* Mc - Spacing_Mark */
+       UNICODE_GENERAL_CATEGORY_MC = UNICODE_GENERAL_CATEGORY_M | 2,
+       /* Me - Enclosing_Mark */
+       UNICODE_GENERAL_CATEGORY_ME = UNICODE_GENERAL_CATEGORY_M | 3,
+
+       /* Nd - Decimal_Number */
+       UNICODE_GENERAL_CATEGORY_ND = UNICODE_GENERAL_CATEGORY_N | 1,
+       /* Nl - Letter_Number */
+       UNICODE_GENERAL_CATEGORY_NL = UNICODE_GENERAL_CATEGORY_N | 2,
+       /* No - Other_Number */
+       UNICODE_GENERAL_CATEGORY_NO = UNICODE_GENERAL_CATEGORY_N | 3,
+
+       /* Pc - Connector_Punctuation */
+       UNICODE_GENERAL_CATEGORY_PC = UNICODE_GENERAL_CATEGORY_P | 1,
+       /* Pd - Dash_Punctuation */
+       UNICODE_GENERAL_CATEGORY_PD = UNICODE_GENERAL_CATEGORY_P | 2,
+       /* Ps - Open_Punctuation */
+       UNICODE_GENERAL_CATEGORY_PS = UNICODE_GENERAL_CATEGORY_P | 3,
+       /* Pe - Close_Punctuation */
+       UNICODE_GENERAL_CATEGORY_PE = UNICODE_GENERAL_CATEGORY_P | 4,
+       /* Pi - Initial_Punctuation */
+       UNICODE_GENERAL_CATEGORY_PI = UNICODE_GENERAL_CATEGORY_P | 5,
+       /* Pf - Final_Punctuation */
+       UNICODE_GENERAL_CATEGORY_PF = UNICODE_GENERAL_CATEGORY_P | 6,
+       /* Po - Other_Punctuation */
+       UNICODE_GENERAL_CATEGORY_PO = UNICODE_GENERAL_CATEGORY_P | 7,
+
+       /* Sm - Math_Symbol */
+       UNICODE_GENERAL_CATEGORY_SM = UNICODE_GENERAL_CATEGORY_S | 1,
+       /* Sc - Currency_Symbol */
+       UNICODE_GENERAL_CATEGORY_SC = UNICODE_GENERAL_CATEGORY_S | 2,
+       /* Sk - Modifier_Symbol */
+       UNICODE_GENERAL_CATEGORY_SK = UNICODE_GENERAL_CATEGORY_S | 3,
+       /* So - Other_Symbol */
+       UNICODE_GENERAL_CATEGORY_SO = UNICODE_GENERAL_CATEGORY_S | 4,
+
+       /* Zs - Space_Separator */
+       UNICODE_GENERAL_CATEGORY_ZS = UNICODE_GENERAL_CATEGORY_Z | 1,
+       /* Zl - Line_Separator */
+       UNICODE_GENERAL_CATEGORY_ZL = UNICODE_GENERAL_CATEGORY_Z | 2,
+       /* Zp - Paragraph_Separator */
+       UNICODE_GENERAL_CATEGORY_ZP = UNICODE_GENERAL_CATEGORY_Z | 3,
+
+       /* Cc - Control */
+       UNICODE_GENERAL_CATEGORY_CC = UNICODE_GENERAL_CATEGORY_C | 1,
+       /* Cf - Format */
+       UNICODE_GENERAL_CATEGORY_CF = UNICODE_GENERAL_CATEGORY_C | 2,
+       /* Cs - Surrogate */
+       UNICODE_GENERAL_CATEGORY_CS = UNICODE_GENERAL_CATEGORY_C | 3,
+       /* Co - Private_Use */
+       UNICODE_GENERAL_CATEGORY_CO = UNICODE_GENERAL_CATEGORY_C | 4,
+       /* Cn - Unassigned */
+       UNICODE_GENERAL_CATEGORY_CN = UNICODE_GENERAL_CATEGORY_C | 5,
+};
+
+struct unicode_code_point_data {
+       uint8_t general_category; // Not yet used
+
+       uint32_t simple_titlecase_mapping;
+};
+
+#endif
diff --git a/src/lib/unicode-data.c b/src/lib/unicode-data.c
new file mode 100644 (file)
index 0000000..31e4990
--- /dev/null
@@ -0,0 +1,172 @@
+/* Copyright (c) 2025 Dovecot authors, see the included COPYING file */
+
+#include "lib.h"
+#include "unicode-data.h"
+
+uint8_t unicode_general_category_from_string(const char *str)
+{
+       if (str == NULL || strlen(str) != 2)
+               return UNICODE_GENERAL_CATEGORY_INVALID;
+
+       switch (str[0]) {
+       case 'L':
+               switch (str[1]) {
+               /* Lu - Uppercase_Letter */
+               case 'u':
+                       return UNICODE_GENERAL_CATEGORY_LU;
+               /* Ll - Lowercase_Letter */
+               case 'l':
+                       return UNICODE_GENERAL_CATEGORY_LL;
+               /* Lt - Titlecase_Letter */
+               case 't':
+                       return UNICODE_GENERAL_CATEGORY_LT;
+               /* LC - Cased_Letter: Lu | Ll | Lt */
+               case 'C':
+                       return UNICODE_GENERAL_CATEGORY_LC;
+               /* Lm - Modifier_Letter */
+               case 'm':
+                       return UNICODE_GENERAL_CATEGORY_LM;
+               /* Lo - Other_Letter */
+               case 'o':
+                       return UNICODE_GENERAL_CATEGORY_LO;
+               /* L - Letter: Lu | Ll | Lt | Lm | Lo */
+               case '\0':
+                       return UNICODE_GENERAL_CATEGORY_L;
+               default:
+                       break;
+               }
+               break;
+       case 'M':
+               switch (str[1]) {
+               /* Mn - Nonspacing_Mark */
+               case 'n':
+                       return UNICODE_GENERAL_CATEGORY_MN;
+               /* Mc - Spacing_Mark */
+               case 'c':
+                       return UNICODE_GENERAL_CATEGORY_MC;
+               /* Me - Enclosing_Mark */
+               case 'e':
+                       return UNICODE_GENERAL_CATEGORY_ME;
+               /* M - Mark: Mn | Mc | Me */
+               case '\0':
+                       return UNICODE_GENERAL_CATEGORY_M;
+               default:
+                       break;
+               }
+               break;
+       case 'N':
+               switch (str[1]) {
+               /* Nd - Decimal_Number */
+               case 'd':
+                       return UNICODE_GENERAL_CATEGORY_ND;
+               /* Nl - Letter_Number */
+               case 'l':
+                       return UNICODE_GENERAL_CATEGORY_NL;
+               /* No - Other_Number */
+               case 'o':
+                       return UNICODE_GENERAL_CATEGORY_NO;
+               /* N - Number: Nd | Nl | No */
+               case '\0':
+                       return UNICODE_GENERAL_CATEGORY_N;
+               default:
+                       break;
+               }
+               break;
+       case 'P':
+               switch (str[1]) {
+               /* Pc - Connector_Punctuation */
+               case 'c':
+                       return UNICODE_GENERAL_CATEGORY_PC;
+               /* Pd - Dash_Punctuation */
+               case 'd':
+                       return UNICODE_GENERAL_CATEGORY_PD;
+               /* Ps - Open_Punctuation */
+               case 's':
+                       return UNICODE_GENERAL_CATEGORY_PS;
+               /* Pe - Close_Punctuation */
+               case 'e':
+                       return UNICODE_GENERAL_CATEGORY_PE;
+               /* Pi - Initial_Punctuation */
+               case 'i':
+                       return UNICODE_GENERAL_CATEGORY_PI;
+               /* Pf - Final_Punctuation */
+               case 'f':
+                       return UNICODE_GENERAL_CATEGORY_PF;
+               /* Po - Other_Punctuation */
+               case 'o':
+                       return UNICODE_GENERAL_CATEGORY_PO;
+               /* P - Punctuation: Pc | Pd | Ps | Pe | Pi | Pf | Po */
+               case '\0':
+                       return UNICODE_GENERAL_CATEGORY_P;
+               default:
+                       break;
+               }
+               break;
+       case 'S':
+               switch (str[1]) {
+               /* Sm - Math_Symbol */
+               case 'm':
+                       return UNICODE_GENERAL_CATEGORY_SM;
+               /* Sc - Currency_Symbol */
+               case 'c':
+                       return UNICODE_GENERAL_CATEGORY_SC;
+               /* Sk - Modifier_Symbol */
+               case 'k':
+                       return UNICODE_GENERAL_CATEGORY_SK;
+               /* So - Other_Symbol */
+               case 'o':
+                       return UNICODE_GENERAL_CATEGORY_SO;
+               /* S - Symbol: Sm | Sc | Sk | So */
+               case '\0':
+                       return UNICODE_GENERAL_CATEGORY_S;
+               default:
+                       break;
+               }
+               break;
+       case 'Z':
+               switch (str[1]) {
+               /* Zs - Space_Separator */
+               case 's':
+                       return UNICODE_GENERAL_CATEGORY_ZS;
+               /* Zl - Line_Separator */
+               case 'l':
+                       return UNICODE_GENERAL_CATEGORY_ZL;
+               /* Zp - Paragraph_Separator */
+               case 'p':
+                       return UNICODE_GENERAL_CATEGORY_ZP;
+               /* Z - Separator: Zs | Zl | Zp */
+               case '\0':
+                       return UNICODE_GENERAL_CATEGORY_Z;
+               default:
+                       break;
+               }
+               break;
+       case 'C':
+               switch (str[1]) {
+               /* Cc - Control */
+               case 'c':
+                       return UNICODE_GENERAL_CATEGORY_CC;
+               /* Cf - Format */
+               case 'f':
+                       return UNICODE_GENERAL_CATEGORY_CF;
+               /* Cs - Surrogate */
+               case 's':
+                       return UNICODE_GENERAL_CATEGORY_CS;
+               /* Co - Private_Use */
+               case 'o':
+                       return UNICODE_GENERAL_CATEGORY_CO;
+               /* Cn - Unassigned */
+               case 'n':
+                       return UNICODE_GENERAL_CATEGORY_CN;
+               /* C - Other: Cc | Cf | Cs | Co | Cn */
+               case '\0':
+                       return UNICODE_GENERAL_CATEGORY_C;
+               default:
+                       break;
+               }
+               break;
+       default:
+               break;
+       }
+       return UNICODE_GENERAL_CATEGORY_INVALID;
+}
diff --git a/src/lib/unicode-data.h b/src/lib/unicode-data.h
new file mode 100644 (file)
index 0000000..6b156f1
--- /dev/null
@@ -0,0 +1,29 @@
+#ifndef UNICODE_DATA_H
+#define UNICODE_DATA_H
+
+#include "unicode-data-tables.h"
+
+static inline bool
+unicode_general_category_is_group(enum unicode_general_category gencat)
+{
+       return ((gencat & 0x0f) == 0x00);
+}
+
+static inline const struct unicode_code_point_data *
+unicode_code_point_get_data(uint32_t cp)
+{
+       unsigned int idx8 = cp >> 24;
+       unsigned int blk16 = unicode_code_points_index8[idx8];
+       unsigned int idx16 = (blk16 << 8) + ((cp >> 16) & 0xFF);
+       unsigned int blk24 = unicode_code_points_index16[idx16];
+       unsigned int idx24 = (blk24 << 8) + ((cp >> 8) & 0xFF);
+       unsigned int blk32 = unicode_code_points_index24[idx24];
+       unsigned int idx32 = (blk32 << 8) + (cp & 0xFF);
+       unsigned int idxcp = unicode_code_points_index32[idx32];
+
+       return &unicode_code_points[idxcp];
+}
+
+uint8_t unicode_general_category_from_string(const char *str);
+
+#endif
diff --git a/src/lib/unicode-ucd-compile.py b/src/lib/unicode-ucd-compile.py
new file mode 100755 (executable)
index 0000000..acd3653
--- /dev/null
@@ -0,0 +1,805 @@
+#!/usr/bin/env python3
+# Copyright (c) 2025 Dovecot authors, see the included COPYING file
+
+import argparse
+import bisect
+import copy
+import re
+import sys
+from pathlib import Path
+
+
+source_files = []
+
+ud_codepoints = []
+ud_codepoints_first = []
+ud_codepoints_last = []
+ud_codepoints_index = {}
+
+ud_codepoints_index8 = {}
+ud_codepoints_index16 = {}
+ud_codepoints_index16_reused = {}
+ud_codepoints_index16_offsets = {}
+ud_codepoints_index24 = {}
+ud_codepoints_index24_reused = {}
+ud_codepoints_index24_offsets = {}
+ud_codepoints_index32 = {}
+ud_codepoints_index32_reused = {}
+ud_codepoints_index32_offsets = {}
+ud_codepoints_index16_blocks = 1
+ud_codepoints_index24_blocks = 2
+ud_codepoints_index32_blocks = 2
+
+
+class UCDFileOpen:
+    def __init__(self, filename):
+        self.filename = filename
+
+    def __enter__(self):
+        global ucd_dir
+        global source_files
+
+        self.fd = open(ucd_dir + "/" + self.filename, mode="r", encoding="utf-8")
+        source_files.append(self.filename)
+        return self
+
+    def __exit__(self, exception_type, exception_value, exception_traceback):
+        self.fd.close()
+
+    def __str__(self):
+        return self.filename
+
+
+class CodePointData:
+    def mergeFrom(self, data, default=False):
+        for attr in dir(data):
+            if callable(getattr(data, attr)):
+                continue
+            if attr.startswith("__"):
+                continue
+            if default and hasattr(self, attr):
+                continue
+            setattr(self, attr, getattr(data, attr))
+
+
+class CodePointRange:
+    def insert(self, n):
+        global ud_codepoints
+        global ud_codepoints_first
+        global ud_codepoints_last
+
+        ud_codepoints.insert(n, self)
+        ud_codepoints_first.insert(n, self.cp_first)
+        ud_codepoints_last.insert(n, self.cp_last)
+
+    def modify(self, n):
+        global ud_codepoints
+        global ud_codepoints_first
+        global ud_codepoints_last
+
+        ud_codepoints_first[n] = self.cp_first
+        ud_codepoints_last[n] = self.cp_last
+
+    def __new__(cls, cp_first, cp_last, data, default=False):
+        global ud_codepoints
+        global ud_codepoints_first
+        global ud_codepoints_last
+
+        cprn_first = None
+
+        if len(ud_codepoints) == 0:
+            cprn = super().__new__(cls)
+            cprn.cp_first = cp_first
+            cprn.cp_last = cp_last
+            cprn.data = data
+            cprn.insert(0)
+            return
+
+        idx_first = bisect.bisect_left(ud_codepoints_first, cp_first)
+        idx_last = bisect.bisect_right(ud_codepoints_last, cp_last)
+        rng_first = idx_first - 1
+        rng_last = idx_last + 1
+        if rng_last >= len(ud_codepoints):
+            rng_last = len(ud_codepoints) - 1
+
+        # Check existing ranges
+        nn = None
+        n = rng_first
+        while n <= rng_last:
+            cpr = ud_codepoints[n]
+            pos = n
+            n += 1
+
+            # No overlap with this range
+            if cp_last < cpr.cp_first or cp_first > cpr.cp_last:
+                continue
+            # Exact match
+            if cp_first == cpr.cp_first and cp_last == cpr.cp_last:
+                cpr.data.mergeFrom(data, default)
+                return cpr
+            # New range fully envelops existing
+            if cp_first <= cpr.cp_first and cp_last >= cpr.cp_last:
+                # Split off range before
+                if cp_first < cpr.cp_first:
+                    cprn = super().__new__(cls)
+                    cprn.cp_first = cp_first
+                    cprn.cp_last = cpr.cp_first - 1
+                    cprn.data = copy.deepcopy(data)
+                    cprn.insert(pos)
+                    rng_last += 1
+                    if cprn_first is None:
+                        cprn_first = cprn
+                # Merge with existing
+                cpr.data.mergeFrom(data, default)
+                # Split off range after
+                if cp_last > cpr.cp_last:
+                    cp_first = cpr.cp_last + 1
+                    nn = pos + 1
+                    continue
+                break
+            # New range fully enveloped by existing
+            if cp_first > cpr.cp_first and cp_last < cpr.cp_last:
+                cprn = super().__new__(cls)
+                cprn.cp_first = cp_last + 1
+                cprn.cp_last = cpr.cp_last
+                cprn.data = cpr.data
+                cprn.insert(pos + 1)
+                rng_last += 1
+                cpr.cp_last = cp_first - 1
+                cpr.modify(pos)
+                cprn = super().__new__(cls)
+                cprn.cp_first = cp_first
+                cprn.cp_last = cp_last
+                cprn.data = copy.deepcopy(cpr.data)
+                cprn.data.mergeFrom(data, default)
+                cprn.insert(pos + 1)
+                rng_last += 1
+                return cprn
+            # New range aligns with beginning of existing
+            if cp_first == cpr.cp_first and cp_last < cpr.cp_last:
+                cpr.cp_first = cp_last + 1
+                cpr.modify(pos)
+                cprn = super().__new__(cls)
+                cprn.cp_first = cp_first
+                cprn.cp_last = cp_last
+                cprn.data = copy.deepcopy(cpr.data)
+                cprn.data.mergeFrom(data, default)
+                cprn.insert(pos)
+                rng_last += 1
+                return cprn
+            # New range aligns with end of existing
+            if cp_first > cpr.cp_first and cp_last == cpr.cp_last:
+                cpr.cp_last = cp_first - 1
+                cpr.modify(pos)
+                cprn = super().__new__(cls)
+                cprn.cp_first = cp_first
+                cprn.cp_last = cp_last
+                cprn.data = copy.deepcopy(cpr.data)
+                cprn.data.mergeFrom(data, default)
+                cprn.insert(pos + 1)
+                rng_last += 1
+                return cprn
+            # New range crosses the beginning of existing
+            if cp_first < cpr.cp_first and cp_last >= cpr.cp_first:
+                cprn = super().__new__(cls)
+                cprn.cp_first = cp_first
+                cprn.cp_last = cpr.cp_first - 1
+                cprn.data = data
+                cprn.insert(pos)
+                rng_last += 1
+                cprn = super().__new__(cls)
+                cprn.cp_first = cpr.cp_first
+                cprn.cp_last = cp_last
+                cprn.data = copy.deepcopy(cpr.data)
+                cprn.data.mergeFrom(data, default)
+                cprn.insert(pos + 1)
+                rng_last += 1
+                cpr.cp_first = cp_last + 1
+                cpr.modify(pos + 2)
+                return cprn
+            # New range crosses the end of existing
+            if cp_first <= cpr.cp_last and cp_last > cpr.cp_last:
+                cprn = super().__new__(cls)
+                cprn.cp_first = cp_first
+                cprn.cp_last = cpr.cp_last
+                cprn.data = copy.deepcopy(cpr.data)
+                cprn.data.mergeFrom(data, default)
+                cprn.insert(pos)
+                rng_last += 1
+                if cprn_first is None:
+                    cprn_first = cprn
+                tmp = cp_first
+                cp_first = cpr.cp_last + 1
+                cpr.cp_last = tmp - 1
+                cpr.modify(pos + 1)
+                nn = pos + 1
+                continue
+
+        cprn = super().__new__(cls)
+        cprn.cp_first = cp_first
+        cprn.cp_last = cp_last
+        cprn.data = data
+        if nn is None:
+            cprn.insert(idx_first)
+        else:
+            cprn.insert(nn)
+        if cprn_first is None:
+            cprn_first = cprn
+
+        return cprn_first
+
+
+def die(message):
+    module_filename = Path(__file__).name
+    print(f"{module_filename}: {message}", file=sys.stderr)
+    sys.exit(1)
+
+
+def parse_cp_range(column):
+    rng_hex = column.strip()
+    if len(rng_hex) == 0:
+        return None
+    rng = rng_hex.split("..")
+
+    cp_hex = rng[0].strip()
+    cp_first = int(cp_hex, 16)
+    cp_last = cp_first
+
+    if len(rng) > 1:
+        cp_hex = rng[1].strip()
+        cp_last = int(cp_hex, 16)
+
+    return (cp_first, cp_last)
+
+
+def read_ucd_files():
+    global ud_decomposition_type_names
+    global ud_composition_exclusions
+
+    # UnicodeData.txt
+    with UCDFileOpen("UnicodeData.txt") as ucd:
+        cp_range_first = None
+        line_num = 0
+        for line in ucd.fd:
+            line_num = line_num + 1
+            data = line.split("#")
+            line = data[0].strip()
+            if len(line) == 0:
+                continue
+
+            cols = line.split(";")
+            if len(cols) < 15:
+                die(f"{ucd}:{line_num}: Missing columns")
+
+            # (0) Code point in hex
+
+            cp_first = cp_last = int(cols[0].strip(), 16)
+
+            # (1) Name
+
+            cp_name = cols[1].strip()
+
+            x = re.search("<([^>]*), (First|Last)>", cp_name)
+            if x:
+                if x.group(2) == "First":
+                    cp_range_first = cp_first
+                    continue
+                if x.group(2) == "Last" and cp_range_first is not None:
+                    cp_first = cp_range_first
+                    cp_name = "<%s>" % x.group(1)
+                    cp_range_first = None
+
+            cpd = CodePointData()
+            cpd.name = cp_name
+
+            # (2) General_Category
+
+            cpd.general_category = cols[2].strip()
+
+            # (3) Canonical_Combining_Class
+
+            ccc = cols[3].strip()
+            if ccc != "":
+                cpd.canonical_combining_class = int(ccc)
+
+            # (5) Decomposition_Type, Decomposition_Mapping
+
+            x = re.search("(<([^>]*)> )?(.+)", cols[5].strip())
+            if x:
+                if x.group(2) is not None:
+                    cpd.decomposition_type = x.group(2)
+                dcs_txt = x.group(3).split(" ")
+                dcs = []
+                for dc_txt in dcs_txt:
+                    dcs.append(int(dc_txt.strip(), 16))
+                cpd.decomposition_first = dcs
+
+            # (12) Simple_Uppercase_Mapping
+
+            code = cols[12].strip()
+            if code != "":
+                cpd.simple_uppercase_mapping = int(code, 16)
+
+            # (13) Simple_Lowercase_Mapping
+
+            code = cols[13].strip()
+            if code != "":
+                cpd.simple_lowercase_mapping = int(code, 16)
+
+            # (14) Simple_Titlecase_Mapping
+
+            code = cols[14].strip()
+            if code != "":
+                cpd.simple_titlecase_mapping = int(code, 16)
+
+            # Add range
+            CodePointRange(cp_first, cp_last, cpd)
+
+
+def create_cp_range_index():
+    global ud_codepoints
+    global ud_codepoints_index
+
+    for cpr in ud_codepoints:
+        ud_codepoints_index[cpr.cp_first] = cpr
+
+
+def update_cp_index_tables(cp_first, cp_last, cp_pos):
+    global ud_codepoints_index8
+    global ud_codepoints_index16
+    global ud_codepoints_index16_reused
+    global ud_codepoints_index16_offsets
+    global ud_codepoints_index16_blocks
+    global ud_codepoints_index24
+    global ud_codepoints_index24_reused
+    global ud_codepoints_index24_offsets
+    global ud_codepoints_index24_blocks
+    global ud_codepoints_index32
+    global ud_codepoints_index32_reused
+    global ud_codepoints_index32_offsets
+    global ud_codepoints_index32_blocks
+
+    cp_range = range(cp_first, cp_last + 1)
+
+    id16_block = None
+    id24_block = None
+    id32_block = None
+    first16 = True
+    first24 = True
+    first32 = True
+
+    last_rcp = cp_last
+    for rcp in cp_range:
+        # Index for first 8 bits of code point
+        id8_idx = rcp >> 24
+        if id8_idx in ud_codepoints_index8:
+            id16_block = ud_codepoints_index8[id8_idx]
+        elif (
+            id16_block is not None
+            and not first16
+            and ((last_rcp & 0xFFFFFF) == 0xFFFFFF or (rcp >> 24) != (last_rcp >> 24))
+        ):
+            ud_codepoints_index8[id8_idx] = id16_block
+            if id16_block not in ud_codepoints_index16_reused:
+                ud_codepoints_index16_reused[id16_block] = 1
+            ud_codepoints_index16_reused[id16_block] += 1
+        else:
+            first16 = False
+            id16_block = ud_codepoints_index16_blocks
+            ud_codepoints_index8[id8_idx] = id16_block
+            ud_codepoints_index16_offsets[id16_block] = rcp & (((1 << 8) - 1) << 24)
+            ud_codepoints_index16_blocks += 1
+
+        # Index for first 16 bits of code point
+        id16_idx = (id16_block << 8) + ((rcp >> 16) & 0xFF)
+        if id16_idx in ud_codepoints_index16:
+            id24_block = ud_codepoints_index16[id16_idx]
+        elif (
+            id24_block is not None
+            and not first24
+            and ((last_rcp & 0xFFFF) == 0xFFFF or (rcp >> 16) != (last_rcp >> 16))
+        ):
+            ud_codepoints_index16[id16_idx] = id24_block
+            if id24_block not in ud_codepoints_index24_reused:
+                ud_codepoints_index24_reused[id24_block] = 1
+            ud_codepoints_index24_reused[id24_block] += 1
+        else:
+            first24 = False
+            id24_block = ud_codepoints_index24_blocks
+            ud_codepoints_index16[id16_idx] = id24_block
+            ud_codepoints_index24_offsets[id24_block] = rcp & (((1 << 16) - 1) << 16)
+            ud_codepoints_index24_blocks += 1
+
+        # Index for first 24 bits of code point
+        id24_idx = (id24_block << 8) + ((rcp >> 8) & 0xFF)
+        if id24_idx in ud_codepoints_index24:
+            id32_block = ud_codepoints_index24[id24_idx]
+        elif (
+            id32_block is not None
+            and not first32
+            and ((last_rcp & 0xFF) == 0xFF or (rcp >> 8) != (last_rcp >> 8))
+        ):
+            ud_codepoints_index24[id24_idx] = id32_block
+            if id32_block not in ud_codepoints_index32_reused:
+                ud_codepoints_index32_reused[id32_block] = 1
+            ud_codepoints_index32_reused[id32_block] += 1
+        else:
+            first32 = False
+            id32_block = ud_codepoints_index32_blocks
+            ud_codepoints_index24[id24_idx] = id32_block
+            ud_codepoints_index32_offsets[id32_block] = rcp & (((1 << 24) - 1) << 8)
+            ud_codepoints_index32_blocks += 1
+
+        # Index for first 32 bits of code point
+        id32_idx = (id32_block << 8) + (rcp & 0xFF)
+        ud_codepoints_index32[id32_idx] = cp_pos
+
+
+def create_cp_index_tables():
+    global ud_codepoints
+
+    # Create code point index
+    for n in range(0, len(ud_codepoints)):
+        cpr = ud_codepoints[n]
+        cp_first = cpr.cp_first
+        cp_last = cpr.cp_last
+
+        update_cp_index_tables(cp_first, cp_last, n)
+
+
+def get_general_category_def(gc):
+    return "UNICODE_GENERAL_CATEGORY_%s" % gc.upper()
+
+
+def decomposition_type_def(dt):
+    return "UNICODE_DECOMPOSITION_TYPE_%s" % dt.upper()
+
+
+def print_list(code_list):
+    last = len(code_list) - 1
+    n = 0
+    print("\t", end="")
+    for code in code_list:
+        print("0x%05x" % code, end="")
+        if n == last:
+            break
+        print(",", end="")
+
+        n += 1
+        if (n % 8) == 0:
+            print("")
+            print("\t", end="")
+            if (n % 10) == 0:
+                print("// INDEX %u" % n)
+                print("\t", end="")
+        else:
+            print(" ", end="")
+
+
+def print_top_message():
+    global ucd_dir
+    global source_files
+
+    print("/* This file is automatically generated by unicode-ucd-compile.py from:")
+    for sf in source_files:
+        print("     %s/%s" % (ucd_dir, sf))
+    print(" */")
+    print("")
+
+
+def write_tables_h():
+    global output_dir
+    global ud_decomposition_max_length
+    global ud_compositions_max_per_starter
+
+    orig_stdout = sys.stdout
+
+    with open(output_dir + "/unicode-data-tables.h", mode="w", encoding="utf-8") as fd:
+        sys.stdout = fd
+
+        print("#ifndef UNICODE_DATA_TABLES_H")
+        print("#define UNICODE_DATA_TABLES_H")
+        print("")
+        print_top_message()
+        print('#include "unicode-data-static.h"')
+        print("")
+        print("extern const struct unicode_code_point_data unicode_code_points[];")
+        print("")
+        print("extern const uint8_t unicode_code_points_index8[];")
+        print("extern const uint8_t unicode_code_points_index16[];")
+        print("extern const uint16_t unicode_code_points_index24[];")
+        print("extern const uint16_t unicode_code_points_index32[];")
+        print("")
+        print("#endif")
+
+    sys.stdout = orig_stdout
+
+
+def write_tables_c():
+    global output_dir
+    global ud_codepoints
+    global ud_decompositions
+    global ud_compositions
+    global ud_composition_primaries
+    global ud_case_mappings
+
+    orig_stdout = sys.stdout
+
+    with open(output_dir + "/unicode-data-tables.c", mode="w", encoding="utf-8") as fd:
+        sys.stdout = fd
+        print_top_message()
+
+        print('#include "lib.h"')
+        print('#include "unicode-data-tables.h"')
+        print("")
+        print("const struct unicode_code_point_data unicode_code_points[] = {")
+        print("\t{ // [0000] <invalid>")
+        print("\t\t.general_category = UNICODE_GENERAL_CATEGORY_INVALID,")
+        print("\t},")
+        print("\t{ // [0001] <unassigned>")
+        print("\t\t.general_category = UNICODE_GENERAL_CATEGORY_CN,")
+        print("\t},")
+        n = 2
+        for cpr in ud_codepoints:
+            cpd = cpr.data
+
+            if cpr.cp_last > cpr.cp_first:
+                range_str = "U+%04X..U+%04X" % (cpr.cp_first, cpr.cp_last)
+            else:
+                range_str = "U+%04X" % (cpr.cp_first)
+            print("\t{ // [%04X] %s: %s" % (n, range_str, cpd.name))
+            n = n + 1
+
+            print(
+                "\t\t.general_category = %s,"
+                % get_general_category_def(cpd.general_category)
+            )
+            if hasattr(cpd, "simple_titlecase_mapping"):
+                print(
+                    "\t\t.simple_titlecase_mapping = 0x%04X,"
+                    % cpd.simple_titlecase_mapping
+                )
+            print("\t},")
+        print("};")
+        print("")
+        # Code points index8
+        print("const uint8_t unicode_code_points_index8[] = {")
+        print("\t", end="")
+        for n in range(0, 256):
+            if n in ud_codepoints_index8:
+                print("0x%02x" % ud_codepoints_index8[n], end="")
+            else:
+                print("0x00", end="")
+            if n == 255:
+                break
+            print(",", end="")
+
+            if ((n + 1) % 8) == 0:
+                print("\n\t", end="")
+            else:
+                print(" ", end="")
+        print(",")
+        print("};")
+        print("")
+        # Code points index16
+        print("const uint8_t unicode_code_points_index16[] = {")
+        print("\t// Block 0x00: <invalid>")
+        print("\t", end="")
+        last = (1 << 8) - 1
+        for n in range(0 << 8, last + 1):
+            print("0x00", end="")
+            if n == last:
+                break
+            print(",", end="")
+
+            if ((n + 1) % 8) == 0:
+                print("\n\t", end="")
+            else:
+                print(" ", end="")
+        print(",")
+        print("\t", end="")
+        last = (ud_codepoints_index16_blocks << 8) - 1
+        for n in range((1 << 8), last + 1):
+            if (n & ((1 << 8) - 1)) == 0:
+                blk_id = n >> 8
+                blk_offset = ud_codepoints_index16_offsets[blk_id]
+                blk_end = blk_offset + (1 << 24) - 1
+                print(
+                    "// Block 0x%02X: U+%06X..U+%06X" % (blk_id, blk_offset, blk_end),
+                    end="",
+                )
+                if blk_id in ud_codepoints_index16_reused:
+                    print(
+                        " (used %u times)" % ud_codepoints_index16_reused[blk_id], end=""
+                    )
+                print("")
+                print("\t", end="")
+            if n in ud_codepoints_index16:
+                print("0x%02x" % ud_codepoints_index16[n], end="")
+            elif ud_codepoints_index16_offsets[n >> 8] + ((n & 0xFF) << 16) > 0x10FFFF:
+                print("0x00", end="")
+            else:
+                print("0x01", end="")
+            if n == last:
+                break
+            print(",", end="")
+
+            if ((n + 1) % 8) == 0:
+                print("")
+                print("\t", end="")
+            else:
+                print(" ", end="")
+        print("")
+        print("};")
+        print("")
+        # Code points index24
+        print("const uint16_t unicode_code_points_index24[] = {")
+        print("\t// Block 0x00: <invalid>")
+        print("\t", end="")
+        last = (1 << 8) - 1
+        for n in range((0 << 8), last + 1):
+            print("0x000", end="")
+            if n == last:
+                break
+            print(",", end="")
+
+            if ((n + 1) % 8) == 0:
+                print("")
+                print("\t", end="")
+            else:
+                print(" ", end="")
+        print(",")
+        print("\t// Block 0x01: <unassigned>")
+        print("\t", end="")
+        last = (2 << 8) - 1
+        for n in range((1 << 8), last + 1):
+            print("0x001", end="")
+            if n == last:
+                break
+            print(",", end="")
+
+            if ((n + 1) % 8) == 0:
+                print("")
+                print("\t", end="")
+            else:
+                print(" ", end="")
+        print(",")
+        print("\t", end="")
+        last = (ud_codepoints_index24_blocks << 8) - 1
+        for n in range((2 << 8), last + 1):
+            if (n & ((1 << 8) - 1)) == 0:
+                blk_id = n >> 8
+                blk_offset = ud_codepoints_index24_offsets[blk_id]
+                blk_end = blk_offset + (1 << 16) - 1
+                print(
+                    "// Block 0x%04X: U+%06X..U+%06X" % (blk_id, blk_offset, blk_end),
+                    end="",
+                )
+                if blk_id in ud_codepoints_index24_reused:
+                    print(
+                        " (used %u times)" % ud_codepoints_index24_reused[blk_id], end=""
+                    )
+                print("")
+                print("\t", end="")
+            if n in ud_codepoints_index24:
+                print("0x%03x" % ud_codepoints_index24[n], end="")
+            else:
+                print("0x001", end="")
+            if n == last:
+                break
+            print(",", end="")
+
+            if ((n + 1) % 8) == 0:
+                print("")
+                print("\t", end="")
+            else:
+                print(" ", end="")
+        print(",")
+        print("};")
+        print("")
+        # Code points index32
+        print("const uint16_t unicode_code_points_index32[] = {")
+        print("\t// Block 0x000: <invalid>")
+        print("\t", end="")
+        last = (1 << 8) - 1
+        for n in range(0 << 8, last + 1):
+            print("0x0000", end="")
+            if n == last:
+                break
+            print(",", end="")
+
+            if ((n + 1) % 8) == 0:
+                print("")
+                print("\t", end="")
+            else:
+                print(" ", end="")
+        print(",")
+        print("\t// Block 0x001: <unassigned>")
+        print("\t", end="")
+        last = (2 << 8) - 1
+        for n in range(1 << 8, last + 1):
+            print("0x0001", end="")
+            if n == last:
+                break
+            print(",", end="")
+
+            if ((n + 1) % 8) == 0:
+                print("")
+                print("\t", end="")
+            else:
+                print(" ", end="")
+        print(",")
+        print("\t", end="")
+        last = (ud_codepoints_index32_blocks << 8) - 1
+        for n in range(2 << 8, last + 1):
+            if (n & ((1 << 8) - 1)) == 0:
+                blk_id = n >> 8
+                blk_offset = ud_codepoints_index32_offsets[blk_id]
+                blk_end = blk_offset + (1 << 8) - 1
+                print(
+                    "// Block 0x%04X: U+%06X - U+%06X" % (blk_id, blk_offset, blk_end),
+                    end="",
+                )
+                if blk_id in ud_codepoints_index32_reused:
+                    print(
+                        " (used %u times)" % ud_codepoints_index32_reused[blk_id], end=""
+                    )
+                print("")
+                print("\t", end="")
+            if n in ud_codepoints_index32:
+                print("0x%04x" % (ud_codepoints_index32[n] + 2), end="")
+            else:
+                print("0x0001", end="")
+            if n == last:
+                break
+            print(",", end="")
+
+            if ((n + 1) % 8) == 0:
+                print("")
+                print("\t", end="")
+            else:
+                print(" ", end="")
+        print(",")
+        print("};")
+
+    sys.stdout = orig_stdout
+
+
+def main():
+    global ucd_dir
+    global output_dir
+    global source_files
+
+    """Entry point."""
+    parser = argparse.ArgumentParser(
+        prog="unicode-ucd-compile.py",
+        description="Compile the Unicode Character Database files into C code",
+    )
+    parser.add_argument(
+        "ucd-dir",
+        type=str,
+        help="Directory containing the UCD files",
+    )
+    parser.add_argument(
+        "output-dir",
+        type=str,
+        help="Output directory where the C header and source files are written",
+    )
+    args = parser.parse_args()
+
+    ucd_dir = getattr(args, "ucd-dir")
+    output_dir = getattr(args, "output-dir")
+
+    read_ucd_files()
+    source_files.sort()
+
+    create_cp_range_index()
+
+    create_cp_index_tables()
+
+    write_tables_h()
+    write_tables_c()
+
+
+if __name__ == "__main__":
+    main()