--- /dev/null
+/* Copyright (c) 2025 Dovecot authors, see the included COPYING file */
+
+#include "test-lib.h"
+#include "strnum.h"
+#include "str.h"
+#include "unichar.h"
+#include "istream.h"
+#include "unicode-data.h"
+
+#include <fcntl.h>
+
+#define UCD_UNICODE_DATA_TXT "UnicodeData.txt"
+
+static void test_unicode_data_line(const char *line, unsigned int line_num)
+{
+ static uint32_t cp_first = 0;
+
+ const char *const *columns = t_strsplit(line, ";");
+ if (str_array_length(columns) < 15) {
+ test_failed(t_strdup_printf(
+ "Invalid data at %s:%u",
+ UCD_UNICODE_DATA_TXT, line_num));
+ return;
+ }
+
+ const char *cp_hex = columns[0];
+ uint32_t cp;
+
+ if (str_to_uint32_hex(cp_hex, &cp) < 0) {
+ test_failed(t_strdup_printf(
+ "Invalid data at %s:%u: "
+ "Bad code point",
+ UCD_UNICODE_DATA_TXT, line_num));
+ return;
+ }
+
+ /* Parse Name */
+
+ const char *cp_name = columns[1];
+ size_t cp_name_len = strlen(cp_name);
+ const char *p;
+
+ if (cp_name[0] == '<' && cp_name[cp_name_len - 1] == '>') {
+ p = strchr(cp_name + 1, ',');
+ if (p != NULL) {
+ if (strcmp(p, ", First>") == 0) {
+ cp_first = cp;
+ return;
+ } else if (strcmp(p, ", Last>") != 0) {
+ test_failed(t_strdup_printf(
+ "Invalid data at %s:%u: "
+ "Bad code point range: %s",
+ UCD_UNICODE_DATA_TXT, line_num, cp_name));
+ return;
+ }
+ }
+ }
+
+ /* Parse General_Category */
+
+ uint8_t general_category =
+ (uint8_t)unicode_general_category_from_string(columns[2]);
+ if (general_category == UNICODE_GENERAL_CATEGORY_INVALID) {
+ test_failed(t_strdup_printf(
+ "Invalid data at %s:%u: "
+ "Bad General_Category for code point %"PRIu32": %s",
+ UCD_UNICODE_DATA_TXT, line_num, cp, columns[2]));
+ return;
+ }
+ test_assert(!unicode_general_category_is_group(general_category));
+
+ /* Parse Simple_*case_Mapping */
+
+ uint32_t simple_uppercase_mapping = 0;
+ uint32_t simple_lowercase_mapping = 0;
+ uint32_t simple_titlecase_mapping = 0;
+
+ if (*columns[12] != '\0' &&
+ str_to_uint32_hex(columns[12], &simple_uppercase_mapping) < 0) {
+ test_failed(t_strdup_printf(
+ "Invalid data at %s:%u: "
+ "Bad Simple_Uppercase_Mapping for code point %"PRIu32": %s",
+ UCD_UNICODE_DATA_TXT, line_num, cp, columns[12]));
+ return;
+ }
+ if (*columns[13] != '\0' &&
+ str_to_uint32_hex(columns[13], &simple_lowercase_mapping) < 0) {
+ test_failed(t_strdup_printf(
+ "Invalid data at %s:%u: "
+ "Bad Simple_Lowercase_Mapping for code point %"PRIu32": %s",
+ UCD_UNICODE_DATA_TXT, line_num, cp, columns[13]));
+ return;
+ }
+ if (*columns[14] != '\0' &&
+ str_to_uint32_hex(columns[14], &simple_titlecase_mapping) < 0) {
+ test_failed(t_strdup_printf(
+ "Invalid data at %s:%u: "
+ "Bad Simple_Titlecase_Mapping for code point %"PRIu32": %s",
+ UCD_UNICODE_DATA_TXT, line_num, cp, columns[14]));
+ return;
+ }
+
+ /* Check data */
+
+ uint32_t cp_last = cp;
+
+ if (cp_first == 0)
+ cp_first = cp;
+ for (cp = cp_first; cp <= cp_last && !test_has_failed(); cp++) {
+ const struct unicode_code_point_data *cp_data =
+ unicode_code_point_get_data(cp);
+
+ test_assert_idx(
+ cp_data->general_category == general_category, cp);
+
+ test_assert_idx(
+ cp_data->simple_titlecase_mapping == simple_titlecase_mapping,
+ cp);
+ }
+
+ cp_first = 0;
+}
+
+static void
+test_ucd_file(const char *filename,
+ void (*test_line)(const char *line, unsigned int line_num))
+{
+ const char *file_path = t_strconcat(UCD_DIR, "/", filename, NULL);
+ struct istream *input;
+ int fd;
+
+ fd = open(file_path, O_RDONLY);
+ if (fd < 0)
+ i_fatal("Failed to open '%s': %m", file_path);
+
+ test_begin(t_strdup_printf("unicode_data - %s", filename));
+
+ input = i_stream_create_fd_autoclose(&fd, 1024);
+
+ unsigned int line_num = 0;
+
+ while (!test_has_failed()) {
+ char *line = i_stream_read_next_line(input);
+
+ if (line == NULL)
+ break;
+ line_num++;
+
+ char *comment = strchr(line, '#');
+
+ if (comment != NULL)
+ *comment = '\0';
+ if (*line == '\0')
+ continue;
+
+ T_BEGIN {
+ test_line(line, line_num);
+ } T_END;
+ }
+
+ i_stream_destroy(&input);
+ test_end();
+}
+
+void test_unicode_data(void)
+{
+ /* Check that UCD data files match with what is compiled. */
+ test_ucd_file(UCD_UNICODE_DATA_TXT, test_unicode_data_line);
+}
--- /dev/null
+#ifndef UNICODE_DATA_STATIC_H
+#define UNICODE_DATA_STATIC_H
+
+/* UAX #44, Section 5.7.1: General Category Values
+ */
+enum unicode_general_category {
+ UNICODE_GENERAL_CATEGORY_INVALID = 0,
+
+ /* LC - Cased_Letter: Lu | Ll | Lt */
+ UNICODE_GENERAL_CATEGORY_LC = (1 << 4),
+ /* L - Letter: Lu | Ll | Lt | Lm | Lo */
+ UNICODE_GENERAL_CATEGORY_L = (1 << 5) | UNICODE_GENERAL_CATEGORY_LC,
+ /* M - Mark: Mn | Mc | Me */
+ UNICODE_GENERAL_CATEGORY_M = (2 << 5),
+ /* N - Number: Nd | Nl | No */
+ UNICODE_GENERAL_CATEGORY_N = (3 << 5),
+ /* P - Punctuation: Pc | Pd | Ps | Pe | Pi | Pf | Po */
+ UNICODE_GENERAL_CATEGORY_P = (4 << 5),
+ /* S - Symbol: Sm | Sc | Sk | So */
+ UNICODE_GENERAL_CATEGORY_S = (5 << 5),
+ /* Z - Separator: Zs | Zl | Zp */
+ UNICODE_GENERAL_CATEGORY_Z = (6 << 5),
+ /* C - Other: Cc | Cf | Cs | Co | Cn */
+ UNICODE_GENERAL_CATEGORY_C = (7 << 5),
+
+ UNICODE_GENERAL_CATEGORY_GROUP_MASK = (0xf0),
+
+ /* Lu - Uppercase_Letter */
+ UNICODE_GENERAL_CATEGORY_LU = UNICODE_GENERAL_CATEGORY_LC | 1,
+ /* Ll - Lowercase_Letter */
+ UNICODE_GENERAL_CATEGORY_LL = UNICODE_GENERAL_CATEGORY_LC | 2,
+ /* Lt - Titlecase_Letter */
+ UNICODE_GENERAL_CATEGORY_LT = UNICODE_GENERAL_CATEGORY_LC | 3,
+ /* Lm - Modifier_Letter */
+ UNICODE_GENERAL_CATEGORY_LM = UNICODE_GENERAL_CATEGORY_L | 4,
+ /* Lo - Other_Letter */
+ UNICODE_GENERAL_CATEGORY_LO = UNICODE_GENERAL_CATEGORY_L | 5,
+
+ /* Mn - Nonspacing_Mark */
+ UNICODE_GENERAL_CATEGORY_MN = UNICODE_GENERAL_CATEGORY_M | 1,
+ /* Mc - Spacing_Mark */
+ UNICODE_GENERAL_CATEGORY_MC = UNICODE_GENERAL_CATEGORY_M | 2,
+ /* Me - Enclosing_Mark */
+ UNICODE_GENERAL_CATEGORY_ME = UNICODE_GENERAL_CATEGORY_M | 3,
+
+ /* Nd - Decimal_Number */
+ UNICODE_GENERAL_CATEGORY_ND = UNICODE_GENERAL_CATEGORY_N | 1,
+ /* Nl - Letter_Number */
+ UNICODE_GENERAL_CATEGORY_NL = UNICODE_GENERAL_CATEGORY_N | 2,
+ /* No - Other_Number */
+ UNICODE_GENERAL_CATEGORY_NO = UNICODE_GENERAL_CATEGORY_N | 3,
+
+ /* Pc - Connector_Punctuation */
+ UNICODE_GENERAL_CATEGORY_PC = UNICODE_GENERAL_CATEGORY_P | 1,
+ /* Pd - Dash_Punctuation */
+ UNICODE_GENERAL_CATEGORY_PD = UNICODE_GENERAL_CATEGORY_P | 2,
+ /* Ps - Open_Punctuation */
+ UNICODE_GENERAL_CATEGORY_PS = UNICODE_GENERAL_CATEGORY_P | 3,
+ /* Pe - Close_Punctuation */
+ UNICODE_GENERAL_CATEGORY_PE = UNICODE_GENERAL_CATEGORY_P | 4,
+ /* Pi - Initial_Punctuation */
+ UNICODE_GENERAL_CATEGORY_PI = UNICODE_GENERAL_CATEGORY_P | 5,
+ /* Pf - Final_Punctuation */
+ UNICODE_GENERAL_CATEGORY_PF = UNICODE_GENERAL_CATEGORY_P | 6,
+ /* Po - Other_Punctuation */
+ UNICODE_GENERAL_CATEGORY_PO = UNICODE_GENERAL_CATEGORY_P | 7,
+
+ /* Sm - Math_Symbol */
+ UNICODE_GENERAL_CATEGORY_SM = UNICODE_GENERAL_CATEGORY_S | 1,
+ /* Sc - Currency_Symbol */
+ UNICODE_GENERAL_CATEGORY_SC = UNICODE_GENERAL_CATEGORY_S | 2,
+ /* Sk - Modifier_Symbol */
+ UNICODE_GENERAL_CATEGORY_SK = UNICODE_GENERAL_CATEGORY_S | 3,
+ /* So - Other_Symbol */
+ UNICODE_GENERAL_CATEGORY_SO = UNICODE_GENERAL_CATEGORY_S | 4,
+
+ /* Zs - Space_Separator */
+ UNICODE_GENERAL_CATEGORY_ZS = UNICODE_GENERAL_CATEGORY_Z | 1,
+ /* Zl - Line_Separator */
+ UNICODE_GENERAL_CATEGORY_ZL = UNICODE_GENERAL_CATEGORY_Z | 2,
+ /* Zp - Paragraph_Separator */
+ UNICODE_GENERAL_CATEGORY_ZP = UNICODE_GENERAL_CATEGORY_Z | 3,
+
+ /* Cc - Control */
+ UNICODE_GENERAL_CATEGORY_CC = UNICODE_GENERAL_CATEGORY_C | 1,
+ /* Cf - Format */
+ UNICODE_GENERAL_CATEGORY_CF = UNICODE_GENERAL_CATEGORY_C | 2,
+ /* Cs - Surrogate */
+ UNICODE_GENERAL_CATEGORY_CS = UNICODE_GENERAL_CATEGORY_C | 3,
+ /* Co - Private_Use */
+ UNICODE_GENERAL_CATEGORY_CO = UNICODE_GENERAL_CATEGORY_C | 4,
+ /* Cn - Unassigned */
+ UNICODE_GENERAL_CATEGORY_CN = UNICODE_GENERAL_CATEGORY_C | 5,
+};
+
+struct unicode_code_point_data {
+ uint8_t general_category; // Not yet used
+
+ uint32_t simple_titlecase_mapping;
+};
+
+#endif
--- /dev/null
+/* Copyright (c) 2025 Dovecot authors, see the included COPYING file */
+
+#include "lib.h"
+#include "unicode-data.h"
+
+uint8_t unicode_general_category_from_string(const char *str)
+{
+ if (str == NULL || strlen(str) != 2)
+ return UNICODE_GENERAL_CATEGORY_INVALID;
+
+ switch (str[0]) {
+ case 'L':
+ switch (str[1]) {
+ /* Lu - Uppercase_Letter */
+ case 'u':
+ return UNICODE_GENERAL_CATEGORY_LU;
+ /* Ll - Lowercase_Letter */
+ case 'l':
+ return UNICODE_GENERAL_CATEGORY_LL;
+ /* Lt - Titlecase_Letter */
+ case 't':
+ return UNICODE_GENERAL_CATEGORY_LT;
+ /* LC - Cased_Letter: Lu | Ll | Lt */
+ case 'C':
+ return UNICODE_GENERAL_CATEGORY_LC;
+ /* Lm - Modifier_Letter */
+ case 'm':
+ return UNICODE_GENERAL_CATEGORY_LM;
+ /* Lo - Other_Letter */
+ case 'o':
+ return UNICODE_GENERAL_CATEGORY_LO;
+ /* L - Letter: Lu | Ll | Lt | Lm | Lo */
+ case '\0':
+ return UNICODE_GENERAL_CATEGORY_L;
+ default:
+ break;
+ }
+ break;
+ case 'M':
+ switch (str[1]) {
+ /* Mn - Nonspacing_Mark */
+ case 'n':
+ return UNICODE_GENERAL_CATEGORY_MN;
+ /* Mc - Spacing_Mark */
+ case 'c':
+ return UNICODE_GENERAL_CATEGORY_MC;
+ /* Me - Enclosing_Mark */
+ case 'e':
+ return UNICODE_GENERAL_CATEGORY_ME;
+ /* M - Mark: Mn | Mc | Me */
+ case '\0':
+ return UNICODE_GENERAL_CATEGORY_M;
+ default:
+ break;
+ }
+ break;
+ case 'N':
+ switch (str[1]) {
+ /* Nd - Decimal_Number */
+ case 'd':
+ return UNICODE_GENERAL_CATEGORY_ND;
+ /* Nl - Letter_Number */
+ case 'l':
+ return UNICODE_GENERAL_CATEGORY_NL;
+ /* No - Other_Number */
+ case 'o':
+ return UNICODE_GENERAL_CATEGORY_NO;
+ /* N - Number: Nd | Nl | No */
+ case '\0':
+ return UNICODE_GENERAL_CATEGORY_N;
+ default:
+ break;
+ }
+ break;
+ case 'P':
+ switch (str[1]) {
+ /* Pc - Connector_Punctuation */
+ case 'c':
+ return UNICODE_GENERAL_CATEGORY_PC;
+ /* Pd - Dash_Punctuation */
+ case 'd':
+ return UNICODE_GENERAL_CATEGORY_PD;
+ /* Ps - Open_Punctuation */
+ case 's':
+ return UNICODE_GENERAL_CATEGORY_PS;
+ /* Pe - Close_Punctuation */
+ case 'e':
+ return UNICODE_GENERAL_CATEGORY_PE;
+ /* Pi - Initial_Punctuation */
+ case 'i':
+ return UNICODE_GENERAL_CATEGORY_PI;
+ /* Pf - Final_Punctuation */
+ case 'f':
+ return UNICODE_GENERAL_CATEGORY_PF;
+ /* Po - Other_Punctuation */
+ case 'o':
+ return UNICODE_GENERAL_CATEGORY_PO;
+ /* P - Punctuation: Pc | Pd | Ps | Pe | Pi | Pf | Po */
+ case '\0':
+ return UNICODE_GENERAL_CATEGORY_P;
+ default:
+ break;
+ }
+ break;
+ case 'S':
+ switch (str[1]) {
+ /* Sm - Math_Symbol */
+ case 'm':
+ return UNICODE_GENERAL_CATEGORY_SM;
+ /* Sc - Currency_Symbol */
+ case 'c':
+ return UNICODE_GENERAL_CATEGORY_SC;
+ /* Sk - Modifier_Symbol */
+ case 'k':
+ return UNICODE_GENERAL_CATEGORY_SK;
+ /* So - Other_Symbol */
+ case 'o':
+ return UNICODE_GENERAL_CATEGORY_SO;
+ /* S - Symbol: Sm | Sc | Sk | So */
+ case '\0':
+ return UNICODE_GENERAL_CATEGORY_S;
+ default:
+ break;
+ }
+ break;
+ case 'Z':
+ switch (str[1]) {
+ /* Zs - Space_Separator */
+ case 's':
+ return UNICODE_GENERAL_CATEGORY_ZS;
+ /* Zl - Line_Separator */
+ case 'l':
+ return UNICODE_GENERAL_CATEGORY_ZL;
+ /* Zp - Paragraph_Separator */
+ case 'p':
+ return UNICODE_GENERAL_CATEGORY_ZP;
+ /* Z - Separator: Zs | Zl | Zp */
+ case '\0':
+ return UNICODE_GENERAL_CATEGORY_Z;
+ default:
+ break;
+ }
+ break;
+ case 'C':
+ switch (str[1]) {
+ /* Cc - Control */
+ case 'c':
+ return UNICODE_GENERAL_CATEGORY_CC;
+ /* Cf - Format */
+ case 'f':
+ return UNICODE_GENERAL_CATEGORY_CF;
+ /* Cs - Surrogate */
+ case 's':
+ return UNICODE_GENERAL_CATEGORY_CS;
+ /* Co - Private_Use */
+ case 'o':
+ return UNICODE_GENERAL_CATEGORY_CO;
+ /* Cn - Unassigned */
+ case 'n':
+ return UNICODE_GENERAL_CATEGORY_CN;
+ /* C - Other: Cc | Cf | Cs | Co | Cn */
+ case '\0':
+ return UNICODE_GENERAL_CATEGORY_C;
+ default:
+ break;
+ }
+ break;
+ default:
+ break;
+ }
+ return UNICODE_GENERAL_CATEGORY_INVALID;
+}
--- /dev/null
+#!/usr/bin/env python3
+# Copyright (c) 2025 Dovecot authors, see the included COPYING file
+
+import argparse
+import bisect
+import copy
+import re
+import sys
+from pathlib import Path
+
+
+source_files = []
+
+ud_codepoints = []
+ud_codepoints_first = []
+ud_codepoints_last = []
+ud_codepoints_index = {}
+
+ud_codepoints_index8 = {}
+ud_codepoints_index16 = {}
+ud_codepoints_index16_reused = {}
+ud_codepoints_index16_offsets = {}
+ud_codepoints_index24 = {}
+ud_codepoints_index24_reused = {}
+ud_codepoints_index24_offsets = {}
+ud_codepoints_index32 = {}
+ud_codepoints_index32_reused = {}
+ud_codepoints_index32_offsets = {}
+ud_codepoints_index16_blocks = 1
+ud_codepoints_index24_blocks = 2
+ud_codepoints_index32_blocks = 2
+
+
+class UCDFileOpen:
+ def __init__(self, filename):
+ self.filename = filename
+
+ def __enter__(self):
+ global ucd_dir
+ global source_files
+
+ self.fd = open(ucd_dir + "/" + self.filename, mode="r", encoding="utf-8")
+ source_files.append(self.filename)
+ return self
+
+ def __exit__(self, exception_type, exception_value, exception_traceback):
+ self.fd.close()
+
+ def __str__(self):
+ return self.filename
+
+
+class CodePointData:
+ def mergeFrom(self, data, default=False):
+ for attr in dir(data):
+ if callable(getattr(data, attr)):
+ continue
+ if attr.startswith("__"):
+ continue
+ if default and hasattr(self, attr):
+ continue
+ setattr(self, attr, getattr(data, attr))
+
+
+class CodePointRange:
+ def insert(self, n):
+ global ud_codepoints
+ global ud_codepoints_first
+ global ud_codepoints_last
+
+ ud_codepoints.insert(n, self)
+ ud_codepoints_first.insert(n, self.cp_first)
+ ud_codepoints_last.insert(n, self.cp_last)
+
+ def modify(self, n):
+ global ud_codepoints
+ global ud_codepoints_first
+ global ud_codepoints_last
+
+ ud_codepoints_first[n] = self.cp_first
+ ud_codepoints_last[n] = self.cp_last
+
+ def __new__(cls, cp_first, cp_last, data, default=False):
+ global ud_codepoints
+ global ud_codepoints_first
+ global ud_codepoints_last
+
+ cprn_first = None
+
+ if len(ud_codepoints) == 0:
+ cprn = super().__new__(cls)
+ cprn.cp_first = cp_first
+ cprn.cp_last = cp_last
+ cprn.data = data
+ cprn.insert(0)
+ return
+
+ idx_first = bisect.bisect_left(ud_codepoints_first, cp_first)
+ idx_last = bisect.bisect_right(ud_codepoints_last, cp_last)
+ rng_first = idx_first - 1
+ rng_last = idx_last + 1
+ if rng_last >= len(ud_codepoints):
+ rng_last = len(ud_codepoints) - 1
+
+ # Check existing ranges
+ nn = None
+ n = rng_first
+ while n <= rng_last:
+ cpr = ud_codepoints[n]
+ pos = n
+ n += 1
+
+ # No overlap with this range
+ if cp_last < cpr.cp_first or cp_first > cpr.cp_last:
+ continue
+ # Exact match
+ if cp_first == cpr.cp_first and cp_last == cpr.cp_last:
+ cpr.data.mergeFrom(data, default)
+ return cpr
+ # New range fully envelops existing
+ if cp_first <= cpr.cp_first and cp_last >= cpr.cp_last:
+ # Split off range before
+ if cp_first < cpr.cp_first:
+ cprn = super().__new__(cls)
+ cprn.cp_first = cp_first
+ cprn.cp_last = cpr.cp_first - 1
+ cprn.data = copy.deepcopy(data)
+ cprn.insert(pos)
+ rng_last += 1
+ if cprn_first is None:
+ cprn_first = cprn
+ # Merge with existing
+ cpr.data.mergeFrom(data, default)
+ # Split off range after
+ if cp_last > cpr.cp_last:
+ cp_first = cpr.cp_last + 1
+ nn = pos + 1
+ continue
+ break
+ # New range fully enveloped by existing
+ if cp_first > cpr.cp_first and cp_last < cpr.cp_last:
+ cprn = super().__new__(cls)
+ cprn.cp_first = cp_last + 1
+ cprn.cp_last = cpr.cp_last
+ cprn.data = cpr.data
+ cprn.insert(pos + 1)
+ rng_last += 1
+ cpr.cp_last = cp_first - 1
+ cpr.modify(pos)
+ cprn = super().__new__(cls)
+ cprn.cp_first = cp_first
+ cprn.cp_last = cp_last
+ cprn.data = copy.deepcopy(cpr.data)
+ cprn.data.mergeFrom(data, default)
+ cprn.insert(pos + 1)
+ rng_last += 1
+ return cprn
+ # New range aligns with beginning of existing
+ if cp_first == cpr.cp_first and cp_last < cpr.cp_last:
+ cpr.cp_first = cp_last + 1
+ cpr.modify(pos)
+ cprn = super().__new__(cls)
+ cprn.cp_first = cp_first
+ cprn.cp_last = cp_last
+ cprn.data = copy.deepcopy(cpr.data)
+ cprn.data.mergeFrom(data, default)
+ cprn.insert(pos)
+ rng_last += 1
+ return cprn
+ # New range aligns with end of existing
+ if cp_first > cpr.cp_first and cp_last == cpr.cp_last:
+ cpr.cp_last = cp_first - 1
+ cpr.modify(pos)
+ cprn = super().__new__(cls)
+ cprn.cp_first = cp_first
+ cprn.cp_last = cp_last
+ cprn.data = copy.deepcopy(cpr.data)
+ cprn.data.mergeFrom(data, default)
+ cprn.insert(pos + 1)
+ rng_last += 1
+ return cprn
+ # New range crosses the beginning of existing
+ if cp_first < cpr.cp_first and cp_last >= cpr.cp_first:
+ cprn = super().__new__(cls)
+ cprn.cp_first = cp_first
+ cprn.cp_last = cpr.cp_first - 1
+ cprn.data = data
+ cprn.insert(pos)
+ rng_last += 1
+ cprn = super().__new__(cls)
+ cprn.cp_first = cpr.cp_first
+ cprn.cp_last = cp_last
+ cprn.data = copy.deepcopy(cpr.data)
+ cprn.data.mergeFrom(data, default)
+ cprn.insert(pos + 1)
+ rng_last += 1
+ cpr.cp_first = cp_last + 1
+ cpr.modify(pos + 2)
+ return cprn
+ # New range crosses the end of existing
+ if cp_first <= cpr.cp_last and cp_last > cpr.cp_last:
+ cprn = super().__new__(cls)
+ cprn.cp_first = cp_first
+ cprn.cp_last = cpr.cp_last
+ cprn.data = copy.deepcopy(cpr.data)
+ cprn.data.mergeFrom(data, default)
+ cprn.insert(pos)
+ rng_last += 1
+ if cprn_first is None:
+ cprn_first = cprn
+ tmp = cp_first
+ cp_first = cpr.cp_last + 1
+ cpr.cp_last = tmp - 1
+ cpr.modify(pos + 1)
+ nn = pos + 1
+ continue
+
+ cprn = super().__new__(cls)
+ cprn.cp_first = cp_first
+ cprn.cp_last = cp_last
+ cprn.data = data
+ if nn is None:
+ cprn.insert(idx_first)
+ else:
+ cprn.insert(nn)
+ if cprn_first is None:
+ cprn_first = cprn
+
+ return cprn_first
+
+
+def die(message):
+ module_filename = Path(__file__).name
+ print(f"{module_filename}: {message}", file=sys.stderr)
+ sys.exit(1)
+
+
+def parse_cp_range(column):
+ rng_hex = column.strip()
+ if len(rng_hex) == 0:
+ return None
+ rng = rng_hex.split("..")
+
+ cp_hex = rng[0].strip()
+ cp_first = int(cp_hex, 16)
+ cp_last = cp_first
+
+ if len(rng) > 1:
+ cp_hex = rng[1].strip()
+ cp_last = int(cp_hex, 16)
+
+ return (cp_first, cp_last)
+
+
+def read_ucd_files():
+ global ud_decomposition_type_names
+ global ud_composition_exclusions
+
+ # UnicodeData.txt
+ with UCDFileOpen("UnicodeData.txt") as ucd:
+ cp_range_first = None
+ line_num = 0
+ for line in ucd.fd:
+ line_num = line_num + 1
+ data = line.split("#")
+ line = data[0].strip()
+ if len(line) == 0:
+ continue
+
+ cols = line.split(";")
+ if len(cols) < 15:
+ die(f"{ucd}:{line_num}: Missing columns")
+
+ # (0) Code point in hex
+
+ cp_first = cp_last = int(cols[0].strip(), 16)
+
+ # (1) Name
+
+ cp_name = cols[1].strip()
+
+ x = re.search("<([^>]*), (First|Last)>", cp_name)
+ if x:
+ if x.group(2) == "First":
+ cp_range_first = cp_first
+ continue
+ if x.group(2) == "Last" and cp_range_first is not None:
+ cp_first = cp_range_first
+ cp_name = "<%s>" % x.group(1)
+ cp_range_first = None
+
+ cpd = CodePointData()
+ cpd.name = cp_name
+
+ # (2) General_Category
+
+ cpd.general_category = cols[2].strip()
+
+ # (3) Canonical_Combining_Class
+
+ ccc = cols[3].strip()
+ if ccc != "":
+ cpd.canonical_combining_class = int(ccc)
+
+ # (5) Decomposition_Type, Decomposition_Mapping
+
+ x = re.search("(<([^>]*)> )?(.+)", cols[5].strip())
+ if x:
+ if x.group(2) is not None:
+ cpd.decomposition_type = x.group(2)
+ dcs_txt = x.group(3).split(" ")
+ dcs = []
+ for dc_txt in dcs_txt:
+ dcs.append(int(dc_txt.strip(), 16))
+ cpd.decomposition_first = dcs
+
+ # (12) Simple_Uppercase_Mapping
+
+ code = cols[12].strip()
+ if code != "":
+ cpd.simple_uppercase_mapping = int(code, 16)
+
+ # (13) Simple_Lowercase_Mapping
+
+ code = cols[13].strip()
+ if code != "":
+ cpd.simple_lowercase_mapping = int(code, 16)
+
+ # (14) Simple_Titlecase_Mapping
+
+ code = cols[14].strip()
+ if code != "":
+ cpd.simple_titlecase_mapping = int(code, 16)
+
+ # Add range
+ CodePointRange(cp_first, cp_last, cpd)
+
+
+def create_cp_range_index():
+ global ud_codepoints
+ global ud_codepoints_index
+
+ for cpr in ud_codepoints:
+ ud_codepoints_index[cpr.cp_first] = cpr
+
+
+def update_cp_index_tables(cp_first, cp_last, cp_pos):
+ global ud_codepoints_index8
+ global ud_codepoints_index16
+ global ud_codepoints_index16_reused
+ global ud_codepoints_index16_offsets
+ global ud_codepoints_index16_blocks
+ global ud_codepoints_index24
+ global ud_codepoints_index24_reused
+ global ud_codepoints_index24_offsets
+ global ud_codepoints_index24_blocks
+ global ud_codepoints_index32
+ global ud_codepoints_index32_reused
+ global ud_codepoints_index32_offsets
+ global ud_codepoints_index32_blocks
+
+ cp_range = range(cp_first, cp_last + 1)
+
+ id16_block = None
+ id24_block = None
+ id32_block = None
+ first16 = True
+ first24 = True
+ first32 = True
+
+ last_rcp = cp_last
+ for rcp in cp_range:
+ # Index for first 8 bits of code point
+ id8_idx = rcp >> 24
+ if id8_idx in ud_codepoints_index8:
+ id16_block = ud_codepoints_index8[id8_idx]
+ elif (
+ id16_block is not None
+ and not first16
+ and ((last_rcp & 0xFFFFFF) == 0xFFFFFF or (rcp >> 24) != (last_rcp >> 24))
+ ):
+ ud_codepoints_index8[id8_idx] = id16_block
+ if id16_block not in ud_codepoints_index16_reused:
+ ud_codepoints_index16_reused[id16_block] = 1
+ ud_codepoints_index16_reused[id16_block] += 1
+ else:
+ first16 = False
+ id16_block = ud_codepoints_index16_blocks
+ ud_codepoints_index8[id8_idx] = id16_block
+ ud_codepoints_index16_offsets[id16_block] = rcp & (((1 << 8) - 1) << 24)
+ ud_codepoints_index16_blocks += 1
+
+ # Index for first 16 bits of code point
+ id16_idx = (id16_block << 8) + ((rcp >> 16) & 0xFF)
+ if id16_idx in ud_codepoints_index16:
+ id24_block = ud_codepoints_index16[id16_idx]
+ elif (
+ id24_block is not None
+ and not first24
+ and ((last_rcp & 0xFFFF) == 0xFFFF or (rcp >> 16) != (last_rcp >> 16))
+ ):
+ ud_codepoints_index16[id16_idx] = id24_block
+ if id24_block not in ud_codepoints_index24_reused:
+ ud_codepoints_index24_reused[id24_block] = 1
+ ud_codepoints_index24_reused[id24_block] += 1
+ else:
+ first24 = False
+ id24_block = ud_codepoints_index24_blocks
+ ud_codepoints_index16[id16_idx] = id24_block
+ ud_codepoints_index24_offsets[id24_block] = rcp & (((1 << 16) - 1) << 16)
+ ud_codepoints_index24_blocks += 1
+
+ # Index for first 24 bits of code point
+ id24_idx = (id24_block << 8) + ((rcp >> 8) & 0xFF)
+ if id24_idx in ud_codepoints_index24:
+ id32_block = ud_codepoints_index24[id24_idx]
+ elif (
+ id32_block is not None
+ and not first32
+ and ((last_rcp & 0xFF) == 0xFF or (rcp >> 8) != (last_rcp >> 8))
+ ):
+ ud_codepoints_index24[id24_idx] = id32_block
+ if id32_block not in ud_codepoints_index32_reused:
+ ud_codepoints_index32_reused[id32_block] = 1
+ ud_codepoints_index32_reused[id32_block] += 1
+ else:
+ first32 = False
+ id32_block = ud_codepoints_index32_blocks
+ ud_codepoints_index24[id24_idx] = id32_block
+ ud_codepoints_index32_offsets[id32_block] = rcp & (((1 << 24) - 1) << 8)
+ ud_codepoints_index32_blocks += 1
+
+ # Index for first 32 bits of code point
+ id32_idx = (id32_block << 8) + (rcp & 0xFF)
+ ud_codepoints_index32[id32_idx] = cp_pos
+
+
+def create_cp_index_tables():
+ global ud_codepoints
+
+ # Create code point index
+ for n in range(0, len(ud_codepoints)):
+ cpr = ud_codepoints[n]
+ cp_first = cpr.cp_first
+ cp_last = cpr.cp_last
+
+ update_cp_index_tables(cp_first, cp_last, n)
+
+
+def get_general_category_def(gc):
+ return "UNICODE_GENERAL_CATEGORY_%s" % gc.upper()
+
+
+def decomposition_type_def(dt):
+ return "UNICODE_DECOMPOSITION_TYPE_%s" % dt.upper()
+
+
+def print_list(code_list):
+ last = len(code_list) - 1
+ n = 0
+ print("\t", end="")
+ for code in code_list:
+ print("0x%05x" % code, end="")
+ if n == last:
+ break
+ print(",", end="")
+
+ n += 1
+ if (n % 8) == 0:
+ print("")
+ print("\t", end="")
+ if (n % 10) == 0:
+ print("// INDEX %u" % n)
+ print("\t", end="")
+ else:
+ print(" ", end="")
+
+
+def print_top_message():
+ global ucd_dir
+ global source_files
+
+ print("/* This file is automatically generated by unicode-ucd-compile.py from:")
+ for sf in source_files:
+ print(" %s/%s" % (ucd_dir, sf))
+ print(" */")
+ print("")
+
+
+def write_tables_h():
+ global output_dir
+ global ud_decomposition_max_length
+ global ud_compositions_max_per_starter
+
+ orig_stdout = sys.stdout
+
+ with open(output_dir + "/unicode-data-tables.h", mode="w", encoding="utf-8") as fd:
+ sys.stdout = fd
+
+ print("#ifndef UNICODE_DATA_TABLES_H")
+ print("#define UNICODE_DATA_TABLES_H")
+ print("")
+ print_top_message()
+ print('#include "unicode-data-static.h"')
+ print("")
+ print("extern const struct unicode_code_point_data unicode_code_points[];")
+ print("")
+ print("extern const uint8_t unicode_code_points_index8[];")
+ print("extern const uint8_t unicode_code_points_index16[];")
+ print("extern const uint16_t unicode_code_points_index24[];")
+ print("extern const uint16_t unicode_code_points_index32[];")
+ print("")
+ print("#endif")
+
+ sys.stdout = orig_stdout
+
+
+def write_tables_c():
+ global output_dir
+ global ud_codepoints
+ global ud_decompositions
+ global ud_compositions
+ global ud_composition_primaries
+ global ud_case_mappings
+
+ orig_stdout = sys.stdout
+
+ with open(output_dir + "/unicode-data-tables.c", mode="w", encoding="utf-8") as fd:
+ sys.stdout = fd
+ print_top_message()
+
+ print('#include "lib.h"')
+ print('#include "unicode-data-tables.h"')
+ print("")
+ print("const struct unicode_code_point_data unicode_code_points[] = {")
+ print("\t{ // [0000] <invalid>")
+ print("\t\t.general_category = UNICODE_GENERAL_CATEGORY_INVALID,")
+ print("\t},")
+ print("\t{ // [0001] <unassigned>")
+ print("\t\t.general_category = UNICODE_GENERAL_CATEGORY_CN,")
+ print("\t},")
+ n = 2
+ for cpr in ud_codepoints:
+ cpd = cpr.data
+
+ if cpr.cp_last > cpr.cp_first:
+ range_str = "U+%04X..U+%04X" % (cpr.cp_first, cpr.cp_last)
+ else:
+ range_str = "U+%04X" % (cpr.cp_first)
+ print("\t{ // [%04X] %s: %s" % (n, range_str, cpd.name))
+ n = n + 1
+
+ print(
+ "\t\t.general_category = %s,"
+ % get_general_category_def(cpd.general_category)
+ )
+ if hasattr(cpd, "simple_titlecase_mapping"):
+ print(
+ "\t\t.simple_titlecase_mapping = 0x%04X,"
+ % cpd.simple_titlecase_mapping
+ )
+ print("\t},")
+ print("};")
+ print("")
+ # Code points index8
+ print("const uint8_t unicode_code_points_index8[] = {")
+ print("\t", end="")
+ for n in range(0, 256):
+ if n in ud_codepoints_index8:
+ print("0x%02x" % ud_codepoints_index8[n], end="")
+ else:
+ print("0x00", end="")
+ if n == 255:
+ break
+ print(",", end="")
+
+ if ((n + 1) % 8) == 0:
+ print("\n\t", end="")
+ else:
+ print(" ", end="")
+ print(",")
+ print("};")
+ print("")
+ # Code points index16
+ print("const uint8_t unicode_code_points_index16[] = {")
+ print("\t// Block 0x00: <invalid>")
+ print("\t", end="")
+ last = (1 << 8) - 1
+ for n in range(0 << 8, last + 1):
+ print("0x00", end="")
+ if n == last:
+ break
+ print(",", end="")
+
+ if ((n + 1) % 8) == 0:
+ print("\n\t", end="")
+ else:
+ print(" ", end="")
+ print(",")
+ print("\t", end="")
+ last = (ud_codepoints_index16_blocks << 8) - 1
+ for n in range((1 << 8), last + 1):
+ if (n & ((1 << 8) - 1)) == 0:
+ blk_id = n >> 8
+ blk_offset = ud_codepoints_index16_offsets[blk_id]
+ blk_end = blk_offset + (1 << 24) - 1
+ print(
+ "// Block 0x%02X: U+%06X..U+%06X" % (blk_id, blk_offset, blk_end),
+ end="",
+ )
+ if blk_id in ud_codepoints_index16_reused:
+ print(
+ " (used %u times)" % ud_codepoints_index16_reused[blk_id], end=""
+ )
+ print("")
+ print("\t", end="")
+ if n in ud_codepoints_index16:
+ print("0x%02x" % ud_codepoints_index16[n], end="")
+ elif ud_codepoints_index16_offsets[n >> 8] + ((n & 0xFF) << 16) > 0x10FFFF:
+ print("0x00", end="")
+ else:
+ print("0x01", end="")
+ if n == last:
+ break
+ print(",", end="")
+
+ if ((n + 1) % 8) == 0:
+ print("")
+ print("\t", end="")
+ else:
+ print(" ", end="")
+ print("")
+ print("};")
+ print("")
+ # Code points index24
+ print("const uint16_t unicode_code_points_index24[] = {")
+ print("\t// Block 0x00: <invalid>")
+ print("\t", end="")
+ last = (1 << 8) - 1
+ for n in range((0 << 8), last + 1):
+ print("0x000", end="")
+ if n == last:
+ break
+ print(",", end="")
+
+ if ((n + 1) % 8) == 0:
+ print("")
+ print("\t", end="")
+ else:
+ print(" ", end="")
+ print(",")
+ print("\t// Block 0x01: <unassigned>")
+ print("\t", end="")
+ last = (2 << 8) - 1
+ for n in range((1 << 8), last + 1):
+ print("0x001", end="")
+ if n == last:
+ break
+ print(",", end="")
+
+ if ((n + 1) % 8) == 0:
+ print("")
+ print("\t", end="")
+ else:
+ print(" ", end="")
+ print(",")
+ print("\t", end="")
+ last = (ud_codepoints_index24_blocks << 8) - 1
+ for n in range((2 << 8), last + 1):
+ if (n & ((1 << 8) - 1)) == 0:
+ blk_id = n >> 8
+ blk_offset = ud_codepoints_index24_offsets[blk_id]
+ blk_end = blk_offset + (1 << 16) - 1
+ print(
+ "// Block 0x%04X: U+%06X..U+%06X" % (blk_id, blk_offset, blk_end),
+ end="",
+ )
+ if blk_id in ud_codepoints_index24_reused:
+ print(
+ " (used %u times)" % ud_codepoints_index24_reused[blk_id], end=""
+ )
+ print("")
+ print("\t", end="")
+ if n in ud_codepoints_index24:
+ print("0x%03x" % ud_codepoints_index24[n], end="")
+ else:
+ print("0x001", end="")
+ if n == last:
+ break
+ print(",", end="")
+
+ if ((n + 1) % 8) == 0:
+ print("")
+ print("\t", end="")
+ else:
+ print(" ", end="")
+ print(",")
+ print("};")
+ print("")
+ # Code points index32
+ print("const uint16_t unicode_code_points_index32[] = {")
+ print("\t// Block 0x000: <invalid>")
+ print("\t", end="")
+ last = (1 << 8) - 1
+ for n in range(0 << 8, last + 1):
+ print("0x0000", end="")
+ if n == last:
+ break
+ print(",", end="")
+
+ if ((n + 1) % 8) == 0:
+ print("")
+ print("\t", end="")
+ else:
+ print(" ", end="")
+ print(",")
+ print("\t// Block 0x001: <unassigned>")
+ print("\t", end="")
+ last = (2 << 8) - 1
+ for n in range(1 << 8, last + 1):
+ print("0x0001", end="")
+ if n == last:
+ break
+ print(",", end="")
+
+ if ((n + 1) % 8) == 0:
+ print("")
+ print("\t", end="")
+ else:
+ print(" ", end="")
+ print(",")
+ print("\t", end="")
+ last = (ud_codepoints_index32_blocks << 8) - 1
+ for n in range(2 << 8, last + 1):
+ if (n & ((1 << 8) - 1)) == 0:
+ blk_id = n >> 8
+ blk_offset = ud_codepoints_index32_offsets[blk_id]
+ blk_end = blk_offset + (1 << 8) - 1
+ print(
+ "// Block 0x%04X: U+%06X - U+%06X" % (blk_id, blk_offset, blk_end),
+ end="",
+ )
+ if blk_id in ud_codepoints_index32_reused:
+ print(
+ " (used %u times)" % ud_codepoints_index32_reused[blk_id], end=""
+ )
+ print("")
+ print("\t", end="")
+ if n in ud_codepoints_index32:
+ print("0x%04x" % (ud_codepoints_index32[n] + 2), end="")
+ else:
+ print("0x0001", end="")
+ if n == last:
+ break
+ print(",", end="")
+
+ if ((n + 1) % 8) == 0:
+ print("")
+ print("\t", end="")
+ else:
+ print(" ", end="")
+ print(",")
+ print("};")
+
+ sys.stdout = orig_stdout
+
+
+def main():
+ global ucd_dir
+ global output_dir
+ global source_files
+
+ """Entry point."""
+ parser = argparse.ArgumentParser(
+ prog="unicode-ucd-compile.py",
+ description="Compile the Unicode Character Database files into C code",
+ )
+ parser.add_argument(
+ "ucd-dir",
+ type=str,
+ help="Directory containing the UCD files",
+ )
+ parser.add_argument(
+ "output-dir",
+ type=str,
+ help="Output directory where the C header and source files are written",
+ )
+ args = parser.parse_args()
+
+ ucd_dir = getattr(args, "ucd-dir")
+ output_dir = getattr(args, "output-dir")
+
+ read_ucd_files()
+ source_files.sort()
+
+ create_cp_range_index()
+
+ create_cp_index_tables()
+
+ write_tables_h()
+ write_tables_c()
+
+
+if __name__ == "__main__":
+ main()