From: Stephan Bosch Date: Fri, 21 Mar 2025 03:26:50 +0000 (+0100) Subject: lib: Start new Unicode Character Database implementation X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=e19cfbf8f8c346262bb39fe253d63c4763c350f6;p=thirdparty%2Fdovecot%2Fcore.git lib: Start new Unicode Character Database implementation It uses a pre-compiled trie structure and will in later commits feature full support for the data necessary for Unicode normalization. Stuff needed for lib-language can be migrated here as well. --- diff --git a/.gitignore b/.gitignore index c1158e9ebf..84e994bc96 100644 --- a/.gitignore +++ b/.gitignore @@ -101,6 +101,10 @@ src/lib/event-filter-lexer.c src/lib/event-filter-parser.c src/lib/event-filter-parser.h src/lib/unicodemap.c +src/lib/unicode-data-tables.c +src/lib/unicode-data-tables.h +src/lib/unicode-data-types.c +src/lib/unicode-data-types.h src/lib-compression/bench-compression src/lib-language/PropList.txt src/lib-language/WordBreakProperty.txt diff --git a/src/lib/Makefile.am b/src/lib/Makefile.am index d0e2715984..1d98248ecf 100644 --- a/src/lib/Makefile.am +++ b/src/lib/Makefile.am @@ -4,6 +4,8 @@ AM_CPPFLAGS = \ noinst_LTLIBRARIES = liblib.la BUILT_SOURCES = $(srcdir)/unicodemap.c \ + $(srcdir)/unicode-data-tables.c \ + $(srcdir)/unicode-data-tables.h \ event-filter-lexer.c \ event-filter-parser.c \ event-filter-parser.h @@ -11,11 +13,17 @@ BUILT_SOURCES = $(srcdir)/unicodemap.c \ UCD_URL = https://dovecot.org/res UCD_DIR = $(srcdir)/ucd UCD_FILES = \ + $(UCD_DIR)/DerivedCoreProperties.txt \ $(UCD_DIR)/UnicodeData.txt -EXTRA_DIST = unicodemap.c unicodemap.pl $(UCD_FILES) -EXTRA_CLEAN = unicodemap.c - +EXTRA_DIST = \ + unicodemap.c \ + unicode-data-tables.c \ + unicode-data-tables.h \ + unicodemap.pl \ + unicode-ucd-compile.py \ + $(UCD_FILES) +EXTRA_CLEAN = unicodemap.c unicode-data-tables.c # Squelch autoconf error about using .[ly] sources but not defining $(LEX) # and $(YACC). Using false here avoids accidental use. @@ -37,11 +45,16 @@ YACC=/bin/false # dependency, anything including the header will race the bison process. event-filter-parser.h: event-filter-parser.c +$(UCD_DIR)/DerivedCoreProperties.txt: + $(AM_V_at)test -f $@ || $(WGET) -nv -O $@ $(UCD_URL)/DerivedCoreProperties.txt $(UCD_DIR)/UnicodeData.txt: $(AM_V_at)test -f $@ || $(WGET) -nv -O $@ $(UCD_URL)/UnicodeData.txt $(srcdir)/unicodemap.c: $(srcdir)/unicodemap.pl $(UCD_DIR)/UnicodeData.txt $(AM_V_GEN)$(PERL) $(srcdir)/unicodemap.pl < $(UCD_DIR)/UnicodeData.txt > $@ +$(srcdir)/unicode-data-tables.c $(srcdir)/unicode-data-tables.h &: \ + $(srcdir)/unicode-ucd-compile.py $(UCD_FILES) + $(AM_V_GEN)$(PYTHON) $(srcdir)/unicode-ucd-compile.py $(UCD_DIR) $(srcdir) liblib_la_LIBADD = $(LIBUNWIND_LIBS) liblib_la_SOURCES = \ @@ -200,6 +213,8 @@ liblib_la_SOURCES = \ unlink-directory.c \ unlink-old-files.c \ unichar.c \ + unicode-data-tables.c \ + unicode-data.c \ uri-util.c \ utc-offset.c \ utc-mktime.c \ @@ -360,6 +375,9 @@ headers = \ unlink-directory.h \ unlink-old-files.h \ unichar.h \ + unicode-data-static.h \ + unicode-data-tables.h \ + unicode-data.h \ uri-util.h \ utc-offset.h \ utc-mktime.h \ @@ -370,7 +388,8 @@ test_programs = test-lib noinst_PROGRAMS = $(test_programs) test_lib_CPPFLAGS = \ - -I$(top_srcdir)/src/lib-test + -I$(top_srcdir)/src/lib-test \ + -DUCD_DIR=\"$(UCD_DIR)\" test_libs = \ ../lib-test/libtest.la \ @@ -467,6 +486,7 @@ test_lib_SOURCES = \ test-str-table.c \ test-time-util.c \ test-unichar.c \ + test-unicode-data.c \ test-utc-mktime.c \ test-uri.c \ test-wildcard-match.c @@ -478,7 +498,7 @@ test_headers = \ test_lib_LDADD = $(test_libs) -lm test_lib_DEPENDENCIES = $(test_libs) -check-local: +check-local: $(UCD_FILES) for bin in $(test_programs); do \ if ! $(RUN_TEST) ./$$bin; then exit 1; fi; \ done diff --git a/src/lib/test-lib.inc b/src/lib/test-lib.inc index 2fceca2f9b..d3a1f33669 100644 --- a/src/lib/test-lib.inc +++ b/src/lib/test-lib.inc @@ -107,6 +107,7 @@ TEST(test_str_sanitize) TEST(test_str_table) TEST(test_time_util) TEST(test_unichar) +TEST(test_unicode_data) TEST(test_uri) TEST(test_utc_mktime) TEST(test_wildcard_match) diff --git a/src/lib/test-unicode-data.c b/src/lib/test-unicode-data.c new file mode 100644 index 0000000000..b01b57b658 --- /dev/null +++ b/src/lib/test-unicode-data.c @@ -0,0 +1,169 @@ +/* Copyright (c) 2025 Dovecot authors, see the included COPYING file */ + +#include "test-lib.h" +#include "strnum.h" +#include "str.h" +#include "unichar.h" +#include "istream.h" +#include "unicode-data.h" + +#include + +#define UCD_UNICODE_DATA_TXT "UnicodeData.txt" + +static void test_unicode_data_line(const char *line, unsigned int line_num) +{ + static uint32_t cp_first = 0; + + const char *const *columns = t_strsplit(line, ";"); + if (str_array_length(columns) < 15) { + test_failed(t_strdup_printf( + "Invalid data at %s:%u", + UCD_UNICODE_DATA_TXT, line_num)); + return; + } + + const char *cp_hex = columns[0]; + uint32_t cp; + + if (str_to_uint32_hex(cp_hex, &cp) < 0) { + test_failed(t_strdup_printf( + "Invalid data at %s:%u: " + "Bad code point", + UCD_UNICODE_DATA_TXT, line_num)); + return; + } + + /* Parse Name */ + + const char *cp_name = columns[1]; + size_t cp_name_len = strlen(cp_name); + const char *p; + + if (cp_name[0] == '<' && cp_name[cp_name_len - 1] == '>') { + p = strchr(cp_name + 1, ','); + if (p != NULL) { + if (strcmp(p, ", First>") == 0) { + cp_first = cp; + return; + } else if (strcmp(p, ", Last>") != 0) { + test_failed(t_strdup_printf( + "Invalid data at %s:%u: " + "Bad code point range: %s", + UCD_UNICODE_DATA_TXT, line_num, cp_name)); + return; + } + } + } + + /* Parse General_Category */ + + uint8_t general_category = + (uint8_t)unicode_general_category_from_string(columns[2]); + if (general_category == UNICODE_GENERAL_CATEGORY_INVALID) { + test_failed(t_strdup_printf( + "Invalid data at %s:%u: " + "Bad General_Category for code point %"PRIu32": %s", + UCD_UNICODE_DATA_TXT, line_num, cp, columns[2])); + return; + } + test_assert(!unicode_general_category_is_group(general_category)); + + /* Parse Simple_*case_Mapping */ + + uint32_t simple_uppercase_mapping = 0; + uint32_t simple_lowercase_mapping = 0; + uint32_t simple_titlecase_mapping = 0; + + if (*columns[12] != '\0' && + str_to_uint32_hex(columns[12], &simple_uppercase_mapping) < 0) { + test_failed(t_strdup_printf( + "Invalid data at %s:%u: " + "Bad Simple_Uppercase_Mapping for code point %"PRIu32": %s", + UCD_UNICODE_DATA_TXT, line_num, cp, columns[12])); + return; + } + if (*columns[13] != '\0' && + str_to_uint32_hex(columns[13], &simple_lowercase_mapping) < 0) { + test_failed(t_strdup_printf( + "Invalid data at %s:%u: " + "Bad Simple_Lowercase_Mapping for code point %"PRIu32": %s", + UCD_UNICODE_DATA_TXT, line_num, cp, columns[13])); + return; + } + if (*columns[14] != '\0' && + str_to_uint32_hex(columns[14], &simple_titlecase_mapping) < 0) { + test_failed(t_strdup_printf( + "Invalid data at %s:%u: " + "Bad Simple_Titlecase_Mapping for code point %"PRIu32": %s", + UCD_UNICODE_DATA_TXT, line_num, cp, columns[14])); + return; + } + + /* Check data */ + + uint32_t cp_last = cp; + + if (cp_first == 0) + cp_first = cp; + for (cp = cp_first; cp <= cp_last && !test_has_failed(); cp++) { + const struct unicode_code_point_data *cp_data = + unicode_code_point_get_data(cp); + + test_assert_idx( + cp_data->general_category == general_category, cp); + + test_assert_idx( + cp_data->simple_titlecase_mapping == simple_titlecase_mapping, + cp); + } + + cp_first = 0; +} + +static void +test_ucd_file(const char *filename, + void (*test_line)(const char *line, unsigned int line_num)) +{ + const char *file_path = t_strconcat(UCD_DIR, "/", filename, NULL); + struct istream *input; + int fd; + + fd = open(file_path, O_RDONLY); + if (fd < 0) + i_fatal("Failed to open '%s': %m", file_path); + + test_begin(t_strdup_printf("unicode_data - %s", filename)); + + input = i_stream_create_fd_autoclose(&fd, 1024); + + unsigned int line_num = 0; + + while (!test_has_failed()) { + char *line = i_stream_read_next_line(input); + + if (line == NULL) + break; + line_num++; + + char *comment = strchr(line, '#'); + + if (comment != NULL) + *comment = '\0'; + if (*line == '\0') + continue; + + T_BEGIN { + test_line(line, line_num); + } T_END; + } + + i_stream_destroy(&input); + test_end(); +} + +void test_unicode_data(void) +{ + /* Check that UCD data files match with what is compiled. */ + test_ucd_file(UCD_UNICODE_DATA_TXT, test_unicode_data_line); +} diff --git a/src/lib/unicode-data-static.h b/src/lib/unicode-data-static.h new file mode 100644 index 0000000000..0258548985 --- /dev/null +++ b/src/lib/unicode-data-static.h @@ -0,0 +1,102 @@ +#ifndef UNICODE_DATA_STATIC_H +#define UNICODE_DATA_STATIC_H + +/* UAX #44, Section 5.7.1: General Category Values + */ +enum unicode_general_category { + UNICODE_GENERAL_CATEGORY_INVALID = 0, + + /* LC - Cased_Letter: Lu | Ll | Lt */ + UNICODE_GENERAL_CATEGORY_LC = (1 << 4), + /* L - Letter: Lu | Ll | Lt | Lm | Lo */ + UNICODE_GENERAL_CATEGORY_L = (1 << 5) | UNICODE_GENERAL_CATEGORY_LC, + /* M - Mark: Mn | Mc | Me */ + UNICODE_GENERAL_CATEGORY_M = (2 << 5), + /* N - Number: Nd | Nl | No */ + UNICODE_GENERAL_CATEGORY_N = (3 << 5), + /* P - Punctuation: Pc | Pd | Ps | Pe | Pi | Pf | Po */ + UNICODE_GENERAL_CATEGORY_P = (4 << 5), + /* S - Symbol: Sm | Sc | Sk | So */ + UNICODE_GENERAL_CATEGORY_S = (5 << 5), + /* Z - Separator: Zs | Zl | Zp */ + UNICODE_GENERAL_CATEGORY_Z = (6 << 5), + /* C - Other: Cc | Cf | Cs | Co | Cn */ + UNICODE_GENERAL_CATEGORY_C = (7 << 5), + + UNICODE_GENERAL_CATEGORY_GROUP_MASK = (0xf0), + + /* Lu - Uppercase_Letter */ + UNICODE_GENERAL_CATEGORY_LU = UNICODE_GENERAL_CATEGORY_LC | 1, + /* Ll - Lowercase_Letter */ + UNICODE_GENERAL_CATEGORY_LL = UNICODE_GENERAL_CATEGORY_LC | 2, + /* Lt - Titlecase_Letter */ + UNICODE_GENERAL_CATEGORY_LT = UNICODE_GENERAL_CATEGORY_LC | 3, + /* Lm - Modifier_Letter */ + UNICODE_GENERAL_CATEGORY_LM = UNICODE_GENERAL_CATEGORY_L | 4, + /* Lo - Other_Letter */ + UNICODE_GENERAL_CATEGORY_LO = UNICODE_GENERAL_CATEGORY_L | 5, + + /* Mn - Nonspacing_Mark */ + UNICODE_GENERAL_CATEGORY_MN = UNICODE_GENERAL_CATEGORY_M | 1, + /* Mc - Spacing_Mark */ + UNICODE_GENERAL_CATEGORY_MC = UNICODE_GENERAL_CATEGORY_M | 2, + /* Me - Enclosing_Mark */ + UNICODE_GENERAL_CATEGORY_ME = UNICODE_GENERAL_CATEGORY_M | 3, + + /* Nd - Decimal_Number */ + UNICODE_GENERAL_CATEGORY_ND = UNICODE_GENERAL_CATEGORY_N | 1, + /* Nl - Letter_Number */ + UNICODE_GENERAL_CATEGORY_NL = UNICODE_GENERAL_CATEGORY_N | 2, + /* No - Other_Number */ + UNICODE_GENERAL_CATEGORY_NO = UNICODE_GENERAL_CATEGORY_N | 3, + + /* Pc - Connector_Punctuation */ + UNICODE_GENERAL_CATEGORY_PC = UNICODE_GENERAL_CATEGORY_P | 1, + /* Pd - Dash_Punctuation */ + UNICODE_GENERAL_CATEGORY_PD = UNICODE_GENERAL_CATEGORY_P | 2, + /* Ps - Open_Punctuation */ + UNICODE_GENERAL_CATEGORY_PS = UNICODE_GENERAL_CATEGORY_P | 3, + /* Pe - Close_Punctuation */ + UNICODE_GENERAL_CATEGORY_PE = UNICODE_GENERAL_CATEGORY_P | 4, + /* Pi - Initial_Punctuation */ + UNICODE_GENERAL_CATEGORY_PI = UNICODE_GENERAL_CATEGORY_P | 5, + /* Pf - Final_Punctuation */ + UNICODE_GENERAL_CATEGORY_PF = UNICODE_GENERAL_CATEGORY_P | 6, + /* Po - Other_Punctuation */ + UNICODE_GENERAL_CATEGORY_PO = UNICODE_GENERAL_CATEGORY_P | 7, + + /* Sm - Math_Symbol */ + UNICODE_GENERAL_CATEGORY_SM = UNICODE_GENERAL_CATEGORY_S | 1, + /* Sc - Currency_Symbol */ + UNICODE_GENERAL_CATEGORY_SC = UNICODE_GENERAL_CATEGORY_S | 2, + /* Sk - Modifier_Symbol */ + UNICODE_GENERAL_CATEGORY_SK = UNICODE_GENERAL_CATEGORY_S | 3, + /* So - Other_Symbol */ + UNICODE_GENERAL_CATEGORY_SO = UNICODE_GENERAL_CATEGORY_S | 4, + + /* Zs - Space_Separator */ + UNICODE_GENERAL_CATEGORY_ZS = UNICODE_GENERAL_CATEGORY_Z | 1, + /* Zl - Line_Separator */ + UNICODE_GENERAL_CATEGORY_ZL = UNICODE_GENERAL_CATEGORY_Z | 2, + /* Zp - Paragraph_Separator */ + UNICODE_GENERAL_CATEGORY_ZP = UNICODE_GENERAL_CATEGORY_Z | 3, + + /* Cc - Control */ + UNICODE_GENERAL_CATEGORY_CC = UNICODE_GENERAL_CATEGORY_C | 1, + /* Cf - Format */ + UNICODE_GENERAL_CATEGORY_CF = UNICODE_GENERAL_CATEGORY_C | 2, + /* Cs - Surrogate */ + UNICODE_GENERAL_CATEGORY_CS = UNICODE_GENERAL_CATEGORY_C | 3, + /* Co - Private_Use */ + UNICODE_GENERAL_CATEGORY_CO = UNICODE_GENERAL_CATEGORY_C | 4, + /* Cn - Unassigned */ + UNICODE_GENERAL_CATEGORY_CN = UNICODE_GENERAL_CATEGORY_C | 5, +}; + +struct unicode_code_point_data { + uint8_t general_category; // Not yet used + + uint32_t simple_titlecase_mapping; +}; + +#endif diff --git a/src/lib/unicode-data.c b/src/lib/unicode-data.c new file mode 100644 index 0000000000..31e49903d5 --- /dev/null +++ b/src/lib/unicode-data.c @@ -0,0 +1,172 @@ +/* Copyright (c) 2025 Dovecot authors, see the included COPYING file */ + +#include "lib.h" +#include "unicode-data.h" + +uint8_t unicode_general_category_from_string(const char *str) +{ + if (str == NULL || strlen(str) != 2) + return UNICODE_GENERAL_CATEGORY_INVALID; + + switch (str[0]) { + case 'L': + switch (str[1]) { + /* Lu - Uppercase_Letter */ + case 'u': + return UNICODE_GENERAL_CATEGORY_LU; + /* Ll - Lowercase_Letter */ + case 'l': + return UNICODE_GENERAL_CATEGORY_LL; + /* Lt - Titlecase_Letter */ + case 't': + return UNICODE_GENERAL_CATEGORY_LT; + /* LC - Cased_Letter: Lu | Ll | Lt */ + case 'C': + return UNICODE_GENERAL_CATEGORY_LC; + /* Lm - Modifier_Letter */ + case 'm': + return UNICODE_GENERAL_CATEGORY_LM; + /* Lo - Other_Letter */ + case 'o': + return UNICODE_GENERAL_CATEGORY_LO; + /* L - Letter: Lu | Ll | Lt | Lm | Lo */ + case '\0': + return UNICODE_GENERAL_CATEGORY_L; + default: + break; + } + break; + case 'M': + switch (str[1]) { + /* Mn - Nonspacing_Mark */ + case 'n': + return UNICODE_GENERAL_CATEGORY_MN; + /* Mc - Spacing_Mark */ + case 'c': + return UNICODE_GENERAL_CATEGORY_MC; + /* Me - Enclosing_Mark */ + case 'e': + return UNICODE_GENERAL_CATEGORY_ME; + /* M - Mark: Mn | Mc | Me */ + case '\0': + return UNICODE_GENERAL_CATEGORY_M; + default: + break; + } + break; + case 'N': + switch (str[1]) { + /* Nd - Decimal_Number */ + case 'd': + return UNICODE_GENERAL_CATEGORY_ND; + /* Nl - Letter_Number */ + case 'l': + return UNICODE_GENERAL_CATEGORY_NL; + /* No - Other_Number */ + case 'o': + return UNICODE_GENERAL_CATEGORY_NO; + /* N - Number: Nd | Nl | No */ + case '\0': + return UNICODE_GENERAL_CATEGORY_N; + default: + break; + } + break; + case 'P': + switch (str[1]) { + /* Pc - Connector_Punctuation */ + case 'c': + return UNICODE_GENERAL_CATEGORY_PC; + /* Pd - Dash_Punctuation */ + case 'd': + return UNICODE_GENERAL_CATEGORY_PD; + /* Ps - Open_Punctuation */ + case 's': + return UNICODE_GENERAL_CATEGORY_PS; + /* Pe - Close_Punctuation */ + case 'e': + return UNICODE_GENERAL_CATEGORY_PE; + /* Pi - Initial_Punctuation */ + case 'i': + return UNICODE_GENERAL_CATEGORY_PI; + /* Pf - Final_Punctuation */ + case 'f': + return UNICODE_GENERAL_CATEGORY_PF; + /* Po - Other_Punctuation */ + case 'o': + return UNICODE_GENERAL_CATEGORY_PO; + /* P - Punctuation: Pc | Pd | Ps | Pe | Pi | Pf | Po */ + case '\0': + return UNICODE_GENERAL_CATEGORY_P; + default: + break; + } + break; + case 'S': + switch (str[1]) { + /* Sm - Math_Symbol */ + case 'm': + return UNICODE_GENERAL_CATEGORY_SM; + /* Sc - Currency_Symbol */ + case 'c': + return UNICODE_GENERAL_CATEGORY_SC; + /* Sk - Modifier_Symbol */ + case 'k': + return UNICODE_GENERAL_CATEGORY_SK; + /* So - Other_Symbol */ + case 'o': + return UNICODE_GENERAL_CATEGORY_SO; + /* S - Symbol: Sm | Sc | Sk | So */ + case '\0': + return UNICODE_GENERAL_CATEGORY_S; + default: + break; + } + break; + case 'Z': + switch (str[1]) { + /* Zs - Space_Separator */ + case 's': + return UNICODE_GENERAL_CATEGORY_ZS; + /* Zl - Line_Separator */ + case 'l': + return UNICODE_GENERAL_CATEGORY_ZL; + /* Zp - Paragraph_Separator */ + case 'p': + return UNICODE_GENERAL_CATEGORY_ZP; + /* Z - Separator: Zs | Zl | Zp */ + case '\0': + return UNICODE_GENERAL_CATEGORY_Z; + default: + break; + } + break; + case 'C': + switch (str[1]) { + /* Cc - Control */ + case 'c': + return UNICODE_GENERAL_CATEGORY_CC; + /* Cf - Format */ + case 'f': + return UNICODE_GENERAL_CATEGORY_CF; + /* Cs - Surrogate */ + case 's': + return UNICODE_GENERAL_CATEGORY_CS; + /* Co - Private_Use */ + case 'o': + return UNICODE_GENERAL_CATEGORY_CO; + /* Cn - Unassigned */ + case 'n': + return UNICODE_GENERAL_CATEGORY_CN; + /* C - Other: Cc | Cf | Cs | Co | Cn */ + case '\0': + return UNICODE_GENERAL_CATEGORY_C; + default: + break; + } + break; + default: + break; + } + return UNICODE_GENERAL_CATEGORY_INVALID; +} diff --git a/src/lib/unicode-data.h b/src/lib/unicode-data.h new file mode 100644 index 0000000000..6b156f1b30 --- /dev/null +++ b/src/lib/unicode-data.h @@ -0,0 +1,29 @@ +#ifndef UNICODE_DATA_H +#define UNICODE_DATA_H + +#include "unicode-data-tables.h" + +static inline bool +unicode_general_category_is_group(enum unicode_general_category gencat) +{ + return ((gencat & 0x0f) == 0x00); +} + +static inline const struct unicode_code_point_data * +unicode_code_point_get_data(uint32_t cp) +{ + unsigned int idx8 = cp >> 24; + unsigned int blk16 = unicode_code_points_index8[idx8]; + unsigned int idx16 = (blk16 << 8) + ((cp >> 16) & 0xFF); + unsigned int blk24 = unicode_code_points_index16[idx16]; + unsigned int idx24 = (blk24 << 8) + ((cp >> 8) & 0xFF); + unsigned int blk32 = unicode_code_points_index24[idx24]; + unsigned int idx32 = (blk32 << 8) + (cp & 0xFF); + unsigned int idxcp = unicode_code_points_index32[idx32]; + + return &unicode_code_points[idxcp]; +} + +uint8_t unicode_general_category_from_string(const char *str); + +#endif diff --git a/src/lib/unicode-ucd-compile.py b/src/lib/unicode-ucd-compile.py new file mode 100755 index 0000000000..acd3653983 --- /dev/null +++ b/src/lib/unicode-ucd-compile.py @@ -0,0 +1,805 @@ +#!/usr/bin/env python3 +# Copyright (c) 2025 Dovecot authors, see the included COPYING file + +import argparse +import bisect +import copy +import re +import sys +from pathlib import Path + + +source_files = [] + +ud_codepoints = [] +ud_codepoints_first = [] +ud_codepoints_last = [] +ud_codepoints_index = {} + +ud_codepoints_index8 = {} +ud_codepoints_index16 = {} +ud_codepoints_index16_reused = {} +ud_codepoints_index16_offsets = {} +ud_codepoints_index24 = {} +ud_codepoints_index24_reused = {} +ud_codepoints_index24_offsets = {} +ud_codepoints_index32 = {} +ud_codepoints_index32_reused = {} +ud_codepoints_index32_offsets = {} +ud_codepoints_index16_blocks = 1 +ud_codepoints_index24_blocks = 2 +ud_codepoints_index32_blocks = 2 + + +class UCDFileOpen: + def __init__(self, filename): + self.filename = filename + + def __enter__(self): + global ucd_dir + global source_files + + self.fd = open(ucd_dir + "/" + self.filename, mode="r", encoding="utf-8") + source_files.append(self.filename) + return self + + def __exit__(self, exception_type, exception_value, exception_traceback): + self.fd.close() + + def __str__(self): + return self.filename + + +class CodePointData: + def mergeFrom(self, data, default=False): + for attr in dir(data): + if callable(getattr(data, attr)): + continue + if attr.startswith("__"): + continue + if default and hasattr(self, attr): + continue + setattr(self, attr, getattr(data, attr)) + + +class CodePointRange: + def insert(self, n): + global ud_codepoints + global ud_codepoints_first + global ud_codepoints_last + + ud_codepoints.insert(n, self) + ud_codepoints_first.insert(n, self.cp_first) + ud_codepoints_last.insert(n, self.cp_last) + + def modify(self, n): + global ud_codepoints + global ud_codepoints_first + global ud_codepoints_last + + ud_codepoints_first[n] = self.cp_first + ud_codepoints_last[n] = self.cp_last + + def __new__(cls, cp_first, cp_last, data, default=False): + global ud_codepoints + global ud_codepoints_first + global ud_codepoints_last + + cprn_first = None + + if len(ud_codepoints) == 0: + cprn = super().__new__(cls) + cprn.cp_first = cp_first + cprn.cp_last = cp_last + cprn.data = data + cprn.insert(0) + return + + idx_first = bisect.bisect_left(ud_codepoints_first, cp_first) + idx_last = bisect.bisect_right(ud_codepoints_last, cp_last) + rng_first = idx_first - 1 + rng_last = idx_last + 1 + if rng_last >= len(ud_codepoints): + rng_last = len(ud_codepoints) - 1 + + # Check existing ranges + nn = None + n = rng_first + while n <= rng_last: + cpr = ud_codepoints[n] + pos = n + n += 1 + + # No overlap with this range + if cp_last < cpr.cp_first or cp_first > cpr.cp_last: + continue + # Exact match + if cp_first == cpr.cp_first and cp_last == cpr.cp_last: + cpr.data.mergeFrom(data, default) + return cpr + # New range fully envelops existing + if cp_first <= cpr.cp_first and cp_last >= cpr.cp_last: + # Split off range before + if cp_first < cpr.cp_first: + cprn = super().__new__(cls) + cprn.cp_first = cp_first + cprn.cp_last = cpr.cp_first - 1 + cprn.data = copy.deepcopy(data) + cprn.insert(pos) + rng_last += 1 + if cprn_first is None: + cprn_first = cprn + # Merge with existing + cpr.data.mergeFrom(data, default) + # Split off range after + if cp_last > cpr.cp_last: + cp_first = cpr.cp_last + 1 + nn = pos + 1 + continue + break + # New range fully enveloped by existing + if cp_first > cpr.cp_first and cp_last < cpr.cp_last: + cprn = super().__new__(cls) + cprn.cp_first = cp_last + 1 + cprn.cp_last = cpr.cp_last + cprn.data = cpr.data + cprn.insert(pos + 1) + rng_last += 1 + cpr.cp_last = cp_first - 1 + cpr.modify(pos) + cprn = super().__new__(cls) + cprn.cp_first = cp_first + cprn.cp_last = cp_last + cprn.data = copy.deepcopy(cpr.data) + cprn.data.mergeFrom(data, default) + cprn.insert(pos + 1) + rng_last += 1 + return cprn + # New range aligns with beginning of existing + if cp_first == cpr.cp_first and cp_last < cpr.cp_last: + cpr.cp_first = cp_last + 1 + cpr.modify(pos) + cprn = super().__new__(cls) + cprn.cp_first = cp_first + cprn.cp_last = cp_last + cprn.data = copy.deepcopy(cpr.data) + cprn.data.mergeFrom(data, default) + cprn.insert(pos) + rng_last += 1 + return cprn + # New range aligns with end of existing + if cp_first > cpr.cp_first and cp_last == cpr.cp_last: + cpr.cp_last = cp_first - 1 + cpr.modify(pos) + cprn = super().__new__(cls) + cprn.cp_first = cp_first + cprn.cp_last = cp_last + cprn.data = copy.deepcopy(cpr.data) + cprn.data.mergeFrom(data, default) + cprn.insert(pos + 1) + rng_last += 1 + return cprn + # New range crosses the beginning of existing + if cp_first < cpr.cp_first and cp_last >= cpr.cp_first: + cprn = super().__new__(cls) + cprn.cp_first = cp_first + cprn.cp_last = cpr.cp_first - 1 + cprn.data = data + cprn.insert(pos) + rng_last += 1 + cprn = super().__new__(cls) + cprn.cp_first = cpr.cp_first + cprn.cp_last = cp_last + cprn.data = copy.deepcopy(cpr.data) + cprn.data.mergeFrom(data, default) + cprn.insert(pos + 1) + rng_last += 1 + cpr.cp_first = cp_last + 1 + cpr.modify(pos + 2) + return cprn + # New range crosses the end of existing + if cp_first <= cpr.cp_last and cp_last > cpr.cp_last: + cprn = super().__new__(cls) + cprn.cp_first = cp_first + cprn.cp_last = cpr.cp_last + cprn.data = copy.deepcopy(cpr.data) + cprn.data.mergeFrom(data, default) + cprn.insert(pos) + rng_last += 1 + if cprn_first is None: + cprn_first = cprn + tmp = cp_first + cp_first = cpr.cp_last + 1 + cpr.cp_last = tmp - 1 + cpr.modify(pos + 1) + nn = pos + 1 + continue + + cprn = super().__new__(cls) + cprn.cp_first = cp_first + cprn.cp_last = cp_last + cprn.data = data + if nn is None: + cprn.insert(idx_first) + else: + cprn.insert(nn) + if cprn_first is None: + cprn_first = cprn + + return cprn_first + + +def die(message): + module_filename = Path(__file__).name + print(f"{module_filename}: {message}", file=sys.stderr) + sys.exit(1) + + +def parse_cp_range(column): + rng_hex = column.strip() + if len(rng_hex) == 0: + return None + rng = rng_hex.split("..") + + cp_hex = rng[0].strip() + cp_first = int(cp_hex, 16) + cp_last = cp_first + + if len(rng) > 1: + cp_hex = rng[1].strip() + cp_last = int(cp_hex, 16) + + return (cp_first, cp_last) + + +def read_ucd_files(): + global ud_decomposition_type_names + global ud_composition_exclusions + + # UnicodeData.txt + with UCDFileOpen("UnicodeData.txt") as ucd: + cp_range_first = None + line_num = 0 + for line in ucd.fd: + line_num = line_num + 1 + data = line.split("#") + line = data[0].strip() + if len(line) == 0: + continue + + cols = line.split(";") + if len(cols) < 15: + die(f"{ucd}:{line_num}: Missing columns") + + # (0) Code point in hex + + cp_first = cp_last = int(cols[0].strip(), 16) + + # (1) Name + + cp_name = cols[1].strip() + + x = re.search("<([^>]*), (First|Last)>", cp_name) + if x: + if x.group(2) == "First": + cp_range_first = cp_first + continue + if x.group(2) == "Last" and cp_range_first is not None: + cp_first = cp_range_first + cp_name = "<%s>" % x.group(1) + cp_range_first = None + + cpd = CodePointData() + cpd.name = cp_name + + # (2) General_Category + + cpd.general_category = cols[2].strip() + + # (3) Canonical_Combining_Class + + ccc = cols[3].strip() + if ccc != "": + cpd.canonical_combining_class = int(ccc) + + # (5) Decomposition_Type, Decomposition_Mapping + + x = re.search("(<([^>]*)> )?(.+)", cols[5].strip()) + if x: + if x.group(2) is not None: + cpd.decomposition_type = x.group(2) + dcs_txt = x.group(3).split(" ") + dcs = [] + for dc_txt in dcs_txt: + dcs.append(int(dc_txt.strip(), 16)) + cpd.decomposition_first = dcs + + # (12) Simple_Uppercase_Mapping + + code = cols[12].strip() + if code != "": + cpd.simple_uppercase_mapping = int(code, 16) + + # (13) Simple_Lowercase_Mapping + + code = cols[13].strip() + if code != "": + cpd.simple_lowercase_mapping = int(code, 16) + + # (14) Simple_Titlecase_Mapping + + code = cols[14].strip() + if code != "": + cpd.simple_titlecase_mapping = int(code, 16) + + # Add range + CodePointRange(cp_first, cp_last, cpd) + + +def create_cp_range_index(): + global ud_codepoints + global ud_codepoints_index + + for cpr in ud_codepoints: + ud_codepoints_index[cpr.cp_first] = cpr + + +def update_cp_index_tables(cp_first, cp_last, cp_pos): + global ud_codepoints_index8 + global ud_codepoints_index16 + global ud_codepoints_index16_reused + global ud_codepoints_index16_offsets + global ud_codepoints_index16_blocks + global ud_codepoints_index24 + global ud_codepoints_index24_reused + global ud_codepoints_index24_offsets + global ud_codepoints_index24_blocks + global ud_codepoints_index32 + global ud_codepoints_index32_reused + global ud_codepoints_index32_offsets + global ud_codepoints_index32_blocks + + cp_range = range(cp_first, cp_last + 1) + + id16_block = None + id24_block = None + id32_block = None + first16 = True + first24 = True + first32 = True + + last_rcp = cp_last + for rcp in cp_range: + # Index for first 8 bits of code point + id8_idx = rcp >> 24 + if id8_idx in ud_codepoints_index8: + id16_block = ud_codepoints_index8[id8_idx] + elif ( + id16_block is not None + and not first16 + and ((last_rcp & 0xFFFFFF) == 0xFFFFFF or (rcp >> 24) != (last_rcp >> 24)) + ): + ud_codepoints_index8[id8_idx] = id16_block + if id16_block not in ud_codepoints_index16_reused: + ud_codepoints_index16_reused[id16_block] = 1 + ud_codepoints_index16_reused[id16_block] += 1 + else: + first16 = False + id16_block = ud_codepoints_index16_blocks + ud_codepoints_index8[id8_idx] = id16_block + ud_codepoints_index16_offsets[id16_block] = rcp & (((1 << 8) - 1) << 24) + ud_codepoints_index16_blocks += 1 + + # Index for first 16 bits of code point + id16_idx = (id16_block << 8) + ((rcp >> 16) & 0xFF) + if id16_idx in ud_codepoints_index16: + id24_block = ud_codepoints_index16[id16_idx] + elif ( + id24_block is not None + and not first24 + and ((last_rcp & 0xFFFF) == 0xFFFF or (rcp >> 16) != (last_rcp >> 16)) + ): + ud_codepoints_index16[id16_idx] = id24_block + if id24_block not in ud_codepoints_index24_reused: + ud_codepoints_index24_reused[id24_block] = 1 + ud_codepoints_index24_reused[id24_block] += 1 + else: + first24 = False + id24_block = ud_codepoints_index24_blocks + ud_codepoints_index16[id16_idx] = id24_block + ud_codepoints_index24_offsets[id24_block] = rcp & (((1 << 16) - 1) << 16) + ud_codepoints_index24_blocks += 1 + + # Index for first 24 bits of code point + id24_idx = (id24_block << 8) + ((rcp >> 8) & 0xFF) + if id24_idx in ud_codepoints_index24: + id32_block = ud_codepoints_index24[id24_idx] + elif ( + id32_block is not None + and not first32 + and ((last_rcp & 0xFF) == 0xFF or (rcp >> 8) != (last_rcp >> 8)) + ): + ud_codepoints_index24[id24_idx] = id32_block + if id32_block not in ud_codepoints_index32_reused: + ud_codepoints_index32_reused[id32_block] = 1 + ud_codepoints_index32_reused[id32_block] += 1 + else: + first32 = False + id32_block = ud_codepoints_index32_blocks + ud_codepoints_index24[id24_idx] = id32_block + ud_codepoints_index32_offsets[id32_block] = rcp & (((1 << 24) - 1) << 8) + ud_codepoints_index32_blocks += 1 + + # Index for first 32 bits of code point + id32_idx = (id32_block << 8) + (rcp & 0xFF) + ud_codepoints_index32[id32_idx] = cp_pos + + +def create_cp_index_tables(): + global ud_codepoints + + # Create code point index + for n in range(0, len(ud_codepoints)): + cpr = ud_codepoints[n] + cp_first = cpr.cp_first + cp_last = cpr.cp_last + + update_cp_index_tables(cp_first, cp_last, n) + + +def get_general_category_def(gc): + return "UNICODE_GENERAL_CATEGORY_%s" % gc.upper() + + +def decomposition_type_def(dt): + return "UNICODE_DECOMPOSITION_TYPE_%s" % dt.upper() + + +def print_list(code_list): + last = len(code_list) - 1 + n = 0 + print("\t", end="") + for code in code_list: + print("0x%05x" % code, end="") + if n == last: + break + print(",", end="") + + n += 1 + if (n % 8) == 0: + print("") + print("\t", end="") + if (n % 10) == 0: + print("// INDEX %u" % n) + print("\t", end="") + else: + print(" ", end="") + + +def print_top_message(): + global ucd_dir + global source_files + + print("/* This file is automatically generated by unicode-ucd-compile.py from:") + for sf in source_files: + print(" %s/%s" % (ucd_dir, sf)) + print(" */") + print("") + + +def write_tables_h(): + global output_dir + global ud_decomposition_max_length + global ud_compositions_max_per_starter + + orig_stdout = sys.stdout + + with open(output_dir + "/unicode-data-tables.h", mode="w", encoding="utf-8") as fd: + sys.stdout = fd + + print("#ifndef UNICODE_DATA_TABLES_H") + print("#define UNICODE_DATA_TABLES_H") + print("") + print_top_message() + print('#include "unicode-data-static.h"') + print("") + print("extern const struct unicode_code_point_data unicode_code_points[];") + print("") + print("extern const uint8_t unicode_code_points_index8[];") + print("extern const uint8_t unicode_code_points_index16[];") + print("extern const uint16_t unicode_code_points_index24[];") + print("extern const uint16_t unicode_code_points_index32[];") + print("") + print("#endif") + + sys.stdout = orig_stdout + + +def write_tables_c(): + global output_dir + global ud_codepoints + global ud_decompositions + global ud_compositions + global ud_composition_primaries + global ud_case_mappings + + orig_stdout = sys.stdout + + with open(output_dir + "/unicode-data-tables.c", mode="w", encoding="utf-8") as fd: + sys.stdout = fd + print_top_message() + + print('#include "lib.h"') + print('#include "unicode-data-tables.h"') + print("") + print("const struct unicode_code_point_data unicode_code_points[] = {") + print("\t{ // [0000] ") + print("\t\t.general_category = UNICODE_GENERAL_CATEGORY_INVALID,") + print("\t},") + print("\t{ // [0001] ") + print("\t\t.general_category = UNICODE_GENERAL_CATEGORY_CN,") + print("\t},") + n = 2 + for cpr in ud_codepoints: + cpd = cpr.data + + if cpr.cp_last > cpr.cp_first: + range_str = "U+%04X..U+%04X" % (cpr.cp_first, cpr.cp_last) + else: + range_str = "U+%04X" % (cpr.cp_first) + print("\t{ // [%04X] %s: %s" % (n, range_str, cpd.name)) + n = n + 1 + + print( + "\t\t.general_category = %s," + % get_general_category_def(cpd.general_category) + ) + if hasattr(cpd, "simple_titlecase_mapping"): + print( + "\t\t.simple_titlecase_mapping = 0x%04X," + % cpd.simple_titlecase_mapping + ) + print("\t},") + print("};") + print("") + # Code points index8 + print("const uint8_t unicode_code_points_index8[] = {") + print("\t", end="") + for n in range(0, 256): + if n in ud_codepoints_index8: + print("0x%02x" % ud_codepoints_index8[n], end="") + else: + print("0x00", end="") + if n == 255: + break + print(",", end="") + + if ((n + 1) % 8) == 0: + print("\n\t", end="") + else: + print(" ", end="") + print(",") + print("};") + print("") + # Code points index16 + print("const uint8_t unicode_code_points_index16[] = {") + print("\t// Block 0x00: ") + print("\t", end="") + last = (1 << 8) - 1 + for n in range(0 << 8, last + 1): + print("0x00", end="") + if n == last: + break + print(",", end="") + + if ((n + 1) % 8) == 0: + print("\n\t", end="") + else: + print(" ", end="") + print(",") + print("\t", end="") + last = (ud_codepoints_index16_blocks << 8) - 1 + for n in range((1 << 8), last + 1): + if (n & ((1 << 8) - 1)) == 0: + blk_id = n >> 8 + blk_offset = ud_codepoints_index16_offsets[blk_id] + blk_end = blk_offset + (1 << 24) - 1 + print( + "// Block 0x%02X: U+%06X..U+%06X" % (blk_id, blk_offset, blk_end), + end="", + ) + if blk_id in ud_codepoints_index16_reused: + print( + " (used %u times)" % ud_codepoints_index16_reused[blk_id], end="" + ) + print("") + print("\t", end="") + if n in ud_codepoints_index16: + print("0x%02x" % ud_codepoints_index16[n], end="") + elif ud_codepoints_index16_offsets[n >> 8] + ((n & 0xFF) << 16) > 0x10FFFF: + print("0x00", end="") + else: + print("0x01", end="") + if n == last: + break + print(",", end="") + + if ((n + 1) % 8) == 0: + print("") + print("\t", end="") + else: + print(" ", end="") + print("") + print("};") + print("") + # Code points index24 + print("const uint16_t unicode_code_points_index24[] = {") + print("\t// Block 0x00: ") + print("\t", end="") + last = (1 << 8) - 1 + for n in range((0 << 8), last + 1): + print("0x000", end="") + if n == last: + break + print(",", end="") + + if ((n + 1) % 8) == 0: + print("") + print("\t", end="") + else: + print(" ", end="") + print(",") + print("\t// Block 0x01: ") + print("\t", end="") + last = (2 << 8) - 1 + for n in range((1 << 8), last + 1): + print("0x001", end="") + if n == last: + break + print(",", end="") + + if ((n + 1) % 8) == 0: + print("") + print("\t", end="") + else: + print(" ", end="") + print(",") + print("\t", end="") + last = (ud_codepoints_index24_blocks << 8) - 1 + for n in range((2 << 8), last + 1): + if (n & ((1 << 8) - 1)) == 0: + blk_id = n >> 8 + blk_offset = ud_codepoints_index24_offsets[blk_id] + blk_end = blk_offset + (1 << 16) - 1 + print( + "// Block 0x%04X: U+%06X..U+%06X" % (blk_id, blk_offset, blk_end), + end="", + ) + if blk_id in ud_codepoints_index24_reused: + print( + " (used %u times)" % ud_codepoints_index24_reused[blk_id], end="" + ) + print("") + print("\t", end="") + if n in ud_codepoints_index24: + print("0x%03x" % ud_codepoints_index24[n], end="") + else: + print("0x001", end="") + if n == last: + break + print(",", end="") + + if ((n + 1) % 8) == 0: + print("") + print("\t", end="") + else: + print(" ", end="") + print(",") + print("};") + print("") + # Code points index32 + print("const uint16_t unicode_code_points_index32[] = {") + print("\t// Block 0x000: ") + print("\t", end="") + last = (1 << 8) - 1 + for n in range(0 << 8, last + 1): + print("0x0000", end="") + if n == last: + break + print(",", end="") + + if ((n + 1) % 8) == 0: + print("") + print("\t", end="") + else: + print(" ", end="") + print(",") + print("\t// Block 0x001: ") + print("\t", end="") + last = (2 << 8) - 1 + for n in range(1 << 8, last + 1): + print("0x0001", end="") + if n == last: + break + print(",", end="") + + if ((n + 1) % 8) == 0: + print("") + print("\t", end="") + else: + print(" ", end="") + print(",") + print("\t", end="") + last = (ud_codepoints_index32_blocks << 8) - 1 + for n in range(2 << 8, last + 1): + if (n & ((1 << 8) - 1)) == 0: + blk_id = n >> 8 + blk_offset = ud_codepoints_index32_offsets[blk_id] + blk_end = blk_offset + (1 << 8) - 1 + print( + "// Block 0x%04X: U+%06X - U+%06X" % (blk_id, blk_offset, blk_end), + end="", + ) + if blk_id in ud_codepoints_index32_reused: + print( + " (used %u times)" % ud_codepoints_index32_reused[blk_id], end="" + ) + print("") + print("\t", end="") + if n in ud_codepoints_index32: + print("0x%04x" % (ud_codepoints_index32[n] + 2), end="") + else: + print("0x0001", end="") + if n == last: + break + print(",", end="") + + if ((n + 1) % 8) == 0: + print("") + print("\t", end="") + else: + print(" ", end="") + print(",") + print("};") + + sys.stdout = orig_stdout + + +def main(): + global ucd_dir + global output_dir + global source_files + + """Entry point.""" + parser = argparse.ArgumentParser( + prog="unicode-ucd-compile.py", + description="Compile the Unicode Character Database files into C code", + ) + parser.add_argument( + "ucd-dir", + type=str, + help="Directory containing the UCD files", + ) + parser.add_argument( + "output-dir", + type=str, + help="Output directory where the C header and source files are written", + ) + args = parser.parse_args() + + ucd_dir = getattr(args, "ucd-dir") + output_dir = getattr(args, "output-dir") + + read_ucd_files() + source_files.sort() + + create_cp_range_index() + + create_cp_index_tables() + + write_tables_h() + write_tables_c() + + +if __name__ == "__main__": + main()