lib: unicode-data - Add data for first and full code point decomposition

author Stephan Bosch <stephan.bosch@open-xchange.com>

Tue, 22 Apr 2025 00:55:05 +0000 (02:55 +0200)

committer Stephan Bosch <stephan.bosch@open-xchange.com>

Fri, 1 Aug 2025 01:11:19 +0000 (03:11 +0200)
author Stephan Bosch <stephan.bosch@open-xchange.com>
Tue, 22 Apr 2025 00:55:05 +0000 (02:55 +0200)
committer Stephan Bosch <stephan.bosch@open-xchange.com>
Fri, 1 Aug 2025 01:11:19 +0000 (03:11 +0200)
diff --git a/src/lib/Makefile.am b/src/lib/Makefile.am

index 1d98248ecf9eb6b881b6a8dcb5b10f087eace84f..e3927510f9fbde1acd32147f895711f31305ead0 100644 (file)
--- a/src/lib/Makefile.am
+++ b/src/lib/Makefile.am
@@ -6,6 +6,8 @@ noinst_LTLIBRARIES = liblib.la
  BUILT_SOURCES = $(srcdir)/unicodemap.c \
                 $(srcdir)/unicode-data-tables.c \
                 $(srcdir)/unicode-data-tables.h \
+               $(srcdir)/unicode-data-types.c \
+               $(srcdir)/unicode-data-types.h \
                 event-filter-lexer.c \
                 event-filter-parser.c \
                 event-filter-parser.h
@@ -14,12 +16,15 @@ UCD_URL = https://dovecot.org/res
  UCD_DIR = $(srcdir)/ucd
  UCD_FILES = \
         $(UCD_DIR)/DerivedCoreProperties.txt \
+       $(UCD_DIR)/PropertyValueAliases.txt \
         $(UCD_DIR)/UnicodeData.txt
  
  EXTRA_DIST = \
         unicodemap.c \
         unicode-data-tables.c \
         unicode-data-tables.h \
+       unicode-data-types.c \
+       unicode-data-types.h \
         unicodemap.pl \
         unicode-ucd-compile.py \
         $(UCD_FILES)
@@ -47,12 +52,15 @@ event-filter-parser.h: event-filter-parser.c
  
  $(UCD_DIR)/DerivedCoreProperties.txt:
         $(AM_V_at)test -f $@ || $(WGET) -nv -O $@ $(UCD_URL)/DerivedCoreProperties.txt
+$(UCD_DIR)/PropertyValueAliases.txt:
+       $(AM_V_at)test -f $@ || $(WGET) -nv -O $@ $(UCD_URL)/PropertyValueAliases.txt
  $(UCD_DIR)/UnicodeData.txt:
         $(AM_V_at)test -f $@ || $(WGET) -nv -O $@ $(UCD_URL)/UnicodeData.txt
  
  $(srcdir)/unicodemap.c: $(srcdir)/unicodemap.pl $(UCD_DIR)/UnicodeData.txt
         $(AM_V_GEN)$(PERL) $(srcdir)/unicodemap.pl < $(UCD_DIR)/UnicodeData.txt > $@
-$(srcdir)/unicode-data-tables.c $(srcdir)/unicode-data-tables.h &: \
+$(srcdir)/unicode-data-tables.c $(srcdir)/unicode-data-tables.h \
+       $(srcdir)/unicode-data-types.c $(srcdir)/unicode-data-types.h &: \
         $(srcdir)/unicode-ucd-compile.py $(UCD_FILES)
         $(AM_V_GEN)$(PYTHON) $(srcdir)/unicode-ucd-compile.py $(UCD_DIR) $(srcdir)
  
@@ -213,6 +221,7 @@ liblib_la_SOURCES = \
         unlink-directory.c \
         unlink-old-files.c \
         unichar.c \
+       unicode-data-types.c \
         unicode-data-tables.c \
         unicode-data.c \
         uri-util.c \
@@ -376,6 +385,7 @@ headers = \
         unlink-old-files.h \
         unichar.h \
         unicode-data-static.h \
+       unicode-data-types.h \
         unicode-data-tables.h \
         unicode-data.h \
         uri-util.h \
diff --git a/src/lib/test-unicode-data.c b/src/lib/test-unicode-data.c

index b01b57b6583be1113cdb3e620cee28ba0d623528..3edef44122236b9ee9d3db086b1e2c233534ae31 100644 (file)
--- a/src/lib/test-unicode-data.c
+++ b/src/lib/test-unicode-data.c
@@ -69,6 +69,29 @@ static void test_unicode_data_line(const char *line, unsigned int line_num)
         }
         test_assert(!unicode_general_category_is_group(general_category));
  
+       /* Parse Decomposition_* */
+
+       const char *decomp_spec = columns[5];
+       enum unicode_decomposition_type decomp_type =
+               UNICODE_DECOMPOSITION_TYPE_CANONICAL;
+
+       if (*decomp_spec == '<') {
+               const char *p = strchr(decomp_spec + 1, '>');
+
+               if (p == NULL || *(p + 1) != ' ') {
+                       test_failed(t_strdup_printf(
+                               "Invalid data at %s:%u: "
+                               "Bad Decomposition for code point %"PRIu32": %s",
+                               UCD_UNICODE_DATA_TXT, line_num, cp, columns[5]));
+                       return;
+               }
+               decomp_type = unicode_decomposition_type_from_string(
+                       t_strdup_until(decomp_spec + 1, p));
+               decomp_spec = p + 2;
+       }
+
+       const char *const *decomp = t_strsplit(decomp_spec, " ");
+
         /* Parse Simple_*case_Mapping */
  
         uint32_t simple_uppercase_mapping = 0;
@@ -113,6 +136,35 @@ static void test_unicode_data_line(const char *line, unsigned int line_num)
                 test_assert_idx(
                         cp_data->general_category == general_category, cp);
  
+               const uint32_t *cp_decomp;
+               size_t cp_decomp_len, cp_decomp_idx;
+               uint8_t cp_decomp_type;
+
+               cp_decomp_len =
+                       unicode_code_point_data_get_first_decomposition(
+                               cp_data, &cp_decomp_type, &cp_decomp);
+               test_assert(str_array_length(decomp) == cp_decomp_len);
+               if (test_has_failed())
+                       break;
+
+               test_assert_idx(
+                       (cp_decomp_type == decomp_type ||
+                        cp_decomp_type == UNICODE_DECOMPOSITION_TYPE_COMPAT),
+                       cp);
+               cp_decomp_idx = 0;
+               while (*decomp != NULL && !test_has_failed()) {
+                       uint32_t dcp;
+
+                       test_assert_idx(str_to_uint32_hex(*decomp, &dcp) >= 0, cp);
+                       if (test_has_failed())
+                               break;
+                       test_assert_idx(uni_is_valid_ucs4(dcp), cp);
+                       test_assert_idx(dcp == cp_decomp[cp_decomp_idx], cp);
+
+                       cp_decomp_idx++;
+                       decomp++;
+               }
+
                 test_assert_idx(
                         cp_data->simple_titlecase_mapping == simple_titlecase_mapping,
                         cp);
diff --git a/src/lib/unicode-data-static.h b/src/lib/unicode-data-static.h

index 0258548985045e145ac49a097e6ae23c8ca7363a..ffc61bb8ac2fcdf8135f1162a0905cdd200f2dff 100644 (file)
--- a/src/lib/unicode-data-static.h
+++ b/src/lib/unicode-data-static.h
@@ -96,6 +96,15 @@ enum unicode_general_category {
  struct unicode_code_point_data {
         uint8_t general_category; // Not yet used
  
+       uint8_t decomposition_type; // Not yet used
+       uint8_t decomposition_first_length;
+       uint8_t decomposition_full_length;
+       uint8_t decomposition_full_k_length;
+
+       uint16_t decomposition_first_offset;
+       uint16_t decomposition_full_offset;
+       uint16_t decomposition_full_k_offset;
+
         uint32_t simple_titlecase_mapping;
  };
  
diff --git a/src/lib/unicode-data.h b/src/lib/unicode-data.h

index 6b156f1b30d7faabe597aca4d325aa9342c37d3b..eace84639ffd8b9035385a1b2ef0daff2a706391 100644 (file)
--- a/src/lib/unicode-data.h
+++ b/src/lib/unicode-data.h
@@ -24,6 +24,48 @@ unicode_code_point_get_data(uint32_t cp)
         return &unicode_code_points[idxcp];
  }
  
+static inline size_t
+unicode_code_point_data_get_first_decomposition(
+       const struct unicode_code_point_data *cp_data,
+       uint8_t *type_r, const uint32_t **decomp_r)
+{
+       uint32_t offset;
+
+       if (type_r != NULL)
+               *type_r = cp_data->decomposition_type;
+       offset = cp_data->decomposition_first_offset;
+       *decomp_r = &unicode_decompositions[offset];
+       return cp_data->decomposition_first_length;
+}
+
+static inline size_t
+unicode_code_point_data_get_full_decomposition(
+       const struct unicode_code_point_data *cp_data, bool canonical,
+       const uint32_t **decomp_r)
+{
+       uint32_t offset;
+
+       if (canonical) {
+               offset = cp_data->decomposition_full_offset;
+               *decomp_r = &unicode_decompositions[offset];
+               return cp_data->decomposition_full_length;
+       }
+       offset = cp_data->decomposition_full_k_offset;
+       *decomp_r = &unicode_decompositions[offset];
+       return cp_data->decomposition_full_k_length;
+}
+
+static inline size_t
+unicode_code_point_get_full_decomposition(uint32_t cp, bool canonical,
+                                         const uint32_t **decomp_r)
+{
+       const struct unicode_code_point_data *cp_data =
+               unicode_code_point_get_data(cp);
+
+       return unicode_code_point_data_get_full_decomposition(
+               cp_data, canonical, decomp_r);
+}
+
  uint8_t unicode_general_category_from_string(const char *str);
  
  #endif
diff --git a/src/lib/unicode-ucd-compile.py b/src/lib/unicode-ucd-compile.py

index acd3653983dfce54499a037e8bdfd406dc1781aa..e7283e91f9995cf668e9ceeb020d563ec459cd44 100755 (executable)
--- a/src/lib/unicode-ucd-compile.py
+++ b/src/lib/unicode-ucd-compile.py
@@ -30,6 +30,10 @@ ud_codepoints_index16_blocks = 1
  ud_codepoints_index24_blocks = 2
  ud_codepoints_index32_blocks = 2
  
+ud_decomposition_type_names = []
+ud_decompositions = []
+ud_decomposition_max_length = 0
+
  
  class UCDFileOpen:
      def __init__(self, filename):
@@ -256,6 +260,25 @@ def read_ucd_files():
      global ud_decomposition_type_names
      global ud_composition_exclusions
  
+    # PropertyValueAliases.txt
+    with UCDFileOpen("PropertyValueAliases.txt") as ucd:
+        line_num = 0
+        for line in ucd.fd:
+            line_num = line_num + 1
+            data = line.split("#")
+            line = data[0].strip()
+            if len(line) == 0:
+                continue
+
+            cols = line.split(";")
+            if len(cols) < 3:
+                die(f"{ucd}:{line_num}: Missing columns")
+
+            prop = cols[0].strip()
+            if prop == "dt":
+                lval = cols[2].strip()
+                ud_decomposition_type_names.append(lval)
+
      # UnicodeData.txt
      with UCDFileOpen("UnicodeData.txt") as ucd:
          cp_range_first = None
@@ -336,6 +359,142 @@ def read_ucd_files():
              CodePointRange(cp_first, cp_last, cpd)
  
  
+def expand_decompositions():
+    global ud_codepoints
+    global ud_codepoints_index
+    global ud_decompositions
+    global ud_decomposition_max_length
+
+    # Record first decompositions in ud_decompositions table
+    for cpr in ud_codepoints:
+        cpd = cpr.data
+
+        if not hasattr(cpd, "decomposition_first") or len(cpd.decomposition_first) == 0:
+            continue
+
+        dc = cpd.decomposition_first
+        cpd.decomposition_offset = len(ud_decompositions)
+        cpd.decomposition_length = len(dc)
+        ud_decompositions = ud_decompositions + dc
+        if len(dc) > ud_decomposition_max_length:
+            ud_decomposition_max_length = len(dc)
+
+    # Expand all decompositions
+    for cpr in ud_codepoints:
+        if cpr.cp_last > cpr.cp_first:
+            # No decompositions in ranges expected, ever
+            continue
+        cpd = cpr.data
+
+        if not hasattr(cpd, "decomposition_first") or len(cpd.decomposition_first) == 0:
+            continue
+
+        dc_type = None
+        if hasattr(cpd, "decomposition_type"):
+            dc_type = cpd.decomposition_type
+
+        # Canonical
+        dc = []
+
+        finished = False
+        changed = False
+        if dc_type is None:
+            dc = cpd.decomposition_first
+        else:
+            finished = True
+            changed = True
+
+        while not finished:
+            finished = True
+
+            dc_new = []
+            for dcp in dc:
+                if dcp not in ud_codepoints_index:
+                    dc_new.append(dcp)
+                    continue
+
+                scpr = ud_codepoints_index[dcp]
+                scpd = scpr.data
+
+                if (
+                    hasattr(scpd, "decomposition_type")
+                    or not hasattr(scpd, "decomposition_first")
+                    or (
+                        len(scpd.decomposition_first) == 1
+                        and scpd.decomposition_first[0] == dcp
+                    )
+                ):
+                    dc_new.append(dcp)
+                    continue
+
+                finished = False
+                changed = True
+                dc_new = dc_new + scpd.decomposition_first
+
+            if not finished:
+                dc = dc_new
+
+        if not changed:
+            if hasattr(cpd, "decomposition_offset"):
+                cpd.decomposition_full_offset = cpd.decomposition_offset
+                cpd.decomposition_full_length = cpd.decomposition_length
+        elif len(dc) == 0:
+            pass
+        else:
+            cpd.decomposition_full_offset = len(ud_decompositions)
+            cpd.decomposition_full_length = len(dc)
+            ud_decompositions = ud_decompositions + dc
+            if len(dc) > ud_decomposition_max_length:
+                ud_decomposition_max_length = len(dc)
+
+        dc_c = dc
+
+        # Compatibility
+        dc = cpd.decomposition_first
+
+        finished = False
+        changed = False
+        while not finished:
+            finished = True
+
+            dc_new = []
+            for dcp in dc:
+                if dcp not in ud_codepoints_index:
+                    dc_new.append(dcp)
+                    continue
+
+                scpr = ud_codepoints_index[dcp]
+                scpd = scpr.data
+
+                if not hasattr(scpd, "decomposition_first") or (
+                    len(scpd.decomposition_first) == 1
+                    and scpd.decomposition_first[0] == dcp
+                ):
+                    dc_new.append(dcp)
+                    continue
+
+                finished = False
+                changed = True
+                dc_new = dc_new + scpd.decomposition_first
+
+            if not finished:
+                dc = dc_new
+
+        if not changed:
+            if hasattr(cpd, "decomposition_offset"):
+                cpd.decomposition_full_k_offset = cpd.decomposition_offset
+                cpd.decomposition_full_k_length = cpd.decomposition_length
+        elif dc == dc_c:
+            cpd.decomposition_full_k_offset = cpd.decomposition_full_offset
+            cpd.decomposition_full_k_length = cpd.decomposition_full_length
+        else:
+            cpd.decomposition_full_k_offset = len(ud_decompositions)
+            cpd.decomposition_full_k_length = len(dc)
+            ud_decompositions = ud_decompositions + dc
+            if len(dc) > ud_decomposition_max_length:
+                ud_decomposition_max_length = len(dc)
+
+
  def create_cp_range_index():
      global ud_codepoints
      global ud_codepoints_index
@@ -501,7 +660,11 @@ def write_tables_h():
          print("#define UNICODE_DATA_TABLES_H")
          print("")
          print_top_message()
-        print('#include "unicode-data-static.h"')
+        print('#include "unicode-data-types.h"')
+        print("")
+        print(
+            "#define UNICODE_DECOMPOSITION_MAX_LENGTH %s" % ud_decomposition_max_length
+        )
          print("")
          print("extern const struct unicode_code_point_data unicode_code_points[];")
          print("")
@@ -510,6 +673,8 @@ def write_tables_h():
          print("extern const uint16_t unicode_code_points_index24[];")
          print("extern const uint16_t unicode_code_points_index32[];")
          print("")
+        print("extern const uint32_t unicode_decompositions[];")
+        print("")
          print("#endif")
  
      sys.stdout = orig_stdout
@@ -554,6 +719,32 @@ def write_tables_c():
                  "\t\t.general_category = %s,"
                  % get_general_category_def(cpd.general_category)
              )
+            if hasattr(cpd, "decomposition_type"):
+                print(
+                    "\t\t.decomposition_type = %s,"
+                    % decomposition_type_def(cpd.decomposition_type)
+                )
+            if hasattr(cpd, "decomposition_length"):
+                print("\t\t.decomposition_first_length = %u," % cpd.decomposition_length)
+                print("\t\t.decomposition_first_offset = %u," % cpd.decomposition_offset)
+            if hasattr(cpd, "decomposition_full_length"):
+                print(
+                    "\t\t.decomposition_full_length = %u,"
+                    % cpd.decomposition_full_length
+                )
+                print(
+                    "\t\t.decomposition_full_offset = %u,"
+                    % cpd.decomposition_full_offset
+                )
+            if hasattr(cpd, "decomposition_full_k_length"):
+                print(
+                    "\t\t.decomposition_full_k_length = %u,"
+                    % cpd.decomposition_full_k_length
+                )
+                print(
+                    "\t\t.decomposition_full_k_offset = %u,"
+                    % cpd.decomposition_full_k_offset
+                )
              if hasattr(cpd, "simple_titlecase_mapping"):
                  print(
                      "\t\t.simple_titlecase_mapping = 0x%04X,"
@@ -761,6 +952,85 @@ def write_tables_c():
                  print(" ", end="")
          print(",")
          print("};")
+        print("")
+        print("const uint32_t unicode_decompositions[] = {")
+        print_list(ud_decompositions)
+        print(",")
+        print("};")
+
+    sys.stdout = orig_stdout
+
+
+def write_types_h():
+    global output_dir
+    global ud_decomposition_type_names
+
+    orig_stdout = sys.stdout
+
+    with open(output_dir + "/unicode-data-types.h", mode="w", encoding="utf-8") as fd:
+        sys.stdout = fd
+
+        print("#ifndef UNICODE_DATA_TYPES_H")
+        print("#define UNICODE_DATA_TYPES_H")
+        print("")
+        print_top_message()
+        print('#include "unicode-data-static.h"')
+        print("")
+        print("/* Decomposition_Type */")
+        print("enum unicode_decomposition_type {")
+        print("\t/* Canonical */")
+        print("\tUNICODE_DECOMPOSITION_TYPE_CANONICAL = 0,")
+        for dt in ud_decomposition_type_names:
+            dt_uc = dt.upper()
+
+            if dt_uc == "CANONICAL":
+                continue
+
+            print("\t/* <%s> */" % dt)
+            print("\tUNICODE_DECOMPOSITION_TYPE_%s," % dt_uc)
+        print("};")
+        print("")
+        print("/* Decomposition_Type */")
+        print("enum unicode_decomposition_type")
+        print("unicode_decomposition_type_from_string(const char *str);")
+        print("")
+        print("#endif")
+
+    sys.stdout = orig_stdout
+
+
+def write_types_c():
+    global output_dir
+    global ud_decomposition_type_names
+
+    orig_stdout = sys.stdout
+
+    with open(output_dir + "/unicode-data-types.c", mode="w", encoding="utf-8") as fd:
+        sys.stdout = fd
+
+        print_top_message()
+        print('#include "lib.h"')
+        print('#include "unicode-data-types.h"')
+        print("")
+        print("/* Decomposition_Type */")
+        print("enum unicode_decomposition_type")
+        print("unicode_decomposition_type_from_string(const char *str)")
+        print("{")
+        print("\t/* Canonical */")
+        print('\tif (strcasecmp(str, "Canonical") == 0)')
+        print("\t\treturn UNICODE_DECOMPOSITION_TYPE_CANONICAL;")
+        for dt in ud_decomposition_type_names:
+            dt_uc = dt.upper()
+
+            if dt_uc == "CANONICAL":
+                continue
+
+            print("\t/* <%s> */" % dt)
+            print('\telse if (strcasecmp(str, "%s") == 0)' % dt)
+            print("\t\treturn UNICODE_DECOMPOSITION_TYPE_%s;" % dt_uc)
+        print("")
+        print("\treturn UNICODE_DECOMPOSITION_TYPE_CANONICAL;")
+        print("}")
  
      sys.stdout = orig_stdout
  
@@ -794,11 +1064,14 @@ def main():
      source_files.sort()
  
      create_cp_range_index()
+    expand_decompositions()
  
      create_cp_index_tables()
  
      write_tables_h()
      write_tables_c()
+    write_types_h()
+    write_types_c()
  
  
  if __name__ == "__main__":
author	Stephan Bosch <stephan.bosch@open-xchange.com>
	Tue, 22 Apr 2025 00:55:05 +0000 (02:55 +0200)
committer	Stephan Bosch <stephan.bosch@open-xchange.com>
	Fri, 1 Aug 2025 01:11:19 +0000 (03:11 +0200)
src/lib/Makefile.am		patch \| blob \| blame \| history
src/lib/test-unicode-data.c		patch \| blob \| blame \| history
src/lib/unicode-data-static.h		patch \| blob \| blame \| history
src/lib/unicode-data.h		patch \| blob \| blame \| history
src/lib/unicode-ucd-compile.py		patch \| blob \| blame \| history