lib: unicode-data - Add case folding mappings

author Stephan Bosch <stephan.bosch@open-xchange.com>

Tue, 1 Apr 2025 01:07:30 +0000 (03:07 +0200)

committer Stephan Bosch <stephan.bosch@open-xchange.com>

Fri, 1 Aug 2025 01:11:19 +0000 (03:11 +0200)
author Stephan Bosch <stephan.bosch@open-xchange.com>
Tue, 1 Apr 2025 01:07:30 +0000 (03:07 +0200)
committer Stephan Bosch <stephan.bosch@open-xchange.com>
Fri, 1 Aug 2025 01:11:19 +0000 (03:11 +0200)
diff --git a/src/lib/Makefile.am b/src/lib/Makefile.am

index 8fe2070fd6f4f796a4110b73bd3c78a82ae7f327..61e48817a9bdae217901439a949b2effaae4d852 100644 (file)
--- a/src/lib/Makefile.am
+++ b/src/lib/Makefile.am
@@ -14,6 +14,7 @@ BUILT_SOURCES = $(srcdir)/unicode-data-tables.c \
  UCD_URL = https://dovecot.org/res
  UCD_DIR = $(srcdir)/ucd
  UCD_FILES = \
+       $(UCD_DIR)/CaseFolding.txt \
         $(UCD_DIR)/CompositionExclusions.txt \
         $(UCD_DIR)/DerivedCoreProperties.txt \
         $(UCD_DIR)/DerivedNormalizationProps.txt \
@@ -53,6 +54,8 @@ YACC=/bin/false
  # dependency, anything including the header will race the bison process.
  event-filter-parser.h: event-filter-parser.c
  
+$(UCD_DIR)/CaseFolding.txt:
+       $(AM_V_at)test -f $@ || $(WGET) -nv -O $@ $(UCD_URL)/CaseFolding.txt
  $(UCD_DIR)/CompositionExclusions.txt:
         $(AM_V_at)test -f $@ || $(WGET) -nv -O $@ $(UCD_URL)/CompositionExclusions.txt
  $(UCD_DIR)/DerivedCoreProperties.txt:
diff --git a/src/lib/test-unicode-data.c b/src/lib/test-unicode-data.c

index 83ab3a7c72dd10a92434d8a9ef5ebf88655c0de3..d1b1f683094ad1162ec7441e943be3ed84013c88 100644 (file)
--- a/src/lib/test-unicode-data.c
+++ b/src/lib/test-unicode-data.c
@@ -9,6 +9,7 @@
  
  #include <fcntl.h>
  
+#define UCD_CASE_FOLDING_TXT "CaseFolding.txt"
  #define UCD_COMPOSITION_EXCLUSIONS_TXT "CompositionExclusions.txt"
  #define UCD_DERIVED_NORMALIZATION_PROPS_TXT "DerivedNormalizationProps.txt"
  #define UCD_PROP_LIST_TXT "PropList.txt"
@@ -108,6 +109,58 @@ test_case_mapping(uint32_t cp, const char *const *parsed_mapping,
         }
  }
  
+static void test_case_folding_line(const char *line, unsigned int line_num)
+{
+       const char *const *columns = t_strsplit(line, ";");
+       size_t num_columns = str_array_length(columns);
+
+       /* <code>; <status>; <mapping>; # <name> */
+
+       if (num_columns < 4) {
+               test_failed(t_strdup_printf(
+                       "Invalid data at %s:%u",
+                       UCD_CASE_FOLDING_TXT, line_num));
+               return;
+       }
+
+       if (num_columns > 4 && strlen(t_str_trim(columns[4], " ")) > 0) {
+               /* Skip lines with condition list */
+               return;
+       }
+
+       const char *cp_hex = t_str_trim(columns[0], " ");
+       uint32_t cp;
+
+       if (str_to_uint32_hex(cp_hex, &cp) < 0) {
+               test_failed(t_strdup_printf(
+                               "Invalid data at %s:%u: "
+                               "Bad code point",
+                               UCD_CASE_FOLDING_TXT, line_num));
+               return;
+       }
+
+       /* Parse Decomposition_* */
+
+       const char *status = t_str_trim(columns[1], " ");
+
+       if (strcmp(status, "C") != 0 && strcmp(status, "F") != 0)
+               return;
+
+       const char *mapping = t_str_trim(columns[2], " ");
+       const char *const *map = t_strsplit(mapping, " ");
+
+       /* Check data */
+
+       const struct unicode_code_point_data *cp_data =
+               unicode_code_point_get_data(cp);
+       const uint32_t *case_map;
+       size_t case_map_len;
+
+       case_map_len = unicode_code_point_data_get_casefold_mapping(
+               cp_data, &case_map);
+       test_case_mapping(cp, map, case_map, case_map_len);
+}
+
  static void
  test_composition_exclusions_line(const char *line, unsigned int line_num)
  {
@@ -542,6 +595,7 @@ void test_unicode_data(void)
            property files only the positive assignment of properties to the
            code points mentioned in the files is tested, and notably not their
            absence for other code points. */
+       test_ucd_file(UCD_CASE_FOLDING_TXT, test_case_folding_line);
         test_ucd_file(UCD_COMPOSITION_EXCLUSIONS_TXT,
                       test_composition_exclusions_line);
         test_ucd_file(UCD_DERIVED_NORMALIZATION_PROPS_TXT,
diff --git a/src/lib/unicode-data-static.h b/src/lib/unicode-data-static.h

index a2a66c5c94e6ce678abd343c87d7196b4568ba65..9ff142e8ba14b1d9ccc82161219acdc0f94d530f 100644 (file)
--- a/src/lib/unicode-data-static.h
+++ b/src/lib/unicode-data-static.h
@@ -129,6 +129,7 @@ struct unicode_code_point_data {
  
         uint8_t uppercase_mapping_length;
         uint8_t lowercase_mapping_length;
+       uint8_t casefold_mapping_length;
  
         uint16_t decomposition_first_offset;
         uint16_t decomposition_full_offset;
@@ -137,6 +138,7 @@ struct unicode_code_point_data {
  
         uint16_t uppercase_mapping_offset;
         uint16_t lowercase_mapping_offset;
+       uint16_t casefold_mapping_offset;
  
         uint32_t simple_titlecase_mapping;
  
diff --git a/src/lib/unicode-data.h b/src/lib/unicode-data.h

index 0fea0930cbee97f8b01eab062d32440fb6dc9411..801eadd737e4e29d8769e6a0e6cf0f99f63177ac 100644 (file)
--- a/src/lib/unicode-data.h
+++ b/src/lib/unicode-data.h
@@ -115,6 +115,18 @@ unicode_code_point_data_get_lowercase_mapping(
         return cp_data->lowercase_mapping_length;
  }
  
+static inline size_t
+unicode_code_point_data_get_casefold_mapping(
+       const struct unicode_code_point_data *cp_data,
+       const uint32_t **map_r)
+{
+       uint32_t offset;
+
+       offset = cp_data->casefold_mapping_offset;
+       *map_r = &unicode_case_mappings[offset];
+       return cp_data->casefold_mapping_length;
+}
+
  uint8_t unicode_general_category_from_string(const char *str);
  
  #endif
diff --git a/src/lib/unicode-ucd-compile.py b/src/lib/unicode-ucd-compile.py

index c0f6e415c23397bf04e77601385eb0f3ce8ab8b9..12e0ec76004a9572816e02bc07b4b3d81d2e212d 100755 (executable)
--- a/src/lib/unicode-ucd-compile.py
+++ b/src/lib/unicode-ucd-compile.py
@@ -368,6 +368,44 @@ def read_ucd_files():
              # Add range
              CodePointRange(cp_first, cp_last, cpd)
  
+    # CaseFolding.txt
+    with UCDFileOpen("CaseFolding.txt") as ucd:
+        line_num = 0
+        for line in ucd.fd:
+            line_num = line_num + 1
+            data = line.split("#")
+            line = data[0].strip()
+            if len(line) == 0:
+                continue
+
+            cols = line.split(";")
+            if len(cols) < 3:
+                die(f"{ucd}:{line_num}: Missing columns")
+
+            cp_hex = cols[0].strip()
+            if len(cp_hex) == 0:
+                continue
+            cp = int(cp_hex, 16)
+
+            status = cols[1].strip()
+            mapping = cols[2].strip()
+
+            if status != "C" and status != "F":
+                continue
+
+            codes_hex = mapping.split(" ")
+            if len(codes_hex) > 0:
+                first_code_hex = codes_hex[0].strip()
+                first_code = int(first_code_hex, 16)
+                if len(codes_hex) > 1 or first_code != cp:
+                    codes = []
+                    for code_hex in codes_hex:
+                        codes.append(int(code_hex, 16))
+
+                    cpd = CodePointData()
+                    cpd.case_folding = codes
+                    CodePointRange(cp, cp, cpd)
+
      # CompositionExclusions.txt
      with UCDFileOpen("CompositionExclusions.txt") as ucd:
          for line in ucd.fd:
@@ -665,6 +703,23 @@ def resolve_case_mappings():
          if len(lcase_codes) > ud_case_mapping_max_length:
              ud_case_mapping_max_length = len(lcase_codes)
  
+        # Case_Folding
+        cfold_codes = []
+        if hasattr(cpd, "case_folding"):
+            cfold_codes = cpd.case_folding
+        if len(ucase_codes) > 0 and cfold_codes == ucase_codes:
+            cpd.casefold_mapping_length = cpd.uppercase_mapping_length
+            cpd.casefold_mapping_offset = cpd.uppercase_mapping_offset
+        elif len(lcase_codes) > 0 and cfold_codes == lcase_codes:
+            cpd.casefold_mapping_length = cpd.lowercase_mapping_length
+            cpd.casefold_mapping_offset = cpd.lowercase_mapping_offset
+        elif len(cfold_codes) > 0 and (len(cfold_codes) > 1 or cfold_codes[0] != cp):
+            cpd.casefold_mapping_offset = len(ud_case_mappings)
+            cpd.casefold_mapping_length = len(cfold_codes)
+            ud_case_mappings = ud_case_mappings + cfold_codes
+        if len(cfold_codes) > ud_case_mapping_max_length:
+            ud_case_mapping_max_length = len(cfold_codes)
+
  
  def expand_decompositions():
      global ud_codepoints
@@ -1215,6 +1270,12 @@ def write_tables_c():
                  print(
                      "\t\t.uppercase_mapping_offset = %s," % cpd.uppercase_mapping_offset
                  )
+            if (
+                hasattr(cpd, "casefold_mapping_length")
+                and cpd.casefold_mapping_length > 0
+            ):
+                print("\t\t.casefold_mapping_length = %s," % cpd.casefold_mapping_length)
+                print("\t\t.casefold_mapping_offset = %s," % cpd.casefold_mapping_offset)
              if hasattr(cpd, "simple_titlecase_mapping"):
                  print(
                      "\t\t.simple_titlecase_mapping = 0x%04X,"
author	Stephan Bosch <stephan.bosch@open-xchange.com>
	Tue, 1 Apr 2025 01:07:30 +0000 (03:07 +0200)
committer	Stephan Bosch <stephan.bosch@open-xchange.com>
	Fri, 1 Aug 2025 01:11:19 +0000 (03:11 +0200)
src/lib/Makefile.am		patch \| blob \| blame \| history
src/lib/test-unicode-data.c		patch \| blob \| blame \| history
src/lib/unicode-data-static.h		patch \| blob \| blame \| history
src/lib/unicode-data.h		patch \| blob \| blame \| history
src/lib/unicode-ucd-compile.py		patch \| blob \| blame \| history