UCD_URL = https://dovecot.org/res
UCD_DIR = $(srcdir)/ucd
UCD_FILES = \
+ $(UCD_DIR)/CompositionExclusions.txt \
$(UCD_DIR)/DerivedCoreProperties.txt \
+ $(UCD_DIR)/DerivedNormalizationProps.txt \
$(UCD_DIR)/PropertyValueAliases.txt \
$(UCD_DIR)/UnicodeData.txt
# dependency, anything including the header will race the bison process.
event-filter-parser.h: event-filter-parser.c
+$(UCD_DIR)/CompositionExclusions.txt:
+ $(AM_V_at)test -f $@ || $(WGET) -nv -O $@ $(UCD_URL)/CompositionExclusions.txt
$(UCD_DIR)/DerivedCoreProperties.txt:
$(AM_V_at)test -f $@ || $(WGET) -nv -O $@ $(UCD_URL)/DerivedCoreProperties.txt
+$(UCD_DIR)/DerivedNormalizationProps.txt:
+ $(AM_V_at)test -f $@ || $(WGET) -nv -O $@ $(UCD_URL)/DerivedNormalizationProps.txt
$(UCD_DIR)/PropertyValueAliases.txt:
$(AM_V_at)test -f $@ || $(WGET) -nv -O $@ $(UCD_URL)/PropertyValueAliases.txt
$(UCD_DIR)/UnicodeData.txt:
#include <fcntl.h>
+#define UCD_COMPOSITION_EXCLUSIONS_TXT "CompositionExclusions.txt"
+#define UCD_DERIVED_NORMALIZATION_PROPS_TXT "DerivedNormalizationProps.txt"
#define UCD_UNICODE_DATA_TXT "UnicodeData.txt"
+static bool
+parse_prop_file_line(const char *line, const char *file, unsigned int line_num,
+ uint32_t *cp_first_r, uint32_t *cp_last_r,
+ const char **prop_r, const char **value_r)
+{
+ unsigned int expected_columns = 1;
+
+ if (prop_r != NULL)
+ expected_columns++;
+
+ const char *const *columns = t_strsplit(line, ";");
+ if (str_array_length(columns) < expected_columns) {
+ test_failed(t_strdup_printf(
+ "Invalid data at %s:%u", file, line_num));
+ return FALSE;
+ }
+
+ const char *p = strstr(columns[0], "..");
+ const char *cp_first_hex, *cp_last_hex;
+
+ cp_last_hex = NULL;
+ if (p == NULL) {
+ cp_first_hex = t_str_trim(columns[0], " \t");
+ } else {
+ cp_first_hex = t_str_trim(t_strdup_until(columns[0], p), " \t");
+ cp_last_hex = t_str_trim(p + 2, " \t");
+ }
+ if (str_to_uint32_hex(cp_first_hex, cp_first_r) < 0) {
+ test_failed(t_strdup_printf(
+ "Invalid data at %s:%u: "
+ "Bad first code point", file, line_num));
+ return FALSE;
+ }
+ if (cp_last_hex == NULL)
+ *cp_last_r = *cp_first_r;
+ else if (str_to_uint32_hex(cp_last_hex, cp_last_r) < 0) {
+ test_failed(t_strdup_printf(
+ "Invalid data at %s:%u: "
+ "Bad first code point", file, line_num));
+ return FALSE;
+ }
+
+ if (prop_r != NULL) {
+ *prop_r = t_str_trim(columns[1], " \t");
+ if (value_r != NULL) {
+ if (columns[2] != NULL)
+ *value_r = t_str_trim(columns[2], " \t");
+ else
+ *value_r = NULL;
+ }
+ }
+ return !test_has_failed();
+}
+
+static void
+test_composition_exclusions_line(const char *line, unsigned int line_num)
+{
+ uint32_t cp_first, cp_last, cp;
+
+ if (!parse_prop_file_line(line, UCD_COMPOSITION_EXCLUSIONS_TXT,
+ line_num, &cp_first, &cp_last, NULL, NULL))
+ return;
+
+ for (cp = cp_first; cp <= cp_last && !test_has_failed(); cp++) {
+ const struct unicode_code_point_data *cp_data =
+ unicode_code_point_get_data(cp);
+
+ test_assert_idx(cp_data->composition_count == 0, cp);
+ }
+}
+
+static void
+test_derived_normalization_props_line(const char *line, unsigned int line_num)
+{
+ uint32_t cp_first, cp_last, cp;
+ const char *prop, *value;
+
+ if (!parse_prop_file_line(line, UCD_DERIVED_NORMALIZATION_PROPS_TXT,
+ line_num, &cp_first, &cp_last, &prop, &value))
+ return;
+
+ for (cp = cp_first; cp <= cp_last && !test_has_failed(); cp++) {
+ const struct unicode_code_point_data *cp_data =
+ unicode_code_point_get_data(cp);
+ uint8_t qc, qc_no, qc_maybe;
+
+ if (strcmp(prop, "NFD_QC") == 0) {
+ qc = (cp_data->nf_quick_check &
+ UNICODE_NFD_QUICK_CHECK_MASK);
+ qc_no = UNICODE_NFD_QUICK_CHECK_NO;
+ qc_maybe = UNICODE_NFD_QUICK_CHECK_MAYBE;
+ } else if (strcmp(prop, "NFKD_QC") == 0) {
+ qc = (cp_data->nf_quick_check &
+ UNICODE_NFKD_QUICK_CHECK_MASK);
+ qc_no = UNICODE_NFKD_QUICK_CHECK_NO;
+ qc_maybe = UNICODE_NFKD_QUICK_CHECK_MAYBE;
+ } else if (strcmp(prop, "NFC_QC") == 0) {
+ qc = (cp_data->nf_quick_check &
+ UNICODE_NFC_QUICK_CHECK_MASK);
+ qc_no = UNICODE_NFC_QUICK_CHECK_NO;
+ qc_maybe = UNICODE_NFC_QUICK_CHECK_MAYBE;
+ } else if (strcmp(prop, "NFKC_QC") == 0) {
+ qc = (cp_data->nf_quick_check &
+ UNICODE_NFKC_QUICK_CHECK_MASK);
+ qc_no = UNICODE_NFKC_QUICK_CHECK_NO;
+ qc_maybe = UNICODE_NFKC_QUICK_CHECK_MAYBE;
+ } else {
+ continue;
+ }
+
+ i_assert(value != NULL);
+ if (strcmp(value, "N") == 0)
+ test_assert_idx(qc == qc_no, cp);
+ else if (strcmp(value, "M") == 0)
+ test_assert_idx(qc == qc_maybe, cp);
+ }
+}
+
static void test_unicode_data_line(const char *line, unsigned int line_num)
{
static uint32_t cp_first = 0;
}
test_assert(!unicode_general_category_is_group(general_category));
+ /* Parse Canonical_Combining_Class */
+
+ unsigned int ccc = 0;
+ if (*columns[3] != '\0' &&
+ (str_to_uint(columns[3], &ccc) < 0 || ccc > UINT8_MAX)) {
+ test_failed(t_strdup_printf(
+ "Invalid data at %s:%u: "
+ "Bad Canonical_Combining_Class for code point %"PRIu32": %s",
+ UCD_UNICODE_DATA_TXT, line_num, cp, columns[3]));
+ return;
+ }
+
/* Parse Decomposition_* */
const char *decomp_spec = columns[5];
test_assert_idx(
cp_data->general_category == general_category, cp);
+ test_assert_idx(
+ cp_data->canonical_combining_class == ccc, cp);
const uint32_t *cp_decomp;
size_t cp_decomp_len, cp_decomp_idx;
void test_unicode_data(void)
{
- /* Check that UCD data files match with what is compiled. */
+ /* Check that UCD data files match with what is compiled. For the
+ property files only the positive assignment of properties to the
+ code points mentioned in the files is tested, and notably not their
+ absence for other code points. */
+ test_ucd_file(UCD_COMPOSITION_EXCLUSIONS_TXT,
+ test_composition_exclusions_line);
+ test_ucd_file(UCD_DERIVED_NORMALIZATION_PROPS_TXT,
+ test_derived_normalization_props_line);
test_ucd_file(UCD_UNICODE_DATA_TXT, test_unicode_data_line);
}
UNICODE_GENERAL_CATEGORY_CN = UNICODE_GENERAL_CATEGORY_C | 5,
};
+/* UAX #44, Section 5.7.5: Decompositions and Normalization
+ */
+enum unicode_nf_quick_check {
+ UNICODE_NFKC_QUICK_CHECK_YES = (0x00 << 6),
+ UNICODE_NFKC_QUICK_CHECK_NO = (0x01 << 6),
+ UNICODE_NFKC_QUICK_CHECK_MAYBE = (0x02 << 6),
+ UNICODE_NFC_QUICK_CHECK_YES = (0x00 << 4),
+ UNICODE_NFC_QUICK_CHECK_NO = (0x01 << 4),
+ UNICODE_NFC_QUICK_CHECK_MAYBE = (0x02 << 4),
+ UNICODE_NFKD_QUICK_CHECK_YES = (0x00 << 2),
+ UNICODE_NFKD_QUICK_CHECK_NO = (0x01 << 2),
+ UNICODE_NFKD_QUICK_CHECK_MAYBE = (0x02 << 2),
+ UNICODE_NFD_QUICK_CHECK_YES = (0x00 << 0),
+ UNICODE_NFD_QUICK_CHECK_NO = (0x01 << 0),
+ UNICODE_NFD_QUICK_CHECK_MAYBE = (0x02 << 0),
+
+ UNICODE_NFKC_QUICK_CHECK_MASK = (0x03 << 6),
+ UNICODE_NFC_QUICK_CHECK_MASK = (0x03 << 4),
+ UNICODE_NFKD_QUICK_CHECK_MASK = (0x03 << 2),
+ UNICODE_NFD_QUICK_CHECK_MASK = (0x03 << 0),
+};
+
struct unicode_code_point_data {
uint8_t general_category; // Not yet used
+ uint8_t canonical_combining_class;
+ uint8_t nf_quick_check;
uint8_t decomposition_type; // Not yet used
uint8_t decomposition_first_length;
uint8_t decomposition_full_length;
uint8_t decomposition_full_k_length;
+ uint8_t composition_count;
+
uint16_t decomposition_first_offset;
uint16_t decomposition_full_offset;
uint16_t decomposition_full_k_offset;
+ uint16_t composition_offset;
uint32_t simple_titlecase_mapping;
};
return cp_data->decomposition_full_k_length;
}
+static inline uint32_t
+unicode_code_point_data_find_composition(
+ const struct unicode_code_point_data *cp_data, uint32_t second)
+{
+ const uint32_t *compositions =
+ &unicode_compositions[cp_data->composition_offset];
+ size_t left_idx, right_idx;
+
+ left_idx = 0; right_idx = cp_data->composition_count;
+ while (left_idx < right_idx) {
+ unsigned int idx = (left_idx + right_idx) / 2;
+
+ if (second > compositions[idx])
+ left_idx = idx + 1;
+ else if (second < compositions[idx])
+ right_idx = idx;
+ else {
+ return unicode_composition_primaries[
+ cp_data->composition_offset + idx];
+ }
+ }
+
+ return 0x0000;
+}
+
static inline size_t
unicode_code_point_get_full_decomposition(uint32_t cp, bool canonical,
const uint32_t **decomp_r)
ud_decompositions = []
ud_decomposition_max_length = 0
+ud_composition_pairs = {}
+ud_composition_composites = {}
+ud_composition_exclusions = {}
+ud_compositions = []
+ud_composition_primaries = []
+ud_compositions_max_per_starter = 0
+
class UCDFileOpen:
def __init__(self, filename):
# Add range
CodePointRange(cp_first, cp_last, cpd)
+ # CompositionExclusions.txt
+ with UCDFileOpen("CompositionExclusions.txt") as ucd:
+ for line in ucd.fd:
+ data = line.split("#")
+
+ cprng = parse_cp_range(data[0])
+ if cprng is None:
+ continue
+
+ for cp in range(cprng[0], cprng[1] + 1):
+ ud_composition_exclusions[cp] = True
+
+ # DerivedNormalizationProps.txt
+ with UCDFileOpen("DerivedNormalizationProps.txt") as ucd:
+ line_num = 0
+ for line in ucd.fd:
+ line_num = line_num + 1
+ data = line.split("#")
+ line = data[0].strip()
+ if len(line) == 0:
+ continue
+
+ cols = line.split(";")
+ if len(cols) < 3:
+ if len(cols) < 2:
+ die(f"{ucd}:{line_num}: Missing columns")
+ continue
+
+ cprng = parse_cp_range(cols[0])
+ if cprng is None:
+ continue
+
+ prop = cols[1].strip()
+ value = cols[2].strip()
+ if prop == "NFD_QC":
+ cpd = CodePointData()
+ cpd.nfd_quick_check = value
+ CodePointRange(cprng[0], cprng[1], cpd)
+ elif prop == "NFKD_QC":
+ cpd = CodePointData()
+ cpd.nfkd_quick_check = value
+ CodePointRange(cprng[0], cprng[1], cpd)
+ elif prop == "NFC_QC":
+ cpd = CodePointData()
+ cpd.nfc_quick_check = value
+ CodePointRange(cprng[0], cprng[1], cpd)
+ elif prop == "NFKC_QC":
+ cpd = CodePointData()
+ cpd.nfkc_quick_check = value
+ CodePointRange(cprng[0], cprng[1], cpd)
+
def expand_decompositions():
global ud_codepoints
ud_decomposition_max_length = len(dc)
+def derive_canonical_compositions():
+ global ud_codepoints
+ global ud_decompositions
+ global ud_composition_exclusions
+ global ud_composition_pairs
+ global ud_composition_composites
+ global ud_compositions
+ global ud_composition_primaries
+ global ud_compositions_max_per_starter
+
+ for cpr in ud_codepoints:
+ if cpr.cp_last > cpr.cp_first:
+ # No compositions in ranges expected, ever
+ continue
+ cp = cpr.cp_first
+ cpd = cpr.data
+
+ if not hasattr(cpd, "decomposition_full_offset"):
+ continue
+
+ # Skip singleton decompositions
+ if len(cpd.decomposition_first) < 2:
+ continue
+
+ # Skip non-starter decompositions
+ dc_offset = cpd.decomposition_full_offset
+ dc_len = cpd.decomposition_full_length
+ dc = ud_decompositions[dc_offset:(dc_offset + dc_len)]
+
+ scpr = ud_codepoints_index[dc[0]]
+ scpd = scpr.data
+ if (
+ hasattr(scpd, "canonical_combining_class")
+ and scpd.canonical_combining_class > 0
+ ):
+ continue
+
+ # Skip composition exclusions
+ if cp in ud_composition_exclusions:
+ continue
+
+ dc = cpd.decomposition_first
+
+ # Record all alternative pairs for each starter
+ if not dc[0] in ud_composition_pairs:
+ mp = [(dc[1], cp)]
+ ud_composition_pairs[dc[0]] = mp
+ else:
+ mp = ud_composition_pairs[dc[0]]
+ mp.append((dc[1], cp))
+
+ if len(mp) > ud_compositions_max_per_starter:
+ ud_compositions_max_per_starter = len(mp)
+
+ # Compose lookup tables
+ for cpr in ud_codepoints:
+ if cpr.cp_last > cpr.cp_first:
+ # No compositions in ranges expected, ever
+ continue
+ cp = cpr.cp_first
+ cpd = cpr.data
+
+ if cp not in ud_composition_pairs:
+ continue
+
+ def mp_key_func(a):
+ return a[0]
+
+ mp = ud_composition_pairs[cp]
+ mp.sort(key=mp_key_func)
+
+ cpd.composition_offset = len(ud_compositions)
+ cpd.composition_count = len(mp)
+
+ ud_compositions = ud_compositions + [p[0] for p in mp]
+ ud_composition_primaries = ud_composition_primaries + [p[1] for p in mp]
+
+
def create_cp_range_index():
global ud_codepoints
global ud_codepoints_index
print(
"#define UNICODE_DECOMPOSITION_MAX_LENGTH %s" % ud_decomposition_max_length
)
+ print(
+ "#define UNICODE_COMPOSITIONS_MAX_PER_STARTER %s"
+ % ud_compositions_max_per_starter
+ )
print("")
print("extern const struct unicode_code_point_data unicode_code_points[];")
print("")
print("")
print("extern const uint32_t unicode_decompositions[];")
print("")
+ print("extern const uint32_t unicode_compositions[];")
+ print("extern const uint32_t unicode_composition_primaries[];")
+ print("")
print("#endif")
sys.stdout = orig_stdout
"\t\t.general_category = %s,"
% get_general_category_def(cpd.general_category)
)
+ if (
+ hasattr(cpd, "canonical_combining_class")
+ and cpd.canonical_combining_class > 0
+ ):
+ print(
+ "\t\t.canonical_combining_class = %u,"
+ % cpd.canonical_combining_class
+ )
+ if (
+ hasattr(cpd, "nfd_quick_check")
+ or hasattr(cpd, "nfkd_quick_check")
+ or hasattr(cpd, "nfc_quick_check")
+ or hasattr(cpd, "nfkc_quick_check")
+ ):
+ print("\t\t.nf_quick_check = (", end="")
+ if hasattr(cpd, "nfkc_quick_check"):
+ if cpd.nfkc_quick_check == "N":
+ print("UNICODE_NFKC_QUICK_CHECK_NO", end="")
+ elif cpd.nfkc_quick_check == "M":
+ print("UNICODE_NFKC_QUICK_CHECK_MAYBE", end="")
+ if hasattr(cpd, "nfkc_quick_check") and hasattr(cpd, "nfc_quick_check"):
+ print(" |")
+ print("\t\t\t\t ", end="")
+ if hasattr(cpd, "nfc_quick_check"):
+ if cpd.nfc_quick_check == "N":
+ print("UNICODE_NFC_QUICK_CHECK_NO", end="")
+ elif cpd.nfc_quick_check == "M":
+ print("UNICODE_NFC_QUICK_CHECK_MAYBE", end="")
+ if (
+ hasattr(cpd, "nfkc_quick_check") or hasattr(cpd, "nfc_quick_check")
+ ) and hasattr(cpd, "nfkd_quick_check"):
+ print(" |")
+ print("\t\t\t\t ", end="")
+ if hasattr(cpd, "nfkd_quick_check"):
+ if cpd.nfkd_quick_check == "N":
+ print("UNICODE_NFKD_QUICK_CHECK_NO", end="")
+ elif cpd.nfkd_quick_check == "M":
+ print("UNICODE_NFKD_QUICK_CHECK_MAYBE", end="")
+ if (
+ hasattr(cpd, "nfkc_quick_check")
+ or hasattr(cpd, "nfc_quick_check")
+ or hasattr(cpd, "nfkd_quick_check")
+ ) and hasattr(cpd, "nfd_quick_check"):
+ print(" |")
+ print("\t\t\t\t ", end="")
+ if hasattr(cpd, "nfd_quick_check"):
+ if cpd.nfd_quick_check == "N":
+ print("UNICODE_NFD_QUICK_CHECK_NO", end="")
+ elif cpd.nfd_quick_check == "M":
+ print("UNICODE_NFD_QUICK_CHECK_MAYBE", end="")
+ print("),")
if hasattr(cpd, "decomposition_type"):
print(
"\t\t.decomposition_type = %s,"
"\t\t.decomposition_full_k_offset = %u,"
% cpd.decomposition_full_k_offset
)
+ if hasattr(cpd, "composition_count"):
+ print("\t\t.composition_count = %u," % cpd.composition_count)
+ print("\t\t.composition_offset = %u," % cpd.composition_offset)
if hasattr(cpd, "simple_titlecase_mapping"):
print(
"\t\t.simple_titlecase_mapping = 0x%04X,"
print_list(ud_decompositions)
print(",")
print("};")
+ print("")
+ print("const uint32_t unicode_compositions[] = {")
+ print_list(ud_compositions)
+ print(",")
+ print("};")
+ print("")
+ print("const uint32_t unicode_composition_primaries[] = {")
+ print_list(ud_composition_primaries)
+ print(",")
+ print("};")
sys.stdout = orig_stdout
create_cp_range_index()
expand_decompositions()
+ derive_canonical_compositions()
create_cp_index_tables()