$(UCD_DIR)/NormalizationTest.txt \
$(UCD_DIR)/PropertyValueAliases.txt \
$(UCD_DIR)/PropList.txt \
+ $(UCD_DIR)/SpecialCasing.txt \
$(UCD_DIR)/UnicodeData.txt \
$(UCD_DIR)/WordBreakProperty.txt
$(AM_V_at)test -f $@ || $(WGET) -nv -O $@ $(UCD_URL)/PropertyValueAliases.txt
$(UCD_DIR)/PropList.txt:
$(AM_V_at)test -f $@ || $(WGET) -nv -O $@ $(UCD_URL)/PropList.txt
+$(UCD_DIR)/SpecialCasing.txt:
+ $(AM_V_at)test -f $@ || $(WGET) -nv -O $@ $(UCD_URL)/SpecialCasing.txt
$(UCD_DIR)/UnicodeData.txt:
$(AM_V_at)test -f $@ || $(WGET) -nv -O $@ $(UCD_URL)/UnicodeData.txt
$(UCD_DIR)/WordBreakProperty.txt:
#define UCD_COMPOSITION_EXCLUSIONS_TXT "CompositionExclusions.txt"
#define UCD_DERIVED_NORMALIZATION_PROPS_TXT "DerivedNormalizationProps.txt"
#define UCD_PROP_LIST_TXT "PropList.txt"
+#define UCD_SPECIAL_CASING_TXT "SpecialCasing.txt"
#define UCD_UNICODE_DATA_TXT "UnicodeData.txt"
#define UCD_WORD_BREAK_PROPERTY_TXT "WordBreakProperty.txt"
return !test_has_failed();
}
+static void
+test_case_mapping(uint32_t cp, const char *const *parsed_mapping,
+ const uint32_t *case_map, unsigned int case_map_len)
+{
+ unsigned int case_map_idx;
+ unsigned int parsed_mapping_len = str_array_length(parsed_mapping);
+
+ if (parsed_mapping_len == 1 && case_map_len == 0) {
+ /* Maps to itself (compiled as len == 0) */
+ uint32_t mcp;
+
+ test_assert_idx(str_to_uint32_hex(*parsed_mapping, &mcp) >= 0, cp);
+ if (test_has_failed())
+ return;
+ test_assert_idx(mcp == cp, cp);
+ return;
+ }
+
+ /* Explicit mapping */
+ test_assert(parsed_mapping_len == case_map_len);
+ if (test_has_failed())
+ return;
+
+ case_map_idx = 0;
+ while (*parsed_mapping != NULL && !test_has_failed()) {
+ uint32_t mcp;
+
+ test_assert_idx(str_to_uint32_hex(*parsed_mapping, &mcp) >= 0, cp);
+ if (test_has_failed())
+ return;
+ test_assert_idx(uni_is_valid_ucs4(mcp), cp);
+ test_assert_idx(mcp == case_map[case_map_idx], cp);
+
+ case_map_idx++;
+ parsed_mapping++;
+ }
+}
+
static void
test_composition_exclusions_line(const char *line, unsigned int line_num)
{
}
}
+static void test_special_casing_line(const char *line, unsigned int line_num)
+{
+ const char *const *columns = t_strsplit(line, ";");
+ size_t num_columns = str_array_length(columns);
+
+ /* <code>; <lower>; <title>; <upper>; (<condition_list>;)? */
+
+ if (num_columns < 4) {
+ test_failed(t_strdup_printf(
+ "Invalid data at %s:%u",
+ UCD_SPECIAL_CASING_TXT, line_num));
+ return;
+ }
+
+ if (num_columns > 4 && strlen(t_str_trim(columns[4], " ")) > 0) {
+ /* Skip lines with condition list */
+ return;
+ }
+
+ const char *cp_hex = t_str_trim(columns[0], " ");
+ uint32_t cp;
+
+ if (str_to_uint32_hex(cp_hex, &cp) < 0) {
+ test_failed(t_strdup_printf(
+ "Invalid data at %s:%u: "
+ "Bad code point",
+ UCD_SPECIAL_CASING_TXT, line_num));
+ return;
+ }
+
+ /* Parse Decomposition_* */
+
+ const char *lower = t_str_trim(columns[1], " ");
+ const char *upper = t_str_trim(columns[3], " ");
+ const char *const *lower_map = t_strsplit(lower, " ");
+ const char *const *upper_map = t_strsplit(upper, " ");
+
+ /* Check data */
+
+ const struct unicode_code_point_data *cp_data =
+ unicode_code_point_get_data(cp);
+ const uint32_t *case_map;
+ size_t case_map_len;
+
+ case_map_len = unicode_code_point_data_get_uppercase_mapping(
+ cp_data, &case_map);
+ test_case_mapping(cp, upper_map, case_map, case_map_len);
+ case_map_len = unicode_code_point_data_get_lowercase_mapping(
+ cp_data, &case_map);
+ test_case_mapping(cp, lower_map, case_map, case_map_len);
+}
+
static void test_unicode_data_line(const char *line, unsigned int line_num)
{
static uint32_t cp_first = 0;
decomp++;
}
+ if (cp_data->uppercase_mapping_length == 1) {
+ const uint32_t *map;
+ size_t map_len =
+ unicode_code_point_data_get_uppercase_mapping(
+ cp_data, &map);
+ test_assert_idx(map_len == 1 &&
+ map[0] == simple_uppercase_mapping, cp);
+ }
+ if (cp_data->lowercase_mapping_length == 1) {
+ const uint32_t *map;
+ size_t map_len =
+ unicode_code_point_data_get_lowercase_mapping(
+ cp_data, &map);
+ test_assert_idx(map_len == 1 &&
+ map[0] == simple_lowercase_mapping, cp);
+ }
test_assert_idx(
cp_data->simple_titlecase_mapping == simple_titlecase_mapping,
cp);
test_ucd_file(UCD_DERIVED_NORMALIZATION_PROPS_TXT,
test_derived_normalization_props_line);
test_ucd_file(UCD_PROP_LIST_TXT, test_prop_list_line);
+ test_ucd_file(UCD_SPECIAL_CASING_TXT, test_special_casing_line);
test_ucd_file(UCD_UNICODE_DATA_TXT, test_unicode_data_line);
test_ucd_file(UCD_WORD_BREAK_PROPERTY_TXT,
test_word_break_property_line);
uint8_t composition_count;
+ uint8_t uppercase_mapping_length;
+ uint8_t lowercase_mapping_length;
+
uint16_t decomposition_first_offset;
uint16_t decomposition_full_offset;
uint16_t decomposition_full_k_offset;
uint16_t composition_offset;
+ uint16_t uppercase_mapping_offset;
+ uint16_t lowercase_mapping_offset;
+
uint32_t simple_titlecase_mapping;
/* Property bits (UAX #44, Section 5.1) */
cp_data, canonical, decomp_r);
}
+static inline size_t
+unicode_code_point_data_get_uppercase_mapping(
+ const struct unicode_code_point_data *cp_data,
+ const uint32_t **map_r)
+{
+ uint32_t offset;
+
+ offset = cp_data->uppercase_mapping_offset;
+ *map_r = &unicode_case_mappings[offset];
+ return cp_data->uppercase_mapping_length;
+}
+
+static inline size_t
+unicode_code_point_data_get_lowercase_mapping(
+ const struct unicode_code_point_data *cp_data,
+ const uint32_t **map_r)
+{
+ uint32_t offset;
+
+ offset = cp_data->lowercase_mapping_offset;
+ *map_r = &unicode_case_mappings[offset];
+ return cp_data->lowercase_mapping_length;
+}
+
uint8_t unicode_general_category_from_string(const char *str);
#endif
ud_composition_primaries = []
ud_compositions_max_per_starter = 0
+ud_case_mappings = []
+ud_case_mapping_max_length = 0
+
class UCDFileOpen:
def __init__(self, filename):
cpd.pb_m_terminal_punctuation = True
CodePointRange(cprng[0], cprng[1], cpd)
+ # SpecialCasing.txt
+ with UCDFileOpen("SpecialCasing.txt") as ucd:
+ line_num = 0
+ for line in ucd.fd:
+ line_num = line_num + 1
+ data = line.split("#")
+ line = data[0].strip()
+ if len(line) == 0:
+ continue
+
+ # <code>; <lower>; <title>; <upper>; (<condition_list>;)? # <comment>
+ cols = line.split(";")
+ if len(cols) < 4:
+ die(f"{ucd}:{line_num}: Missing columns")
+ if len(cols) > 4 and len(cols[4].strip()) > 0:
+ # Skip lines with condition list
+ continue
+
+ cp_hex = cols[0].strip()
+ if len(cp_hex) == 0:
+ continue
+ cp = int(cp_hex, 16)
+
+ lower = cols[1].strip()
+ upper = cols[3].strip()
+
+ cpd = None
+
+ # Lowercase_Mapping
+ codes_hex = lower.split(" ")
+ if len(codes_hex) > 0:
+ first_code_hex = codes_hex[0].strip()
+ first_code = int(first_code_hex, 16)
+ if len(codes_hex) > 1 or first_code != cp:
+ codes = []
+ for code_hex in codes_hex:
+ codes.append(int(code_hex, 16))
+
+ if cpd is None:
+ cpd = CodePointData()
+ cpd.lowercase_mapping = codes
+
+ # Uppercase_Mapping
+ codes_hex = upper.split(" ")
+ if len(codes_hex) > 0:
+ first_code_hex = codes_hex[0].strip()
+ first_code = int(first_code_hex, 16)
+ if len(codes_hex) > 1 or first_code != cp:
+ codes = []
+ for code_hex in codes_hex:
+ codes.append(int(code_hex, 16))
+
+ if cpd is None:
+ cpd = CodePointData()
+ cpd.uppercase_mapping = codes
+
+ if cpd is not None:
+ CodePointRange(cp, cp, cpd)
+
# WordBreakProperty.txt
with UCDFileOpen("WordBreakProperty.txt") as ucd:
line_num = 0
CodePointRange(cprng[0], cprng[1], cpd)
+def resolve_case_mappings():
+ global ud_codepoints
+ global ud_case_mappings
+ global ud_case_mapping_max_length
+
+ for cpr in ud_codepoints:
+ if cpr.cp_last > cpr.cp_first:
+ # No case mappings in ranges expected, ever
+ continue
+ cp = cpr.cp_first
+ cpd = cpr.data
+
+ # Uppercase_Mapping
+ ucase_codes = []
+ if hasattr(cpd, "uppercase_mapping"):
+ ucase_codes = cpd.uppercase_mapping
+ if len(ucase_codes) > 0 and (len(ucase_codes) > 1 or ucase_codes[0] != cp):
+ cpd.uppercase_mapping_offset = len(ud_case_mappings)
+ cpd.uppercase_mapping_length = len(ucase_codes)
+ ud_case_mappings = ud_case_mappings + ucase_codes
+ elif (
+ hasattr(cpd, "simple_uppercase_mapping")
+ and cpd.simple_uppercase_mapping != cp
+ ):
+ cpd.uppercase_mapping_offset = len(ud_case_mappings)
+ cpd.uppercase_mapping_length = 1
+ ud_case_mappings.append(cpd.simple_uppercase_mapping)
+ ucase_codes = [cpd.simple_uppercase_mapping]
+ else:
+ ucase_codes = []
+ if len(ucase_codes) > ud_case_mapping_max_length:
+ ud_case_mapping_max_length = len(ucase_codes)
+
+ # Lowercase_Mapping
+ lcase_codes = []
+ if hasattr(cpd, "lowercase_mapping"):
+ lcase_codes = cpd.lowercase_mapping
+ if len(lcase_codes) > 0 and (len(lcase_codes) > 1 or lcase_codes[0] != cp):
+ cpd.lowercase_mapping_offset = len(ud_case_mappings)
+ cpd.lowercase_mapping_length = len(lcase_codes)
+ ud_case_mappings = ud_case_mappings + lcase_codes
+ elif (
+ hasattr(cpd, "simple_lowercase_mapping")
+ and cpd.simple_lowercase_mapping != cp
+ ):
+ cpd.lowercase_mapping_offset = len(ud_case_mappings)
+ cpd.lowercase_mapping_length = 1
+ ud_case_mappings.append(cpd.simple_lowercase_mapping)
+ lcase_codes = [cpd.simple_lowercase_mapping]
+ else:
+ lcase_codes = []
+ if len(lcase_codes) > ud_case_mapping_max_length:
+ ud_case_mapping_max_length = len(lcase_codes)
+
+
def expand_decompositions():
global ud_codepoints
global ud_codepoints_index
global output_dir
global ud_decomposition_max_length
global ud_compositions_max_per_starter
+ global ud_case_mapping_max_length
orig_stdout = sys.stdout
"#define UNICODE_COMPOSITIONS_MAX_PER_STARTER %s"
% ud_compositions_max_per_starter
)
+ print("#define UNICODE_CASE_MAPPING_MAX_LENGTH %s" % ud_case_mapping_max_length)
print("")
print("extern const struct unicode_code_point_data unicode_code_points[];")
print("")
print("extern const uint32_t unicode_compositions[];")
print("extern const uint32_t unicode_composition_primaries[];")
print("")
+ print("extern const uint32_t unicode_case_mappings[];")
+ print("")
print("#endif")
sys.stdout = orig_stdout
if hasattr(cpd, "composition_count"):
print("\t\t.composition_count = %u," % cpd.composition_count)
print("\t\t.composition_offset = %u," % cpd.composition_offset)
+ if (
+ hasattr(cpd, "lowercase_mapping_length")
+ and cpd.lowercase_mapping_length > 0
+ ):
+ print(
+ "\t\t.lowercase_mapping_length = %s," % cpd.lowercase_mapping_length
+ )
+ print(
+ "\t\t.lowercase_mapping_offset = %s," % cpd.lowercase_mapping_offset
+ )
+ if (
+ hasattr(cpd, "uppercase_mapping_length")
+ and cpd.uppercase_mapping_length > 0
+ ):
+ print(
+ "\t\t.uppercase_mapping_length = %s," % cpd.uppercase_mapping_length
+ )
+ print(
+ "\t\t.uppercase_mapping_offset = %s," % cpd.uppercase_mapping_offset
+ )
if hasattr(cpd, "simple_titlecase_mapping"):
print(
"\t\t.simple_titlecase_mapping = 0x%04X,"
print_list(ud_composition_primaries)
print(",")
print("};")
+ print("")
+ print("const uint32_t unicode_case_mappings[] = {")
+ print_list(ud_case_mappings)
+ print(",")
+ print("};")
sys.stdout = orig_stdout
source_files.sort()
create_cp_range_index()
+ resolve_case_mappings()
expand_decompositions()
derive_canonical_compositions()