From: Stephan Bosch Date: Tue, 1 Apr 2025 02:23:15 +0000 (+0200) Subject: lib: unicode-transform - Implement case mapping X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=67f4bd295964b5590aadef9a4996f4047e04665c;p=thirdparty%2Fdovecot%2Fcore.git lib: unicode-transform - Implement case mapping --- diff --git a/src/lib/Makefile.am b/src/lib/Makefile.am index 61e48817a9..b968527037 100644 --- a/src/lib/Makefile.am +++ b/src/lib/Makefile.am @@ -516,6 +516,7 @@ test_lib_SOURCES = \ test-unichar.c \ test-unicode-data.c \ test-unicode-nf.c \ + test-unicode-casemap.c \ test-utc-mktime.c \ test-uri.c \ test-wildcard-match.c diff --git a/src/lib/test-lib.inc b/src/lib/test-lib.inc index 1bdc76e367..facdc413a0 100644 --- a/src/lib/test-lib.inc +++ b/src/lib/test-lib.inc @@ -109,6 +109,7 @@ TEST(test_time_util) TEST(test_unichar) TEST(test_unicode_data) TEST(test_unicode_nf) +TEST(test_unicode_casemap) TEST(test_uri) TEST(test_utc_mktime) TEST(test_wildcard_match) diff --git a/src/lib/test-unicode-casemap.c b/src/lib/test-unicode-casemap.c new file mode 100644 index 0000000000..49fe8ec24d --- /dev/null +++ b/src/lib/test-unicode-casemap.c @@ -0,0 +1,61 @@ +/* Copyright (c) 2025 Dovecot authors, see the included COPYING file */ + +#include "test-lib.h" +#include "strnum.h" +#include "str.h" +#include "unichar.h" + +static const struct casemap_test { + const char *input; + const char *lowercase; + const char *uppercase; + const char *casefold; +} tests[] = { + { + /* Weikopfseeadler */ + .input = "\x57\x65\x69\xC3\x9F\x6B\x6F\x70\x66" + "\x73\x65\x65\x61\x64\x6C\x65\x72", + /* WEISSKOPFSEEADLER */ + .uppercase = "WEISSKOPFSEEADLER", + /* weikopfseeadler */ + .lowercase = "\x77\x65\x69\xC3\x9F\x6B\x6F\x70" + "\x66\x73\x65\x65\x61\x64\x6C\x65\x72", + /* weisskopfseeadler */ + .casefold = "weisskopfseeadler", + }, +}; + +static const unsigned int tests_count = N_ELEMENTS(tests); + +void test_unicode_casemap(void) +{ + unsigned int i; + + test_begin("unicode casemap"); + + for (i = 0; i < tests_count; i++) { + const struct casemap_test *test = &tests[i]; + const char *uppercase, *lowercase, *casefold; + const char *test_casefold = + (test->casefold != NULL ? + test->casefold : test->lowercase); + int ret; + + ret = uni_utf8_to_uppercase(test->input, strlen(test->input), + &uppercase); + test_assert_idx(ret >= 0, i); + test_assert_strcmp_idx(test->uppercase, uppercase, i); + + ret = uni_utf8_to_lowercase(test->input, strlen(test->input), + &lowercase); + test_assert_idx(ret >= 0, i); + test_assert_strcmp_idx(test->lowercase, lowercase, i); + + ret = uni_utf8_to_casefold(test->input, strlen(test->input), + &casefold); + test_assert_idx(ret >= 0, i); + test_assert_strcmp_idx(test_casefold, casefold, i); + } + + test_end(); +} diff --git a/src/lib/unichar.c b/src/lib/unichar.c index a8a60e92da..b582e4769d 100644 --- a/src/lib/unichar.c +++ b/src/lib/unichar.c @@ -418,6 +418,69 @@ int uni_utf8_is_nfkc(const void *input, size_t size) return uni_utf8_is_nf(input, size, UNICODE_NFKC); } +int uni_utf8_write_uppercase(const void *_input, size_t size, buffer_t *output) +{ + static struct unicode_casemap map; + const char *error; + + unicode_casemap_init_uppercase(&map); + + return uni_utf8_run_transform(_input, size, &map.transform, output, + &error); +} + +int uni_utf8_write_lowercase(const void *_input, size_t size, buffer_t *output) +{ + static struct unicode_casemap map; + const char *error; + + unicode_casemap_init_lowercase(&map); + + return uni_utf8_run_transform(_input, size, &map.transform, output, + &error); +} + +int uni_utf8_write_casefold(const void *_input, size_t size, buffer_t *output) +{ + static struct unicode_casemap map; + const char *error; + + unicode_casemap_init_casefold(&map); + + return uni_utf8_run_transform(_input, size, &map.transform, output, + &error); +} + +int uni_utf8_to_uppercase(const void *input, size_t size, const char **output_r) +{ + buffer_t *output = t_buffer_create(size); + int ret; + + ret = uni_utf8_write_uppercase(input, size, output); + *output_r = str_c(output); + return ret; +} + +int uni_utf8_to_lowercase(const void *input, size_t size, const char **output_r) +{ + buffer_t *output = t_buffer_create(size); + int ret; + + ret = uni_utf8_write_lowercase(input, size, output); + *output_r = str_c(output); + return ret; +} + +int uni_utf8_to_casefold(const void *input, size_t size, const char **output_r) +{ + buffer_t *output = t_buffer_create(size); + int ret; + + ret = uni_utf8_write_casefold(input, size, output); + *output_r = str_c(output); + return ret; +} + int uni_utf8_to_decomposed_titlecase(const void *_input, size_t size, buffer_t *output) { diff --git a/src/lib/unichar.h b/src/lib/unichar.h index 1e1ef09ece..68943a4866 100644 --- a/src/lib/unichar.h +++ b/src/lib/unichar.h @@ -157,6 +157,16 @@ int uni_utf8_is_nfkd(const void *input, size_t size); int uni_utf8_is_nfc(const void *input, size_t size); int uni_utf8_is_nfkc(const void *input, size_t size); +/* Write the input UTF8 string to the provided buffer after mapping it to the + requested case. */ +int uni_utf8_write_uppercase(const void *_input, size_t size, buffer_t *output); +int uni_utf8_write_lowercase(const void *_input, size_t size, buffer_t *output); +int uni_utf8_write_casefold(const void *_input, size_t size, buffer_t *output); + +int uni_utf8_to_uppercase(const void *input, size_t size, const char **output_r); +int uni_utf8_to_lowercase(const void *input, size_t size, const char **output_r); +int uni_utf8_to_casefold(const void *input, size_t size, const char **output_r); + /* Convert UTF-8 input to titlecase and decompose the titlecase characters to output buffer. Returns 0 if ok, -1 if input was invalid. This generates output that's compatible with i;unicode-casemap comparator. Invalid input diff --git a/src/lib/unicode-transform.c b/src/lib/unicode-transform.c index 8274f1ffd5..d71486001c 100644 --- a/src/lib/unicode-transform.c +++ b/src/lib/unicode-transform.c @@ -898,6 +898,172 @@ int unicode_nf_checker_finish(struct unicode_nf_checker *unc) return (ret > 0 ? 1 : 0); } +/* + * Casemap Transform + */ + +static size_t +unicode_casemap_uppercase_cp(const struct unicode_code_point_data *cp_data, + const uint32_t **map_r); +static size_t +unicode_casemap_lowercase_cp(const struct unicode_code_point_data *cp_data, + const uint32_t **map_r); +static size_t +unicode_casemap_casefold_cp(const struct unicode_code_point_data *cp_data, + const uint32_t **map_r); + +static ssize_t +unicode_casemap_input(struct unicode_transform *trans, + const struct unicode_transform_buffer *buf, + const char **error_r); +static int +unicode_casemap_flush(struct unicode_transform *trans, bool finished, + const char **error_r); + +static const struct unicode_transform_def unicode_casemap_def = { + .input = unicode_casemap_input, + .flush = unicode_casemap_flush, +}; + +void unicode_casemap_init_uppercase(struct unicode_casemap *map_r) +{ + i_zero(map_r); + unicode_transform_init(&map_r->transform, &unicode_casemap_def); + map_r->map = unicode_casemap_uppercase_cp; +} + +void unicode_casemap_init_lowercase(struct unicode_casemap *map_r) +{ + i_zero(map_r); + unicode_transform_init(&map_r->transform, &unicode_casemap_def); + map_r->map = unicode_casemap_lowercase_cp; +} + +void unicode_casemap_init_casefold(struct unicode_casemap *map_r) +{ + i_zero(map_r); + unicode_transform_init(&map_r->transform, &unicode_casemap_def); + map_r->map = unicode_casemap_casefold_cp; +} + +static size_t +unicode_casemap_uppercase_cp(const struct unicode_code_point_data *cp_data, + const uint32_t **map_r) +{ + return unicode_code_point_data_get_uppercase_mapping(cp_data, map_r); +} + +static size_t +unicode_casemap_lowercase_cp(const struct unicode_code_point_data *cp_data, + const uint32_t **map_r) +{ + return unicode_code_point_data_get_lowercase_mapping(cp_data, map_r); +} + +static size_t +unicode_casemap_casefold_cp(const struct unicode_code_point_data *cp_data, + const uint32_t **map_r) +{ + return unicode_code_point_data_get_casefold_mapping(cp_data, map_r); +} + +static ssize_t +unicode_casemap_input_cp(struct unicode_casemap *map, uint32_t cp, + const struct unicode_code_point_data *cp_data, + const char **error_r) +{ + bool was_buffered = map->cp_buffered; + ssize_t sret; + + if (cp_data == NULL) + cp_data = unicode_code_point_get_data(cp); + + const uint32_t *map_cps; + const struct unicode_code_point_data *const *map_cps_data = NULL; + size_t map_cps_len; + + map_cps_len = map->map(cp_data, &map_cps); + if (map_cps_len == 0) { + map_cps = &cp; + map_cps_data = &cp_data; + map_cps_len = 1; + } + i_assert(map_cps_len > map->cp_map_pos); + + map_cps += map->cp_map_pos; + map_cps_len -= map->cp_map_pos; + sret = uniform_transform_forward(&map->transform, + map_cps, map_cps_data, map_cps_len, + error_r); + if (sret < 0) { + i_assert(*error_r != NULL); + return -1; + } + if ((size_t)sret < map_cps_len) { + map->cp_buffered = TRUE; + map->cp = cp; + map->cp_data = cp_data; + map->cp_map_pos += sret; + return (was_buffered ? 0 : 1); + } + + map->cp_buffered = FALSE; + map->cp_data = NULL; + map->cp_map_pos = 0; + return 1; +} + +static ssize_t +unicode_casemap_input(struct unicode_transform *trans, + const struct unicode_transform_buffer *buf, + const char **error_r) +{ + struct unicode_casemap *map = + container_of(trans, struct unicode_casemap, transform); + int ret; + + ret = unicode_casemap_flush(trans, TRUE, error_r); + if (ret < 0) { + i_assert(*error_r != NULL); + return -1; + } + if (map->cp_buffered) + return 0; + + size_t n; + for (n = 0; n < buf->cp_count; n++) { + if (map->cp_buffered) + break; + ret = unicode_casemap_input_cp(map, buf->cp[n], + (buf->cp_data != NULL ? + buf->cp_data[n] : NULL), + error_r); + if (ret < 0) { + i_assert(*error_r != NULL); + return -1; + } + if (ret == 0) + break; + } + return n; +} + +static int +unicode_casemap_flush(struct unicode_transform *trans, + bool finished ATTR_UNUSED, const char **error_r) +{ + struct unicode_casemap *map = + container_of(trans, struct unicode_casemap, transform); + int ret; + + if (!map->cp_buffered) + return 1; + + ret = unicode_casemap_input_cp(map, map->cp, map->cp_data, error_r); + i_assert(ret >= 0 || *error_r != NULL); + return ret; +} + /* * RFC 5051 - Simple Unicode Collation Algorithm */ diff --git a/src/lib/unicode-transform.h b/src/lib/unicode-transform.h index 3e3499f0e5..24e4c2cb37 100644 --- a/src/lib/unicode-transform.h +++ b/src/lib/unicode-transform.h @@ -196,6 +196,27 @@ int unicode_nf_checker_input(struct unicode_nf_checker *unc, uint32_t cp, const struct unicode_code_point_data **cp_data); int unicode_nf_checker_finish(struct unicode_nf_checker *unc); +/* + * Casemap Transform + */ + +struct unicode_casemap { + struct unicode_transform transform; + + size_t (*map)(const struct unicode_code_point_data *cp_data, + const uint32_t **map_r); + + uint32_t cp; + const struct unicode_code_point_data *cp_data; + unsigned int cp_map_pos; + + bool cp_buffered:1; +}; + +void unicode_casemap_init_uppercase(struct unicode_casemap *map); +void unicode_casemap_init_lowercase(struct unicode_casemap *map); +void unicode_casemap_init_casefold(struct unicode_casemap *map); + /* * RFC 5051 - Simple Unicode Collation Algorithm */