From: Stephan Bosch Date: Sun, 30 Mar 2025 01:02:58 +0000 (+0200) Subject: lib: unicode-transform - Add generic Unicode string transformation API X-Git-Tag: 2.4.2~612 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=3cc0190ef90056926e748b7fc82e79a8940732d4;p=thirdparty%2Fdovecot%2Fcore.git lib: unicode-transform - Add generic Unicode string transformation API --- diff --git a/src/lib/unichar.c b/src/lib/unichar.c index 209d1b1e9d..0a7405d2cc 100644 --- a/src/lib/unichar.c +++ b/src/lib/unichar.c @@ -250,6 +250,54 @@ static void output_add_replacement_char(buffer_t *output) buffer_append(output, utf8_replacement_char, UTF8_REPLACEMENT_CHAR_LEN); } +int uni_utf8_run_transform(const void *_input, size_t size, + struct unicode_transform *trans, buffer_t *output, + const char **error_r) +{ + struct unicode_transform *trans_last = + unicode_transform_get_last(trans); + struct unicode_buffer_sink sink; + const unsigned char *input = _input; + unichar_t chr; + ssize_t sret; + bool got_chr = FALSE, bad_cp = FALSE; + int ret = 0; + + unicode_buffer_sink_init(&sink, output); + unicode_transform_chain(trans_last, &sink.transform); + + while (size > 0 || got_chr) { + if (!got_chr) { + int bytes = uni_utf8_get_char_n(input, size, &chr); + if (bytes <= 0) { + /* Invalid input. try the next byte. */ + ret = -1; + input++; size--; + if (!bad_cp) { + chr = UNICODE_REPLACEMENT_CHAR; + bad_cp = TRUE; + } + } else { + input += bytes; + size -= bytes; + bad_cp = FALSE; + } + } + + sret = unicode_transform_input(trans, &chr, 1, error_r); + if (sret < 0) + return -1; + if (sret > 0) + got_chr = FALSE; + } + + int fret = unicode_transform_flush(trans, error_r); + if (fret < 0) + i_panic("unicode_transform_flush(): %s", *error_r); + i_assert(fret == 1); + return ret; +} + int uni_utf8_to_decomposed_titlecase(const void *_input, size_t size, buffer_t *output) { diff --git a/src/lib/unichar.h b/src/lib/unichar.h index e26acaf298..31741a8ba9 100644 --- a/src/lib/unichar.h +++ b/src/lib/unichar.h @@ -42,6 +42,8 @@ #define UTF16_VALID_HIGH_SURROGATE(chr) (((chr) & 0xfffc00) == UTF16_SURROGATE_HIGH_FIRST) #define UTF16_VALID_LOW_SURROGATE(chr) (((chr) & 0xfffc00) == UTF16_SURROGATE_LOW_FIRST) +struct unicode_transform; + typedef uint32_t unichar_t; ARRAY_DEFINE_TYPE(unichars, unichar_t); @@ -116,6 +118,12 @@ uni_utf8_char_bytes(unsigned char chr) /* Return given character in titlecase. */ unichar_t uni_ucs4_to_titlecase(unichar_t chr) ATTR_CONST; +/* Run the UTF8 string through the provided Unicode transform and write the + result into the buffer again encoded in UTF8. */ +int uni_utf8_run_transform(const void *_input, size_t size, + struct unicode_transform *trans, buffer_t *output, + const char **error_r); + /* Convert UTF-8 input to titlecase and decompose the titlecase characters to output buffer. Returns 0 if ok, -1 if input was invalid. This generates output that's compatible with i;unicode-casemap comparator. Invalid input diff --git a/src/lib/unicode-transform.c b/src/lib/unicode-transform.c index cc94c8c359..401e74ec9c 100644 --- a/src/lib/unicode-transform.c +++ b/src/lib/unicode-transform.c @@ -1,12 +1,204 @@ /* Copyright (c) 2025 Dovecot authors, see the included COPYING file */ #include "lib.h" +#include "unichar.h" #include "unicode-data.h" #include "unicode-transform.h" #define HANGUL_FIRST 0xac00 #define HANGUL_LAST 0xd7a3 +/* + * Transform + */ + +ssize_t uniform_transform_forward( + struct unicode_transform *trans, const uint32_t *out, + const struct unicode_code_point_data *const *out_data, size_t out_len, + const char **error_r) +{ + struct unicode_transform_buffer buf_next; + ssize_t sret; + + i_zero(&buf_next); + buf_next.cp = out; + buf_next.cp_data = out_data; + buf_next.cp_count = out_len; + + i_assert(trans->next != NULL); + i_assert(trans->next->def != NULL); + i_assert(trans->next->def->input != NULL); + sret = trans->next->def->input(trans->next, &buf_next, error_r); + + i_assert(sret >= 0 || *error_r != NULL); + i_assert(sret <= (ssize_t)out_len); + return sret; +} + +ssize_t unicode_transform_input(struct unicode_transform *trans, + const uint32_t *in, size_t in_len, + const char **error_r) +{ + struct unicode_transform_buffer in_buf; + size_t input_total = 0; + ssize_t sret; + bool flushed = FALSE; + int ret; + + *error_r = NULL; + + i_zero(&in_buf); + in_buf.cp = in; + in_buf.cp_count = in_len; + + while (in_buf.cp_count > 0) { + if (in_buf.cp_count > 0) { + i_assert(trans->def->input != NULL); + sret = trans->def->input(trans, &in_buf, error_r); + if (sret < 0) { + i_assert(*error_r != NULL); + return -1; + } + if (sret > 0) { + i_assert((size_t)sret <= in_buf.cp_count); + in_buf.cp += sret; + in_buf.cp_count -= sret; + input_total += sret; + flushed = FALSE; + continue; + } + if (sret == 0 && flushed) + break; + } + + struct unicode_transform *tp = trans; + + while (tp->next != NULL) { + if (tp->def->flush != NULL) { + ret = tp->def->flush(tp, FALSE, error_r); + if (ret < 0) { + i_assert(*error_r != NULL); + return -1; + } + } + tp = tp->next; + } + + flushed = TRUE; + } + + return input_total; +} + +int unicode_transform_flush(struct unicode_transform *trans, + const char **error_r) +{ + int ret; + + *error_r = NULL; + + while (trans != NULL) { + struct unicode_transform *tp = trans; + bool progress = FALSE; + + while (tp != NULL) { + if (tp->def->flush == NULL) { + progress = TRUE; + if (tp == trans) + trans = trans->next; + } else { + ret = tp->def->flush(tp, (tp == trans), error_r); + if (ret < 0) { + i_assert(*error_r != NULL); + return -1; + } + if (ret > 0) { + progress = TRUE; + if (tp == trans) + trans = trans->next; + } + } + tp = tp->next; + } + if (!progress) + return 0; + } + return 1; +} + +/* Buffer Sink */ + +static ssize_t +unicode_buffer_sink_input(struct unicode_transform *trans, + const struct unicode_transform_buffer *buf, + const char **error_r); + +static const struct unicode_transform_def unicode_buffer_sink_def = { + .input = unicode_buffer_sink_input, +}; + +void unicode_buffer_sink_init(struct unicode_buffer_sink *sink, + buffer_t *buffer) +{ + i_zero(sink); + unicode_transform_init(&sink->transform, &unicode_buffer_sink_def); + sink->buffer = buffer; +} + +static ssize_t +unicode_buffer_sink_input(struct unicode_transform *trans, + const struct unicode_transform_buffer *buf, + const char **error_r ATTR_UNUSED) +{ + struct unicode_buffer_sink *sink = + container_of(trans, struct unicode_buffer_sink, transform); + + uni_ucs4_to_utf8(buf->cp, buf->cp_count, sink->buffer); + return buf->cp_count; +} + +/* Static Array Sink */ + +static ssize_t +unicode_static_array_sink_input(struct unicode_transform *trans, + const struct unicode_transform_buffer *buf, + const char **error_r); + +static const struct unicode_transform_def unicode_static_array_sink_def = { + .input = unicode_static_array_sink_input, +}; + +void unicode_static_array_sink_init(struct unicode_static_array_sink *sink, + uint32_t *array, size_t array_size, + size_t *array_pos) +{ + i_zero(sink); + unicode_transform_init(&sink->transform, + &unicode_static_array_sink_def); + sink->array = array; + sink->array_size = array_size; + sink->array_pos = array_pos; +} + +static ssize_t +unicode_static_array_sink_input(struct unicode_transform *trans, + const struct unicode_transform_buffer *buf, + const char **error_r) +{ + struct unicode_static_array_sink *sink = + container_of(trans, struct unicode_static_array_sink, + transform); + + if (*sink->array_pos + buf->cp_count > sink->array_size) { + *error_r = "Output overflow"; + return -1; + } + memcpy(sink->array + *sink->array_pos, buf->cp, + buf->cp_count * sizeof(*buf->cp)); + *sink->array_pos += buf->cp_count; + return buf->cp_count; +} + /* * Hangul syllable (de)composition */ diff --git a/src/lib/unicode-transform.h b/src/lib/unicode-transform.h index ce407a74de..e6e18d3177 100644 --- a/src/lib/unicode-transform.h +++ b/src/lib/unicode-transform.h @@ -1,6 +1,89 @@ #ifndef UNICODE_NF_H #define UNICODE_NF_H +/* + * Transform API + */ + +struct unicode_transform; + +struct unicode_transform_buffer { + const uint32_t *cp; + const struct unicode_code_point_data *const *cp_data; + size_t cp_count; +}; + +struct unicode_transform_def { + ssize_t (*input)(struct unicode_transform *trans, + const struct unicode_transform_buffer *buf, + const char **error_r); + int (*flush)(struct unicode_transform *trans, bool finished, + const char **error_r); +}; + +struct unicode_transform { + const struct unicode_transform_def *def; + struct unicode_transform *next; +}; + +static inline void +unicode_transform_init(struct unicode_transform *trans, + const struct unicode_transform_def *def) +{ + i_zero(trans); + trans->def = def; +} + +static inline void +unicode_transform_chain(struct unicode_transform *trans, + struct unicode_transform *next) +{ + i_assert(trans->next == NULL); + trans->next = next; +} + +static inline struct unicode_transform * +unicode_transform_get_last(struct unicode_transform *trans) +{ + while (trans->next != NULL) + trans = trans->next; + return trans; +} + +ssize_t uniform_transform_forward( + struct unicode_transform *trans, const uint32_t *out, + const struct unicode_code_point_data *const *out_data, size_t out_len, + const char **error_r); + +ssize_t unicode_transform_input(struct unicode_transform *trans, + const uint32_t *in, size_t in_len, + const char **error_r); +int unicode_transform_flush(struct unicode_transform *trans, + const char **error_r); + +/* Buffer Sink */ + +struct unicode_buffer_sink { + struct unicode_transform transform; + buffer_t *buffer; +}; + +void unicode_buffer_sink_init(struct unicode_buffer_sink *sink, + buffer_t *buffer); + +/* Static Array Sink */ + +struct unicode_static_array_sink { + struct unicode_transform transform; + uint32_t *array; + size_t array_size; + size_t *array_pos; +}; + +void unicode_static_array_sink_init(struct unicode_static_array_sink *sink, + uint32_t *array, size_t array_size, + size_t *array_pos); + /* * RFC 5051 - Simple Unicode Collation Algorithm */