From: Stephan Bosch Date: Wed, 27 Nov 2024 00:36:43 +0000 (+0100) Subject: lib: unicode-transform - Implement streaming Unicode Normalization X-Git-Tag: 2.4.2~611 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=0ce5059db585830ea39adca938339a907b6f0789;p=thirdparty%2Fdovecot%2Fcore.git lib: unicode-transform - Implement streaming Unicode Normalization All standard forms are supported: NFD, NFKD, NFC, NFKC. --- diff --git a/src/lib/unichar.c b/src/lib/unichar.c index 0a7405d2cc..a8a60e92da 100644 --- a/src/lib/unichar.c +++ b/src/lib/unichar.c @@ -2,6 +2,7 @@ #include "lib.h" #include "array.h" +#include "str.h" #include "bsearch-insert-pos.h" #include "unicode-data.h" #include "unicode-transform.h" @@ -298,6 +299,125 @@ int uni_utf8_run_transform(const void *_input, size_t size, return ret; } +static inline int +uni_utf8_write_nf_common(const void *_input, size_t size, + enum unicode_nf_type nf_type, buffer_t *output) +{ + static struct unicode_nf_context ctx; + const char *error; + + unicode_nf_init(&ctx, nf_type); + + return uni_utf8_run_transform(_input, size, &ctx.transform, output, + &error); +} + +int uni_utf8_write_nfd(const void *input, size_t size, buffer_t *output) +{ + return uni_utf8_write_nf_common(input, size, UNICODE_NFD, output); +} + +int uni_utf8_write_nfkd(const void *input, size_t size, buffer_t *output) +{ + return uni_utf8_write_nf_common(input, size, UNICODE_NFKD, output); +} + +int uni_utf8_write_nfc(const void *input, size_t size, buffer_t *output) +{ + return uni_utf8_write_nf_common(input, size, UNICODE_NFC, output); +} + +int uni_utf8_write_nfkc(const void *input, size_t size, buffer_t *output) +{ + return uni_utf8_write_nf_common(input, size, UNICODE_NFKC, output); +} + +int uni_utf8_to_nfd(const void *input, size_t size, const char **output_r) +{ + buffer_t *output = t_buffer_create(size); + + if (uni_utf8_write_nf_common(input, size, UNICODE_NFD, output) < 0) + return -1; + *output_r = str_c(output); + return 0; +} + +int uni_utf8_to_nfkd(const void *input, size_t size, const char **output_r) +{ + buffer_t *output = t_buffer_create(size); + + if (uni_utf8_write_nf_common(input, size, UNICODE_NFKD, output) < 0) + return -1; + *output_r = str_c(output); + return 0; +} + +int uni_utf8_to_nfc(const void *input, size_t size, const char **output_r) +{ + buffer_t *output = t_buffer_create(size); + + if (uni_utf8_write_nf_common(input, size, UNICODE_NFC, output) < 0) + return -1; + *output_r = str_c(output); + return 0; +} + +int uni_utf8_to_nfkc(const void *input, size_t size, const char **output_r) +{ + buffer_t *output = t_buffer_create(size); + + if (uni_utf8_write_nf_common(input, size, UNICODE_NFKC, output) < 0) + return -1; + *output_r = str_c(output); + return 0; +} + +static int +uni_utf8_is_nf(const void *_input, size_t size, enum unicode_nf_type type) +{ + static struct unicode_nf_checker unc; + const unsigned char *input = _input; + unichar_t chr; + int ret; + + unicode_nf_checker_init(&unc, type); + + while (size > 0) { + const struct unicode_code_point_data *cp_data = NULL; + int bytes = uni_utf8_get_char_n(input, size, &chr); + if (bytes <= 0) + return -1; + input += bytes; + size -= bytes; + + ret = unicode_nf_checker_input(&unc, chr, &cp_data); + if (ret <= 0) + return ret; + } + + return unicode_nf_checker_finish(&unc); +} + +int uni_utf8_is_nfd(const void *input, size_t size) +{ + return uni_utf8_is_nf(input, size, UNICODE_NFD); +} + +int uni_utf8_is_nfkd(const void *input, size_t size) +{ + return uni_utf8_is_nf(input, size, UNICODE_NFKD); +} + +int uni_utf8_is_nfc(const void *input, size_t size) +{ + return uni_utf8_is_nf(input, size, UNICODE_NFC); +} + +int uni_utf8_is_nfkc(const void *input, size_t size) +{ + return uni_utf8_is_nf(input, size, UNICODE_NFKC); +} + int uni_utf8_to_decomposed_titlecase(const void *_input, size_t size, buffer_t *output) { diff --git a/src/lib/unichar.h b/src/lib/unichar.h index 31741a8ba9..1e1ef09ece 100644 --- a/src/lib/unichar.h +++ b/src/lib/unichar.h @@ -124,6 +124,39 @@ int uni_utf8_run_transform(const void *_input, size_t size, struct unicode_transform *trans, buffer_t *output, const char **error_r); +/* Normalize the UTF-8 input in Unicode NFD, NFKD, NFC or NFKC form and write + the result to the output buffer. + + Refer to Unicode Standard Annex #15, Section 1.2 for more information. An + excerpt can be found in unicode-nf.h. + + NOTE: Do not blindly use this function to write and append several values + together expecting the result to be NF* normalized as well. This function + does not check whether concatenation preserves the desired normalization nor + does it endeavour to achieve this result. Blind concatination works only in + very specific cases, so make sure you know what you are doing. + */ +int uni_utf8_write_nfd(const void *input, size_t size, buffer_t *output); +int uni_utf8_write_nfkd(const void *input, size_t size, buffer_t *output); +int uni_utf8_write_nfc(const void *input, size_t size, buffer_t *output); +int uni_utf8_write_nfkc(const void *input, size_t size, buffer_t *output); + +/* Same as the write variants, but return the normalized input in the + output_r argument as a C string. + */ +int uni_utf8_to_nfd(const void *input, size_t size, const char **output_r); +int uni_utf8_to_nfkd(const void *input, size_t size, const char **output_r); +int uni_utf8_to_nfc(const void *input, size_t size, const char **output_r); +int uni_utf8_to_nfkc(const void *input, size_t size, const char **output_r); + +/* Check whether the input is normalized in the indicated form. Returns -1 if + the input is not even valid UTF8 or contains invalid code points. Returns 1 + if the input adheres to the requested normalization form and 0 otherwise. */ +int uni_utf8_is_nfd(const void *input, size_t size); +int uni_utf8_is_nfkd(const void *input, size_t size); +int uni_utf8_is_nfc(const void *input, size_t size); +int uni_utf8_is_nfkc(const void *input, size_t size); + /* Convert UTF-8 input to titlecase and decompose the titlecase characters to output buffer. Returns 0 if ok, -1 if input was invalid. This generates output that's compatible with i;unicode-casemap comparator. Invalid input diff --git a/src/lib/unicode-transform.c b/src/lib/unicode-transform.c index 401e74ec9c..8274f1ffd5 100644 --- a/src/lib/unicode-transform.c +++ b/src/lib/unicode-transform.c @@ -35,9 +35,9 @@ ssize_t uniform_transform_forward( return sret; } -ssize_t unicode_transform_input(struct unicode_transform *trans, - const uint32_t *in, size_t in_len, - const char **error_r) +ssize_t unicode_transform_input_buf(struct unicode_transform *trans, + const struct unicode_transform_buffer *buf, + const char **error_r) { struct unicode_transform_buffer in_buf; size_t input_total = 0; @@ -47,9 +47,7 @@ ssize_t unicode_transform_input(struct unicode_transform *trans, *error_r = NULL; - i_zero(&in_buf); - in_buf.cp = in; - in_buf.cp_count = in_len; + in_buf = *buf; while (in_buf.cp_count > 0) { if (in_buf.cp_count > 0) { @@ -207,10 +205,15 @@ static const uint16_t uni_hangul_s_base = 0xac00; static const uint16_t uni_hangul_l_base = 0x1100; static const uint16_t uni_hangul_v_base = 0x1161; static const uint16_t uni_hangul_t_base = 0x11a7; +static const unsigned int uni_hangul_l_count = 19; static const unsigned int uni_hangul_v_count = 21; static const unsigned int uni_hangul_t_count = 28; static const unsigned int uni_hangul_n_count = uni_hangul_v_count * uni_hangul_t_count; +static const uint16_t uni_hangul_l_end = uni_hangul_l_base + uni_hangul_l_count; +static const uint16_t uni_hangul_v_end = uni_hangul_v_base + uni_hangul_v_count; +static const uint16_t uni_hangul_t_end = uni_hangul_t_base + uni_hangul_t_count; +static const uint16_t uni_hangul_s_end = 0xD7A4; static size_t unicode_hangul_decompose(uint32_t cp, uint32_t buf[3]) { @@ -240,6 +243,661 @@ static size_t unicode_hangul_decompose(uint32_t cp, uint32_t buf[3]) return 3; } +static uint32_t unicode_hangul_compose_pair(uint32_t l, uint32_t r) +{ + /* The Unicode Standard, Section 3.12.3: + Hangul Syllable Composition + */ + + /* */ + if (l >= uni_hangul_l_base && l < uni_hangul_l_end && + r >= uni_hangul_v_base && r < uni_hangul_v_end) { + uint32_t l_part = l, v_part = r; + + unsigned int l_index = l_part - uni_hangul_l_base; + unsigned int v_index = v_part - uni_hangul_v_base; + unsigned int lv_index = l_index * uni_hangul_n_count + + v_index * uni_hangul_t_count; + return uni_hangul_s_base + lv_index; + } + /* A sequence */ + if (l >= uni_hangul_s_base && l < uni_hangul_s_end && + r >= (uni_hangul_t_base + 1) && r < uni_hangul_t_end && + ((l - uni_hangul_s_base) % uni_hangul_t_count) == 0) { + uint32_t lv_part = l, t_part = r; + + unsigned int t_index = t_part - uni_hangul_t_base; + return lv_part + t_index; + } + return 0x0000; +} + +/* + * Normalization transform: NFD, NFKD, NFC, NFKC + */ + +static ssize_t +unicode_nf_input(struct unicode_transform *trans, + const struct unicode_transform_buffer *buf, + const char **error_r); +static int +unicode_nf_flush(struct unicode_transform *trans, bool finished, + const char **error_r); + +static const struct unicode_transform_def unicode_nf_def = { + .input = unicode_nf_input, + .flush = unicode_nf_flush, +}; + +void unicode_nf_init(struct unicode_nf_context *ctx_r, + enum unicode_nf_type type) +{ + i_zero(ctx_r); + unicode_transform_init(&ctx_r->transform, &unicode_nf_def); + + switch (type) { + case UNICODE_NFD: + ctx_r->canonical = TRUE; + ctx_r->nf_qc_mask = UNICODE_NFD_QUICK_CHECK_MASK; + break; + case UNICODE_NFKD: + ctx_r->nf_qc_mask = UNICODE_NFKD_QUICK_CHECK_MASK; + break; + case UNICODE_NFC: + ctx_r->compose = TRUE; + ctx_r->canonical = TRUE; + ctx_r->nf_qc_mask = UNICODE_NFC_QUICK_CHECK_MASK; + break; + case UNICODE_NFKC: + ctx_r->compose = TRUE; + ctx_r->nf_qc_mask = UNICODE_NFKC_QUICK_CHECK_MASK; + break; + } +} + +void unicode_nf_reset(struct unicode_nf_context *ctx) +{ + enum unicode_nf_type type = + (ctx->compose ? (ctx->canonical ? UNICODE_NFC : UNICODE_NFKC) : + (ctx->canonical ? UNICODE_NFD : UNICODE_NFKD)); + struct unicode_transform *next = ctx->transform.next; + + unicode_nf_init(ctx, type); + unicode_transform_chain(&ctx->transform, next); +} + +static void +unicode_nf_buffer_delete(struct unicode_nf_context *ctx, size_t offset, + size_t count) +{ + if (count == 0) + return; + + i_assert(offset < ctx->buffer_len); + i_assert(count <= ctx->buffer_len); + i_assert(offset <= (ctx->buffer_len - count)); + + if (count == ctx->buffer_len) { + ctx->buffer_len = 0; + return; + } + + size_t trailer = ctx->buffer_len - (offset + count); + if (trailer > 0) { + memmove(&ctx->cp_buffer[offset], + &ctx->cp_buffer[offset + count], + trailer * sizeof(ctx->cp_buffer[0])); + memmove(&ctx->cpd_buffer[offset], + &ctx->cpd_buffer[offset + count], + trailer * sizeof(ctx->cpd_buffer[0])); + } + ctx->buffer_len -= count; +} + +static void +unicode_nf_buffer_swap(struct unicode_nf_context *ctx, + size_t idx1, size_t idx2) +{ + uint32_t tmp_cp = ctx->cp_buffer[idx2]; + const struct unicode_code_point_data *tmp_cpd = ctx->cpd_buffer[idx2]; + + ctx->cp_buffer[idx2] = ctx->cp_buffer[idx1]; + ctx->cpd_buffer[idx2] = ctx->cpd_buffer[idx1]; + ctx->cp_buffer[idx1] = tmp_cp; + ctx->cpd_buffer[idx1] = tmp_cpd; +} + +static void +unicode_nf_cp(struct unicode_nf_context *ctx, uint32_t cp, + const struct unicode_code_point_data *cpd) +{ + static const size_t buffer_size = UNICODE_NF_BUFFER_SIZE; + uint8_t nf_qc_mask = ctx->nf_qc_mask; + size_t i; + + /* + * Decompose the code point + */ + + const uint32_t *decomp, *decomp_k; + uint32_t decomp_hangul[3]; + size_t len, len_k; + + if (cp >= HANGUL_FIRST && cp <= HANGUL_LAST) { + len = len_k = unicode_hangul_decompose(cp, decomp_hangul); + decomp = decomp_k = decomp_hangul; + } else { + if (cpd == NULL) + cpd = unicode_code_point_get_data(cp); + len = unicode_code_point_data_get_full_decomposition( + cpd, ctx->canonical, &decomp); + if (len == 0) { + decomp = &cp; + len = 1; + } + len_k = len; + decomp_k = decomp; + if (ctx->canonical) { + len_k = unicode_code_point_data_get_full_decomposition( + cpd, ctx->canonical, &decomp_k); + if (len_k == 0) { + decomp_k = decomp; + len_k = len; + } + } + if (len > 0) + cpd = NULL; + } + + i_assert(len <= UNICODE_DECOMPOSITION_MAX_LENGTH); + i_assert(len_k <= UNICODE_DECOMPOSITION_MAX_LENGTH); + + if ((ctx->buffer_len + len) > buffer_size) { + /* Decomposition overflows the buffer. Record and mark it as + pending and come back to it once the buffer is sufficiently + drained. */ + i_assert(ctx->pending_decomp == 0); + ctx->pending_decomp = len; + ctx->pending_cp = cp; + ctx->pending_cpd = cpd; + return; + } + + /* UAX15-D4: Stream-Safe Text Process is the process of producing a + Unicode string in Stream-Safe Text Format by processing that string + from start to finish, inserting U+034F COMBINING GRAPHEME JOINER + (CGJ) within long sequences of non-starters. The exact position o + the inserted CGJs are determined according to the following + algorithm, which describes the generation of an output string from an + input string: + + 1. If the input string is empty, return an empty output string. + 2. Set nonStarterCount to zero. + 3. For each code point C in the input string: + a. Produce the NFKD decomposition S. + b. If nonStarterCount plus the number of initial non-starters in + S is greater than 30, append a CGJ to the output string and + set the nonStarterCount to zero. + c. Append C to the output string. + d. If there are no starters in S, increment nonStarterCount by + the number of code points in S; otherwise, set + nonStarterCount to the number of trailing non-starters in S + (which may be zero). + 4. Return the output string. + */ + + /* Determine number of leading and trailing non-starters in full NFKD + decomposition. */ + const struct unicode_code_point_data * + decomp_cpd[UNICODE_DECOMPOSITION_MAX_LENGTH]; + size_t ns_lead = 0, ns_trail = 0; + bool seen_starter = FALSE; + for (i = 0; i < len_k; i++) { + if (cpd == NULL) + cpd = unicode_code_point_get_data(decomp[i]); + + uint8_t ccc = cpd->canonical_combining_class; + + if (decomp == decomp_k) { + decomp_cpd[i] = cpd; + cpd = NULL; + } + + if (ccc == 0) + seen_starter = TRUE; + else if (!seen_starter) + ns_lead++; + else + ns_trail++; + } + + /* Lookup canonical decomposed code points if necessary (avoid double + lookups). */ + if (decomp != decomp_k) { + for (i = 0; i < len; i++) { + if (cpd == NULL) + cpd = unicode_code_point_get_data(decomp[i]); + decomp_cpd[i] = cpd; + cpd = NULL; + } + } + + ctx->nonstarter_count += ns_lead; + if (ctx->nonstarter_count > 30) { + ctx->nonstarter_count = ns_trail; + + /* Write U+034F COMBINING GRAPHEME JOINER (CGJ) + */ + ctx->cp_buffer[ctx->buffer_len] = 0x034F; + ctx->cpd_buffer[ctx->buffer_len] = + unicode_code_point_get_data(0x034F); + ctx->buffer_len++; + } + + /* + * Buffer the requested decomposition for COA sorting + */ + + i_assert(ctx->buffer_len <= buffer_size); + if ((ctx->buffer_len + len) > buffer_size) { + /* Decomposition now overflows the buffer. Record and mark it as + pending and come back to it once the buffer is sufficiently + drained. */ + i_assert(ctx->pending_decomp == 0); + ctx->pending_decomp = len; + ctx->pending_cp = cp; + ctx->pending_cpd = cpd; + } else { + for (i = 0; i < len; i++) { + ctx->cp_buffer[ctx->buffer_len] = decomp[i]; + ctx->cpd_buffer[ctx->buffer_len] = decomp_cpd[i]; + ctx->buffer_len++; + } + i_assert(ctx->buffer_len <= buffer_size); + } + + /* + * Apply the Canonical Ordering Algorithm (COA) + */ + + bool changed = TRUE; + size_t last_qc_y; + size_t last_starter; + + while (changed) { + changed = FALSE; + last_qc_y = 0; + last_starter = 0; + + for (i = I_MAX(1, ctx->buffer_output_max); + i < ctx->buffer_len; i++) { + const struct unicode_code_point_data + *cpd_i = ctx->cpd_buffer[i], + *cpd_im1 = ctx->cpd_buffer[i - 1]; + uint8_t ccc_i = cpd_i->canonical_combining_class; + uint8_t ccc_im1 = cpd_im1->canonical_combining_class; + bool nqc = ((cpd_i->nf_quick_check & nf_qc_mask) == 0); + + if (ccc_i == 0) { + last_starter = i; + if (nqc) + last_qc_y = i; + } else if (ccc_im1 > ccc_i) { + unicode_nf_buffer_swap(ctx, i - 1, i); + changed = TRUE; + } + } + } + ctx->buffer_output_max = I_MIN(last_qc_y, last_starter); +} + +static bool +unicode_nf_input_cp(struct unicode_nf_context *ctx, uint32_t cp, + const struct unicode_code_point_data *cpd) +{ + static const size_t buffer_size = UNICODE_NF_BUFFER_SIZE; + + i_assert(ctx->buffer_len <= buffer_size); + if (ctx->buffer_len == buffer_size || + (ctx->pending_decomp > 0 && + ctx->buffer_len > (buffer_size - ctx->pending_decomp))) { + /* Buffer is (still too) full. */ + return FALSE; + } + + if (ctx->pending_decomp > 0) { + /* Earlier, the buffer was too full for the next decomposition + and it was recorded and marked as pending. Now, we have the + opportunity to continue. */ + unicode_nf_cp(ctx, ctx->pending_cp, ctx->pending_cpd); + ctx->pending_decomp = 0; + + i_assert(ctx->buffer_len <= buffer_size); + if (ctx->buffer_output_max > 0 && + ctx->buffer_len == buffer_size) { + /* Pending decomposition filled the buffer completely. + */ + return FALSE; + } + } + + /* Normal input of next code point */ + unicode_nf_cp(ctx, cp, cpd); + return TRUE; +} + +static ssize_t +unicode_nf_input(struct unicode_transform *trans, + const struct unicode_transform_buffer *buf, + const char **error_r ATTR_UNUSED) +{ + struct unicode_nf_context *ctx = + container_of(trans, struct unicode_nf_context, transform); + size_t n; + + for (n = 0; n < buf->cp_count; n++) { + if (!unicode_nf_input_cp(ctx, buf->cp[n], + (buf->cp_data == NULL ? + NULL : buf->cp_data[n]))) + break; + } + return n; +} + +static uint32_t +unicode_nf_compose_pair(uint32_t l, uint32_t r, + const struct unicode_code_point_data **l_data) +{ + uint32_t comp = unicode_hangul_compose_pair(l, r); + + if (comp > 0x0000) + return comp; + + if (*l_data == NULL) + *l_data = unicode_code_point_get_data(l); + return unicode_code_point_data_find_composition(*l_data, r); +} + +static int +unicode_nf_flush_more(struct unicode_nf_context *ctx, bool finished, + const char **error_r) +{ + struct unicode_transform *trans = &ctx->transform; + + ctx->finished = finished; + + if (ctx->buffer_len == 0) + return 1; + if (!finished && ctx->buffer_output_max == 0) + return 0; + + /* + * Apply the Canonical Composition Algorithm + */ + + if (ctx->finished) + ctx->buffer_output_max = ctx->buffer_len; + i_assert(ctx->buffer_processed <= ctx->buffer_output_max); + if (ctx->compose && ctx->buffer_len > 1) { + size_t in_pos, out_pos, starter; + int last_ccc; + + out_pos = 1; + last_ccc = -1; + starter = 0; + for (in_pos = I_MAX(1, ctx->buffer_processed); + in_pos < ctx->buffer_output_max; in_pos++) { + uint32_t cp = ctx->cp_buffer[in_pos]; + const struct unicode_code_point_data *cpd = + ctx->cpd_buffer[in_pos]; + + if (cpd == NULL) { + ctx->cpd_buffer[in_pos] = cpd = + unicode_code_point_get_data(cp); + } + + uint8_t ccc = cpd->canonical_combining_class; + uint32_t comp = 0x0000; + if (last_ccc < (int)ccc) { + comp = unicode_nf_compose_pair( + ctx->cp_buffer[starter], cp, + &ctx->cpd_buffer[starter]); + } + if (comp > 0x0000) { + ctx->cp_buffer[starter] = comp; + ctx->cpd_buffer[starter] = NULL; + } else if (ccc == 0) { + starter = out_pos; + last_ccc = -1; + ctx->cp_buffer[out_pos] = cp; + ctx->cpd_buffer[out_pos] = cpd; + out_pos++; + } else { + last_ccc = ccc; + ctx->cp_buffer[out_pos] = cp; + ctx->cpd_buffer[out_pos] = cpd; + out_pos++; + } + } + if (finished) { + ctx->buffer_len = ctx->buffer_output_max = out_pos; + } else if (in_pos > out_pos) { + unicode_nf_buffer_delete(ctx, out_pos, + (in_pos - out_pos)); + ctx->buffer_output_max = out_pos; + } + } + ctx->buffer_processed = ctx->buffer_output_max; + + /* + * Forward output + */ + + size_t output_len = ctx->buffer_processed; + ssize_t sret; + + sret = uniform_transform_forward(trans, ctx->cp_buffer, ctx->cpd_buffer, + output_len, error_r); + if (sret < 0) + return -1; + + i_assert((size_t)sret <= ctx->buffer_processed); + unicode_nf_buffer_delete(ctx, 0, sret); + ctx->buffer_processed -= sret; + ctx->buffer_output_max -= sret; + if ((size_t)sret < output_len) + return 0; + return 1; +} + +static int +unicode_nf_flush(struct unicode_transform *trans, bool finished, + const char **error_r) +{ + struct unicode_nf_context *ctx = + container_of(trans, struct unicode_nf_context, transform); + int ret; + + ret = unicode_nf_flush_more(ctx, finished, error_r); + if (ret <= 0) + return ret; + + if (finished && ctx->pending_decomp > 0) { + unicode_nf_cp(ctx, ctx->pending_cp, ctx->pending_cpd); + ctx->pending_decomp = 0; + } + + return unicode_nf_flush_more(ctx, finished, error_r); +} + +/* + * Normalization check + */ + +static ssize_t +unicode_nf_check_sink_input(struct unicode_transform *trans, + const struct unicode_transform_buffer *buf, + const char **error_r); + +static const struct unicode_transform_def unicode_nf_check_sink_def = { + .input = unicode_nf_check_sink_input, +}; + +void unicode_nf_checker_init(struct unicode_nf_checker *unc_r, + enum unicode_nf_type type) +{ + i_zero(unc_r); + + switch (type) { + case UNICODE_NFD: + unc_r->canonical = TRUE; + unc_r->nf_qc_mask = UNICODE_NFD_QUICK_CHECK_MASK; + unc_r->nf_qc_yes = UNICODE_NFD_QUICK_CHECK_YES; + unc_r->nf_qc_no = UNICODE_NFD_QUICK_CHECK_NO; + break; + case UNICODE_NFKD: + unc_r->nf_qc_mask = UNICODE_NFKD_QUICK_CHECK_MASK; + unc_r->nf_qc_yes = UNICODE_NFKD_QUICK_CHECK_YES; + unc_r->nf_qc_no = UNICODE_NFKD_QUICK_CHECK_NO; + break; + case UNICODE_NFC: + unc_r->compose = TRUE; + unc_r->canonical = TRUE; + unc_r->nf_qc_mask = UNICODE_NFC_QUICK_CHECK_MASK; + unc_r->nf_qc_yes = UNICODE_NFC_QUICK_CHECK_YES; + unc_r->nf_qc_no = UNICODE_NFC_QUICK_CHECK_NO; + break; + case UNICODE_NFKC: + unc_r->compose = TRUE; + unc_r->nf_qc_mask = UNICODE_NFKC_QUICK_CHECK_MASK; + unc_r->nf_qc_yes = UNICODE_NFKC_QUICK_CHECK_YES; + unc_r->nf_qc_no = UNICODE_NFKC_QUICK_CHECK_NO; + break; + } + + unicode_nf_init(&unc_r->nf, type); + unicode_transform_init(&unc_r->sink, &unicode_nf_check_sink_def); + unicode_transform_chain(&unc_r->nf.transform, &unc_r->sink); +} + +void unicode_nf_checker_reset(struct unicode_nf_checker *unc) +{ + enum unicode_nf_type type = + (unc->compose ? (unc->canonical ? UNICODE_NFC : UNICODE_NFKC) : + (unc->canonical ? UNICODE_NFD : UNICODE_NFKD)); + + unicode_nf_checker_init(unc, type); +} + +static ssize_t +unicode_nf_check_sink_input(struct unicode_transform *trans, + const struct unicode_transform_buffer *buf, + const char **error_r) +{ + struct unicode_nf_checker *unc = + container_of(trans, struct unicode_nf_checker, sink); + size_t n; + + i_assert(unc->buffer_len > 0); + i_assert(buf->cp_count <= unc->buffer_len); + for (n = 0; n < buf->cp_count; n++) { + if (buf->cp[n] != unc->cp_buffer[n]) { + *error_r = "Not normalized"; + return -1; + } + } + if (buf->cp_count == unc->buffer_len) + unc->buffer_len = 0; + else { + unc->buffer_len -= buf->cp_count; + memmove(&unc->cp_buffer[0], &unc->cp_buffer[buf->cp_count], + unc->buffer_len); + } + return buf->cp_count; +} + +int unicode_nf_checker_input(struct unicode_nf_checker *unc, uint32_t cp, + const struct unicode_code_point_data **_cp_data) +{ + const struct unicode_code_point_data *cpd_last = unc->cpd_last; + + if (*_cp_data == NULL) + *_cp_data = unicode_code_point_get_data(cp); + + const struct unicode_code_point_data *cp_data = *_cp_data; + const char *error; + int ret; + + unc->cpd_last = cp_data; + + if (cp_data->general_category == UNICODE_GENERAL_CATEGORY_INVALID) + return -1; + if ((cp_data->nf_quick_check & unc->nf_qc_mask) == unc->nf_qc_no) + return 0; + if (cpd_last != NULL && cp_data->canonical_combining_class != 0 && + cpd_last->canonical_combining_class > + cp_data->canonical_combining_class) + return 0; + if ((cp_data->nf_quick_check & unc->nf_qc_mask) == unc->nf_qc_yes && + cp_data->canonical_combining_class == 0) { + if (unc->buffer_len > 0) { + ret = unicode_transform_flush(&unc->nf.transform, + &error); + i_assert(ret != 0); + if (ret < 0) + return 0; + unicode_nf_reset(&unc->nf); + } + i_assert(unc->buffer_len == 0); + unc->cp_buffer[0] = cp; + return 1; + } + + struct unicode_transform_buffer buf; + ssize_t sret; + + if (unc->buffer_len == 0 && cpd_last != NULL) { + i_zero(&buf); + buf.cp = &unc->cp_buffer[0]; + buf.cp_data = &cpd_last; + buf.cp_count = 1; + + unc->buffer_len++; + sret = unicode_transform_input_buf(&unc->nf.transform, &buf, + &error); + i_assert(sret != 0); + if (sret < 0) + return 0; + } + + i_assert(unc->buffer_len < UNICODE_NF_BUFFER_SIZE); + unc->cp_buffer[unc->buffer_len] = cp; + unc->buffer_len++; + + i_zero(&buf); + buf.cp = &cp; + buf.cp_data = &cp_data; + buf.cp_count = 1; + sret = unicode_transform_input_buf(&unc->nf.transform, &buf, &error); + i_assert(sret != 0); + if (sret < 0) + return 0; + return 1; +} + +int unicode_nf_checker_finish(struct unicode_nf_checker *unc) +{ + if (unc->buffer_len == 0) + return 1; + + const char *error; + int ret; + + ret = unicode_transform_flush(&unc->nf.transform, &error); + i_assert(ret != 0); + return (ret > 0 ? 1 : 0); +} + /* * RFC 5051 - Simple Unicode Collation Algorithm */ diff --git a/src/lib/unicode-transform.h b/src/lib/unicode-transform.h index e6e18d3177..3e3499f0e5 100644 --- a/src/lib/unicode-transform.h +++ b/src/lib/unicode-transform.h @@ -1,6 +1,11 @@ #ifndef UNICODE_NF_H #define UNICODE_NF_H +#define UNICODE_NF_STREAM_SAFE_NON_STARTER_LEN 30 +#define UNICODE_NF_BUFFER_SIZE (UNICODE_NF_STREAM_SAFE_NON_STARTER_LEN + 2) + +struct unicode_code_point_data; + /* * Transform API */ @@ -55,9 +60,21 @@ ssize_t uniform_transform_forward( const struct unicode_code_point_data *const *out_data, size_t out_len, const char **error_r); -ssize_t unicode_transform_input(struct unicode_transform *trans, - const uint32_t *in, size_t in_len, - const char **error_r); +ssize_t unicode_transform_input_buf(struct unicode_transform *trans, + const struct unicode_transform_buffer *buf, + const char **error_r); +static inline ssize_t +unicode_transform_input(struct unicode_transform *trans, + const uint32_t *in, size_t in_len, const char **error_r) +{ + struct unicode_transform_buffer buf = { + .cp = in, + .cp_count = in_len, + }; + + return unicode_transform_input_buf(trans, &buf, error_r); +} + int unicode_transform_flush(struct unicode_transform *trans, const char **error_r); @@ -84,6 +101,101 @@ void unicode_static_array_sink_init(struct unicode_static_array_sink *sink, uint32_t *array, size_t array_size, size_t *array_pos); +/* + * NFD, NFKD, NFC, NFKC + */ + +/* Unicode Standard Annex #15, Section 1.2: + + Unicode Normalization Forms are formally defined normalizations of Unicode + strings which make it possible to determine whether any two Unicode strings + are equivalent to each other. Depending on the particular Unicode + Normalization Form, that equivalence can either be a canonical equivalence or + a compatibility equivalence. + + Essentially, the Unicode Normalization Algorithm puts all combining marks in + a specified order, and uses rules for decomposition and composition to + transform each string into one of the Unicode Normalization Forms. A binary + comparison of the transformed strings will then determine equivalence. + + The four Unicode Normalization Forms are summarized as follows: + + Normalization Form D (NFD) - Canonical Decomposition + Normalization Form KD (NFKD) - Compatibility Decomposition + Normalization Form C (NFC) - Canonical Decomposition, followed by + Canonical Composition + Normalization Form KC (NFKC) - Compatibility Decomposition, followed by + Canonical Composition + + There are two forms of normalization that convert to composite characters: + Normalization Form C and Normalization Form KC. The difference between these + depends on whether the resulting text is to be a canonical equivalent to the + original unnormalized text or a compatibility equivalent to the original + unnormalized text. (In NFKC and NFKD, a K is used to stand for compatibility + to avoid confusion with the C standing for composition.) Both types of + normalization can be useful in different circumstances. + */ + +enum unicode_nf_type { + UNICODE_NFD, + UNICODE_NFKD, + UNICODE_NFC, + UNICODE_NFKC, +}; + +struct unicode_nf_context { + struct unicode_transform transform; + + size_t nonstarter_count; + uint32_t cp_buffer[UNICODE_NF_BUFFER_SIZE]; + const struct unicode_code_point_data * + cpd_buffer[UNICODE_NF_BUFFER_SIZE]; + size_t buffer_len, buffer_processed, buffer_output_max; + + size_t pending_decomp; + uint32_t pending_cp; + const struct unicode_code_point_data *pending_cpd; + + uint8_t nf_qc_mask; + + bool compose:1; + bool canonical:1; + bool finished:1; +}; + +void unicode_nf_init(struct unicode_nf_context *ctx_r, + enum unicode_nf_type type); +void unicode_nf_reset(struct unicode_nf_context *ctx); + +/* + * Normalization check + */ + +struct unicode_nf_checker { + const struct unicode_code_point_data *cpd_last; + + uint8_t nf_qc_mask; + uint8_t nf_qc_yes; + uint8_t nf_qc_no; + + uint32_t cp_buffer[UNICODE_NF_BUFFER_SIZE]; + size_t buffer_len; + struct unicode_nf_context nf; + struct unicode_transform sink; + + bool not_first_cp; + bool compose:1; + bool canonical:1; +}; + +void unicode_nf_checker_init(struct unicode_nf_checker *unc_r, + enum unicode_nf_type type); +void unicode_nf_checker_reset(struct unicode_nf_checker *unc); + +int unicode_nf_checker_input(struct unicode_nf_checker *unc, uint32_t cp, + const struct unicode_code_point_data **cp_data); +int unicode_nf_checker_finish(struct unicode_nf_checker *unc); + /* * RFC 5051 - Simple Unicode Collation Algorithm */