From fd5a7429ca5c4d38e43f69876aac06a14bd69e98 Mon Sep 17 00:00:00 2001 From: Stephan Bosch Date: Fri, 16 Oct 2020 13:03:46 +0200 Subject: [PATCH] lib: unichar - Add uni_utf8_get_char_buf() Unlike the existing alternatives, this will not stop reading input at '\0'. --- src/lib/unichar.c | 35 +++++++++++++++++++++++------------ src/lib/unichar.h | 1 + 2 files changed, 24 insertions(+), 12 deletions(-) diff --git a/src/lib/unichar.c b/src/lib/unichar.c index 8efacffc09..70db1c7350 100644 --- a/src/lib/unichar.c +++ b/src/lib/unichar.c @@ -29,22 +29,18 @@ unsigned int uni_strlen(const unichar_t *str) return len; } -int uni_utf8_get_char(const char *input, unichar_t *chr_r) -{ - return uni_utf8_get_char_n((const unsigned char *)input, SIZE_MAX, - chr_r); -} - -int uni_utf8_get_char_n(const void *_input, size_t max_len, unichar_t *chr_r) +static int +uni_utf8_parse_char(const void *_buffer, size_t size, bool cstr, + unichar_t *chr_r) { static unichar_t lowest_valid_chr_table[] = { 0, 0, 0x80, 0x800, 0x10000, 0x200000, 0x4000000 }; - const unsigned char *input = _input; + const unsigned char *input = _buffer; unichar_t chr, lowest_valid_chr; unsigned int i, len; int ret; - i_assert(max_len > 0); + i_assert(size > 0); if (*input < 0x80) { *chr_r = *input; @@ -77,20 +73,20 @@ int uni_utf8_get_char_n(const void *_input, size_t max_len, unichar_t *chr_r) return -1; } - if (len <= max_len) { + if (len <= size) { lowest_valid_chr = lowest_valid_chr_table[len]; ret = len; } else { /* check first if the input is invalid before returning 0 */ lowest_valid_chr = 0; ret = 0; - len = max_len; + len = size; } /* the following bytes must all be 10xxxxxx */ for (i = 1; i < len; i++) { if ((input[i] & 0xc0) != 0x80) { - return (max_len == SIZE_MAX && input[i] == '\0' ? + return (cstr && size == SIZE_MAX && input[i] == '\0' ? 0 : -1); } @@ -110,6 +106,21 @@ int uni_utf8_get_char_n(const void *_input, size_t max_len, unichar_t *chr_r) return ret; } +int uni_utf8_get_char(const char *input, unichar_t *chr_r) +{ + return uni_utf8_parse_char(input, SIZE_MAX, TRUE, chr_r); +} + +int uni_utf8_get_char_n(const void *input, size_t max_len, unichar_t *chr_r) +{ + return uni_utf8_parse_char(input, max_len, TRUE, chr_r); +} + +int uni_utf8_get_char_buf(const void *buffer, size_t size, unichar_t *chr_r) +{ + return uni_utf8_parse_char(buffer, size, FALSE, chr_r); +} + int uni_utf8_to_ucs4(const char *input, ARRAY_TYPE(unichars) *output) { unichar_t chr; diff --git a/src/lib/unichar.h b/src/lib/unichar.h index d75a6ba34c..e26acaf298 100644 --- a/src/lib/unichar.h +++ b/src/lib/unichar.h @@ -89,6 +89,7 @@ static inline unsigned int uni_ucs4_to_utf8_len(unichar_t chr) -1 for invalid input. */ int uni_utf8_get_char(const char *input, unichar_t *chr_r); int uni_utf8_get_char_n(const void *input, size_t max_len, unichar_t *chr_r); +int uni_utf8_get_char_buf(const void *buffer, size_t size, unichar_t *chr_r); /* Returns number of characters in UTF-8 string. */ unsigned int uni_utf8_strlen(const char *input) ATTR_PURE; /* Returns number of characters in UTF-8 input of specified size. */ -- 2.47.3