From f93ef23770a75e9318805284c320f89e42ef8b5e Mon Sep 17 00:00:00 2001 From: Stephan Bosch Date: Fri, 1 Jun 2018 00:02:53 +0200 Subject: [PATCH] lib: Add str_truncate_utf8(). It works similar to str_truncate(), but it makes sure the truncated string remains valid UTF-8. --- src/lib/str.c | 10 ++++++++++ src/lib/str.h | 6 ++++++ src/lib/test-str.c | 44 ++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 60 insertions(+) diff --git a/src/lib/str.c b/src/lib/str.c index 0f6d3b8488..2ec597023d 100644 --- a/src/lib/str.c +++ b/src/lib/str.c @@ -3,6 +3,7 @@ #include "lib.h" #include "buffer.h" #include "printf-format-fix.h" +#include "unichar.h" #include "str.h" #include @@ -146,3 +147,12 @@ void str_vprintfa(string_t *str, const char *fmt, va_list args) /* drop the unused data, including terminating NUL */ buffer_set_used_size(str, pos + ret); } + +void str_truncate_utf8(string_t *str, size_t len) +{ + size_t size = str_len(str); + + if (size <= len) + return; + str_truncate(str, uni_utf8_data_truncate(str_data(str), size, len)); +} diff --git a/src/lib/str.h b/src/lib/str.h index 16ccc9314c..e29b9b0cad 100644 --- a/src/lib/str.h +++ b/src/lib/str.h @@ -78,4 +78,10 @@ static inline void str_truncate(string_t *str, size_t len) buffer_set_used_size(str, len); } +/* Truncate the string to specified length, but also make sure the truncation + doesn't happen in the middle of an UTF-8 character sequence. In that case, + the string will end up being up to a few bytes smaller than len. If it's + already smaller to begin with, do nothing. */ +void str_truncate_utf8(string_t *str, size_t len); + #endif diff --git a/src/lib/test-str.c b/src/lib/test-str.c index 79ce5fbc45..e20e7f98f9 100644 --- a/src/lib/test-str.c +++ b/src/lib/test-str.c @@ -1,6 +1,7 @@ /* Copyright (c) 2012-2018 Dovecot authors, see the included COPYING file */ #include "test-lib.h" +#include "unichar.h" #include "str.h" static void test_str_append(void) @@ -127,6 +128,48 @@ static void test_str_truncate(void) test_end(); } +static void test_str_truncate_utf8(void) +{ + string_t *str = t_str_new(8); + int i; + + test_begin("str_truncate_utf8()"); + str_append(str, "123456"); + for (i = 100; i >= 6; i--) { + str_truncate_utf8(str, i); + test_assert_idx(str_len(str) == 6, i); + } + for (; i >= 0; i--) { + str_truncate_utf8(str, i); + test_assert_idx(str_len(str) == (unsigned int)i, i); + } + + str_append(str, "\xE4\xB8\x80\xE4\xBa\x8C\xE4\xB8\x89" + "\xE5\x9b\x9b\xE4\xBa\x94\xE5\x85\xAD"); + for (i = 100; i >= 18; i--) { + str_truncate_utf8(str, i); + test_assert_idx(str_len(str) == 18, i); + } + for (; i >= 0; i--) { + str_truncate_utf8(str, i); + test_assert_idx(str_len(str) % 3 == 0, i); + test_assert_idx((str_len(str) / 3) == ((unsigned int)i / 3), i); + } + + str_append(str, "\xE4\xB8\x80""1""\xE4\xBa\x8C""2""\xE4\xB8\x89""3" + "\xE5\x9b\x9b""4""\xE4\xBa\x94""5""\xE5\x85\xAD""6"); + for (i = 100; i >= 24; i--) { + str_truncate_utf8(str, i); + test_assert_idx(str_len(str) == 24, i); + } + for (; i >= 0; i--) { + str_truncate_utf8(str, i); + test_assert_idx(uni_utf8_data_is_valid(str_data(str), + str_len(str)), i); + } + test_end(); +} + void test_str(void) { test_str_append(); @@ -135,4 +178,5 @@ void test_str(void) test_str_delete(); test_str_append_max(); test_str_truncate(); + test_str_truncate_utf8(); } -- 2.47.3