From: Zbigniew Jędrzejewski-Szmek Date: Tue, 1 Sep 2020 08:43:21 +0000 (+0200) Subject: shared/utf8: add utf8_is_valid_n() X-Git-Tag: v247-rc1~318^2~5 X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=80ab31a43577ab95eb3ddfac637bd792989555b1;p=thirdparty%2Fsystemd.git shared/utf8: add utf8_is_valid_n() Sometimes we need to check strings without the terminating NUL. Add a variant that does that. --- diff --git a/src/basic/utf8.c b/src/basic/utf8.c index 174075be549..f0233397ef2 100644 --- a/src/basic/utf8.c +++ b/src/basic/utf8.c @@ -150,18 +150,22 @@ bool utf8_is_printable_newline(const char* str, size_t length, bool allow_newlin return true; } -char *utf8_is_valid(const char *str) { - const char *p; +char *utf8_is_valid_n(const char *str, size_t len_bytes) { + /* Check if the string is composed of valid utf8 characters. If length len_bytes is given, stop after + * len_bytes. Otherwise, stop at NUL. */ assert(str); - p = str; - while (*p) { + for (const char *p = str; len_bytes != (size_t) -1 ? (size_t) (p - str) < len_bytes : *p != '\0'; ) { int len; - len = utf8_encoded_valid_unichar(p, (size_t) -1); - if (len < 0) - return NULL; + if (_unlikely_(*p == '\0') && len_bytes != (size_t) -1) + return NULL; /* embedded NUL */ + + len = utf8_encoded_valid_unichar(p, + len_bytes != (size_t) -1 ? len_bytes - (p - str) : (size_t) -1); + if (_unlikely_(len < 0)) + return NULL; /* invalid character */ p += len; } diff --git a/src/basic/utf8.h b/src/basic/utf8.h index 52b487955b1..f315ea0f1ed 100644 --- a/src/basic/utf8.h +++ b/src/basic/utf8.h @@ -14,7 +14,10 @@ bool unichar_is_valid(char32_t c); -char *utf8_is_valid(const char *s) _pure_; +char *utf8_is_valid_n(const char *str, size_t len_bytes) _pure_; +static inline char *utf8_is_valid(const char *s) { + return utf8_is_valid_n(s, (size_t) -1); +} char *ascii_is_valid(const char *s) _pure_; char *ascii_is_valid_n(const char *str, size_t len); diff --git a/src/test/test-utf8.c b/src/test/test-utf8.c index 8937f56237f..66003ac13eb 100644 --- a/src/test/test-utf8.c +++ b/src/test/test-utf8.c @@ -18,6 +18,25 @@ static void test_utf8_is_printable(void) { assert_se(utf8_is_printable("\t", 1)); } +static void test_utf8_n_is_valid(void) { + log_info("/* %s */", __func__); + + assert_se( utf8_is_valid_n("ascii is valid unicode", 21)); + assert_se( utf8_is_valid_n("ascii is valid unicode", 22)); + assert_se(!utf8_is_valid_n("ascii is valid unicode", 23)); + assert_se( utf8_is_valid_n("\342\204\242", 0)); + assert_se(!utf8_is_valid_n("\342\204\242", 1)); + assert_se(!utf8_is_valid_n("\342\204\242", 2)); + assert_se( utf8_is_valid_n("\342\204\242", 3)); + assert_se(!utf8_is_valid_n("\342\204\242", 4)); + assert_se( utf8_is_valid_n("", 0)); + assert_se( utf8_is_valid_n("", 1)); + assert_se( utf8_is_valid_n("", 2)); + assert_se( utf8_is_valid_n("", 3)); + assert_se( utf8_is_valid_n("", 4)); + assert_se(!utf8_is_valid_n("", 5)); +} + static void test_utf8_is_valid(void) { log_info("/* %s */", __func__); @@ -216,6 +235,7 @@ static void test_utf8_to_utf16(void) { } int main(int argc, char *argv[]) { + test_utf8_n_is_valid(); test_utf8_is_valid(); test_utf8_is_printable(); test_ascii_is_valid();