shared/utf8: add utf8_is_valid_n()

author Zbigniew Jędrzejewski-Szmek <zbyszek@in.waw.pl>

Tue, 1 Sep 2020 08:43:21 +0000 (10:43 +0200)

committer Zbigniew Jędrzejewski-Szmek <zbyszek@in.waw.pl>

Tue, 1 Sep 2020 14:48:40 +0000 (16:48 +0200)
author Zbigniew Jędrzejewski-Szmek <zbyszek@in.waw.pl>
Tue, 1 Sep 2020 08:43:21 +0000 (10:43 +0200)
committer Zbigniew Jędrzejewski-Szmek <zbyszek@in.waw.pl>
Tue, 1 Sep 2020 14:48:40 +0000 (16:48 +0200)
diff --git a/src/basic/utf8.c b/src/basic/utf8.c

index 174075be54953be99e7d71a9f502b9391178324a..f0233397ef2ad344635037c0b8b39f88d34c6a53 100644 (file)
--- a/src/basic/utf8.c
+++ b/src/basic/utf8.c
@@ -150,18 +150,22 @@ bool utf8_is_printable_newline(const char* str, size_t length, bool allow_newlin
          return true;
  }
  
-char *utf8_is_valid(const char *str) {
-        const char *p;
+char *utf8_is_valid_n(const char *str, size_t len_bytes) {
+        /* Check if the string is composed of valid utf8 characters. If length len_bytes is given, stop after
+         * len_bytes. Otherwise, stop at NUL. */
  
          assert(str);
  
-        p = str;
-        while (*p) {
+        for (const char *p = str; len_bytes != (size_t) -1 ? (size_t) (p - str) < len_bytes : *p != '\0'; ) {
                  int len;
  
-                len = utf8_encoded_valid_unichar(p, (size_t) -1);
-                if (len < 0)
-                        return NULL;
+                if (_unlikely_(*p == '\0') && len_bytes != (size_t) -1)
+                        return NULL; /* embedded NUL */
+
+                len = utf8_encoded_valid_unichar(p,
+                                                 len_bytes != (size_t) -1 ? len_bytes - (p - str) : (size_t) -1);
+                if (_unlikely_(len < 0))
+                        return NULL; /* invalid character */
  
                  p += len;
          }
diff --git a/src/basic/utf8.h b/src/basic/utf8.h

index 52b487955b1e387e566cbf68529dd815c132d162..f315ea0f1ed06eb7c73ac084a033157ce688833d 100644 (file)
--- a/src/basic/utf8.h
+++ b/src/basic/utf8.h
@@ -14,7 +14,10 @@
  
  bool unichar_is_valid(char32_t c);
  
-char *utf8_is_valid(const char *s) _pure_;
+char *utf8_is_valid_n(const char *str, size_t len_bytes) _pure_;
+static inline char *utf8_is_valid(const char *s) {
+        return utf8_is_valid_n(s, (size_t) -1);
+}
  char *ascii_is_valid(const char *s) _pure_;
  char *ascii_is_valid_n(const char *str, size_t len);
  
diff --git a/src/test/test-utf8.c b/src/test/test-utf8.c

index 8937f56237f67bc3a886e55ccbde5ca427e6068e..66003ac13eb112ff65f1df903f98f14d23be4554 100644 (file)
--- a/src/test/test-utf8.c
+++ b/src/test/test-utf8.c
@@ -18,6 +18,25 @@ static void test_utf8_is_printable(void) {
          assert_se(utf8_is_printable("\t", 1));
  }
  
+static void test_utf8_n_is_valid(void) {
+        log_info("/* %s */", __func__);
+
+        assert_se( utf8_is_valid_n("ascii is valid unicode", 21));
+        assert_se( utf8_is_valid_n("ascii is valid unicode", 22));
+        assert_se(!utf8_is_valid_n("ascii is valid unicode", 23));
+        assert_se( utf8_is_valid_n("\342\204\242", 0));
+        assert_se(!utf8_is_valid_n("\342\204\242", 1));
+        assert_se(!utf8_is_valid_n("\342\204\242", 2));
+        assert_se( utf8_is_valid_n("\342\204\242", 3));
+        assert_se(!utf8_is_valid_n("\342\204\242", 4));
+        assert_se( utf8_is_valid_n("<ZZ>", 0));
+        assert_se( utf8_is_valid_n("<ZZ>", 1));
+        assert_se( utf8_is_valid_n("<ZZ>", 2));
+        assert_se( utf8_is_valid_n("<ZZ>", 3));
+        assert_se( utf8_is_valid_n("<ZZ>", 4));
+        assert_se(!utf8_is_valid_n("<ZZ>", 5));
+}
+
  static void test_utf8_is_valid(void) {
          log_info("/* %s */", __func__);
  
@@ -216,6 +235,7 @@ static void test_utf8_to_utf16(void) {
  }
  
  int main(int argc, char *argv[]) {
+        test_utf8_n_is_valid();
          test_utf8_is_valid();
          test_utf8_is_printable();
          test_ascii_is_valid();
author	Zbigniew Jędrzejewski-Szmek <zbyszek@in.waw.pl>
	Tue, 1 Sep 2020 08:43:21 +0000 (10:43 +0200)
committer	Zbigniew Jędrzejewski-Szmek <zbyszek@in.waw.pl>
	Tue, 1 Sep 2020 14:48:40 +0000 (16:48 +0200)
src/basic/utf8.c		patch \| blob \| blame \| history
src/basic/utf8.h		patch \| blob \| blame \| history
src/test/test-utf8.c		patch \| blob \| blame \| history