string: add string_is_utf8() helper

author cypherpunks <cypherpunks@torproject.org>

Wed, 29 Aug 2018 03:22:30 +0000 (03:22 +0000)

committer cypherpunks <cypherpunks@torproject.org>

Mon, 3 Sep 2018 13:54:43 +0000 (13:54 +0000)
author cypherpunks <cypherpunks@torproject.org>
Wed, 29 Aug 2018 03:22:30 +0000 (03:22 +0000)
committer cypherpunks <cypherpunks@torproject.org>
Mon, 3 Sep 2018 13:54:43 +0000 (13:54 +0000)
diff --git a/src/lib/string/util_string.c b/src/lib/string/util_string.c

index a6b0a3d68a66ca16e89f7c0eaa77299b215a1feb..b2b85d151d2d46de6217b85e0defbe6cc101ad97 100644 (file)
--- a/src/lib/string/util_string.c
+++ b/src/lib/string/util_string.c
@@ -451,3 +451,93 @@ string_is_C_identifier(const char *string)
  
    return 1;
  }
+
+/** A byte with the top <b>x</b> bits set. */
+#define TOP_BITS(x) ((uint8_t)(0xFF << (8 - (x))))
+/** A byte with the lowest <b>x</b> bits set. */
+#define LOW_BITS(x) ((uint8_t)(0xFF >> (8 - (x))))
+
+/** Given the leading byte <b>b</b>, return the total number of bytes in the
+ * UTF-8 character. Returns 0 if it's an invalid leading byte.
+ */
+static uint8_t
+bytes_in_char(uint8_t b)
+{
+  if ((TOP_BITS(1) & b) == 0x00)
+    return 1; // a 1-byte UTF-8 char, aka ASCII
+  if ((TOP_BITS(3) & b) == TOP_BITS(2))
+    return 2; // a 2-byte UTF-8 char
+  if ((TOP_BITS(4) & b) == TOP_BITS(3))
+    return 3; // a 3-byte UTF-8 char
+  if ((TOP_BITS(5) & b) == TOP_BITS(4))
+    return 4; // a 4-byte UTF-8 char
+
+  // Invalid: either the top 2 bits are 10, or the top 5 bits are 11111.
+  return 0;
+}
+
+/** Returns true iff <b>b</b> is a UTF-8 continuation byte. */
+static bool
+is_continuation_byte(uint8_t b)
+{
+  uint8_t top2bits = b & TOP_BITS(2);
+  return top2bits == TOP_BITS(1);
+}
+
+/** Returns true iff the <b>len</b> bytes in <b>c</b> are a valid UTF-8
+ * character.
+ */
+static bool
+validate_char(const uint8_t *c, uint8_t len)
+{
+  if (len == 1)
+    return true; // already validated this is an ASCII char earlier.
+
+  uint8_t mask = LOW_BITS(7 - len); // bitmask for the leading byte.
+  uint32_t codepoint = c[0] & mask;
+
+  mask = LOW_BITS(6); // bitmask for continuation bytes.
+  for (uint8_t i = 1; i < len; i++) {
+    if (!is_continuation_byte(c[i]))
+      return false;
+    codepoint <<= 6;
+    codepoint |= (c[i] & mask);
+  }
+
+  if (len == 2 && codepoint <= 0x7f)
+    return false; // Invalid, overly long encoding, should have fit in 1 byte.
+
+  if (len == 3 && codepoint <= 0x7ff)
+    return false; // Invalid, overly long encoding, should have fit in 2 bytes.
+
+  if (len == 4 && codepoint <= 0xffff)
+    return false; // Invalid, overly long encoding, should have fit in 3 bytes.
+
+  if (codepoint >= 0xd800 && codepoint <= 0xdfff)
+    return false; // Invalid, reserved for UTF-16 surrogate pairs.
+
+  return codepoint <= 0x10ffff; // Check if within maximum.
+}
+
+/** Returns true iff the first <b>len</b> bytes in <b>str</b> are a
+    valid UTF-8 string. */
+int
+string_is_utf8(const char *str, size_t len)
+{
+  for (size_t i = 0; i < len;) {
+    uint8_t num_bytes = bytes_in_char(str[i]);
+    if (num_bytes == 0) // Invalid leading byte found.
+      return false;
+
+    size_t next_char = i + num_bytes;
+    if (next_char > len)
+      return false;
+
+    // Validate the continuation bytes in this multi-byte character,
+    // and advance to the next character in the string.
+    if (!validate_char((const uint8_t*)&str[i], num_bytes))
+      return false;
+    i = next_char;
+  }
+  return true;
+}
diff --git a/src/lib/string/util_string.h b/src/lib/string/util_string.h

index 471613462a547749922a1693dbc564b9a86c81b0..746ece0d33141f42202de96a5399cbb8db55f61e 100644 (file)
--- a/src/lib/string/util_string.h
+++ b/src/lib/string/util_string.h
@@ -52,4 +52,6 @@ const char *find_str_at_start_of_line(const char *haystack,
  
  int string_is_C_identifier(const char *string);
  
+int string_is_utf8(const char *str, size_t len);
+
  #endif /* !defined(TOR_UTIL_STRING_H) */
diff --git a/src/test/test_util.c b/src/test/test_util.c

index 6cbd504e349297a8d64ab35078fab91ecd211a85..bdf8b5dafa52e4dbeea346e8af22081cd2527cbe 100644 (file)
--- a/src/test/test_util.c
+++ b/src/test/test_util.c
@@ -4011,6 +4011,53 @@ test_util_string_is_C_identifier(void *ptr)
    ;
  }
  
+static void
+test_util_string_is_utf8(void *ptr)
+{
+  (void)ptr;
+
+  tt_int_op(1, OP_EQ, string_is_utf8(NULL, 0));
+  tt_int_op(1, OP_EQ, string_is_utf8("", 1));
+  tt_int_op(1, OP_EQ, string_is_utf8("\uFEFF", 3));
+  tt_int_op(1, OP_EQ, string_is_utf8("\uFFFE", 3));
+  tt_int_op(1, OP_EQ, string_is_utf8("ascii\x7f\n", 7));
+  tt_int_op(1, OP_EQ, string_is_utf8("Risqu\u00e9=1", 9));
+
+  // Validate exactly 'len' bytes.
+  tt_int_op(0, OP_EQ, string_is_utf8("\0\x80", 2));
+  tt_int_op(0, OP_EQ, string_is_utf8("Risqu\u00e9=1", 6));
+
+  // Reject sequences with missing bytes.
+  tt_int_op(0, OP_EQ, string_is_utf8("\x80", 1));
+  tt_int_op(0, OP_EQ, string_is_utf8("\xc2", 1));
+  tt_int_op(0, OP_EQ, string_is_utf8("\xc2 ", 2));
+  tt_int_op(0, OP_EQ, string_is_utf8("\xe1\x80", 2));
+  tt_int_op(0, OP_EQ, string_is_utf8("\xe1\x80 ", 3));
+  tt_int_op(0, OP_EQ, string_is_utf8("\xf1\x80\x80", 3));
+  tt_int_op(0, OP_EQ, string_is_utf8("\xf1\x80\x80 ", 4));
+
+  // Reject encodings that are overly long.
+  tt_int_op(0, OP_EQ, string_is_utf8("\xc1\xbf", 2));
+  tt_int_op(1, OP_EQ, string_is_utf8("\xc2\x80", 2));
+  tt_int_op(0, OP_EQ, string_is_utf8("\xe0\x9f\xbf", 3));
+  tt_int_op(1, OP_EQ, string_is_utf8("\xe0\xa0\x80", 3));
+  tt_int_op(0, OP_EQ, string_is_utf8("\xf0\x8f\xbf\xbf", 4));
+  tt_int_op(1, OP_EQ, string_is_utf8("\xf0\x90\x80\x80", 4));
+
+  // Reject UTF-16 surrogate halves.
+  tt_int_op(1, OP_EQ, string_is_utf8("\xed\x9f\xbf", 3));
+  tt_int_op(0, OP_EQ, string_is_utf8("\xed\xa0\x80", 3));
+  tt_int_op(0, OP_EQ, string_is_utf8("\xed\xbf\xbf", 3));
+  tt_int_op(1, OP_EQ, string_is_utf8("\xee\x80\x80", 3));
+
+  // The maximum legal codepoint, 10FFFF.
+  tt_int_op(1, OP_EQ, string_is_utf8("\xf4\x8f\xbf\xbf", 4));
+  tt_int_op(0, OP_EQ, string_is_utf8("\xf4\x90\x80\x80", 4));
+
+ done:
+  ;
+}
+
  static void
  test_util_asprintf(void *ptr)
  {
@@ -6398,6 +6445,7 @@ struct testcase_t util_tests[] = {
    UTIL_TEST(clamp_double_to_int64, 0),
    UTIL_TEST(find_str_at_start_of_line, 0),
    UTIL_TEST(string_is_C_identifier, 0),
+  UTIL_TEST(string_is_utf8, 0),
    UTIL_TEST(asprintf, 0),
    UTIL_TEST(listdir, 0),
    UTIL_TEST(parent_dir, 0),
author	cypherpunks <cypherpunks@torproject.org>
	Wed, 29 Aug 2018 03:22:30 +0000 (03:22 +0000)
committer	cypherpunks <cypherpunks@torproject.org>
	Mon, 3 Sep 2018 13:54:43 +0000 (13:54 +0000)
src/lib/string/util_string.c		patch \| blob \| blame \| history
src/lib/string/util_string.h		patch \| blob \| blame \| history
src/test/test_util.c		patch \| blob \| blame \| history