test utf8

author dota17 <chenguopingdota@163.com>

Wed, 8 Jan 2020 11:42:05 +0000 (19:42 +0800)

committer dota17 <chenguopingdota@163.com>

Fri, 10 Jan 2020 10:33:14 +0000 (18:33 +0800)
author dota17 <chenguopingdota@163.com>
Wed, 8 Jan 2020 11:42:05 +0000 (19:42 +0800)
committer dota17 <chenguopingdota@163.com>
Fri, 10 Jan 2020 10:33:14 +0000 (18:33 +0800)
diff --git a/json_tokener.c b/json_tokener.c

index fc8fb65f7eb8a663558add6f22fb1f87534e9c0e..2a8451df1643c7977a46436db6ab60de4d70422b 100644 (file)
--- a/json_tokener.c
+++ b/json_tokener.c
@@ -83,6 +83,7 @@ static const char* json_tokener_errors[] = {
    "object value separator ',' expected",
    "invalid string sequence",
    "expected comment",
+  "invalid utf-8 string",
    "buffer size overflow"
  };
  
@@ -222,8 +223,12 @@ struct json_object* json_tokener_parse_verbose(const char *str,
      :                                          \
      (((tok)->err = json_tokener_continue), 0)  \
      ) :                                                \
-   (((dest) = *str), 1)                                \
-   )
+   (((tok->flags & JSON_TOKENER_STRICT) &&   \
+    (!json_tokener_validate_utf8(*str, nBytesp)))?  \
+    ((tok->err = json_tokener_error_parse_utf8_string), 0)  \
+    :            \
+    (((dest) = *str), 1)                               \
+   ))
  
  /* ADVANCE_CHAR() macro:
   *   Increments str & tok->char_offset.
@@ -242,6 +247,9 @@ struct json_object* json_tokener_parse_ex(struct json_tokener *tok,
  {
    struct json_object *obj = NULL;
    char c = '\1';
+  unsigned int nBytes = 0;
+  unsigned int *nBytesp = &nBytes;
+
  #ifdef HAVE_USELOCALE
    locale_t oldlocale = uselocale(NULL);
    locale_t newloc;
@@ -948,6 +956,10 @@ struct json_object* json_tokener_parse_ex(struct json_tokener *tok,
    } /* while(PEEK_CHAR) */
  
   out:
+  if ((tok->flags & JSON_TOKENER_STRICT) && (nBytes != 0))
+  {
+    tok->err = json_tokener_error_parse_utf8_string;
+  }
    if (c &&
       (state == json_tokener_state_finish) &&
       (tok->depth == 0) &&
@@ -985,6 +997,37 @@ struct json_object* json_tokener_parse_ex(struct json_tokener *tok,
    return NULL;
  }
  
+json_bool json_tokener_validate_utf8(const char c, unsigned int *nBytes)
+{
+  unsigned char chr = c;
+  if (*nBytes == 0)
+  {
+    if (chr >= 0x80)
+    {
+      if(chr >= 0xFC && chr <= 0xFd)
+        *nBytes = 6;
+      else if (chr >= 0xF8)
+        *nBytes = 5;
+      else if (chr >= 0xF0)
+        *nBytes = 4;
+      else if (chr >= 0xE0)
+        *nBytes = 3;
+      else if (chr >= 0xC0)
+        *nBytes = 2;
+      else
+        return 0;
+      (*nBytes)--;
+    }
+  }
+  else
+  {
+    if ((chr & 0xC0) != 0x80)
+      return 0;
+    (*nBytes)--;
+  }
+  return 1;
+}
+
  void json_tokener_set_flags(struct json_tokener *tok, int flags)
  {
         tok->flags = flags;
diff --git a/json_tokener.h b/json_tokener.h

index da2b24c161ca421887566ef2c683d29e6e9a87fc..061f81bca7582148bc765a08e17646d1a7225e1a 100644 (file)
--- a/json_tokener.h
+++ b/json_tokener.h
@@ -38,6 +38,7 @@ enum json_tokener_error {
    json_tokener_error_parse_object_value_sep,
    json_tokener_error_parse_string,
    json_tokener_error_parse_comment,
+  json_tokener_error_parse_utf8_string,
    json_tokener_error_size
  };
  
@@ -162,6 +163,11 @@ JSON_EXPORT void json_tokener_reset(struct json_tokener *tok);
  JSON_EXPORT struct json_object* json_tokener_parse(const char *str);
  JSON_EXPORT struct json_object* json_tokener_parse_verbose(const char *str, enum json_tokener_error *error);
  
+/**
+ * validete the utf-8 string in strict model.
+ * if not utf-8 format, return err.
+ */
+json_bool json_tokener_validate_utf8(const char c, unsigned int *nBytes);
  /**
   * Set flags that control how parsing will be done.
   */
diff --git a/tests/test_parse.c b/tests/test_parse.c

index 807b457d2cc5246040ce90627bcecc05ca4a69bf..14d4b11384c2b58e81a5cc96d7293aae2d554f95 100644 (file)
--- a/tests/test_parse.c
+++ b/tests/test_parse.c
@@ -355,6 +355,41 @@ struct incremental_step {
         { "[1,2,3,]",         -1, 7, json_tokener_error_parse_unexpected, 3 },
         { "{\"a\":1,}",         -1, 7, json_tokener_error_parse_unexpected, 3 },
  
+  // utf-8 test
+  // acsll encoding
+       { "\x22\x31\x32\x33\x61\x73\x63\x24\x25\x26\x22",-1, -1, json_tokener_success, 3 },
+       { "\x22\x31\x32\x33\x61\x73\x63\x24\x25\x26\x22",-1, -1, json_tokener_success, 1 },
+  // utf-8 encoding
+       { "\x22\xe4\xb8\x96\xe7\x95\x8c\x22",-1, -1, json_tokener_success, 3 },
+       { "\x22\xe4\xb8",-1, -1, json_tokener_error_parse_utf8_string, 2 },
+       { "\x96\xe7\x95\x8c\x22",-1, 0, json_tokener_error_parse_utf8_string, 3 },
+       { "\x22\xe4\xb8\x96\xe7\x95\x8c\x22",-1, -1, json_tokener_success, 1 },
+       { "\x22\xcf\x80\xcf\x86\x22",-1, -1, json_tokener_success, 3 },
+       { "\x22\xf0\xa5\x91\x95\x22",-1, -1, json_tokener_success, 3 },
+       { "\x22\xf8\xa5\xa5\x91\x95\x22",-1, -1, json_tokener_success, 3 },
+       { "\x22\xfd\xa5\xa5\xa5\x91\x95\x22",-1, -1, json_tokener_success, 3 },
+  // wrong utf-8 encoding
+       { "\x22\xe6\x9d\x4e\x22",-1, 3, json_tokener_error_parse_utf8_string, 3 },
+       { "\x22\xe6\x9d\x4e\x22",-1, 5, json_tokener_success, 1 },
+  // GBK encoding
+       { "\x22\xc0\xee\xc5\xf4\x22",-1, 2, json_tokener_error_parse_utf8_string, 3 },
+       { "\x22\xc0\xee\xc5\xf4\x22",-1, 6, json_tokener_success, 1 },
+  // char after space
+       { "\x20\x20\x22\xe4\xb8\x96\x22",-1, -1, json_tokener_success, 3 },
+       { "\x20\x20\x81\x22\xe4\xb8\x96\x22",-1, 2, json_tokener_error_parse_utf8_string, 3 },
+       { "\x5b\x20\x81\x31\x5d",-1, 2, json_tokener_error_parse_utf8_string, 3 },
+  // char in state inf
+       { "\x49\x6e\x66\x69\x6e\x69\x74\x79",9, 8, json_tokener_success, 1 },
+       { "\x49\x6e\x66\x81\x6e\x69\x74\x79",-1, 3, json_tokener_error_parse_utf8_string, 3 },
+  // char in escape unicode
+       { "\x22\x5c\x75\x64\x38\x35\x35\x5c\x75\x64\x63\x35\x35\x22",15, 14, json_tokener_success, 3 },
+       { "\x22\x5c\x75\x64\x38\x35\x35\xc0\x75\x64\x63\x35\x35\x22",-1, 8, json_tokener_error_parse_utf8_string, 3 },
+       { "\x22\x5c\x75\x64\x30\x30\x33\x31\xc0\x22",-1, 9, json_tokener_error_parse_utf8_string, 3 },
+  // char in number
+       { "\x31\x31\x81\x31\x31",-1, 2, json_tokener_error_parse_utf8_string, 3 },
+  // char in object
+       { "\x7b\x22\x31\x81\x22\x3a\x31\x7d",-1, 3, json_tokener_error_parse_utf8_string, 3 },
+
         { NULL, -1, -1, json_tokener_success, 0 },
  };
  
diff --git a/tests/test_parse.expected b/tests/test_parse.expected

index af075b0368afbcd40d76c0161c4187f192b3e61c..a5c2454a9ea8df3a86342a3a15d718b121c97157 100644 (file)
--- a/tests/test_parse.expected
+++ b/tests/test_parse.expected
@@ -183,5 +183,29 @@ json_tokener_parse_ex(tok, [1,2,3,]    ,   8) ... OK: got object of type [array]
  json_tokener_parse_ex(tok, [1,2,,3,]   ,   9) ... OK: got correct error: unexpected character
  json_tokener_parse_ex(tok, [1,2,3,]    ,   8) ... OK: got correct error: unexpected character
  json_tokener_parse_ex(tok, {"a":1,}    ,   8) ... OK: got correct error: unexpected character
-End Incremental Tests OK=105 ERROR=0
+json_tokener_parse_ex(tok, "123asc$%&" ,  11) ... OK: got object of type [string]: "123asc$%&"
+json_tokener_parse_ex(tok, "123asc$%&" ,  11) ... OK: got object of type [string]: "123asc$%&"
+json_tokener_parse_ex(tok, "世界"    ,   8) ... OK: got object of type [string]: "世界"
+json_tokener_parse_ex(tok, "ä¸         ,   3) ... OK: got correct error: invalid utf-8 string
+json_tokener_parse_ex(tok, \96ç\95\8c"       ,   5) ... OK: got correct error: invalid utf-8 string
+json_tokener_parse_ex(tok, "世界"    ,   8) ... OK: got object of type [string]: "世界"
+json_tokener_parse_ex(tok, "πφ"      ,   6) ... OK: got object of type [string]: "πφ"
+json_tokener_parse_ex(tok, "𥑕"      ,   6) ... OK: got object of type [string]: "𥑕"
+json_tokener_parse_ex(tok, "�����"     ,   7) ... OK: got object of type [string]: "�����"
+json_tokener_parse_ex(tok, "������"    ,   8) ... OK: got object of type [string]: "������"
+json_tokener_parse_ex(tok, "æ\9dN"       ,   5) ... OK: got correct error: invalid utf-8 string
+json_tokener_parse_ex(tok, "æ\9dN"       ,   5) ... OK: got object of type [string]: "æ\9dN"
+json_tokener_parse_ex(tok, "ÀîÅô"      ,   6) ... OK: got correct error: invalid utf-8 string
+json_tokener_parse_ex(tok, "ÀîÅô"      ,   6) ... OK: got object of type [string]: "ÀîÅô"
+json_tokener_parse_ex(tok,   "世"     ,   7) ... OK: got object of type [string]: "世"
+json_tokener_parse_ex(tok,   \81"ä¸\96"    ,   8) ... OK: got correct error: invalid utf-8 string
+json_tokener_parse_ex(tok, [ \811]       ,   5) ... OK: got correct error: invalid utf-8 string
+json_tokener_parse_ex(tok, Infinity    ,   9) ... OK: got object of type [double]: Infinity
+json_tokener_parse_ex(tok, Inf\81nity    ,   8) ... OK: got correct error: invalid utf-8 string
+json_tokener_parse_ex(tok, "\ud855\udc55",  15) ... OK: got object of type [string]: "𥑕"
+json_tokener_parse_ex(tok, "\ud855Àudc55",  14) ... OK: got correct error: invalid utf-8 string
+json_tokener_parse_ex(tok, "\ud0031À"  ,  10) ... OK: got correct error: invalid utf-8 string
+json_tokener_parse_ex(tok, 11\8111       ,   5) ... OK: got correct error: invalid utf-8 string
+json_tokener_parse_ex(tok, {"1\81":1}    ,   8) ... OK: got correct error: invalid utf-8 string
+End Incremental Tests OK=129 ERROR=0
  ==================================
author	dota17 <chenguopingdota@163.com>
	Wed, 8 Jan 2020 11:42:05 +0000 (19:42 +0800)
committer	dota17 <chenguopingdota@163.com>
	Fri, 10 Jan 2020 10:33:14 +0000 (18:33 +0800)
json_tokener.c		patch \| blob \| blame \| history
json_tokener.h		patch \| blob \| blame \| history
tests/test_parse.c		patch \| blob \| blame \| history
tests/test_parse.expected		patch \| blob \| blame \| history