Fix JSON error reporting for many cases of erroneous string values.

author Tom Lane <tgl@sss.pgh.pa.us>

Mon, 13 Mar 2023 19:19:00 +0000 (15:19 -0400)

committer Tom Lane <tgl@sss.pgh.pa.us>

Mon, 13 Mar 2023 19:19:00 +0000 (15:19 -0400)
author Tom Lane <tgl@sss.pgh.pa.us>
Mon, 13 Mar 2023 19:19:00 +0000 (15:19 -0400)
committer Tom Lane <tgl@sss.pgh.pa.us>
Mon, 13 Mar 2023 19:19:00 +0000 (15:19 -0400)
diff --git a/src/backend/utils/adt/json.c b/src/backend/utils/adt/json.c

index 6f0fe94d63f26bba74b6853db9e58f8801c644df..c020210e202a41ce9c7abc2c5e3e52bdb41d3a90 100644 (file)
--- a/src/backend/utils/adt/json.c
+++ b/src/backend/utils/adt/json.c
@@ -750,6 +750,13 @@ json_lex(JsonLexContext *lex)
  
  /*
   * The next token in the input stream is known to be a string; lex it.
+ *
+ * If lex->strval isn't NULL, fill it with the decoded string.
+ * Set lex->token_terminator to the end of the decoded input, and in
+ * success cases, transfer its previous value to lex->prev_token_terminator.
+ *
+ * Note: be careful that all error cases advance lex->token_terminator
+ * to the point after the character we detected the error on.
   */
  static inline void
  json_lex_string(JsonLexContext *lex)
@@ -837,33 +844,42 @@ json_lex_string(JsonLexContext *lex)
                                         if (ch >= 0xd800 && ch <= 0xdbff)
                                         {
                                                 if (hi_surrogate != -1)
+                                               {
+                                                       lex->token_terminator = s + pg_mblen(s);
                                                         ereport(ERROR,
                                                                         (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
                                                                          errmsg("invalid input syntax for type %s",
                                                                                         "json"),
                                                                          errdetail("Unicode high surrogate must not follow a high surrogate."),
                                                                          report_json_context(lex)));
+                                               }
                                                 hi_surrogate = (ch & 0x3ff) << 10;
                                                 continue;
                                         }
                                         else if (ch >= 0xdc00 && ch <= 0xdfff)
                                         {
                                                 if (hi_surrogate == -1)
+                                               {
+                                                       lex->token_terminator = s + pg_mblen(s);
                                                         ereport(ERROR,
                                                                         (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
                                                                          errmsg("invalid input syntax for type %s", "json"),
                                                                          errdetail("Unicode low surrogate must follow a high surrogate."),
                                                                          report_json_context(lex)));
+                                               }
                                                 ch = 0x10000 + hi_surrogate + (ch & 0x3ff);
                                                 hi_surrogate = -1;
                                         }
  
                                         if (hi_surrogate != -1)
+                                       {
+                                               lex->token_terminator = s + pg_mblen(s);
                                                 ereport(ERROR,
                                                                 (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
                                                                  errmsg("invalid input syntax for type %s", "json"),
                                                                  errdetail("Unicode low surrogate must follow a high surrogate."),
                                                                  report_json_context(lex)));
+                                       }
  
                                         /*
                                          * For UTF8, replace the escape sequence by the actual
@@ -875,6 +891,7 @@ json_lex_string(JsonLexContext *lex)
                                         if (ch == 0)
                                         {
                                                 /* We can't allow this, since our TEXT type doesn't */
+                                               lex->token_terminator = s + pg_mblen(s);
                                                 ereport(ERROR,
                                                                 (errcode(ERRCODE_UNTRANSLATABLE_CHARACTER),
                                                                  errmsg("unsupported Unicode escape sequence"),
@@ -898,24 +915,27 @@ json_lex_string(JsonLexContext *lex)
                                         }
                                         else
                                         {
+                                               lex->token_terminator = s + pg_mblen(s);
                                                 ereport(ERROR,
                                                                 (errcode(ERRCODE_UNTRANSLATABLE_CHARACTER),
                                                                  errmsg("unsupported Unicode escape sequence"),
                                                                  errdetail("Unicode escape values cannot be used for code point values above 007F when the server encoding is not UTF8."),
                                                                  report_json_context(lex)));
                                         }
-
                                 }
                         }
                         else if (lex->strval != NULL)
                         {
                                 if (hi_surrogate != -1)
+                               {
+                                       lex->token_terminator = s + pg_mblen(s);
                                         ereport(ERROR,
                                                         (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
                                                          errmsg("invalid input syntax for type %s",
                                                                         "json"),
                                                          errdetail("Unicode low surrogate must follow a high surrogate."),
                                                          report_json_context(lex)));
+                               }
  
                                 switch (*s)
                                 {
@@ -968,16 +988,18 @@ json_lex_string(JsonLexContext *lex)
                                                                    extract_mb_char(s)),
                                                  report_json_context(lex)));
                         }
-
                 }
                 else if (lex->strval != NULL)
                 {
                         if (hi_surrogate != -1)
+                       {
+                               lex->token_terminator = s + pg_mblen(s);
                                 ereport(ERROR,
                                                 (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
                                                  errmsg("invalid input syntax for type %s", "json"),
                                                  errdetail("Unicode low surrogate must follow a high surrogate."),
                                                  report_json_context(lex)));
+                       }
  
                         appendStringInfoChar(lex->strval, *s);
                 }
@@ -985,11 +1007,14 @@ json_lex_string(JsonLexContext *lex)
         }
  
         if (hi_surrogate != -1)
+       {
+               lex->token_terminator = s + pg_mblen(s);
                 ereport(ERROR,
                                 (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
                                  errmsg("invalid input syntax for type %s", "json"),
                                  errdetail("Unicode low surrogate must follow a high surrogate."),
                                  report_json_context(lex)));
+       }
  
         /* Hooray, we found the end of the string! */
         lex->prev_token_terminator = lex->token_terminator;
diff --git a/src/test/regress/expected/json_encoding.out b/src/test/regress/expected/json_encoding.out

index d8d34f4ff6a834bf194c509bfe3eca0b23458c32..3156c63c6ff4edaf61befa0c00a1e6623f4c34fc 100644 (file)
--- a/src/test/regress/expected/json_encoding.out
+++ b/src/test/regress/expected/json_encoding.out
@@ -41,19 +41,19 @@ select json '{ "a":  "\ud83d\ude04\ud83d\udc36" }' -> 'a' as correct_in_utf8;
  select json '{ "a":  "\ud83d\ud83d" }' -> 'a'; -- 2 high surrogates in a row
  ERROR:  invalid input syntax for type json
  DETAIL:  Unicode high surrogate must not follow a high surrogate.
-CONTEXT:  JSON data, line 1: { "a":...
+CONTEXT:  JSON data, line 1: { "a":  "\ud83d\ud83d...
  select json '{ "a":  "\ude04\ud83d" }' -> 'a'; -- surrogates in wrong order
  ERROR:  invalid input syntax for type json
  DETAIL:  Unicode low surrogate must follow a high surrogate.
-CONTEXT:  JSON data, line 1: { "a":...
+CONTEXT:  JSON data, line 1: { "a":  "\ude04...
  select json '{ "a":  "\ud83dX" }' -> 'a'; -- orphan high surrogate
  ERROR:  invalid input syntax for type json
  DETAIL:  Unicode low surrogate must follow a high surrogate.
-CONTEXT:  JSON data, line 1: { "a":...
+CONTEXT:  JSON data, line 1: { "a":  "\ud83dX...
  select json '{ "a":  "\ude04X" }' -> 'a'; -- orphan low surrogate
  ERROR:  invalid input syntax for type json
  DETAIL:  Unicode low surrogate must follow a high surrogate.
-CONTEXT:  JSON data, line 1: { "a":...
+CONTEXT:  JSON data, line 1: { "a":  "\ude04...
  --handling of simple unicode escapes
  select json '{ "a":  "the Copyright \u00a9 sign" }' as correct_in_utf8;
              correct_in_utf8            
@@ -106,7 +106,7 @@ select json '{ "a":  "dollar \\u0024 character" }' ->> 'a' as not_an_escape;
  select json '{ "a":  "null \u0000 escape" }' ->> 'a' as fails;
  ERROR:  unsupported Unicode escape sequence
  DETAIL:  \u0000 cannot be converted to text.
-CONTEXT:  JSON data, line 1: { "a":...
+CONTEXT:  JSON data, line 1: { "a":  "null \u0000...
  select json '{ "a":  "null \\u0000 escape" }' ->> 'a' as not_an_escape;
     not_an_escape    
  --------------------
@@ -144,7 +144,7 @@ ERROR:  unsupported Unicode escape sequence
  LINE 1: SELECT '"\u0000"'::jsonb;
                 ^
  DETAIL:  \u0000 cannot be converted to text.
-CONTEXT:  JSON data, line 1: ...
+CONTEXT:  JSON data, line 1: "\u0000...
  -- use octet_length here so we don't get an odd unicode char in the
  -- output
  SELECT octet_length('"\uaBcD"'::jsonb::text); -- OK, uppercase and lower case both OK
@@ -165,25 +165,25 @@ ERROR:  invalid input syntax for type json
  LINE 1: SELECT jsonb '{ "a":  "\ud83d\ud83d" }' -> 'a';
                       ^
  DETAIL:  Unicode high surrogate must not follow a high surrogate.
-CONTEXT:  JSON data, line 1: { "a":...
+CONTEXT:  JSON data, line 1: { "a":  "\ud83d\ud83d...
  SELECT jsonb '{ "a":  "\ude04\ud83d" }' -> 'a'; -- surrogates in wrong order
  ERROR:  invalid input syntax for type json
  LINE 1: SELECT jsonb '{ "a":  "\ude04\ud83d" }' -> 'a';
                       ^
  DETAIL:  Unicode low surrogate must follow a high surrogate.
-CONTEXT:  JSON data, line 1: { "a":...
+CONTEXT:  JSON data, line 1: { "a":  "\ude04...
  SELECT jsonb '{ "a":  "\ud83dX" }' -> 'a'; -- orphan high surrogate
  ERROR:  invalid input syntax for type json
  LINE 1: SELECT jsonb '{ "a":  "\ud83dX" }' -> 'a';
                       ^
  DETAIL:  Unicode low surrogate must follow a high surrogate.
-CONTEXT:  JSON data, line 1: { "a":...
+CONTEXT:  JSON data, line 1: { "a":  "\ud83dX...
  SELECT jsonb '{ "a":  "\ude04X" }' -> 'a'; -- orphan low surrogate
  ERROR:  invalid input syntax for type json
  LINE 1: SELECT jsonb '{ "a":  "\ude04X" }' -> 'a';
                       ^
  DETAIL:  Unicode low surrogate must follow a high surrogate.
-CONTEXT:  JSON data, line 1: { "a":...
+CONTEXT:  JSON data, line 1: { "a":  "\ude04...
  -- handling of simple unicode escapes
  SELECT jsonb '{ "a":  "the Copyright \u00a9 sign" }' as correct_in_utf8;
          correct_in_utf8        
@@ -208,7 +208,7 @@ ERROR:  unsupported Unicode escape sequence
  LINE 1: SELECT jsonb '{ "a":  "null \u0000 escape" }' as fails;
                       ^
  DETAIL:  \u0000 cannot be converted to text.
-CONTEXT:  JSON data, line 1: { "a":...
+CONTEXT:  JSON data, line 1: { "a":  "null \u0000...
  SELECT jsonb '{ "a":  "null \\u0000 escape" }' as not_an_escape;
          not_an_escape         
  ------------------------------
@@ -238,7 +238,7 @@ ERROR:  unsupported Unicode escape sequence
  LINE 1: SELECT jsonb '{ "a":  "null \u0000 escape" }' ->> 'a' as fai...
                       ^
  DETAIL:  \u0000 cannot be converted to text.
-CONTEXT:  JSON data, line 1: { "a":...
+CONTEXT:  JSON data, line 1: { "a":  "null \u0000...
  SELECT jsonb '{ "a":  "null \\u0000 escape" }' ->> 'a' as not_an_escape;
     not_an_escape    
  --------------------
diff --git a/src/test/regress/expected/json_encoding_1.out b/src/test/regress/expected/json_encoding_1.out

index 79ed78e1c5f496e858750379aeeb41ebd743de03..d320bfa8118c534576b5dddc33da6f9900c4ad17 100644 (file)
--- a/src/test/regress/expected/json_encoding_1.out
+++ b/src/test/regress/expected/json_encoding_1.out
@@ -35,23 +35,23 @@ SELECT '"\uaBcD"'::json;            -- OK, uppercase and lower case both OK
  select json '{ "a":  "\ud83d\ude04\ud83d\udc36" }' -> 'a' as correct_in_utf8;
  ERROR:  unsupported Unicode escape sequence
  DETAIL:  Unicode escape values cannot be used for code point values above 007F when the server encoding is not UTF8.
-CONTEXT:  JSON data, line 1: { "a":...
+CONTEXT:  JSON data, line 1: { "a":  "\ud83d\ude04...
  select json '{ "a":  "\ud83d\ud83d" }' -> 'a'; -- 2 high surrogates in a row
  ERROR:  invalid input syntax for type json
  DETAIL:  Unicode high surrogate must not follow a high surrogate.
-CONTEXT:  JSON data, line 1: { "a":...
+CONTEXT:  JSON data, line 1: { "a":  "\ud83d\ud83d...
  select json '{ "a":  "\ude04\ud83d" }' -> 'a'; -- surrogates in wrong order
  ERROR:  invalid input syntax for type json
  DETAIL:  Unicode low surrogate must follow a high surrogate.
-CONTEXT:  JSON data, line 1: { "a":...
+CONTEXT:  JSON data, line 1: { "a":  "\ude04...
  select json '{ "a":  "\ud83dX" }' -> 'a'; -- orphan high surrogate
  ERROR:  invalid input syntax for type json
  DETAIL:  Unicode low surrogate must follow a high surrogate.
-CONTEXT:  JSON data, line 1: { "a":...
+CONTEXT:  JSON data, line 1: { "a":  "\ud83dX...
  select json '{ "a":  "\ude04X" }' -> 'a'; -- orphan low surrogate
  ERROR:  invalid input syntax for type json
  DETAIL:  Unicode low surrogate must follow a high surrogate.
-CONTEXT:  JSON data, line 1: { "a":...
+CONTEXT:  JSON data, line 1: { "a":  "\ude04...
  --handling of simple unicode escapes
  select json '{ "a":  "the Copyright \u00a9 sign" }' as correct_in_utf8;
              correct_in_utf8            
@@ -86,7 +86,7 @@ select json '{ "a":  "null \\u0000 escape" }' as not_an_escape;
  select json '{ "a":  "the Copyright \u00a9 sign" }' ->> 'a' as correct_in_utf8;
  ERROR:  unsupported Unicode escape sequence
  DETAIL:  Unicode escape values cannot be used for code point values above 007F when the server encoding is not UTF8.
-CONTEXT:  JSON data, line 1: { "a":...
+CONTEXT:  JSON data, line 1: { "a":  "the Copyright \u00a9...
  select json '{ "a":  "dollar \u0024 character" }' ->> 'a' as correct_everywhere;
   correct_everywhere 
  --------------------
@@ -102,7 +102,7 @@ select json '{ "a":  "dollar \\u0024 character" }' ->> 'a' as not_an_escape;
  select json '{ "a":  "null \u0000 escape" }' ->> 'a' as fails;
  ERROR:  unsupported Unicode escape sequence
  DETAIL:  \u0000 cannot be converted to text.
-CONTEXT:  JSON data, line 1: { "a":...
+CONTEXT:  JSON data, line 1: { "a":  "null \u0000...
  select json '{ "a":  "null \\u0000 escape" }' ->> 'a' as not_an_escape;
     not_an_escape    
  --------------------
@@ -140,7 +140,7 @@ ERROR:  unsupported Unicode escape sequence
  LINE 1: SELECT '"\u0000"'::jsonb;
                 ^
  DETAIL:  \u0000 cannot be converted to text.
-CONTEXT:  JSON data, line 1: ...
+CONTEXT:  JSON data, line 1: "\u0000...
  -- use octet_length here so we don't get an odd unicode char in the
  -- output
  SELECT octet_length('"\uaBcD"'::jsonb::text); -- OK, uppercase and lower case both OK
@@ -148,45 +148,45 @@ ERROR:  unsupported Unicode escape sequence
  LINE 1: SELECT octet_length('"\uaBcD"'::jsonb::text);
                              ^
  DETAIL:  Unicode escape values cannot be used for code point values above 007F when the server encoding is not UTF8.
-CONTEXT:  JSON data, line 1: ...
+CONTEXT:  JSON data, line 1: "\uaBcD...
  -- handling of unicode surrogate pairs
  SELECT octet_length((jsonb '{ "a":  "\ud83d\ude04\ud83d\udc36" }' -> 'a')::text) AS correct_in_utf8;
  ERROR:  unsupported Unicode escape sequence
  LINE 1: SELECT octet_length((jsonb '{ "a":  "\ud83d\ude04\ud83d\udc3...
                                     ^
  DETAIL:  Unicode escape values cannot be used for code point values above 007F when the server encoding is not UTF8.
-CONTEXT:  JSON data, line 1: { "a":...
+CONTEXT:  JSON data, line 1: { "a":  "\ud83d\ude04...
  SELECT jsonb '{ "a":  "\ud83d\ud83d" }' -> 'a'; -- 2 high surrogates in a row
  ERROR:  invalid input syntax for type json
  LINE 1: SELECT jsonb '{ "a":  "\ud83d\ud83d" }' -> 'a';
                       ^
  DETAIL:  Unicode high surrogate must not follow a high surrogate.
-CONTEXT:  JSON data, line 1: { "a":...
+CONTEXT:  JSON data, line 1: { "a":  "\ud83d\ud83d...
  SELECT jsonb '{ "a":  "\ude04\ud83d" }' -> 'a'; -- surrogates in wrong order
  ERROR:  invalid input syntax for type json
  LINE 1: SELECT jsonb '{ "a":  "\ude04\ud83d" }' -> 'a';
                       ^
  DETAIL:  Unicode low surrogate must follow a high surrogate.
-CONTEXT:  JSON data, line 1: { "a":...
+CONTEXT:  JSON data, line 1: { "a":  "\ude04...
  SELECT jsonb '{ "a":  "\ud83dX" }' -> 'a'; -- orphan high surrogate
  ERROR:  invalid input syntax for type json
  LINE 1: SELECT jsonb '{ "a":  "\ud83dX" }' -> 'a';
                       ^
  DETAIL:  Unicode low surrogate must follow a high surrogate.
-CONTEXT:  JSON data, line 1: { "a":...
+CONTEXT:  JSON data, line 1: { "a":  "\ud83dX...
  SELECT jsonb '{ "a":  "\ude04X" }' -> 'a'; -- orphan low surrogate
  ERROR:  invalid input syntax for type json
  LINE 1: SELECT jsonb '{ "a":  "\ude04X" }' -> 'a';
                       ^
  DETAIL:  Unicode low surrogate must follow a high surrogate.
-CONTEXT:  JSON data, line 1: { "a":...
+CONTEXT:  JSON data, line 1: { "a":  "\ude04...
  -- handling of simple unicode escapes
  SELECT jsonb '{ "a":  "the Copyright \u00a9 sign" }' as correct_in_utf8;
  ERROR:  unsupported Unicode escape sequence
  LINE 1: SELECT jsonb '{ "a":  "the Copyright \u00a9 sign" }' as corr...
                       ^
  DETAIL:  Unicode escape values cannot be used for code point values above 007F when the server encoding is not UTF8.
-CONTEXT:  JSON data, line 1: { "a":...
+CONTEXT:  JSON data, line 1: { "a":  "the Copyright \u00a9...
  SELECT jsonb '{ "a":  "dollar \u0024 character" }' as correct_everywhere;
       correct_everywhere      
  -----------------------------
@@ -204,7 +204,7 @@ ERROR:  unsupported Unicode escape sequence
  LINE 1: SELECT jsonb '{ "a":  "null \u0000 escape" }' as fails;
                       ^
  DETAIL:  \u0000 cannot be converted to text.
-CONTEXT:  JSON data, line 1: { "a":...
+CONTEXT:  JSON data, line 1: { "a":  "null \u0000...
  SELECT jsonb '{ "a":  "null \\u0000 escape" }' as not_an_escape;
          not_an_escape         
  ------------------------------
@@ -216,7 +216,7 @@ ERROR:  unsupported Unicode escape sequence
  LINE 1: SELECT jsonb '{ "a":  "the Copyright \u00a9 sign" }' ->> 'a'...
                       ^
  DETAIL:  Unicode escape values cannot be used for code point values above 007F when the server encoding is not UTF8.
-CONTEXT:  JSON data, line 1: { "a":...
+CONTEXT:  JSON data, line 1: { "a":  "the Copyright \u00a9...
  SELECT jsonb '{ "a":  "dollar \u0024 character" }' ->> 'a' as correct_everywhere;
   correct_everywhere 
  --------------------
@@ -234,7 +234,7 @@ ERROR:  unsupported Unicode escape sequence
  LINE 1: SELECT jsonb '{ "a":  "null \u0000 escape" }' ->> 'a' as fai...
                       ^
  DETAIL:  \u0000 cannot be converted to text.
-CONTEXT:  JSON data, line 1: { "a":...
+CONTEXT:  JSON data, line 1: { "a":  "null \u0000...
  SELECT jsonb '{ "a":  "null \\u0000 escape" }' ->> 'a' as not_an_escape;
     not_an_escape    
  --------------------
author	Tom Lane <tgl@sss.pgh.pa.us>
	Mon, 13 Mar 2023 19:19:00 +0000 (15:19 -0400)
committer	Tom Lane <tgl@sss.pgh.pa.us>
	Mon, 13 Mar 2023 19:19:00 +0000 (15:19 -0400)
src/backend/utils/adt/json.c		patch \| blob \| blame \| history
src/test/regress/expected/json_encoding.out		patch \| blob \| blame \| history
src/test/regress/expected/json_encoding_1.out		patch \| blob \| blame \| history