Apply encoding conversion in COPY TO FORMAT JSON

author Andrew Dunstan <andrew@dunslane.net>

Mon, 20 Apr 2026 05:46:36 +0000 (11:16 +0530)

committer Andrew Dunstan <andrew@dunslane.net>

Sat, 30 May 2026 01:48:45 +0000 (21:48 -0400)
author Andrew Dunstan <andrew@dunslane.net>
Mon, 20 Apr 2026 05:46:36 +0000 (11:16 +0530)
committer Andrew Dunstan <andrew@dunslane.net>
Sat, 30 May 2026 01:48:45 +0000 (21:48 -0400)
diff --git a/src/backend/commands/copyto.c b/src/backend/commands/copyto.c

index ffed63a2986bb814fe26709de7ca21065400d412..6755bb698de738c9715cd73ad6ef809a11cbacf8 100644 (file)
--- a/src/backend/commands/copyto.c
+++ b/src/backend/commands/copyto.c
@@ -427,7 +427,25 @@ CopyToJsonOneRow(CopyToState cstate, TupleTableSlot *slot)
                 }
         }
  
-       CopySendData(cstate, cstate->json_buf->data, cstate->json_buf->len);
+       /*
+        * Convert the JSON output to the target encoding if needed.  Unlike the
+        * text and CSV paths which convert per-attribute via CopyAttributeOut*,
+        * composite_to_json() emits the whole row as one buffer, so we transcode
+        * it here in a single call before sending.
+        */
+       if (cstate->need_transcoding)
+       {
+               char       *converted;
+
+               converted = pg_server_to_any(cstate->json_buf->data,
+                                                                        cstate->json_buf->len,
+                                                                        cstate->file_encoding);
+               CopySendData(cstate, converted, strlen(converted));
+               if (converted != cstate->json_buf->data)
+                       pfree(converted);
+       }
+       else
+               CopySendData(cstate, cstate->json_buf->data, cstate->json_buf->len);
  
         CopySendTextLikeEndOfRow(cstate);
  }
diff --git a/src/test/regress/expected/copyencoding.out b/src/test/regress/expected/copyencoding.out

index 76ea0e7cf04b29b5cf1e82192a7b44a8613dafa8..1e7ebc7d0e6048706a9db002068517ba00ac1d7b 100644 (file)
--- a/src/test/regress/expected/copyencoding.out
+++ b/src/test/regress/expected/copyencoding.out
@@ -50,4 +50,31 @@ COPY copy_encoding_tab FROM :'utf8_csv' WITH (FORMAT csv);
  ERROR:  invalid byte sequence for encoding "EUC_JP": 0xe3 0x81
  CONTEXT:  COPY copy_encoding_tab, line 1
  RESET client_encoding;
+-- JSON format encoding conversion
+\set json_latin1 :abs_builddir '/results/copyencoding_json_latin1.json'
+COPY (SELECT E'\u00e9' AS c) TO :'json_latin1' WITH (FORMAT json, ENCODING 'LATIN1');
+-- Verify the file contains LATIN1 'é' (single byte 0xe9) and not UTF-8 (0xc3 0xa9).
+-- Done as separate position checks to stay independent of the platform's
+-- end-of-line convention.
+SELECT position('\xe9'::bytea  IN pg_read_binary_file(:'json_latin1')) > 0 AS has_latin1_e9,
+       position('\xc3a9'::bytea IN pg_read_binary_file(:'json_latin1')) > 0 AS has_utf8_e9;
+ has_latin1_e9 | has_utf8_e9 
+---------------+-------------
+ t             | f
+(1 row)
+
+-- Same with implicit encoding inherited from client_encoding (no ENCODING
+-- option).  Covers the case where a client with a non-UTF8 client_encoding
+-- runs COPY ... FORMAT json and would otherwise receive unconverted bytes.
+\set json_implicit :abs_builddir '/results/copyencoding_json_implicit_latin1.json'
+SET client_encoding TO LATIN1;
+COPY (SELECT E'\u00e9' AS c) TO :'json_implicit' WITH (FORMAT json);
+RESET client_encoding;
+SELECT position('\xe9'::bytea  IN pg_read_binary_file(:'json_implicit')) > 0 AS has_latin1_e9,
+       position('\xc3a9'::bytea IN pg_read_binary_file(:'json_implicit')) > 0 AS has_utf8_e9;
+ has_latin1_e9 | has_utf8_e9 
+---------------+-------------
+ t             | f
+(1 row)
+
  DROP TABLE copy_encoding_tab;
diff --git a/src/test/regress/sql/copyencoding.sql b/src/test/regress/sql/copyencoding.sql

index 64718245b94f10add0c174f7e3f0e641a365db1b..c8905da556789eb8c0f4fa67df2e8c12aa5d9f87 100644 (file)
--- a/src/test/regress/sql/copyencoding.sql
+++ b/src/test/regress/sql/copyencoding.sql
@@ -57,4 +57,23 @@ SET client_encoding TO EUC_JP;
  COPY copy_encoding_tab FROM :'utf8_csv' WITH (FORMAT csv);
  RESET client_encoding;
  
+-- JSON format encoding conversion
+\set json_latin1 :abs_builddir '/results/copyencoding_json_latin1.json'
+COPY (SELECT E'\u00e9' AS c) TO :'json_latin1' WITH (FORMAT json, ENCODING 'LATIN1');
+-- Verify the file contains LATIN1 'é' (single byte 0xe9) and not UTF-8 (0xc3 0xa9).
+-- Done as separate position checks to stay independent of the platform's
+-- end-of-line convention.
+SELECT position('\xe9'::bytea  IN pg_read_binary_file(:'json_latin1')) > 0 AS has_latin1_e9,
+       position('\xc3a9'::bytea IN pg_read_binary_file(:'json_latin1')) > 0 AS has_utf8_e9;
+
+-- Same with implicit encoding inherited from client_encoding (no ENCODING
+-- option).  Covers the case where a client with a non-UTF8 client_encoding
+-- runs COPY ... FORMAT json and would otherwise receive unconverted bytes.
+\set json_implicit :abs_builddir '/results/copyencoding_json_implicit_latin1.json'
+SET client_encoding TO LATIN1;
+COPY (SELECT E'\u00e9' AS c) TO :'json_implicit' WITH (FORMAT json);
+RESET client_encoding;
+SELECT position('\xe9'::bytea  IN pg_read_binary_file(:'json_implicit')) > 0 AS has_latin1_e9,
+       position('\xc3a9'::bytea IN pg_read_binary_file(:'json_implicit')) > 0 AS has_utf8_e9;
+
  DROP TABLE copy_encoding_tab;
author	Andrew Dunstan <andrew@dunslane.net>
	Mon, 20 Apr 2026 05:46:36 +0000 (11:16 +0530)
committer	Andrew Dunstan <andrew@dunslane.net>
	Sat, 30 May 2026 01:48:45 +0000 (21:48 -0400)
src/backend/commands/copyto.c		patch \| blob \| blame \| history
src/test/regress/expected/copyencoding.out		patch \| blob \| blame \| history
src/test/regress/sql/copyencoding.sql		patch \| blob \| blame \| history