From: Andrew Dunstan Date: Mon, 20 Apr 2026 05:46:36 +0000 (+0530) Subject: Apply encoding conversion in COPY TO FORMAT JSON X-Git-Tag: REL_19_BETA1~8 X-Git-Url: http://git.ipfire.org/gitweb/index.cgi?a=commitdiff_plain;h=7dc5bbcf220be05786a08aad2455c61b6dbd4b78;p=thirdparty%2Fpostgresql.git Apply encoding conversion in COPY TO FORMAT JSON CopyToJsonOneRow() sent the output of composite_to_json() directly via CopySendData() without encoding conversion. The text and CSV paths convert per-attribute via pg_server_to_any() when need_transcoding is true, but the JSON path skipped this entirely. This meant COPY ... TO ... WITH (FORMAT json, ENCODING 'LATIN1') on a UTF-8 server silently produced UTF-8 output, and COPY TO STDOUT with a non-UTF-8 client_encoding would send unconverted bytes to the client. Apply pg_server_to_any() to the whole JSON buffer after composite_to_json() returns, converting to the requested file encoding when it differs from the server encoding. Tests cover both the explicit ENCODING option and the implicit case where file_encoding is inherited from client_encoding. Introduced by 7dadd38cda9 (json format for COPY TO). Author: Ayush Tiwari Reviewed-by: Andrew Dunstan Discussion: https://postgr.es/m/CAJTYsWX-jsLzxGRAb-dWnEpGYRPbDYHwce8LctVE92LiDfM2Jw@mail.gmail.com --- diff --git a/src/backend/commands/copyto.c b/src/backend/commands/copyto.c index ffed63a2986..6755bb698de 100644 --- a/src/backend/commands/copyto.c +++ b/src/backend/commands/copyto.c @@ -427,7 +427,25 @@ CopyToJsonOneRow(CopyToState cstate, TupleTableSlot *slot) } } - CopySendData(cstate, cstate->json_buf->data, cstate->json_buf->len); + /* + * Convert the JSON output to the target encoding if needed. Unlike the + * text and CSV paths which convert per-attribute via CopyAttributeOut*, + * composite_to_json() emits the whole row as one buffer, so we transcode + * it here in a single call before sending. + */ + if (cstate->need_transcoding) + { + char *converted; + + converted = pg_server_to_any(cstate->json_buf->data, + cstate->json_buf->len, + cstate->file_encoding); + CopySendData(cstate, converted, strlen(converted)); + if (converted != cstate->json_buf->data) + pfree(converted); + } + else + CopySendData(cstate, cstate->json_buf->data, cstate->json_buf->len); CopySendTextLikeEndOfRow(cstate); } diff --git a/src/test/regress/expected/copyencoding.out b/src/test/regress/expected/copyencoding.out index 76ea0e7cf04..1e7ebc7d0e6 100644 --- a/src/test/regress/expected/copyencoding.out +++ b/src/test/regress/expected/copyencoding.out @@ -50,4 +50,31 @@ COPY copy_encoding_tab FROM :'utf8_csv' WITH (FORMAT csv); ERROR: invalid byte sequence for encoding "EUC_JP": 0xe3 0x81 CONTEXT: COPY copy_encoding_tab, line 1 RESET client_encoding; +-- JSON format encoding conversion +\set json_latin1 :abs_builddir '/results/copyencoding_json_latin1.json' +COPY (SELECT E'\u00e9' AS c) TO :'json_latin1' WITH (FORMAT json, ENCODING 'LATIN1'); +-- Verify the file contains LATIN1 'é' (single byte 0xe9) and not UTF-8 (0xc3 0xa9). +-- Done as separate position checks to stay independent of the platform's +-- end-of-line convention. +SELECT position('\xe9'::bytea IN pg_read_binary_file(:'json_latin1')) > 0 AS has_latin1_e9, + position('\xc3a9'::bytea IN pg_read_binary_file(:'json_latin1')) > 0 AS has_utf8_e9; + has_latin1_e9 | has_utf8_e9 +---------------+------------- + t | f +(1 row) + +-- Same with implicit encoding inherited from client_encoding (no ENCODING +-- option). Covers the case where a client with a non-UTF8 client_encoding +-- runs COPY ... FORMAT json and would otherwise receive unconverted bytes. +\set json_implicit :abs_builddir '/results/copyencoding_json_implicit_latin1.json' +SET client_encoding TO LATIN1; +COPY (SELECT E'\u00e9' AS c) TO :'json_implicit' WITH (FORMAT json); +RESET client_encoding; +SELECT position('\xe9'::bytea IN pg_read_binary_file(:'json_implicit')) > 0 AS has_latin1_e9, + position('\xc3a9'::bytea IN pg_read_binary_file(:'json_implicit')) > 0 AS has_utf8_e9; + has_latin1_e9 | has_utf8_e9 +---------------+------------- + t | f +(1 row) + DROP TABLE copy_encoding_tab; diff --git a/src/test/regress/sql/copyencoding.sql b/src/test/regress/sql/copyencoding.sql index 64718245b94..c8905da5567 100644 --- a/src/test/regress/sql/copyencoding.sql +++ b/src/test/regress/sql/copyencoding.sql @@ -57,4 +57,23 @@ SET client_encoding TO EUC_JP; COPY copy_encoding_tab FROM :'utf8_csv' WITH (FORMAT csv); RESET client_encoding; +-- JSON format encoding conversion +\set json_latin1 :abs_builddir '/results/copyencoding_json_latin1.json' +COPY (SELECT E'\u00e9' AS c) TO :'json_latin1' WITH (FORMAT json, ENCODING 'LATIN1'); +-- Verify the file contains LATIN1 'é' (single byte 0xe9) and not UTF-8 (0xc3 0xa9). +-- Done as separate position checks to stay independent of the platform's +-- end-of-line convention. +SELECT position('\xe9'::bytea IN pg_read_binary_file(:'json_latin1')) > 0 AS has_latin1_e9, + position('\xc3a9'::bytea IN pg_read_binary_file(:'json_latin1')) > 0 AS has_utf8_e9; + +-- Same with implicit encoding inherited from client_encoding (no ENCODING +-- option). Covers the case where a client with a non-UTF8 client_encoding +-- runs COPY ... FORMAT json and would otherwise receive unconverted bytes. +\set json_implicit :abs_builddir '/results/copyencoding_json_implicit_latin1.json' +SET client_encoding TO LATIN1; +COPY (SELECT E'\u00e9' AS c) TO :'json_implicit' WITH (FORMAT json); +RESET client_encoding; +SELECT position('\xe9'::bytea IN pg_read_binary_file(:'json_implicit')) > 0 AS has_latin1_e9, + position('\xc3a9'::bytea IN pg_read_binary_file(:'json_implicit')) > 0 AS has_utf8_e9; + DROP TABLE copy_encoding_tab;