Code coverage for most pg_mblen* calls.

author Thomas Munro <tmunro@postgresql.org>

Sun, 11 Jan 2026 21:20:06 +0000 (10:20 +1300)

committer Thomas Munro <tmunro@postgresql.org>

Sun, 8 Feb 2026 23:44:12 +0000 (12:44 +1300)
author Thomas Munro <tmunro@postgresql.org>
Sun, 11 Jan 2026 21:20:06 +0000 (10:20 +1300)
committer Thomas Munro <tmunro@postgresql.org>
Sun, 8 Feb 2026 23:44:12 +0000 (12:44 +1300)
diff --git a/contrib/pg_trgm/Makefile b/contrib/pg_trgm/Makefile

index 1fbdc9ec1ef43c0dda3b5fca12d14b731c2fa996..c1756993ec7ba1bf5f83d73d633d22d0c97e79a9 100644 (file)
--- a/contrib/pg_trgm/Makefile
+++ b/contrib/pg_trgm/Makefile
@@ -14,7 +14,7 @@ DATA = pg_trgm--1.5--1.6.sql pg_trgm--1.4--1.5.sql pg_trgm--1.3--1.4.sql \
         pg_trgm--1.0--1.1.sql
  PGFILEDESC = "pg_trgm - trigram matching"
  
-REGRESS = pg_trgm pg_word_trgm pg_strict_word_trgm
+REGRESS = pg_trgm pg_utf8_trgm pg_word_trgm pg_strict_word_trgm
  
  ifdef USE_PGXS
  PG_CONFIG = pg_config
diff --git a/contrib/pg_trgm/data/trgm_utf8.data b/contrib/pg_trgm/data/trgm_utf8.data

new file mode 100644 (file)

index 0000000..713856e
--- /dev/null
+++ b/contrib/pg_trgm/data/trgm_utf8.data
@@ -0,0 +1,50 @@
+Mathematics
+数学
+गणित
+Matemáticas
+رياضيات
+Mathématiques
+গণিত
+Matemática
+Математика
+ریاضی
+Matematika
+Mathematik
+数学
+Mathematics
+गणित
+గణితం
+Matematik
+கணிதம்
+數學
+Toán học
+Matematika
+数学
+수학
+ریاضی
+Lissafi
+Hisabati
+Matematika
+Matematica
+ریاضی
+ಗಣಿತ
+ગણિત
+คณิตศาสตร์
+ሂሳብ
+गणित
+ਗਣਿਤ
+數學
+数学
+Iṣiro
+數學
+သင်္ချာ
+Herrega
+رياضي
+गणित
+Математика
+Matematyka
+ഗണിതം
+Matematika
+رياضي
+Matematika
+Matematică
diff --git a/contrib/pg_trgm/expected/pg_utf8_trgm.out b/contrib/pg_trgm/expected/pg_utf8_trgm.out

new file mode 100644 (file)

index 0000000..0768e7d
--- /dev/null
+++ b/contrib/pg_trgm/expected/pg_utf8_trgm.out
@@ -0,0 +1,8 @@
+SELECT getdatabaseencoding() <> 'UTF8' AS skip_test \gset
+\if :skip_test
+\quit
+\endif
+-- Index 50 translations of the word "Mathematics"
+CREATE TEMP TABLE mb (s text);
+\copy mb from 'data/trgm_utf8.data'
+CREATE INDEX ON mb USING gist(s gist_trgm_ops);
diff --git a/contrib/pg_trgm/expected/pg_utf8_trgm_1.out b/contrib/pg_trgm/expected/pg_utf8_trgm_1.out

new file mode 100644 (file)

index 0000000..8505c4f
--- /dev/null
+++ b/contrib/pg_trgm/expected/pg_utf8_trgm_1.out
@@ -0,0 +1,3 @@
+SELECT getdatabaseencoding() <> 'UTF8' AS skip_test \gset
+\if :skip_test
+\quit
diff --git a/contrib/pg_trgm/meson.build b/contrib/pg_trgm/meson.build

index 3cc299d5eaa3f44e10184ff204d93950fd94a15f..3ecf95ba862e9ca5045f8f19be7ca75c4c1b16fa 100644 (file)
--- a/contrib/pg_trgm/meson.build
+++ b/contrib/pg_trgm/meson.build
@@ -39,6 +39,7 @@ tests += {
    'regress': {
      'sql': [
        'pg_trgm',
+      'pg_utf8_trgm',
        'pg_word_trgm',
        'pg_strict_word_trgm',
      ],
diff --git a/contrib/pg_trgm/sql/pg_utf8_trgm.sql b/contrib/pg_trgm/sql/pg_utf8_trgm.sql

new file mode 100644 (file)

index 0000000..0dd962c
--- /dev/null
+++ b/contrib/pg_trgm/sql/pg_utf8_trgm.sql
@@ -0,0 +1,9 @@
+SELECT getdatabaseencoding() <> 'UTF8' AS skip_test \gset
+\if :skip_test
+\quit
+\endif
+
+-- Index 50 translations of the word "Mathematics"
+CREATE TEMP TABLE mb (s text);
+\copy mb from 'data/trgm_utf8.data'
+CREATE INDEX ON mb USING gist(s gist_trgm_ops);
diff --git a/src/backend/utils/adt/arrayfuncs.c b/src/backend/utils/adt/arrayfuncs.c

index da68915ee20aa3c05ecb994a8c7f86582d3bb734..734e5fea45e4b1d33946fd03c5f096eeef37b0e4 100644 (file)
--- a/src/backend/utils/adt/arrayfuncs.c
+++ b/src/backend/utils/adt/arrayfuncs.c
@@ -3736,6 +3736,12 @@ deconstruct_array_builtin(const ArrayType *array,
                         elmalign = TYPALIGN_SHORT;
                         break;
  
+               case INT4OID:
+                       elmlen = sizeof(int32);
+                       elmbyval = true;
+                       elmalign = TYPALIGN_INT;
+                       break;
+
                 case OIDOID:
                         elmlen = sizeof(Oid);
                         elmbyval = true;
diff --git a/src/test/regress/expected/copyencoding.out b/src/test/regress/expected/copyencoding.out

index cfa2ed6df0081c3002d7c9c08a27edc62c867d84..76ea0e7cf04b29b5cf1e82192a7b44a8613dafa8 100644 (file)
--- a/src/test/regress/expected/copyencoding.out
+++ b/src/test/regress/expected/copyencoding.out
@@ -17,6 +17,13 @@ CREATE TABLE copy_encoding_tab (t text);
  COPY (SELECT E'\u3042') TO :'utf8_csv' WITH (FORMAT csv, ENCODING 'UTF8');
  -- Read UTF8 data as LATIN1: no error
  COPY copy_encoding_tab FROM :'utf8_csv' WITH (FORMAT csv, ENCODING 'LATIN1');
+-- Non-server encodings have distinct code paths.
+\set fname :abs_builddir '/results/copyencoding_gb18030.csv'
+COPY (SELECT E'\u3042,') TO :'fname' WITH (FORMAT csv, ENCODING 'GB18030');
+COPY copy_encoding_tab FROM :'fname' WITH (FORMAT csv, ENCODING 'GB18030');
+\set fname :abs_builddir '/results/copyencoding_gb18030.data'
+COPY (SELECT E'\u3042,') TO :'fname' WITH (FORMAT text, ENCODING 'GB18030');
+COPY copy_encoding_tab FROM :'fname' WITH (FORMAT text, ENCODING 'GB18030');
  -- Use client_encoding
  SET client_encoding TO UTF8;
  -- U+3042 HIRAGANA LETTER A
diff --git a/src/test/regress/expected/encoding.out b/src/test/regress/expected/encoding.out

new file mode 100644 (file)

index 0000000..ea1f38c
--- /dev/null
+++ b/src/test/regress/expected/encoding.out
@@ -0,0 +1,401 @@
+/* skip test if not UTF8 server encoding */
+SELECT getdatabaseencoding() <> 'UTF8' AS skip_test \gset
+\if :skip_test
+\quit
+\endif
+\getenv libdir PG_LIBDIR
+\getenv dlsuffix PG_DLSUFFIX
+\set regresslib :libdir '/regress' :dlsuffix
+CREATE FUNCTION test_bytea_to_text(bytea) RETURNS text
+    AS :'regresslib' LANGUAGE C STRICT;
+CREATE FUNCTION test_text_to_bytea(text) RETURNS bytea
+    AS :'regresslib' LANGUAGE C STRICT;
+CREATE FUNCTION test_mblen_func(text, text, text, int) RETURNS int
+    AS :'regresslib' LANGUAGE C STRICT;
+CREATE FUNCTION test_text_to_wchars(text, text) RETURNS int[]
+    AS :'regresslib' LANGUAGE C STRICT;
+CREATE FUNCTION test_wchars_to_text(text, int[]) RETURNS text
+    AS :'regresslib' LANGUAGE C STRICT;
+CREATE FUNCTION test_valid_server_encoding(text) RETURNS boolean
+    AS :'regresslib' LANGUAGE C STRICT;
+CREATE TABLE regress_encoding(good text, truncated text, with_nul text, truncated_with_nul text);
+INSERT INTO regress_encoding
+VALUES ('café',
+        'caf' || test_bytea_to_text('\xc3'),
+        'café' || test_bytea_to_text('\x00') || 'dcba',
+        'caf' || test_bytea_to_text('\xc300') || 'dcba');
+SELECT good, truncated, with_nul FROM regress_encoding;
+ good | truncated | with_nul 
+------+-----------+----------
+ café | caf       | café
+(1 row)
+
+SELECT length(good) FROM regress_encoding;
+ length 
+--------
+      4
+(1 row)
+
+SELECT substring(good, 3, 1) FROM regress_encoding;
+ substring 
+-----------
+ f
+(1 row)
+
+SELECT substring(good, 4, 1) FROM regress_encoding;
+ substring 
+-----------
+ é
+(1 row)
+
+SELECT regexp_replace(good, '^caf(.)$', '\1') FROM regress_encoding;
+ regexp_replace 
+----------------
+ é
+(1 row)
+
+SELECT reverse(good) FROM regress_encoding;
+ reverse 
+---------
+ éfac
+(1 row)
+
+-- invalid short mb character = error
+SELECT length(truncated) FROM regress_encoding;
+ERROR:  invalid byte sequence for encoding "UTF8": 0xc3
+SELECT substring(truncated, 1, 1) FROM regress_encoding;
+ERROR:  invalid byte sequence for encoding "UTF8": 0xc3
+SELECT reverse(truncated) FROM regress_encoding;
+ERROR:  invalid byte sequence for encoding "UTF8": 0xc3
+-- invalid short mb character = silently dropped
+SELECT regexp_replace(truncated, '^caf(.)$', '\1') FROM regress_encoding;
+ regexp_replace 
+----------------
+ caf
+(1 row)
+
+-- PostgreSQL doesn't allow strings to contain NUL.  If a corrupted string
+-- contains NUL at a character boundary position, some functions treat it as a
+-- character while others treat it as a terminator, as implementation details.
+-- NUL = terminator
+SELECT length(with_nul) FROM regress_encoding;
+ length 
+--------
+      4
+(1 row)
+
+SELECT substring(with_nul, 3, 1) FROM regress_encoding;
+ substring 
+-----------
+ f
+(1 row)
+
+SELECT substring(with_nul, 4, 1) FROM regress_encoding;
+ substring 
+-----------
+ é
+(1 row)
+
+SELECT substring(with_nul, 5, 1) FROM regress_encoding;
+ substring 
+-----------
+ 
+(1 row)
+
+SELECT convert_to(substring(with_nul, 5, 1), 'UTF8') FROM regress_encoding;
+ convert_to 
+------------
+ \x
+(1 row)
+
+SELECT regexp_replace(with_nul, '^caf(.)$', '\1') FROM regress_encoding;
+ regexp_replace 
+----------------
+ é
+(1 row)
+
+-- NUL = character
+SELECT with_nul, reverse(with_nul), reverse(reverse(with_nul)) FROM regress_encoding;
+ with_nul | reverse | reverse 
+----------+---------+---------
+ café     | abcd    | café
+(1 row)
+
+-- If a corrupted string contains NUL in the tail bytes of a multibyte
+-- character (invalid in all encodings), it is considered part of the
+-- character for length purposes.  An error will only be raised in code paths
+-- that convert or verify encodings.
+SELECT length(truncated_with_nul) FROM regress_encoding;
+ length 
+--------
+      8
+(1 row)
+
+SELECT substring(truncated_with_nul, 3, 1) FROM regress_encoding;
+ substring 
+-----------
+ f
+(1 row)
+
+SELECT substring(truncated_with_nul, 4, 1) FROM regress_encoding;
+ substring 
+-----------
+ 
+(1 row)
+
+SELECT convert_to(substring(truncated_with_nul, 4, 1), 'UTF8') FROM regress_encoding;
+ERROR:  invalid byte sequence for encoding "UTF8": 0xc3 0x00
+SELECT substring(truncated_with_nul, 5, 1) FROM regress_encoding;
+ substring 
+-----------
+ d
+(1 row)
+
+SELECT regexp_replace(truncated_with_nul, '^caf(.)dcba$', '\1') = test_bytea_to_text('\xc300') FROM regress_encoding;
+ ?column? 
+----------
+ t
+(1 row)
+
+SELECT reverse(truncated_with_nul) FROM regress_encoding;
+ reverse 
+---------
+ abcd
+(1 row)
+
+-- unbounded: sequence would overrun the string!
+SELECT test_mblen_func('pg_mblen_unbounded', 'UTF8', truncated, 3)
+FROM regress_encoding;
+ test_mblen_func 
+-----------------
+               2
+(1 row)
+
+-- condition detected when using the length/range variants
+SELECT test_mblen_func('pg_mblen_with_len', 'UTF8', truncated, 3)
+FROM regress_encoding;
+ERROR:  invalid byte sequence for encoding "UTF8": 0xc3
+SELECT test_mblen_func('pg_mblen_range', 'UTF8', truncated, 3)
+FROM regress_encoding;
+ERROR:  invalid byte sequence for encoding "UTF8": 0xc3
+-- unbounded: sequence would overrun the string, if the terminator were really
+-- the end of it
+SELECT test_mblen_func('pg_mblen_unbounded', 'UTF8', truncated_with_nul, 3)
+FROM regress_encoding;
+ test_mblen_func 
+-----------------
+               2
+(1 row)
+
+SELECT test_mblen_func('pg_encoding_mblen', 'GB18030', truncated_with_nul, 3)
+FROM regress_encoding;
+ test_mblen_func 
+-----------------
+               2
+(1 row)
+
+-- condition detected when using the cstr variants
+SELECT test_mblen_func('pg_mblen_cstr', 'UTF8', truncated_with_nul, 3)
+FROM regress_encoding;
+ERROR:  invalid byte sequence for encoding "UTF8": 0xc3
+DROP TABLE regress_encoding;
+-- mb<->wchar conversions
+CREATE FUNCTION test_encoding(encoding text, description text, input bytea)
+RETURNS VOID LANGUAGE plpgsql AS
+$$
+DECLARE
+       prefix text;
+       len int;
+       wchars int[];
+       round_trip bytea;
+       result text;
+BEGIN
+       prefix := rpad(encoding || ' ' || description || ':', 28);
+
+       -- XXX could also test validation, length functions and include client
+       -- only encodings with these test cases
+
+       IF test_valid_server_encoding(encoding) THEN
+               wchars := test_text_to_wchars(encoding, test_bytea_to_text(input));
+               round_trip = test_text_to_bytea(test_wchars_to_text(encoding, wchars));
+               if input = round_trip then
+                       result := 'OK';
+               elsif length(input) > length(round_trip) and round_trip = substr(input, 1, length(round_trip)) then
+                       result := 'truncated';
+               else
+                       result := 'failed';
+               end if;
+               RAISE NOTICE '% % -> % -> % = %', prefix, input, wchars, round_trip, result;
+       END IF;
+END;
+$$;
+-- No validation is done on the encoding itself, just the length to avoid
+-- overruns, so some of the byte sequences below are bogus.  They cover
+-- all code branches, server encodings only for now.
+CREATE TABLE encoding_tests (encoding text, description text, input bytea);
+INSERT INTO encoding_tests VALUES
+       -- LATIN1, other single-byte encodings
+       ('LATIN1', 'ASCII',    'a'),
+       ('LATIN1', 'extended', '\xe9'),
+       -- EUC_JP, EUC_JIS_2004, EUR_KR (for the purposes of wchar conversion):
+       -- 2 8e (CS2, not used by EUR_KR but arbitrarily considered to have EUC_JP length)
+       -- 3 8f (CS3, not used by EUR_KR but arbitrarily considered to have EUC_JP length)
+       -- 2 80..ff (CS1)
+       ('EUC_JP', 'ASCII',      'a'),
+       ('EUC_JP', 'CS1, short', '\x80'),
+       ('EUC_JP', 'CS1',        '\x8002'),
+       ('EUC_JP', 'CS2, short', '\x8e'),
+       ('EUC_JP', 'CS2',        '\x8e02'),
+       ('EUC_JP', 'CS3, short', '\x8f'),
+       ('EUC_JP', 'CS3, short', '\x8f02'),
+       ('EUC_JP', 'CS3',        '\x8f0203'),
+       -- EUC_CN
+       -- 3 8e (CS2, not used but arbitrarily considered to have length 3)
+       -- 3 8f (CS3, not used but arbitrarily considered to have length 3)
+       -- 2 80..ff (CS1)
+       ('EUC_CN', 'ASCII',      'a'),
+       ('EUC_CN', 'CS1, short', '\x80'),
+       ('EUC_CN', 'CS1',        '\x8002'),
+       ('EUC_CN', 'CS2, short', '\x8e'),
+       ('EUC_CN', 'CS2, short', '\x8e02'),
+       ('EUC_CN', 'CS2',        '\x8e0203'),
+       ('EUC_CN', 'CS3, short', '\x8f'),
+       ('EUC_CN', 'CS3, short', '\x8f02'),
+       ('EUC_CN', 'CS3',        '\x8f0203'),
+       -- EUC_TW:
+       -- 4 8e (CS2)
+       -- 3 8f (CS3, not used but arbitrarily considered to have length 3)
+       -- 2 80..ff (CS1)
+       ('EUC_TW', 'ASCII',      'a'),
+       ('EUC_TW', 'CS1, short', '\x80'),
+       ('EUC_TW', 'CS1',        '\x8002'),
+       ('EUC_TW', 'CS2, short', '\x8e'),
+       ('EUC_TW', 'CS2, short', '\x8e02'),
+       ('EUC_TW', 'CS2, short', '\x8e0203'),
+       ('EUC_TW', 'CS2',        '\x8e020304'),
+       ('EUC_TW', 'CS3, short', '\x8f'),
+       ('EUC_TW', 'CS3, short', '\x8f02'),
+       ('EUC_TW', 'CS3',        '\x8f0203'),
+       -- UTF8
+       -- 2 c0..df
+       -- 3 e0..ef
+       -- 4 f0..f7 (but maximum real codepoint U+10ffff has f4)
+       -- 5 f8..fb (not supported)
+       -- 6 fc..fd (not supported)
+       ('UTF8',   'ASCII',               'a'),
+       ('UTF8',   '2 byte, short',       '\xdf'),
+       ('UTF8',   '2 byte',              '\xdf82'),
+       ('UTF8',   '3 byte, short',       '\xef'),
+       ('UTF8',   '3 byte, short',       '\xef82'),
+       ('UTF8',   '3 byte',              '\xef8283'),
+       ('UTF8',   '4 byte, short',       '\xf7'),
+       ('UTF8',   '4 byte, short',       '\xf782'),
+       ('UTF8',   '4 byte, short',       '\xf78283'),
+       ('UTF8',   '4 byte',              '\xf7828384'),
+       ('UTF8',   '5 byte, unsupported', '\xfb'),
+       ('UTF8',   '5 byte, unsupported', '\xfb82'),
+       ('UTF8',   '5 byte, unsupported', '\xfb8283'),
+       ('UTF8',   '5 byte, unsupported', '\xfb828384'),
+       ('UTF8',   '5 byte, unsupported', '\xfb82838485'),
+       ('UTF8',   '6 byte, unsupported', '\xfd'),
+       ('UTF8',   '6 byte, unsupported', '\xfd82'),
+       ('UTF8',   '6 byte, unsupported', '\xfd8283'),
+       ('UTF8',   '6 byte, unsupported', '\xfd828384'),
+       ('UTF8',   '6 byte, unsupported', '\xfd82838485'),
+       ('UTF8',   '6 byte, unsupported', '\xfd8283848586'),
+       -- MULE_INTERNAL
+       -- 2 81..8d LC1
+       -- 3 90..99 LC2
+       ('MULE_INTERNAL', 'ASCII',         'a'),
+       ('MULE_INTERNAL', 'LC1, short',    '\x81'),
+       ('MULE_INTERNAL', 'LC1',           '\x8182'),
+       ('MULE_INTERNAL', 'LC2, short',    '\x90'),
+       ('MULE_INTERNAL', 'LC2, short',    '\x9082'),
+       ('MULE_INTERNAL', 'LC2',           '\x908283');
+SELECT COUNT(test_encoding(encoding, description, input)) > 0
+FROM encoding_tests;
+NOTICE:  LATIN1 ASCII:                \x61 -> {97} -> \x61 = OK
+NOTICE:  LATIN1 extended:             \xe9 -> {233} -> \xe9 = OK
+NOTICE:  EUC_JP ASCII:                \x61 -> {97} -> \x61 = OK
+NOTICE:  EUC_JP CS1, short:           \x80 -> {} -> \x = truncated
+NOTICE:  EUC_JP CS1:                  \x8002 -> {32770} -> \x8002 = OK
+NOTICE:  EUC_JP CS2, short:           \x8e -> {} -> \x = truncated
+NOTICE:  EUC_JP CS2:                  \x8e02 -> {36354} -> \x8e02 = OK
+NOTICE:  EUC_JP CS3, short:           \x8f -> {} -> \x = truncated
+NOTICE:  EUC_JP CS3, short:           \x8f02 -> {} -> \x = truncated
+NOTICE:  EUC_JP CS3:                  \x8f0203 -> {9372163} -> \x8f0203 = OK
+NOTICE:  EUC_CN ASCII:                \x61 -> {97} -> \x61 = OK
+NOTICE:  EUC_CN CS1, short:           \x80 -> {} -> \x = truncated
+NOTICE:  EUC_CN CS1:                  \x8002 -> {32770} -> \x8002 = OK
+NOTICE:  EUC_CN CS2, short:           \x8e -> {} -> \x = truncated
+NOTICE:  EUC_CN CS2, short:           \x8e02 -> {} -> \x = truncated
+NOTICE:  EUC_CN CS2:                  \x8e0203 -> {9306627} -> \x8e0203 = OK
+NOTICE:  EUC_CN CS3, short:           \x8f -> {} -> \x = truncated
+NOTICE:  EUC_CN CS3, short:           \x8f02 -> {} -> \x = truncated
+NOTICE:  EUC_CN CS3:                  \x8f0203 -> {9372163} -> \x8f0203 = OK
+NOTICE:  EUC_TW ASCII:                \x61 -> {97} -> \x61 = OK
+NOTICE:  EUC_TW CS1, short:           \x80 -> {} -> \x = truncated
+NOTICE:  EUC_TW CS1:                  \x8002 -> {32770} -> \x8002 = OK
+NOTICE:  EUC_TW CS2, short:           \x8e -> {} -> \x = truncated
+NOTICE:  EUC_TW CS2, short:           \x8e02 -> {} -> \x = truncated
+NOTICE:  EUC_TW CS2, short:           \x8e0203 -> {} -> \x = truncated
+NOTICE:  EUC_TW CS2:                  \x8e020304 -> {-1912470780} -> \x8e020304 = OK
+NOTICE:  EUC_TW CS3, short:           \x8f -> {} -> \x = truncated
+NOTICE:  EUC_TW CS3, short:           \x8f02 -> {} -> \x = truncated
+NOTICE:  EUC_TW CS3:                  \x8f0203 -> {9372163} -> \x8f0203 = OK
+NOTICE:  UTF8 ASCII:                  \x61 -> {97} -> \x61 = OK
+NOTICE:  UTF8 2 byte, short:          \xdf -> {} -> \x = truncated
+NOTICE:  UTF8 2 byte:                 \xdf82 -> {1986} -> \xdf82 = OK
+NOTICE:  UTF8 3 byte, short:          \xef -> {} -> \x = truncated
+NOTICE:  UTF8 3 byte, short:          \xef82 -> {} -> \x = truncated
+NOTICE:  UTF8 3 byte:                 \xef8283 -> {61571} -> \xef8283 = OK
+NOTICE:  UTF8 4 byte, short:          \xf7 -> {} -> \x = truncated
+NOTICE:  UTF8 4 byte, short:          \xf782 -> {} -> \x = truncated
+NOTICE:  UTF8 4 byte, short:          \xf78283 -> {} -> \x = truncated
+NOTICE:  UTF8 4 byte:                 \xf7828384 -> {1843396} -> \xf7828384 = OK
+NOTICE:  UTF8 5 byte, unsupported:    \xfb -> {251} -> \xc3bb = failed
+NOTICE:  UTF8 5 byte, unsupported:    \xfb82 -> {251,130} -> \xc3bbc282 = failed
+NOTICE:  UTF8 5 byte, unsupported:    \xfb8283 -> {251,130,131} -> \xc3bbc282c283 = failed
+NOTICE:  UTF8 5 byte, unsupported:    \xfb828384 -> {251,130,131,132} -> \xc3bbc282c283c284 = failed
+NOTICE:  UTF8 5 byte, unsupported:    \xfb82838485 -> {251,130,131,132,133} -> \xc3bbc282c283c284c285 = failed
+NOTICE:  UTF8 6 byte, unsupported:    \xfd -> {253} -> \xc3bd = failed
+NOTICE:  UTF8 6 byte, unsupported:    \xfd82 -> {253,130} -> \xc3bdc282 = failed
+NOTICE:  UTF8 6 byte, unsupported:    \xfd8283 -> {253,130,131} -> \xc3bdc282c283 = failed
+NOTICE:  UTF8 6 byte, unsupported:    \xfd828384 -> {253,130,131,132} -> \xc3bdc282c283c284 = failed
+NOTICE:  UTF8 6 byte, unsupported:    \xfd82838485 -> {253,130,131,132,133} -> \xc3bdc282c283c284c285 = failed
+NOTICE:  UTF8 6 byte, unsupported:    \xfd8283848586 -> {253,130,131,132,133,134} -> \xc3bdc282c283c284c285c286 = failed
+NOTICE:  MULE_INTERNAL ASCII:         \x61 -> {97} -> \x61 = OK
+NOTICE:  MULE_INTERNAL LC1, short:    \x81 -> {} -> \x = truncated
+NOTICE:  MULE_INTERNAL LC1:           \x8182 -> {8454274} -> \x8182 = OK
+NOTICE:  MULE_INTERNAL LC2, short:    \x90 -> {} -> \x = truncated
+NOTICE:  MULE_INTERNAL LC2, short:    \x9082 -> {} -> \x = truncated
+NOTICE:  MULE_INTERNAL LC2:           \x908283 -> {9470595} -> \x908283 = OK
+ ?column? 
+----------
+ t
+(1 row)
+
+DROP TABLE encoding_tests;
+DROP FUNCTION test_encoding;
+DROP FUNCTION test_text_to_wchars;
+DROP FUNCTION test_mblen_func;
+DROP FUNCTION test_bytea_to_text;
+DROP FUNCTION test_text_to_bytea;
+-- substring slow path: multi-byte escape char vs. multi-byte pattern char.
+SELECT SUBSTRING('a' SIMILAR U&'\00AC' ESCAPE U&'\00A7');
+ substring 
+-----------
+ 
+(1 row)
+
+-- Levenshtein distance metric: exercise character length cache.
+SELECT U&"real\00A7_name" FROM (select 1) AS x(real_name);
+ERROR:  column "real§_name" does not exist
+LINE 1: SELECT U&"real\00A7_name" FROM (select 1) AS x(real_name);
+               ^
+HINT:  Perhaps you meant to reference the column "x.real_name".
+-- JSON errcontext: truncate long data.
+SELECT repeat(U&'\00A7', 30)::json;
+ERROR:  invalid input syntax for type json
+DETAIL:  Token "§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§" is invalid.
+CONTEXT:  JSON data, line 1: ...§§§§§§§§§§§§§§§§§§§§§§§§
diff --git a/src/test/regress/expected/encoding_1.out b/src/test/regress/expected/encoding_1.out

new file mode 100644 (file)

index 0000000..a5b0209
--- /dev/null
+++ b/src/test/regress/expected/encoding_1.out
@@ -0,0 +1,4 @@
+/* skip test if not UTF8 server encoding */
+SELECT getdatabaseencoding() <> 'UTF8' AS skip_test \gset
+\if :skip_test
+\quit
diff --git a/src/test/regress/expected/euc_kr.out b/src/test/regress/expected/euc_kr.out

new file mode 100644 (file)

index 0000000..7a61c89
--- /dev/null
+++ b/src/test/regress/expected/euc_kr.out
@@ -0,0 +1,16 @@
+-- This test is about EUC_KR encoding, chosen as perhaps the most prevalent
+-- non-UTF8, multibyte encoding as of 2026-01.  Since UTF8 can represent all
+-- of EUC_KR, also run the test in UTF8.
+SELECT getdatabaseencoding() NOT IN ('EUC_KR', 'UTF8') AS skip_test \gset
+\if :skip_test
+\quit
+\endif
+-- Exercise is_multibyte_char_in_char (non-UTF8) slow path.
+SELECT POSITION(
+       convert_from('\xbcf6c7d0', 'EUC_KR') IN
+       convert_from('\xb0fac7d02c20bcf6c7d02c20b1e2bcfa2c20bbee', 'EUC_KR'));
+ position 
+----------
+        5
+(1 row)
+
diff --git a/src/test/regress/expected/euc_kr_1.out b/src/test/regress/expected/euc_kr_1.out

new file mode 100644 (file)

index 0000000..faaac5d
--- /dev/null
+++ b/src/test/regress/expected/euc_kr_1.out
@@ -0,0 +1,6 @@
+-- This test is about EUC_KR encoding, chosen as perhaps the most prevalent
+-- non-UTF8, multibyte encoding as of 2026-01.  Since UTF8 can represent all
+-- of EUC_KR, also run the test in UTF8.
+SELECT getdatabaseencoding() NOT IN ('EUC_KR', 'UTF8') AS skip_test \gset
+\if :skip_test
+\quit
diff --git a/src/test/regress/parallel_schedule b/src/test/regress/parallel_schedule

index 021d57f66bbd19858a2ccc2067ceec47a6be8e41..549e9b2d7be4aaabbf2c83c5e7688c84522edd72 100644 (file)
--- a/src/test/regress/parallel_schedule
+++ b/src/test/regress/parallel_schedule
@@ -28,7 +28,7 @@ test: strings md5 numerology point lseg line box path polygon circle date time t
  # geometry depends on point, lseg, line, box, path, polygon, circle
  # horology depends on date, time, timetz, timestamp, timestamptz, interval
  # ----------
-test: geometry horology tstypes regex type_sanity opr_sanity misc_sanity comments expressions unicode xid mvcc database stats_import pg_ndistinct pg_dependencies oid8
+test: geometry horology tstypes regex type_sanity opr_sanity misc_sanity comments expressions unicode xid mvcc database stats_import pg_ndistinct pg_dependencies oid8 encoding euc_kr
  
  # ----------
  # Load huge amounts of data
diff --git a/src/test/regress/regress.c b/src/test/regress/regress.c

index ce5f5f9eb196b0fd87bd1330547647b76df643fa..bea858f03c19a26b1cb327af89d5cccf6580e70a 100644 (file)
--- a/src/test/regress/regress.c
+++ b/src/test/regress/regress.c
@@ -1115,6 +1115,145 @@ test_enc_conversion(PG_FUNCTION_ARGS)
         PG_RETURN_DATUM(HeapTupleGetDatum(tuple));
  }
  
+/* Convert bytea to text without validation for corruption tests from SQL. */
+PG_FUNCTION_INFO_V1(test_bytea_to_text);
+Datum
+test_bytea_to_text(PG_FUNCTION_ARGS)
+{
+       PG_RETURN_TEXT_P(PG_GETARG_BYTEA_PP(0));
+}
+
+/* And the reverse. */
+PG_FUNCTION_INFO_V1(test_text_to_bytea);
+Datum
+test_text_to_bytea(PG_FUNCTION_ARGS)
+{
+       PG_RETURN_BYTEA_P(PG_GETARG_TEXT_PP(0));
+}
+
+/* Corruption tests in C. */
+PG_FUNCTION_INFO_V1(test_mblen_func);
+Datum
+test_mblen_func(PG_FUNCTION_ARGS)
+{
+       const char *func = text_to_cstring(PG_GETARG_BYTEA_PP(0));
+       const char *encoding = text_to_cstring(PG_GETARG_BYTEA_PP(1));
+       text       *string = PG_GETARG_BYTEA_PP(2);
+       int                     offset = PG_GETARG_INT32(3);
+       const char *data = VARDATA_ANY(string);
+       size_t          size = VARSIZE_ANY_EXHDR(string);
+       int                     result = 0;
+
+       if (strcmp(func, "pg_mblen_unbounded") == 0)
+               result = pg_mblen_unbounded(data + offset);
+       else if (strcmp(func, "pg_mblen_cstr") == 0)
+               result = pg_mblen_cstr(data + offset);
+       else if (strcmp(func, "pg_mblen_with_len") == 0)
+               result = pg_mblen_with_len(data + offset, size - offset);
+       else if (strcmp(func, "pg_mblen_range") == 0)
+               result = pg_mblen_range(data + offset, data + size);
+       else if (strcmp(func, "pg_encoding_mblen") == 0)
+               result = pg_encoding_mblen(pg_char_to_encoding(encoding), data + offset);
+       else
+               elog(ERROR, "unknown function");
+
+       PG_RETURN_INT32(result);
+}
+
+PG_FUNCTION_INFO_V1(test_text_to_wchars);
+Datum
+test_text_to_wchars(PG_FUNCTION_ARGS)
+{
+       const char *encoding_name = text_to_cstring(PG_GETARG_BYTEA_PP(0));
+       text       *string = PG_GETARG_TEXT_PP(1);
+       const char *data = VARDATA_ANY(string);
+       size_t          size = VARSIZE_ANY_EXHDR(string);
+       pg_wchar   *wchars = palloc(sizeof(pg_wchar) * (size + 1));
+       Datum      *datums;
+       int                     wlen;
+       int                     encoding;
+
+       encoding = pg_char_to_encoding(encoding_name);
+       if (encoding < 0)
+               elog(ERROR, "unknown encoding name: %s", encoding_name);
+
+       if (size > 0)
+       {
+               datums = palloc(sizeof(Datum) * size);
+               wlen = pg_encoding_mb2wchar_with_len(encoding,
+                                                                                        data,
+                                                                                        wchars,
+                                                                                        size);
+               Assert(wlen >= 0);
+               Assert(wlen <= size);
+               Assert(wchars[wlen] == 0);
+
+               for (int i = 0; i < wlen; ++i)
+                       datums[i] = UInt32GetDatum(wchars[i]);
+       }
+       else
+       {
+               datums = NULL;
+               wlen = 0;
+       }
+
+       PG_RETURN_ARRAYTYPE_P(construct_array_builtin(datums, wlen, INT4OID));
+}
+
+PG_FUNCTION_INFO_V1(test_wchars_to_text);
+Datum
+test_wchars_to_text(PG_FUNCTION_ARGS)
+{
+       const char *encoding_name = text_to_cstring(PG_GETARG_BYTEA_PP(0));
+       ArrayType  *array = PG_GETARG_ARRAYTYPE_P(1);
+       Datum      *datums;
+       bool       *nulls;
+       char       *mb;
+       text       *result;
+       int                     wlen;
+       int                     bytes;
+       int                     encoding;
+
+       encoding = pg_char_to_encoding(encoding_name);
+       if (encoding < 0)
+               elog(ERROR, "unknown encoding name: %s", encoding_name);
+
+       deconstruct_array_builtin(array, INT4OID, &datums, &nulls, &wlen);
+
+       if (wlen > 0)
+       {
+               pg_wchar   *wchars = palloc(sizeof(pg_wchar) * wlen);
+
+               for (int i = 0; i < wlen; ++i)
+               {
+                       if (nulls[i])
+                               elog(ERROR, "unexpected NULL in array");
+                       wchars[i] = DatumGetInt32(datums[i]);
+               }
+
+               mb = palloc(pg_encoding_max_length(encoding) * wlen + 1);
+               bytes = pg_encoding_wchar2mb_with_len(encoding, wchars, mb, wlen);
+       }
+       else
+       {
+               mb = "";
+               bytes = 0;
+       }
+
+       result = palloc(bytes + VARHDRSZ);
+       SET_VARSIZE(result, bytes + VARHDRSZ);
+       memcpy(VARDATA(result), mb, bytes);
+
+       PG_RETURN_TEXT_P(result);
+}
+
+PG_FUNCTION_INFO_V1(test_valid_server_encoding);
+Datum
+test_valid_server_encoding(PG_FUNCTION_ARGS)
+{
+       return pg_valid_server_encoding(text_to_cstring(PG_GETARG_TEXT_PP(0)));
+}
+
  /* Provide SQL access to IsBinaryCoercible() */
  PG_FUNCTION_INFO_V1(binary_coercible);
  Datum
diff --git a/src/test/regress/sql/copyencoding.sql b/src/test/regress/sql/copyencoding.sql

index 4e96a4d6505cf36831f09448987a75d3eafd4dc5..64718245b94f10add0c174f7e3f0e641a365db1b 100644 (file)
--- a/src/test/regress/sql/copyencoding.sql
+++ b/src/test/regress/sql/copyencoding.sql
@@ -23,6 +23,13 @@ CREATE TABLE copy_encoding_tab (t text);
  COPY (SELECT E'\u3042') TO :'utf8_csv' WITH (FORMAT csv, ENCODING 'UTF8');
  -- Read UTF8 data as LATIN1: no error
  COPY copy_encoding_tab FROM :'utf8_csv' WITH (FORMAT csv, ENCODING 'LATIN1');
+-- Non-server encodings have distinct code paths.
+\set fname :abs_builddir '/results/copyencoding_gb18030.csv'
+COPY (SELECT E'\u3042,') TO :'fname' WITH (FORMAT csv, ENCODING 'GB18030');
+COPY copy_encoding_tab FROM :'fname' WITH (FORMAT csv, ENCODING 'GB18030');
+\set fname :abs_builddir '/results/copyencoding_gb18030.data'
+COPY (SELECT E'\u3042,') TO :'fname' WITH (FORMAT text, ENCODING 'GB18030');
+COPY copy_encoding_tab FROM :'fname' WITH (FORMAT text, ENCODING 'GB18030');
  
  -- Use client_encoding
  SET client_encoding TO UTF8;
diff --git a/src/test/regress/sql/encoding.sql b/src/test/regress/sql/encoding.sql

new file mode 100644 (file)

index 0000000..b9543c0
--- /dev/null
+++ b/src/test/regress/sql/encoding.sql
@@ -0,0 +1,228 @@
+/* skip test if not UTF8 server encoding */
+SELECT getdatabaseencoding() <> 'UTF8' AS skip_test \gset
+\if :skip_test
+\quit
+\endif
+
+\getenv libdir PG_LIBDIR
+\getenv dlsuffix PG_DLSUFFIX
+
+\set regresslib :libdir '/regress' :dlsuffix
+
+CREATE FUNCTION test_bytea_to_text(bytea) RETURNS text
+    AS :'regresslib' LANGUAGE C STRICT;
+CREATE FUNCTION test_text_to_bytea(text) RETURNS bytea
+    AS :'regresslib' LANGUAGE C STRICT;
+CREATE FUNCTION test_mblen_func(text, text, text, int) RETURNS int
+    AS :'regresslib' LANGUAGE C STRICT;
+CREATE FUNCTION test_text_to_wchars(text, text) RETURNS int[]
+    AS :'regresslib' LANGUAGE C STRICT;
+CREATE FUNCTION test_wchars_to_text(text, int[]) RETURNS text
+    AS :'regresslib' LANGUAGE C STRICT;
+CREATE FUNCTION test_valid_server_encoding(text) RETURNS boolean
+    AS :'regresslib' LANGUAGE C STRICT;
+
+
+CREATE TABLE regress_encoding(good text, truncated text, with_nul text, truncated_with_nul text);
+INSERT INTO regress_encoding
+VALUES ('café',
+        'caf' || test_bytea_to_text('\xc3'),
+        'café' || test_bytea_to_text('\x00') || 'dcba',
+        'caf' || test_bytea_to_text('\xc300') || 'dcba');
+
+SELECT good, truncated, with_nul FROM regress_encoding;
+
+SELECT length(good) FROM regress_encoding;
+SELECT substring(good, 3, 1) FROM regress_encoding;
+SELECT substring(good, 4, 1) FROM regress_encoding;
+SELECT regexp_replace(good, '^caf(.)$', '\1') FROM regress_encoding;
+SELECT reverse(good) FROM regress_encoding;
+
+-- invalid short mb character = error
+SELECT length(truncated) FROM regress_encoding;
+SELECT substring(truncated, 1, 1) FROM regress_encoding;
+SELECT reverse(truncated) FROM regress_encoding;
+-- invalid short mb character = silently dropped
+SELECT regexp_replace(truncated, '^caf(.)$', '\1') FROM regress_encoding;
+
+-- PostgreSQL doesn't allow strings to contain NUL.  If a corrupted string
+-- contains NUL at a character boundary position, some functions treat it as a
+-- character while others treat it as a terminator, as implementation details.
+
+-- NUL = terminator
+SELECT length(with_nul) FROM regress_encoding;
+SELECT substring(with_nul, 3, 1) FROM regress_encoding;
+SELECT substring(with_nul, 4, 1) FROM regress_encoding;
+SELECT substring(with_nul, 5, 1) FROM regress_encoding;
+SELECT convert_to(substring(with_nul, 5, 1), 'UTF8') FROM regress_encoding;
+SELECT regexp_replace(with_nul, '^caf(.)$', '\1') FROM regress_encoding;
+-- NUL = character
+SELECT with_nul, reverse(with_nul), reverse(reverse(with_nul)) FROM regress_encoding;
+
+-- If a corrupted string contains NUL in the tail bytes of a multibyte
+-- character (invalid in all encodings), it is considered part of the
+-- character for length purposes.  An error will only be raised in code paths
+-- that convert or verify encodings.
+
+SELECT length(truncated_with_nul) FROM regress_encoding;
+SELECT substring(truncated_with_nul, 3, 1) FROM regress_encoding;
+SELECT substring(truncated_with_nul, 4, 1) FROM regress_encoding;
+SELECT convert_to(substring(truncated_with_nul, 4, 1), 'UTF8') FROM regress_encoding;
+SELECT substring(truncated_with_nul, 5, 1) FROM regress_encoding;
+SELECT regexp_replace(truncated_with_nul, '^caf(.)dcba$', '\1') = test_bytea_to_text('\xc300') FROM regress_encoding;
+SELECT reverse(truncated_with_nul) FROM regress_encoding;
+
+-- unbounded: sequence would overrun the string!
+SELECT test_mblen_func('pg_mblen_unbounded', 'UTF8', truncated, 3)
+FROM regress_encoding;
+
+-- condition detected when using the length/range variants
+SELECT test_mblen_func('pg_mblen_with_len', 'UTF8', truncated, 3)
+FROM regress_encoding;
+SELECT test_mblen_func('pg_mblen_range', 'UTF8', truncated, 3)
+FROM regress_encoding;
+
+-- unbounded: sequence would overrun the string, if the terminator were really
+-- the end of it
+SELECT test_mblen_func('pg_mblen_unbounded', 'UTF8', truncated_with_nul, 3)
+FROM regress_encoding;
+SELECT test_mblen_func('pg_encoding_mblen', 'GB18030', truncated_with_nul, 3)
+FROM regress_encoding;
+
+-- condition detected when using the cstr variants
+SELECT test_mblen_func('pg_mblen_cstr', 'UTF8', truncated_with_nul, 3)
+FROM regress_encoding;
+
+DROP TABLE regress_encoding;
+
+-- mb<->wchar conversions
+CREATE FUNCTION test_encoding(encoding text, description text, input bytea)
+RETURNS VOID LANGUAGE plpgsql AS
+$$
+DECLARE
+       prefix text;
+       len int;
+       wchars int[];
+       round_trip bytea;
+       result text;
+BEGIN
+       prefix := rpad(encoding || ' ' || description || ':', 28);
+
+       -- XXX could also test validation, length functions and include client
+       -- only encodings with these test cases
+
+       IF test_valid_server_encoding(encoding) THEN
+               wchars := test_text_to_wchars(encoding, test_bytea_to_text(input));
+               round_trip = test_text_to_bytea(test_wchars_to_text(encoding, wchars));
+               if input = round_trip then
+                       result := 'OK';
+               elsif length(input) > length(round_trip) and round_trip = substr(input, 1, length(round_trip)) then
+                       result := 'truncated';
+               else
+                       result := 'failed';
+               end if;
+               RAISE NOTICE '% % -> % -> % = %', prefix, input, wchars, round_trip, result;
+       END IF;
+END;
+$$;
+-- No validation is done on the encoding itself, just the length to avoid
+-- overruns, so some of the byte sequences below are bogus.  They cover
+-- all code branches, server encodings only for now.
+CREATE TABLE encoding_tests (encoding text, description text, input bytea);
+INSERT INTO encoding_tests VALUES
+       -- LATIN1, other single-byte encodings
+       ('LATIN1', 'ASCII',    'a'),
+       ('LATIN1', 'extended', '\xe9'),
+       -- EUC_JP, EUC_JIS_2004, EUR_KR (for the purposes of wchar conversion):
+       -- 2 8e (CS2, not used by EUR_KR but arbitrarily considered to have EUC_JP length)
+       -- 3 8f (CS3, not used by EUR_KR but arbitrarily considered to have EUC_JP length)
+       -- 2 80..ff (CS1)
+       ('EUC_JP', 'ASCII',      'a'),
+       ('EUC_JP', 'CS1, short', '\x80'),
+       ('EUC_JP', 'CS1',        '\x8002'),
+       ('EUC_JP', 'CS2, short', '\x8e'),
+       ('EUC_JP', 'CS2',        '\x8e02'),
+       ('EUC_JP', 'CS3, short', '\x8f'),
+       ('EUC_JP', 'CS3, short', '\x8f02'),
+       ('EUC_JP', 'CS3',        '\x8f0203'),
+       -- EUC_CN
+       -- 3 8e (CS2, not used but arbitrarily considered to have length 3)
+       -- 3 8f (CS3, not used but arbitrarily considered to have length 3)
+       -- 2 80..ff (CS1)
+       ('EUC_CN', 'ASCII',      'a'),
+       ('EUC_CN', 'CS1, short', '\x80'),
+       ('EUC_CN', 'CS1',        '\x8002'),
+       ('EUC_CN', 'CS2, short', '\x8e'),
+       ('EUC_CN', 'CS2, short', '\x8e02'),
+       ('EUC_CN', 'CS2',        '\x8e0203'),
+       ('EUC_CN', 'CS3, short', '\x8f'),
+       ('EUC_CN', 'CS3, short', '\x8f02'),
+       ('EUC_CN', 'CS3',        '\x8f0203'),
+       -- EUC_TW:
+       -- 4 8e (CS2)
+       -- 3 8f (CS3, not used but arbitrarily considered to have length 3)
+       -- 2 80..ff (CS1)
+       ('EUC_TW', 'ASCII',      'a'),
+       ('EUC_TW', 'CS1, short', '\x80'),
+       ('EUC_TW', 'CS1',        '\x8002'),
+       ('EUC_TW', 'CS2, short', '\x8e'),
+       ('EUC_TW', 'CS2, short', '\x8e02'),
+       ('EUC_TW', 'CS2, short', '\x8e0203'),
+       ('EUC_TW', 'CS2',        '\x8e020304'),
+       ('EUC_TW', 'CS3, short', '\x8f'),
+       ('EUC_TW', 'CS3, short', '\x8f02'),
+       ('EUC_TW', 'CS3',        '\x8f0203'),
+       -- UTF8
+       -- 2 c0..df
+       -- 3 e0..ef
+       -- 4 f0..f7 (but maximum real codepoint U+10ffff has f4)
+       -- 5 f8..fb (not supported)
+       -- 6 fc..fd (not supported)
+       ('UTF8',   'ASCII',               'a'),
+       ('UTF8',   '2 byte, short',       '\xdf'),
+       ('UTF8',   '2 byte',              '\xdf82'),
+       ('UTF8',   '3 byte, short',       '\xef'),
+       ('UTF8',   '3 byte, short',       '\xef82'),
+       ('UTF8',   '3 byte',              '\xef8283'),
+       ('UTF8',   '4 byte, short',       '\xf7'),
+       ('UTF8',   '4 byte, short',       '\xf782'),
+       ('UTF8',   '4 byte, short',       '\xf78283'),
+       ('UTF8',   '4 byte',              '\xf7828384'),
+       ('UTF8',   '5 byte, unsupported', '\xfb'),
+       ('UTF8',   '5 byte, unsupported', '\xfb82'),
+       ('UTF8',   '5 byte, unsupported', '\xfb8283'),
+       ('UTF8',   '5 byte, unsupported', '\xfb828384'),
+       ('UTF8',   '5 byte, unsupported', '\xfb82838485'),
+       ('UTF8',   '6 byte, unsupported', '\xfd'),
+       ('UTF8',   '6 byte, unsupported', '\xfd82'),
+       ('UTF8',   '6 byte, unsupported', '\xfd8283'),
+       ('UTF8',   '6 byte, unsupported', '\xfd828384'),
+       ('UTF8',   '6 byte, unsupported', '\xfd82838485'),
+       ('UTF8',   '6 byte, unsupported', '\xfd8283848586'),
+       -- MULE_INTERNAL
+       -- 2 81..8d LC1
+       -- 3 90..99 LC2
+       ('MULE_INTERNAL', 'ASCII',         'a'),
+       ('MULE_INTERNAL', 'LC1, short',    '\x81'),
+       ('MULE_INTERNAL', 'LC1',           '\x8182'),
+       ('MULE_INTERNAL', 'LC2, short',    '\x90'),
+       ('MULE_INTERNAL', 'LC2, short',    '\x9082'),
+       ('MULE_INTERNAL', 'LC2',           '\x908283');
+
+SELECT COUNT(test_encoding(encoding, description, input)) > 0
+FROM encoding_tests;
+
+DROP TABLE encoding_tests;
+DROP FUNCTION test_encoding;
+DROP FUNCTION test_text_to_wchars;
+DROP FUNCTION test_mblen_func;
+DROP FUNCTION test_bytea_to_text;
+DROP FUNCTION test_text_to_bytea;
+
+
+-- substring slow path: multi-byte escape char vs. multi-byte pattern char.
+SELECT SUBSTRING('a' SIMILAR U&'\00AC' ESCAPE U&'\00A7');
+-- Levenshtein distance metric: exercise character length cache.
+SELECT U&"real\00A7_name" FROM (select 1) AS x(real_name);
+-- JSON errcontext: truncate long data.
+SELECT repeat(U&'\00A7', 30)::json;
diff --git a/src/test/regress/sql/euc_kr.sql b/src/test/regress/sql/euc_kr.sql

new file mode 100644 (file)

index 0000000..1851b2a
--- /dev/null
+++ b/src/test/regress/sql/euc_kr.sql
@@ -0,0 +1,12 @@
+-- This test is about EUC_KR encoding, chosen as perhaps the most prevalent
+-- non-UTF8, multibyte encoding as of 2026-01.  Since UTF8 can represent all
+-- of EUC_KR, also run the test in UTF8.
+SELECT getdatabaseencoding() NOT IN ('EUC_KR', 'UTF8') AS skip_test \gset
+\if :skip_test
+\quit
+\endif
+
+-- Exercise is_multibyte_char_in_char (non-UTF8) slow path.
+SELECT POSITION(
+       convert_from('\xbcf6c7d0', 'EUC_KR') IN
+       convert_from('\xb0fac7d02c20bcf6c7d02c20b1e2bcfa2c20bbee', 'EUC_KR'));
author	Thomas Munro <tmunro@postgresql.org>
	Sun, 11 Jan 2026 21:20:06 +0000 (10:20 +1300)
committer	Thomas Munro <tmunro@postgresql.org>
	Sun, 8 Feb 2026 23:44:12 +0000 (12:44 +1300)
contrib/pg_trgm/Makefile		patch \| blob \| blame \| history
contrib/pg_trgm/data/trgm_utf8.data	[new file with mode: 0644]	patch \| blob
contrib/pg_trgm/expected/pg_utf8_trgm.out	[new file with mode: 0644]	patch \| blob
contrib/pg_trgm/expected/pg_utf8_trgm_1.out	[new file with mode: 0644]	patch \| blob
contrib/pg_trgm/meson.build		patch \| blob \| blame \| history
contrib/pg_trgm/sql/pg_utf8_trgm.sql	[new file with mode: 0644]	patch \| blob
src/backend/utils/adt/arrayfuncs.c		patch \| blob \| blame \| history
src/test/regress/expected/copyencoding.out		patch \| blob \| blame \| history
src/test/regress/expected/encoding.out	[new file with mode: 0644]	patch \| blob
src/test/regress/expected/encoding_1.out	[new file with mode: 0644]	patch \| blob
src/test/regress/expected/euc_kr.out	[new file with mode: 0644]	patch \| blob
src/test/regress/expected/euc_kr_1.out	[new file with mode: 0644]	patch \| blob
src/test/regress/parallel_schedule		patch \| blob \| blame \| history
src/test/regress/regress.c		patch \| blob \| blame \| history
src/test/regress/sql/copyencoding.sql		patch \| blob \| blame \| history
src/test/regress/sql/encoding.sql	[new file with mode: 0644]	patch \| blob
src/test/regress/sql/euc_kr.sql	[new file with mode: 0644]	patch \| blob