Fix SUBSTRING() for toasted multibyte characters.

author Noah Misch <noah@leadboat.com>

Sat, 14 Feb 2026 20:16:16 +0000 (12:16 -0800)

committer Noah Misch <noah@leadboat.com>

Sat, 14 Feb 2026 20:16:16 +0000 (12:16 -0800)
author Noah Misch <noah@leadboat.com>
Sat, 14 Feb 2026 20:16:16 +0000 (12:16 -0800)
committer Noah Misch <noah@leadboat.com>
Sat, 14 Feb 2026 20:16:16 +0000 (12:16 -0800)
diff --git a/src/backend/utils/adt/varlena.c b/src/backend/utils/adt/varlena.c

index dbecd7160d6f4ba99591da24b111ae5dbbbf5db1..d8ea7f45bcc6ef8d11b9e21dc7775c897947fdc1 100644 (file)
--- a/src/backend/utils/adt/varlena.c
+++ b/src/backend/utils/adt/varlena.c
@@ -133,6 +133,7 @@ static text *text_substring(Datum str,
                                                         int32 start,
                                                         int32 length,
                                                         bool length_not_specified);
+static int     pg_mbcharcliplen_chars(const char *mbstr, int len, int limit);
  static text *text_overlay(text *t1, text *t2, int sp, int sl);
  static int     text_position(text *t1, text *t2, Oid collid);
  static void text_position_setup(text *t1, text *t2, Oid collid, TextPositionState *state);
@@ -586,7 +587,7 @@ text_substring(Datum str, int32 start, int32 length, bool length_not_specified)
         int32           S = start;              /* start position */
         int32           S1;                             /* adjusted start position */
         int32           L1;                             /* adjusted substring length */
-       int32           E;                              /* end position */
+       int32           E;                              /* end position, exclusive */
  
         /*
          * SQL99 says S can be zero or negative (which we don't document), but we
@@ -684,11 +685,11 @@ text_substring(Datum str, int32 start, int32 length, bool length_not_specified)
                 else
                 {
                         /*
-                        * A zero or negative value for the end position can happen if the
-                        * start was negative or one. SQL99 says to return a zero-length
-                        * string.
+                        * Ending at position 1, exclusive, obviously yields an empty
+                        * string.  A zero or negative value can happen if the start was
+                        * negative or one. SQL99 says to return a zero-length string.
                          */
-                       if (E < 1)
+                       if (E <= 1)
                                 return cstring_to_text("");
  
                         /*
@@ -698,11 +699,11 @@ text_substring(Datum str, int32 start, int32 length, bool length_not_specified)
                         L1 = E - S1;
  
                         /*
-                        * Total slice size in bytes can't be any longer than the start
-                        * position plus substring length times the encoding max length.
-                        * If that overflows, we can just use -1.
+                        * Total slice size in bytes can't be any longer than the
+                        * inclusive end position times the encoding max length.  If that
+                        * overflows, we can just use -1.
                          */
-                       if (pg_mul_s32_overflow(E, eml, &slice_size))
+                       if (pg_mul_s32_overflow(E - 1, eml, &slice_size))
                                 slice_size = -1;
                 }
  
@@ -725,9 +726,17 @@ text_substring(Datum str, int32 start, int32 length, bool length_not_specified)
                         return cstring_to_text("");
                 }
  
-               /* Now we can get the actual length of the slice in MB characters */
-               slice_strlen = pg_mbstrlen_with_len(VARDATA_ANY(slice),
-                                                                                       slice_len);
+               /*
+                * Now we can get the actual length of the slice in MB characters,
+                * stopping at the end of the substring.  Continuing beyond the
+                * substring end could find an incomplete character attributable
+                * solely to DatumGetTextPSlice() chopping in the middle of a
+                * character, and it would be superfluous work at best.
+                */
+               slice_strlen =
+                       (slice_size == -1 ?
+                        pg_mbstrlen_with_len(VARDATA_ANY(slice), slice_len) :
+                        pg_mbcharcliplen_chars(VARDATA_ANY(slice), slice_len, E - 1));
  
                 /*
                  * Check that the start position wasn't > slice_strlen. If so, SQL99
@@ -782,6 +791,35 @@ text_substring(Datum str, int32 start, int32 length, bool length_not_specified)
         return NULL;
  }
  
+/*
+ * pg_mbcharcliplen_chars -
+ *     Mirror pg_mbcharcliplen(), except return value unit is chars, not bytes.
+ *
+ *     This mirrors all the dubious historical behavior, so it's static to
+ *     discourage proliferation.  The assertions are specific to the one caller.
+ */
+static int
+pg_mbcharcliplen_chars(const char *mbstr, int len, int limit)
+{
+       int                     nch = 0;
+       int                     l;
+
+       Assert(len > 0);
+       Assert(limit > 0);
+       Assert(pg_database_encoding_max_length() > 1);
+
+       while (len > 0 && *mbstr)
+       {
+               l = pg_mblen_with_len(mbstr, len);
+               nch++;
+               if (nch == limit)
+                       break;
+               len -= l;
+               mbstr += l;
+       }
+       return nch;
+}
+
  /*
   * textoverlay
   *     Replace specified substring of first string with second
diff --git a/src/test/regress/expected/encoding.out b/src/test/regress/expected/encoding.out

index ea1f38cff41d87619c7250ab3cee6d4a7ce5f830..cac1cb74782661c3dc14654aaf3d7a7d92119025 100644 (file)
--- a/src/test/regress/expected/encoding.out
+++ b/src/test/regress/expected/encoding.out
@@ -63,7 +63,13 @@ SELECT reverse(good) FROM regress_encoding;
  -- invalid short mb character = error
  SELECT length(truncated) FROM regress_encoding;
  ERROR:  invalid byte sequence for encoding "UTF8": 0xc3
-SELECT substring(truncated, 1, 1) FROM regress_encoding;
+SELECT substring(truncated, 1, 3) FROM regress_encoding;
+ substring 
+-----------
+ caf
+(1 row)
+
+SELECT substring(truncated, 1, 4) FROM regress_encoding;
  ERROR:  invalid byte sequence for encoding "UTF8": 0xc3
  SELECT reverse(truncated) FROM regress_encoding;
  ERROR:  invalid byte sequence for encoding "UTF8": 0xc3
@@ -375,7 +381,43 @@ NOTICE:  MULE_INTERNAL LC2:           \x908283 -> {9470595} -> \x908283 = OK
   t
  (1 row)
  
+-- substring fetches a slice of a toasted value; unused tail of that slice is
+-- an incomplete char (bug #19406)
+CREATE TABLE toast_3b_utf8 (c text);
+INSERT INTO toast_3b_utf8 VALUES (repeat(U&'\2026', 4000));
+SELECT SUBSTRING(c FROM 1 FOR 1) FROM toast_3b_utf8;
+ substring 
+-----------
+ …
+(1 row)
+
+SELECT SUBSTRING(c FROM 4001 FOR 1) FROM toast_3b_utf8;
+ substring 
+-----------
+ 
+(1 row)
+
+-- diagnose incomplete char iff within the substring
+UPDATE toast_3b_utf8 SET c = c || test_bytea_to_text('\xe280');
+SELECT SUBSTRING(c FROM 4000 FOR 1) FROM toast_3b_utf8;
+ substring 
+-----------
+ …
+(1 row)
+
+SELECT SUBSTRING(c FROM 4001 FOR 1) FROM toast_3b_utf8;
+ERROR:  invalid byte sequence for encoding "UTF8": 0xe2 0x80
+-- substring needing last byte of its slice_size
+ALTER TABLE toast_3b_utf8 RENAME TO toast_4b_utf8;
+UPDATE toast_4b_utf8 SET c = repeat(U&'\+01F680', 3000);
+SELECT SUBSTRING(c FROM 3000 FOR 1) FROM toast_4b_utf8;
+ substring 
+-----------
+ 🚀
+(1 row)
+
  DROP TABLE encoding_tests;
+DROP TABLE toast_4b_utf8;
  DROP FUNCTION test_encoding;
  DROP FUNCTION test_text_to_wchars;
  DROP FUNCTION test_mblen_func;
diff --git a/src/test/regress/sql/encoding.sql b/src/test/regress/sql/encoding.sql

index b9543c0cb32622d15ff1f9cd669d02ed1055289f..782f90f0d629280d14fac4641a552c7324a8b1b3 100644 (file)
--- a/src/test/regress/sql/encoding.sql
+++ b/src/test/regress/sql/encoding.sql
@@ -40,7 +40,8 @@ SELECT reverse(good) FROM regress_encoding;
  
  -- invalid short mb character = error
  SELECT length(truncated) FROM regress_encoding;
-SELECT substring(truncated, 1, 1) FROM regress_encoding;
+SELECT substring(truncated, 1, 3) FROM regress_encoding;
+SELECT substring(truncated, 1, 4) FROM regress_encoding;
  SELECT reverse(truncated) FROM regress_encoding;
  -- invalid short mb character = silently dropped
  SELECT regexp_replace(truncated, '^caf(.)$', '\1') FROM regress_encoding;
@@ -212,7 +213,23 @@ INSERT INTO encoding_tests VALUES
  SELECT COUNT(test_encoding(encoding, description, input)) > 0
  FROM encoding_tests;
  
+-- substring fetches a slice of a toasted value; unused tail of that slice is
+-- an incomplete char (bug #19406)
+CREATE TABLE toast_3b_utf8 (c text);
+INSERT INTO toast_3b_utf8 VALUES (repeat(U&'\2026', 4000));
+SELECT SUBSTRING(c FROM 1 FOR 1) FROM toast_3b_utf8;
+SELECT SUBSTRING(c FROM 4001 FOR 1) FROM toast_3b_utf8;
+-- diagnose incomplete char iff within the substring
+UPDATE toast_3b_utf8 SET c = c || test_bytea_to_text('\xe280');
+SELECT SUBSTRING(c FROM 4000 FOR 1) FROM toast_3b_utf8;
+SELECT SUBSTRING(c FROM 4001 FOR 1) FROM toast_3b_utf8;
+-- substring needing last byte of its slice_size
+ALTER TABLE toast_3b_utf8 RENAME TO toast_4b_utf8;
+UPDATE toast_4b_utf8 SET c = repeat(U&'\+01F680', 3000);
+SELECT SUBSTRING(c FROM 3000 FOR 1) FROM toast_4b_utf8;
+
  DROP TABLE encoding_tests;
+DROP TABLE toast_4b_utf8;
  DROP FUNCTION test_encoding;
  DROP FUNCTION test_text_to_wchars;
  DROP FUNCTION test_mblen_func;
author	Noah Misch <noah@leadboat.com>
	Sat, 14 Feb 2026 20:16:16 +0000 (12:16 -0800)
committer	Noah Misch <noah@leadboat.com>
	Sat, 14 Feb 2026 20:16:16 +0000 (12:16 -0800)
src/backend/utils/adt/varlena.c		patch \| blob \| blame \| history
src/test/regress/expected/encoding.out		patch \| blob \| blame \| history
src/test/regress/sql/encoding.sql		patch \| blob \| blame \| history