From: Heikki Linnakangas <heikki.linnakangas@iki.fi>
Date: Tue, 20 Jan 2026 09:53:28 +0000 (+0200)
Subject: Fix buffer overflows in pg_trgm due to lower-casing
X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=00896ddaf41fa7b725991120678d544c18c6af70;p=thirdparty%2Fpostgresql.git

Fix buffer overflows in pg_trgm due to lower-casing

The code made a subtle assumption that the lower-cased version of a
string never has more characters than the original. That is not always
true. For example, in a database with the latin9 encoding:

    latin9db=# select lower(U&'\00CC' COLLATE "lt-x-icu");
       lower
    -----------
     i\x1A\x1A
    (1 row)

In this example, lower-casing expands the single input character into
three characters.

The generate_trgm_only() function relied on that assumption in two
ways:

- It used "slen * pg_database_encoding_max_length() + 4" to allocate
  the buffer to hold the lowercased and blank-padded string. That
  formula accounts for expansion if the lower-case characters are
  longer (in bytes) than the originals, but it's still not enough if
  the lower-cased string contains more *characters* than the original.

- Its callers sized the output array to hold the trigrams extracted
  from the input string with the formula "(slen / 2 + 1) * 3", where
  'slen' is the input string length in bytes. (The formula was
  generous to account for the possibility that RPADDING was set to 2.)
  That's also not enough if one input byte can turn into multiple
  characters.

To fix, introduce a growable trigram array and give up on trying to
choose the correct max buffer sizes ahead of time.

Backpatch to v18, but no further. In previous versions lower-casing was
done character by character, and thus the assumption that lower-casing
doesn't change the character length was valid. That was changed in v18,
commit fb1a18810f.

Security: CVE-2026-2007
Reviewed-by: Noah Misch <noah@leadboat.com>
Reviewed-by: Jeff Davis <pgsql@j-davis.com>
---

diff --git a/contrib/pg_trgm/trgm_op.c b/contrib/pg_trgm/trgm_op.c
index 581ca48dd59..4bb5506647c 100644
--- a/contrib/pg_trgm/trgm_op.c
+++ b/contrib/pg_trgm/trgm_op.c
@@ -66,6 +66,78 @@ typedef uint8 TrgmBound;
 #define WORD_SIMILARITY_STRICT		0x02	/* force bounds of extent to match
 											 * word bounds */
 
+/*
+ * A growable array of trigrams
+ *
+ * The actual array of trigrams is in 'datum'.  Note that the other fields in
+ * 'datum', i.e. datum->flags and the varlena length, are not kept up to date
+ * when items are added to the growable array.  We merely reserve the space
+ * for them here.  You must fill those other fields before using 'datum' as a
+ * proper TRGM datum.
+ */
+typedef struct
+{
+	TRGM	   *datum;			/* trigram array */
+	int			length;			/* number of trigrams in the array */
+	int			allocated;		/* allocated size of 'datum' (# of trigrams) */
+} growable_trgm_array;
+
+/*
+ * Allocate a new growable array.
+ *
+ * 'slen' is the size of the source string that we're extracting the trigrams
+ * from.  It is used to choose the initial size of the array.
+ */
+static void
+init_trgm_array(growable_trgm_array *arr, int slen)
+{
+	size_t		init_size;
+
+	/*
+	 * In the extreme case, the input string consists entirely of one
+	 * character words, like "a b c", where each word is expanded to two
+	 * trigrams.  This is not a strict upper bound though, because when
+	 * IGNORECASE is defined, we convert the input string to lowercase before
+	 * extracting the trigrams, which in rare cases can expand one input
+	 * character into multiple characters.
+	 */
+	init_size = (size_t) slen + 1;
+
+	/*
+	 * Guard against possible overflow in the palloc request.  (We don't worry
+	 * about the additive constants, since palloc can detect requests that are
+	 * a little above MaxAllocSize --- we just need to prevent integer
+	 * overflow in the multiplications.)
+	 */
+	if (init_size > MaxAllocSize / sizeof(trgm))
+		ereport(ERROR,
+				(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
+				 errmsg("out of memory")));
+
+	arr->datum = palloc(CALCGTSIZE(ARRKEY, init_size));
+	arr->allocated = init_size;
+	arr->length = 0;
+}
+
+/* Make sure the array can hold at least 'needed' more trigrams */
+static void
+enlarge_trgm_array(growable_trgm_array *arr, int needed)
+{
+	size_t		new_needed = (size_t) arr->length + needed;
+
+	if (new_needed > arr->allocated)
+	{
+		/* Guard against possible overflow, like in init_trgm_array */
+		if (new_needed > MaxAllocSize / sizeof(trgm))
+			ereport(ERROR,
+					(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
+					 errmsg("out of memory")));
+
+		arr->datum = repalloc(arr->datum, CALCGTSIZE(ARRKEY, new_needed));
+		arr->allocated = new_needed;
+	}
+}
+
 /*
  * Module load callback
  */
@@ -267,13 +339,18 @@ compact_trigram(trgm *tptr, char *str, int bytelen)
 /*
  * Adds trigrams from the word in 'str' (already padded if necessary).
  */
-static trgm *
-make_trigrams(trgm *tptr, char *str, int bytelen)
+static void
+make_trigrams(growable_trgm_array *dst, char *str, int bytelen)
 {
+	trgm	   *tptr;
 	char	   *ptr = str;
 
 	if (bytelen < 3)
-		return tptr;
+		return;
+
+	/* max number of trigrams = strlen - 2 */
+	enlarge_trgm_array(dst, bytelen - 2);
+	tptr = GETARR(dst->datum) + dst->length;
 
 	if (pg_encoding_max_length(GetDatabaseEncoding()) == 1)
 	{
@@ -303,7 +380,7 @@ make_trigrams(trgm *tptr, char *str, int bytelen)
 				tptr++;
 
 				if (ptr == str + bytelen - 2)
-					return tptr;
+					goto done;
 			}
 
 			lenfirst = 1;
@@ -314,10 +391,10 @@ make_trigrams(trgm *tptr, char *str, int bytelen)
 		{
 			lenfirst = pg_mblen(ptr);
 			if (ptr + lenfirst >= str + bytelen)
-				return tptr;
+				goto done;
 			lenmiddle = pg_mblen(ptr + lenfirst);
 			if (ptr + lenfirst + lenmiddle >= str + bytelen)
-				return tptr;
+				goto done;
 			lenlast = pg_mblen(ptr + lenfirst + lenmiddle);
 		}
 
@@ -344,35 +421,54 @@ make_trigrams(trgm *tptr, char *str, int bytelen)
 		}
 	}
 
-	return tptr;
+done:
+	dst->length = tptr - GETARR(dst->datum);
+	Assert(dst->length <= dst->allocated);
 }
 
 /*
  * Make array of trigrams without sorting and removing duplicate items.
  *
- * trg: where to return the array of trigrams.
+ * dst: where to return the array of trigrams.
  * str: source string, of length slen bytes.
- * bounds: where to return bounds of trigrams (if needed).
- *
- * Returns length of the generated array.
+ * bounds_p: where to return bounds of trigrams (if needed).
  */
-static int
-generate_trgm_only(trgm *trg, char *str, int slen, TrgmBound *bounds)
+static void
+generate_trgm_only(growable_trgm_array *dst, char *str, int slen, TrgmBound **bounds_p)
 {
-	trgm	   *tptr;
+	size_t		buflen;
 	char	   *buf;
 	int			bytelen;
 	char	   *bword,
 			   *eword;
+	TrgmBound  *bounds = NULL;
+	int			bounds_allocated = 0;
 
-	if (slen + LPADDING + RPADDING < 3 || slen == 0)
-		return 0;
+	init_trgm_array(dst, slen);
 
-	tptr = trg;
+	/*
+	 * If requested, allocate an array for the bounds, with the same size as
+	 * the trigram array.
+	 */
+	if (bounds_p)
+	{
+		bounds_allocated = dst->allocated;
+		bounds = *bounds_p = palloc0_array(TrgmBound, bounds_allocated);
+	}
 
-	/* Allocate a buffer for case-folded, blank-padded words */
-	buf = (char *) palloc(slen * pg_database_encoding_max_length() + 4);
+	if (slen + LPADDING + RPADDING < 3 || slen == 0)
+		return;
 
+	/*
+	 * Allocate a buffer for case-folded, blank-padded words.
+	 *
+	 * As an initial guess, allocate a buffer large enough to hold the
+	 * original string with padding, which is always enough when compiled with
+	 * !IGNORECASE.  If the case-folding produces a string longer than the
+	 * original, we'll grow the buffer.
+	 */
+	buflen = (size_t) slen + 4;
+	buf = (char *) palloc(buflen);
 	if (LPADDING > 0)
 	{
 		*buf = ' ';
@@ -383,49 +479,57 @@ generate_trgm_only(trgm *trg, char *str, int slen, TrgmBound *bounds)
 	eword = str;
 	while ((bword = find_word(eword, slen - (eword - str), &eword)) != NULL)
 	{
+		int			oldlen;
+
+		/* Convert word to lower case before extracting trigrams from it */
 #ifdef IGNORECASE
-		bword = str_tolower(bword, eword - bword, DEFAULT_COLLATION_OID);
-		bytelen = strlen(bword);
+		{
+			char	   *lowered;
+
+			lowered = str_tolower(bword, eword - bword, DEFAULT_COLLATION_OID);
+			bytelen = strlen(lowered);
+
+			/* grow the buffer if necessary */
+			if (bytelen > buflen - 4)
+			{
+				pfree(buf);
+				buflen = (size_t) bytelen + 4;
+				buf = (char *) palloc(buflen);
+				if (LPADDING > 0)
+				{
+					*buf = ' ';
+					if (LPADDING > 1)
+						*(buf + 1) = ' ';
+				}
+			}
+			memcpy(buf + LPADDING, lowered, bytelen);
+			pfree(lowered);
+		}
 #else
 		bytelen = eword - bword;
-#endif
-
 		memcpy(buf + LPADDING, bword, bytelen);
-
-#ifdef IGNORECASE
-		pfree(bword);
 #endif
 
 		buf[LPADDING + bytelen] = ' ';
 		buf[LPADDING + bytelen + 1] = ' ';
 
 		/* Calculate trigrams marking their bounds if needed */
+		oldlen = dst->length;
+		make_trigrams(dst, buf, bytelen + LPADDING + RPADDING);
 		if (bounds)
-			bounds[tptr - trg] |= TRGM_BOUND_LEFT;
-		tptr = make_trigrams(tptr, buf, bytelen + LPADDING + RPADDING);
-		if (bounds)
-			bounds[tptr - trg - 1] |= TRGM_BOUND_RIGHT;
+		{
+			if (bounds_allocated < dst->length)
+			{
+				bounds = repalloc0_array(bounds, TrgmBound, bounds_allocated, dst->allocated);
+				bounds_allocated = dst->allocated;
+			}
+
+			bounds[oldlen] |= TRGM_BOUND_LEFT;
+			bounds[dst->length - 1] |= TRGM_BOUND_RIGHT;
+		}
 	}
 
 	pfree(buf);
-
-	return tptr - trg;
-}
-
-/*
- * Guard against possible overflow in the palloc requests below.  (We
- * don't worry about the additive constants, since palloc can detect
- * requests that are a little above MaxAllocSize --- we just need to
- * prevent integer overflow in the multiplications.)
- */
-static void
-protect_out_of_mem(int slen)
-{
-	if ((Size) (slen / 2) >= (MaxAllocSize / (sizeof(trgm) * 3)) ||
-		(Size) slen >= (MaxAllocSize / pg_database_encoding_max_length()))
-		ereport(ERROR,
-				(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
-				 errmsg("out of memory")));
 }
 
 /*
@@ -439,19 +543,14 @@ TRGM *
 generate_trgm(char *str, int slen)
 {
 	TRGM	   *trg;
+	growable_trgm_array arr;
 	int			len;
 
-	protect_out_of_mem(slen);
-
-	trg = (TRGM *) palloc(TRGMHDRSIZE + sizeof(trgm) * (slen / 2 + 1) * 3);
+	generate_trgm_only(&arr, str, slen, NULL);
+	len = arr.length;
+	trg = arr.datum;
 	trg->flag = ARRKEY;
 
-	len = generate_trgm_only(GETARR(trg), str, slen, NULL);
-	SET_VARSIZE(trg, CALCGTSIZE(ARRKEY, len));
-
-	if (len == 0)
-		return trg;
-
 	/*
 	 * Make trigrams unique.
 	 */
@@ -706,8 +805,8 @@ calc_word_similarity(char *str1, int slen1, char *str2, int slen2,
 {
 	bool	   *found;
 	pos_trgm   *ptrg;
-	trgm	   *trg1;
-	trgm	   *trg2;
+	growable_trgm_array trg1;
+	growable_trgm_array trg2;
 	int			len1,
 				len2,
 				len,
@@ -716,27 +815,21 @@ calc_word_similarity(char *str1, int slen1, char *str2, int slen2,
 				ulen1;
 	int		   *trg2indexes;
 	float4		result;
-	TrgmBound  *bounds;
-
-	protect_out_of_mem(slen1 + slen2);
+	TrgmBound  *bounds = NULL;
 
 	/* Make positional trigrams */
-	trg1 = (trgm *) palloc(sizeof(trgm) * (slen1 / 2 + 1) * 3);
-	trg2 = (trgm *) palloc(sizeof(trgm) * (slen2 / 2 + 1) * 3);
-	if (flags & WORD_SIMILARITY_STRICT)
-		bounds = (TrgmBound *) palloc0(sizeof(TrgmBound) * (slen2 / 2 + 1) * 3);
-	else
-		bounds = NULL;
 
-	len1 = generate_trgm_only(trg1, str1, slen1, NULL);
-	len2 = generate_trgm_only(trg2, str2, slen2, bounds);
+	generate_trgm_only(&trg1, str1, slen1, NULL);
+	len1 = trg1.length;
+	generate_trgm_only(&trg2, str2, slen2, (flags & WORD_SIMILARITY_STRICT) ? &bounds : NULL);
+	len2 = trg2.length;
 
-	ptrg = make_positional_trgm(trg1, len1, trg2, len2);
+	ptrg = make_positional_trgm(GETARR(trg1.datum), len1, GETARR(trg2.datum), len2);
 	len = len1 + len2;
 	qsort(ptrg, len, sizeof(pos_trgm), comp_ptrgm);
 
-	pfree(trg1);
-	pfree(trg2);
+	pfree(trg1.datum);
+	pfree(trg2.datum);
 
 	/*
 	 * Merge positional trigrams array: enumerate each trigram and find its
@@ -937,23 +1030,21 @@ TRGM *
 generate_wildcard_trgm(const char *str, int slen)
 {
 	TRGM	   *trg;
-	char	   *buf,
-			   *buf2;
-	trgm	   *tptr;
+	growable_trgm_array arr;
+	char	   *buf;
 	int			len,
 				bytelen;
 	const char *eword;
 
-	protect_out_of_mem(slen);
-
-	trg = (TRGM *) palloc(TRGMHDRSIZE + sizeof(trgm) * (slen / 2 + 1) * 3);
-	trg->flag = ARRKEY;
-	SET_VARSIZE(trg, TRGMHDRSIZE);
-
 	if (slen + LPADDING + RPADDING < 3 || slen == 0)
+	{
+		trg = (TRGM *) palloc(TRGMHDRSIZE);
+		trg->flag = ARRKEY;
+		SET_VARSIZE(trg, TRGMHDRSIZE);
 		return trg;
+	}
 
-	tptr = GETARR(trg);
+	init_trgm_array(&arr, slen);
 
 	/* Allocate a buffer for blank-padded, but not yet case-folded, words */
 	buf = palloc_array(char, slen + 4);
@@ -965,37 +1056,39 @@ generate_wildcard_trgm(const char *str, int slen)
 	while ((eword = get_wildcard_part(eword, slen - (eword - str),
 									  buf, &bytelen)) != NULL)
 	{
+		char	   *word;
+
 #ifdef IGNORECASE
-		buf2 = str_tolower(buf, bytelen, DEFAULT_COLLATION_OID);
-		bytelen = strlen(buf2);
+		word = str_tolower(buf, bytelen, DEFAULT_COLLATION_OID);
+		bytelen = strlen(word);
 #else
-		buf2 = buf;
+		word = buf;
 #endif
 
 		/*
 		 * count trigrams
 		 */
-		tptr = make_trigrams(tptr, buf2, bytelen);
+		make_trigrams(&arr, word, bytelen);
 
 #ifdef IGNORECASE
-		pfree(buf2);
+		pfree(word);
 #endif
 	}
 
 	pfree(buf);
 
-	if ((len = tptr - GETARR(trg)) == 0)
-		return trg;
-
 	/*
 	 * Make trigrams unique.
 	 */
+	trg = arr.datum;
+	len = arr.length;
 	if (len > 1)
 	{
 		qsort(GETARR(trg), len, sizeof(trgm), comp_trgm);
 		len = qunique(GETARR(trg), len, sizeof(trgm), comp_trgm);
 	}
 
+	trg->flag = ARRKEY;
 	SET_VARSIZE(trg, CALCGTSIZE(ARRKEY, len));
 
 	return trg;
diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list
index 9f5ee8fd482..77518489412 100644
--- a/src/tools/pgindent/typedefs.list
+++ b/src/tools/pgindent/typedefs.list
@@ -3737,6 +3737,7 @@ gistxlogPageReuse
 gistxlogPageSplit
 gistxlogPageUpdate
 grouping_sets_data
+growable_trgm_array
 gseg_picksplit_item
 gss_OID_set
 gss_buffer_desc