]> git.ipfire.org Git - thirdparty/postgresql.git/commitdiff
Fix buffer overflows in pg_trgm due to lower-casing
authorHeikki Linnakangas <heikki.linnakangas@iki.fi>
Tue, 20 Jan 2026 09:53:28 +0000 (11:53 +0200)
committerThomas Munro <tmunro@postgresql.org>
Sun, 8 Feb 2026 23:08:58 +0000 (12:08 +1300)
The code made a subtle assumption that the lower-cased version of a
string never has more characters than the original. That is not always
true. For example, in a database with the latin9 encoding:

    latin9db=# select lower(U&'\00CC' COLLATE "lt-x-icu");
       lower
    -----------
     i\x1A\x1A
    (1 row)

In this example, lower-casing expands the single input character into
three characters.

The generate_trgm_only() function relied on that assumption in two
ways:

- It used "slen * pg_database_encoding_max_length() + 4" to allocate
  the buffer to hold the lowercased and blank-padded string. That
  formula accounts for expansion if the lower-case characters are
  longer (in bytes) than the originals, but it's still not enough if
  the lower-cased string contains more *characters* than the original.

- Its callers sized the output array to hold the trigrams extracted
  from the input string with the formula "(slen / 2 + 1) * 3", where
  'slen' is the input string length in bytes. (The formula was
  generous to account for the possibility that RPADDING was set to 2.)
  That's also not enough if one input byte can turn into multiple
  characters.

To fix, introduce a growable trigram array and give up on trying to
choose the correct max buffer sizes ahead of time.

Backpatch to v18, but no further. In previous versions lower-casing was
done character by character, and thus the assumption that lower-casing
doesn't change the character length was valid. That was changed in v18,
commit fb1a18810f.

Security: CVE-2026-2007
Reviewed-by: Noah Misch <noah@leadboat.com>
Reviewed-by: Jeff Davis <pgsql@j-davis.com>
contrib/pg_trgm/trgm_op.c
src/tools/pgindent/typedefs.list

index 581ca48dd59c061f76aaa8b4f71438050eb71807..4bb5506647c5da3fde21549932300a7088b0a693 100644 (file)
@@ -66,6 +66,78 @@ typedef uint8 TrgmBound;
 #define WORD_SIMILARITY_STRICT         0x02    /* force bounds of extent to match
                                                                                         * word bounds */
 
+/*
+ * A growable array of trigrams
+ *
+ * The actual array of trigrams is in 'datum'.  Note that the other fields in
+ * 'datum', i.e. datum->flags and the varlena length, are not kept up to date
+ * when items are added to the growable array.  We merely reserve the space
+ * for them here.  You must fill those other fields before using 'datum' as a
+ * proper TRGM datum.
+ */
+typedef struct
+{
+       TRGM       *datum;                      /* trigram array */
+       int                     length;                 /* number of trigrams in the array */
+       int                     allocated;              /* allocated size of 'datum' (# of trigrams) */
+} growable_trgm_array;
+
+/*
+ * Allocate a new growable array.
+ *
+ * 'slen' is the size of the source string that we're extracting the trigrams
+ * from.  It is used to choose the initial size of the array.
+ */
+static void
+init_trgm_array(growable_trgm_array *arr, int slen)
+{
+       size_t          init_size;
+
+       /*
+        * In the extreme case, the input string consists entirely of one
+        * character words, like "a b c", where each word is expanded to two
+        * trigrams.  This is not a strict upper bound though, because when
+        * IGNORECASE is defined, we convert the input string to lowercase before
+        * extracting the trigrams, which in rare cases can expand one input
+        * character into multiple characters.
+        */
+       init_size = (size_t) slen + 1;
+
+       /*
+        * Guard against possible overflow in the palloc request.  (We don't worry
+        * about the additive constants, since palloc can detect requests that are
+        * a little above MaxAllocSize --- we just need to prevent integer
+        * overflow in the multiplications.)
+        */
+       if (init_size > MaxAllocSize / sizeof(trgm))
+               ereport(ERROR,
+                               (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
+                                errmsg("out of memory")));
+
+       arr->datum = palloc(CALCGTSIZE(ARRKEY, init_size));
+       arr->allocated = init_size;
+       arr->length = 0;
+}
+
+/* Make sure the array can hold at least 'needed' more trigrams */
+static void
+enlarge_trgm_array(growable_trgm_array *arr, int needed)
+{
+       size_t          new_needed = (size_t) arr->length + needed;
+
+       if (new_needed > arr->allocated)
+       {
+               /* Guard against possible overflow, like in init_trgm_array */
+               if (new_needed > MaxAllocSize / sizeof(trgm))
+                       ereport(ERROR,
+                                       (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
+                                        errmsg("out of memory")));
+
+               arr->datum = repalloc(arr->datum, CALCGTSIZE(ARRKEY, new_needed));
+               arr->allocated = new_needed;
+       }
+}
+
 /*
  * Module load callback
  */
@@ -267,13 +339,18 @@ compact_trigram(trgm *tptr, char *str, int bytelen)
 /*
  * Adds trigrams from the word in 'str' (already padded if necessary).
  */
-static trgm *
-make_trigrams(trgm *tptr, char *str, int bytelen)
+static void
+make_trigrams(growable_trgm_array *dst, char *str, int bytelen)
 {
+       trgm       *tptr;
        char       *ptr = str;
 
        if (bytelen < 3)
-               return tptr;
+               return;
+
+       /* max number of trigrams = strlen - 2 */
+       enlarge_trgm_array(dst, bytelen - 2);
+       tptr = GETARR(dst->datum) + dst->length;
 
        if (pg_encoding_max_length(GetDatabaseEncoding()) == 1)
        {
@@ -303,7 +380,7 @@ make_trigrams(trgm *tptr, char *str, int bytelen)
                                tptr++;
 
                                if (ptr == str + bytelen - 2)
-                                       return tptr;
+                                       goto done;
                        }
 
                        lenfirst = 1;
@@ -314,10 +391,10 @@ make_trigrams(trgm *tptr, char *str, int bytelen)
                {
                        lenfirst = pg_mblen(ptr);
                        if (ptr + lenfirst >= str + bytelen)
-                               return tptr;
+                               goto done;
                        lenmiddle = pg_mblen(ptr + lenfirst);
                        if (ptr + lenfirst + lenmiddle >= str + bytelen)
-                               return tptr;
+                               goto done;
                        lenlast = pg_mblen(ptr + lenfirst + lenmiddle);
                }
 
@@ -344,35 +421,54 @@ make_trigrams(trgm *tptr, char *str, int bytelen)
                }
        }
 
-       return tptr;
+done:
+       dst->length = tptr - GETARR(dst->datum);
+       Assert(dst->length <= dst->allocated);
 }
 
 /*
  * Make array of trigrams without sorting and removing duplicate items.
  *
- * trg: where to return the array of trigrams.
+ * dst: where to return the array of trigrams.
  * str: source string, of length slen bytes.
- * bounds: where to return bounds of trigrams (if needed).
- *
- * Returns length of the generated array.
+ * bounds_p: where to return bounds of trigrams (if needed).
  */
-static int
-generate_trgm_only(trgm *trg, char *str, int slen, TrgmBound *bounds)
+static void
+generate_trgm_only(growable_trgm_array *dst, char *str, int slen, TrgmBound **bounds_p)
 {
-       trgm       *tptr;
+       size_t          buflen;
        char       *buf;
        int                     bytelen;
        char       *bword,
                           *eword;
+       TrgmBound  *bounds = NULL;
+       int                     bounds_allocated = 0;
 
-       if (slen + LPADDING + RPADDING < 3 || slen == 0)
-               return 0;
+       init_trgm_array(dst, slen);
 
-       tptr = trg;
+       /*
+        * If requested, allocate an array for the bounds, with the same size as
+        * the trigram array.
+        */
+       if (bounds_p)
+       {
+               bounds_allocated = dst->allocated;
+               bounds = *bounds_p = palloc0_array(TrgmBound, bounds_allocated);
+       }
 
-       /* Allocate a buffer for case-folded, blank-padded words */
-       buf = (char *) palloc(slen * pg_database_encoding_max_length() + 4);
+       if (slen + LPADDING + RPADDING < 3 || slen == 0)
+               return;
 
+       /*
+        * Allocate a buffer for case-folded, blank-padded words.
+        *
+        * As an initial guess, allocate a buffer large enough to hold the
+        * original string with padding, which is always enough when compiled with
+        * !IGNORECASE.  If the case-folding produces a string longer than the
+        * original, we'll grow the buffer.
+        */
+       buflen = (size_t) slen + 4;
+       buf = (char *) palloc(buflen);
        if (LPADDING > 0)
        {
                *buf = ' ';
@@ -383,49 +479,57 @@ generate_trgm_only(trgm *trg, char *str, int slen, TrgmBound *bounds)
        eword = str;
        while ((bword = find_word(eword, slen - (eword - str), &eword)) != NULL)
        {
+               int                     oldlen;
+
+               /* Convert word to lower case before extracting trigrams from it */
 #ifdef IGNORECASE
-               bword = str_tolower(bword, eword - bword, DEFAULT_COLLATION_OID);
-               bytelen = strlen(bword);
+               {
+                       char       *lowered;
+
+                       lowered = str_tolower(bword, eword - bword, DEFAULT_COLLATION_OID);
+                       bytelen = strlen(lowered);
+
+                       /* grow the buffer if necessary */
+                       if (bytelen > buflen - 4)
+                       {
+                               pfree(buf);
+                               buflen = (size_t) bytelen + 4;
+                               buf = (char *) palloc(buflen);
+                               if (LPADDING > 0)
+                               {
+                                       *buf = ' ';
+                                       if (LPADDING > 1)
+                                               *(buf + 1) = ' ';
+                               }
+                       }
+                       memcpy(buf + LPADDING, lowered, bytelen);
+                       pfree(lowered);
+               }
 #else
                bytelen = eword - bword;
-#endif
-
                memcpy(buf + LPADDING, bword, bytelen);
-
-#ifdef IGNORECASE
-               pfree(bword);
 #endif
 
                buf[LPADDING + bytelen] = ' ';
                buf[LPADDING + bytelen + 1] = ' ';
 
                /* Calculate trigrams marking their bounds if needed */
+               oldlen = dst->length;
+               make_trigrams(dst, buf, bytelen + LPADDING + RPADDING);
                if (bounds)
-                       bounds[tptr - trg] |= TRGM_BOUND_LEFT;
-               tptr = make_trigrams(tptr, buf, bytelen + LPADDING + RPADDING);
-               if (bounds)
-                       bounds[tptr - trg - 1] |= TRGM_BOUND_RIGHT;
+               {
+                       if (bounds_allocated < dst->length)
+                       {
+                               bounds = repalloc0_array(bounds, TrgmBound, bounds_allocated, dst->allocated);
+                               bounds_allocated = dst->allocated;
+                       }
+
+                       bounds[oldlen] |= TRGM_BOUND_LEFT;
+                       bounds[dst->length - 1] |= TRGM_BOUND_RIGHT;
+               }
        }
 
        pfree(buf);
-
-       return tptr - trg;
-}
-
-/*
- * Guard against possible overflow in the palloc requests below.  (We
- * don't worry about the additive constants, since palloc can detect
- * requests that are a little above MaxAllocSize --- we just need to
- * prevent integer overflow in the multiplications.)
- */
-static void
-protect_out_of_mem(int slen)
-{
-       if ((Size) (slen / 2) >= (MaxAllocSize / (sizeof(trgm) * 3)) ||
-               (Size) slen >= (MaxAllocSize / pg_database_encoding_max_length()))
-               ereport(ERROR,
-                               (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
-                                errmsg("out of memory")));
 }
 
 /*
@@ -439,19 +543,14 @@ TRGM *
 generate_trgm(char *str, int slen)
 {
        TRGM       *trg;
+       growable_trgm_array arr;
        int                     len;
 
-       protect_out_of_mem(slen);
-
-       trg = (TRGM *) palloc(TRGMHDRSIZE + sizeof(trgm) * (slen / 2 + 1) * 3);
+       generate_trgm_only(&arr, str, slen, NULL);
+       len = arr.length;
+       trg = arr.datum;
        trg->flag = ARRKEY;
 
-       len = generate_trgm_only(GETARR(trg), str, slen, NULL);
-       SET_VARSIZE(trg, CALCGTSIZE(ARRKEY, len));
-
-       if (len == 0)
-               return trg;
-
        /*
         * Make trigrams unique.
         */
@@ -706,8 +805,8 @@ calc_word_similarity(char *str1, int slen1, char *str2, int slen2,
 {
        bool       *found;
        pos_trgm   *ptrg;
-       trgm       *trg1;
-       trgm       *trg2;
+       growable_trgm_array trg1;
+       growable_trgm_array trg2;
        int                     len1,
                                len2,
                                len,
@@ -716,27 +815,21 @@ calc_word_similarity(char *str1, int slen1, char *str2, int slen2,
                                ulen1;
        int                *trg2indexes;
        float4          result;
-       TrgmBound  *bounds;
-
-       protect_out_of_mem(slen1 + slen2);
+       TrgmBound  *bounds = NULL;
 
        /* Make positional trigrams */
-       trg1 = (trgm *) palloc(sizeof(trgm) * (slen1 / 2 + 1) * 3);
-       trg2 = (trgm *) palloc(sizeof(trgm) * (slen2 / 2 + 1) * 3);
-       if (flags & WORD_SIMILARITY_STRICT)
-               bounds = (TrgmBound *) palloc0(sizeof(TrgmBound) * (slen2 / 2 + 1) * 3);
-       else
-               bounds = NULL;
 
-       len1 = generate_trgm_only(trg1, str1, slen1, NULL);
-       len2 = generate_trgm_only(trg2, str2, slen2, bounds);
+       generate_trgm_only(&trg1, str1, slen1, NULL);
+       len1 = trg1.length;
+       generate_trgm_only(&trg2, str2, slen2, (flags & WORD_SIMILARITY_STRICT) ? &bounds : NULL);
+       len2 = trg2.length;
 
-       ptrg = make_positional_trgm(trg1, len1, trg2, len2);
+       ptrg = make_positional_trgm(GETARR(trg1.datum), len1, GETARR(trg2.datum), len2);
        len = len1 + len2;
        qsort(ptrg, len, sizeof(pos_trgm), comp_ptrgm);
 
-       pfree(trg1);
-       pfree(trg2);
+       pfree(trg1.datum);
+       pfree(trg2.datum);
 
        /*
         * Merge positional trigrams array: enumerate each trigram and find its
@@ -937,23 +1030,21 @@ TRGM *
 generate_wildcard_trgm(const char *str, int slen)
 {
        TRGM       *trg;
-       char       *buf,
-                          *buf2;
-       trgm       *tptr;
+       growable_trgm_array arr;
+       char       *buf;
        int                     len,
                                bytelen;
        const char *eword;
 
-       protect_out_of_mem(slen);
-
-       trg = (TRGM *) palloc(TRGMHDRSIZE + sizeof(trgm) * (slen / 2 + 1) * 3);
-       trg->flag = ARRKEY;
-       SET_VARSIZE(trg, TRGMHDRSIZE);
-
        if (slen + LPADDING + RPADDING < 3 || slen == 0)
+       {
+               trg = (TRGM *) palloc(TRGMHDRSIZE);
+               trg->flag = ARRKEY;
+               SET_VARSIZE(trg, TRGMHDRSIZE);
                return trg;
+       }
 
-       tptr = GETARR(trg);
+       init_trgm_array(&arr, slen);
 
        /* Allocate a buffer for blank-padded, but not yet case-folded, words */
        buf = palloc_array(char, slen + 4);
@@ -965,37 +1056,39 @@ generate_wildcard_trgm(const char *str, int slen)
        while ((eword = get_wildcard_part(eword, slen - (eword - str),
                                                                          buf, &bytelen)) != NULL)
        {
+               char       *word;
+
 #ifdef IGNORECASE
-               buf2 = str_tolower(buf, bytelen, DEFAULT_COLLATION_OID);
-               bytelen = strlen(buf2);
+               word = str_tolower(buf, bytelen, DEFAULT_COLLATION_OID);
+               bytelen = strlen(word);
 #else
-               buf2 = buf;
+               word = buf;
 #endif
 
                /*
                 * count trigrams
                 */
-               tptr = make_trigrams(tptr, buf2, bytelen);
+               make_trigrams(&arr, word, bytelen);
 
 #ifdef IGNORECASE
-               pfree(buf2);
+               pfree(word);
 #endif
        }
 
        pfree(buf);
 
-       if ((len = tptr - GETARR(trg)) == 0)
-               return trg;
-
        /*
         * Make trigrams unique.
         */
+       trg = arr.datum;
+       len = arr.length;
        if (len > 1)
        {
                qsort(GETARR(trg), len, sizeof(trgm), comp_trgm);
                len = qunique(GETARR(trg), len, sizeof(trgm), comp_trgm);
        }
 
+       trg->flag = ARRKEY;
        SET_VARSIZE(trg, CALCGTSIZE(ARRKEY, len));
 
        return trg;
index 9f5ee8fd4829550ea01fe12aa0e987493ff0af1b..775184894122bc2d808202eec9c3ec579cc7ae34 100644 (file)
@@ -3737,6 +3737,7 @@ gistxlogPageReuse
 gistxlogPageSplit
 gistxlogPageUpdate
 grouping_sets_data
+growable_trgm_array
 gseg_picksplit_item
 gss_OID_set
 gss_buffer_desc