]> git.ipfire.org Git - thirdparty/postgresql.git/commitdiff
ltree: fix case-insensitive matching.
authorJeff Davis <jdavis@postgresql.org>
Tue, 16 Dec 2025 19:13:17 +0000 (11:13 -0800)
committerJeff Davis <jdavis@postgresql.org>
Tue, 16 Dec 2025 20:57:12 +0000 (12:57 -0800)
Previously, ltree_prefix_eq_ci() used lowercasing with the default
collation; while ltree_crc32_sz() used tolower() directly. These were
equivalent only if the default collation provider was libc and the
encoding was single-byte.

Change both to use casefolding with the default collation.

Backpatch through 18, where the casefolding APIs were introduced. The
bug exists in earlier versions, but would require some adaptation.

A REINDEX is required for ltree indexes where the database default
collation is not libc.

Reviewed-by: Chao Li <li.evan.chao@gmail.com>
Reviewed-by: Peter Eisentraut <peter@eisentraut.org>
Backpatch-through: 18
Discussion: https://postgr.es/m/450ceb6260cad30d7afdf155d991a9caafee7c0d.camel@j-davis.com
Discussion: https://postgr.es/m/01fc00fd66f641b9693d4f9f1af0ccf44cbdfbdf.camel@j-davis.com

contrib/ltree/crc32.c
contrib/ltree/lquery_op.c
src/include/utils/pg_locale.h

index 134f46a805e569a0ed991aad4460a9577429375c..ce1b0f28e21b8cc67f4e194a561193aa8f5d7f5b 100644 (file)
 #include "postgres.h"
 #include "ltree.h"
 
+#include "crc32.h"
+#include "utils/pg_crc.h"
 #ifdef LOWER_NODE
-#include <ctype.h>
-#define TOLOWER(x)     tolower((unsigned char) (x))
-#else
-#define TOLOWER(x)     (x)
+#include "catalog/pg_collation.h"
+#include "utils/pg_locale.h"
 #endif
 
-#include "crc32.h"
-#include "utils/pg_crc.h"
+#ifdef LOWER_NODE
 
 unsigned int
 ltree_crc32_sz(const char *buf, int size)
 {
        pg_crc32        crc;
        const char *p = buf;
+       static pg_locale_t locale = NULL;
+
+       if (!locale)
+               locale = pg_newlocale_from_collation(DEFAULT_COLLATION_OID);
 
        INIT_TRADITIONAL_CRC32(crc);
        while (size > 0)
        {
-               char            c = (char) TOLOWER(*p);
+               char            foldstr[UNICODE_CASEMAP_BUFSZ];
+               int                     srclen = pg_mblen(p);
+               size_t          foldlen;
+
+               /* fold one codepoint at a time */
+               foldlen = pg_strfold(foldstr, UNICODE_CASEMAP_BUFSZ, p, srclen,
+                                                        locale);
+
+               COMP_TRADITIONAL_CRC32(crc, foldstr, foldlen);
+
+               size -= srclen;
+               p += srclen;
+       }
+       FIN_TRADITIONAL_CRC32(crc);
+       return (unsigned int) crc;
+}
+
+#else
 
-               COMP_TRADITIONAL_CRC32(crc, &c, 1);
+unsigned int
+ltree_crc32_sz(const char *buf, int size)
+{
+       pg_crc32        crc;
+       const char *p = buf;
+
+       INIT_TRADITIONAL_CRC32(crc);
+       while (size > 0)
+       {
+               COMP_TRADITIONAL_CRC32(crc, p, 1);
                size--;
                p++;
        }
        FIN_TRADITIONAL_CRC32(crc);
        return (unsigned int) crc;
 }
+
+#endif                                                 /* !LOWER_NODE */
index 0b39d64a83973f00aa88ef6be1c7e27311f6a73e..9b1de10121367023e6db4818a174d8ec88b1b7c8 100644 (file)
@@ -93,11 +93,44 @@ ltree_prefix_eq(const char *a, size_t a_sz, const char *b, size_t b_sz)
 bool
 ltree_prefix_eq_ci(const char *a, size_t a_sz, const char *b, size_t b_sz)
 {
-       char       *al = str_tolower(a, a_sz, DEFAULT_COLLATION_OID);
-       char       *bl = str_tolower(b, b_sz, DEFAULT_COLLATION_OID);
+       static pg_locale_t locale = NULL;
+       size_t          al_sz = a_sz + 1;
+       size_t          al_len;
+       char       *al = palloc(al_sz);
+       size_t          bl_sz = b_sz + 1;
+       size_t          bl_len;
+       char       *bl = palloc(bl_sz);
        bool            res;
 
-       res = (strncmp(al, bl, a_sz) == 0);
+       if (!locale)
+               locale = pg_newlocale_from_collation(DEFAULT_COLLATION_OID);
+
+       /* casefold both a and b */
+
+       al_len = pg_strfold(al, al_sz, a, a_sz, locale);
+       if (al_len + 1 > al_sz)
+       {
+               /* grow buffer if needed and retry */
+               al_sz = al_len + 1;
+               al = repalloc(al, al_sz);
+               al_len = pg_strfold(al, al_sz, a, a_sz, locale);
+               Assert(al_len + 1 <= al_sz);
+       }
+
+       bl_len = pg_strfold(bl, bl_sz, b, b_sz, locale);
+       if (bl_len + 1 > bl_sz)
+       {
+               /* grow buffer if needed and retry */
+               bl_sz = bl_len + 1;
+               bl = repalloc(bl, bl_sz);
+               bl_len = pg_strfold(bl, bl_sz, b, b_sz, locale);
+               Assert(bl_len + 1 <= bl_sz);
+       }
+
+       if (al_len > bl_len)
+               res = false;
+       else
+               res = (strncmp(al, bl, al_len) == 0);
 
        pfree(al);
        pfree(bl);
index 953e185f92dfa2e633ad78bab8b7515283806fc5..3a7582565910f1108968b5ccde6d7a6ff9219008 100644 (file)
 /* use for libc locale names */
 #define LOCALE_NAME_BUFLEN 128
 
+/*
+ * Maximum number of bytes needed to map a single codepoint. Useful for
+ * mapping and processing a single input codepoint at a time with a
+ * statically-allocated buffer.
+ *
+ * With full case mapping, an input codepoint may be mapped to as many as
+ * three output codepoints. See Unicode 16.0.0, section 5.18.2, "Change in
+ * Length":
+ *
+ * https://www.unicode.org/versions/Unicode16.0.0/core-spec/chapter-5/#G29675
+ */
+#define UNICODE_CASEMAP_LEN            3
+#define UNICODE_CASEMAP_BUFSZ  (UNICODE_CASEMAP_LEN * MAX_MULTIBYTE_CHAR_LEN)
+
 /* GUC settings */
 extern PGDLLIMPORT char *locale_messages;
 extern PGDLLIMPORT char *locale_monetary;