]> git.ipfire.org Git - thirdparty/postgresql.git/commitdiff
ICU: use UTF8-optimized case conversion API
authorJeff Davis <jdavis@postgresql.org>
Tue, 6 Jan 2026 22:09:07 +0000 (14:09 -0800)
committerJeff Davis <jdavis@postgresql.org>
Tue, 6 Jan 2026 22:09:07 +0000 (14:09 -0800)
Initializes a UCaseMap object once for use across calls, and uses
UTF8-optimized APIs.

Author: Andreas Karlsson <andreas@proxel.se>
Reviewed-by: zengman <zengman@halodbtech.com>
Discussion: https://postgr.es/m/5a010b27-8ed9-4739-86fe-1562b07ba564@proxel.se

src/backend/utils/adt/pg_locale_icu.c
src/common/unicode/case_test.c
src/include/utils/pg_locale.h
src/tools/pgindent/typedefs.list

index de80642f9dce27580f0efbf5880c3cc48779e44c..68491666738e9b6a645912b93cd6c52a80a74ac5 100644 (file)
@@ -52,6 +52,7 @@ extern pg_locale_t create_pg_locale_icu(Oid collid, MemoryContext context);
 #ifdef USE_ICU
 
 extern UCollator *pg_ucol_open(const char *loc_str);
+static UCaseMap *pg_ucasemap_open(const char *loc_str);
 
 static size_t strlower_icu(char *dest, size_t destsize, const char *src,
                                                   ssize_t srclen, pg_locale_t locale);
@@ -61,6 +62,14 @@ static size_t strupper_icu(char *dest, size_t destsize, const char *src,
                                                   ssize_t srclen, pg_locale_t locale);
 static size_t strfold_icu(char *dest, size_t destsize, const char *src,
                                                  ssize_t srclen, pg_locale_t locale);
+static size_t strlower_icu_utf8(char *dest, size_t destsize, const char *src,
+                                                               ssize_t srclen, pg_locale_t locale);
+static size_t strtitle_icu_utf8(char *dest, size_t destsize, const char *src,
+                                                               ssize_t srclen, pg_locale_t locale);
+static size_t strupper_icu_utf8(char *dest, size_t destsize, const char *src,
+                                                               ssize_t srclen, pg_locale_t locale);
+static size_t strfold_icu_utf8(char *dest, size_t destsize, const char *src,
+                                                          ssize_t srclen, pg_locale_t locale);
 static size_t downcase_ident_icu(char *dst, size_t dstsize, const char *src,
                                                                 ssize_t srclen, pg_locale_t locale);
 static int     strncoll_icu(const char *arg1, ssize_t len1,
@@ -111,9 +120,9 @@ static size_t icu_from_uchar(char *dest, size_t destsize,
                                                         const UChar *buff_uchar, int32_t len_uchar);
 static void icu_set_collation_attributes(UCollator *collator, const char *loc,
                                                                                 UErrorCode *status);
-static int32_t icu_convert_case(ICU_Convert_Func func, pg_locale_t mylocale,
-                                                               UChar **buff_dest, UChar *buff_source,
-                                                               int32_t len_source);
+static int32_t icu_convert_case(ICU_Convert_Func func, char *dest,
+                                                               size_t destsize, const char *src,
+                                                               ssize_t srclen, pg_locale_t locale);
 static int32_t u_strToTitle_default_BI(UChar *dest, int32_t destCapacity,
                                                                           const UChar *src, int32_t srcLength,
                                                                           const char *locale,
@@ -122,6 +131,7 @@ static int32_t u_strFoldCase_default(UChar *dest, int32_t destCapacity,
                                                                         const UChar *src, int32_t srcLength,
                                                                         const char *locale,
                                                                         UErrorCode *pErrorCode);
+static int32_t foldcase_options(const char *locale);
 
 /*
  * XXX: many of the functions below rely on casts directly from pg_wchar to
@@ -245,6 +255,28 @@ static const struct ctype_methods ctype_methods_icu = {
        .wc_tolower = tolower_icu,
 };
 
+static const struct ctype_methods ctype_methods_icu_utf8 = {
+       .strlower = strlower_icu_utf8,
+       .strtitle = strtitle_icu_utf8,
+       .strupper = strupper_icu_utf8,
+       .strfold = strfold_icu_utf8,
+       /* uses plain ASCII semantics for historical reasons */
+       .downcase_ident = NULL,
+       .wc_isdigit = wc_isdigit_icu,
+       .wc_isalpha = wc_isalpha_icu,
+       .wc_isalnum = wc_isalnum_icu,
+       .wc_isupper = wc_isupper_icu,
+       .wc_islower = wc_islower_icu,
+       .wc_isgraph = wc_isgraph_icu,
+       .wc_isprint = wc_isprint_icu,
+       .wc_ispunct = wc_ispunct_icu,
+       .wc_isspace = wc_isspace_icu,
+       .wc_isxdigit = wc_isxdigit_icu,
+       .wc_iscased = wc_iscased_icu,
+       .wc_toupper = toupper_icu,
+       .wc_tolower = tolower_icu,
+};
+
 /*
  * ICU still depends on libc for compatibility with certain historical
  * behavior for single-byte encodings.  See downcase_ident_icu().
@@ -347,10 +379,16 @@ create_pg_locale_icu(Oid collid, MemoryContext context)
        result->collate_is_c = false;
        result->ctype_is_c = false;
        if (GetDatabaseEncoding() == PG_UTF8)
+       {
+               result->icu.ucasemap = pg_ucasemap_open(iculocstr);
                result->collate = &collate_methods_icu_utf8;
+               result->ctype = &ctype_methods_icu_utf8;
+       }
        else
+       {
                result->collate = &collate_methods_icu;
-       result->ctype = &ctype_methods_icu;
+               result->ctype = &ctype_methods_icu;
+       }
 
        return result;
 #else
@@ -366,19 +404,15 @@ create_pg_locale_icu(Oid collid, MemoryContext context)
 #ifdef USE_ICU
 
 /*
- * Wrapper around ucol_open() to handle API differences for older ICU
- * versions.
+ * Check locale string and fix it if necessary. Returns a new palloc'd string.
  *
- * Ensure that no path leaks a UCollator.
+ * In ICU versions 54 and earlier, "und" is not a recognized spelling of the
+ * root locale. If the first component of the locale is "und", replace with
+ * "root" before opening.
  */
-UCollator *
-pg_ucol_open(const char *loc_str)
+static char *
+fix_icu_locale_str(const char *loc_str)
 {
-       UCollator  *collator;
-       UErrorCode      status;
-       const char *orig_str = loc_str;
-       char       *fixed_str = NULL;
-
        /*
         * Must never open default collator, because it depends on the environment
         * and may change at any time. Should not happen, but check here to catch
@@ -391,16 +425,11 @@ pg_ucol_open(const char *loc_str)
        if (loc_str == NULL)
                elog(ERROR, "opening default collator is not supported");
 
-       /*
-        * In ICU versions 54 and earlier, "und" is not a recognized spelling of
-        * the root locale. If the first component of the locale is "und", replace
-        * with "root" before opening.
-        */
        if (U_ICU_VERSION_MAJOR_NUM < 55)
        {
                char            lang[ULOC_LANG_CAPACITY];
+               UErrorCode      status = U_ZERO_ERROR;
 
-               status = U_ZERO_ERROR;
                uloc_getLanguage(loc_str, lang, ULOC_LANG_CAPACITY, &status);
                if (U_FAILURE(status) || status == U_STRING_NOT_TERMINATED_WARNING)
                {
@@ -413,28 +442,47 @@ pg_ucol_open(const char *loc_str)
                if (strcmp(lang, "und") == 0)
                {
                        const char *remainder = loc_str + strlen("und");
+                       char       *fixed_str;
 
                        fixed_str = palloc(strlen("root") + strlen(remainder) + 1);
                        strcpy(fixed_str, "root");
                        strcat(fixed_str, remainder);
 
-                       loc_str = fixed_str;
+                       return fixed_str;
                }
        }
 
+       return pstrdup(loc_str);
+}
+
+/*
+ * Wrapper around ucol_open() to handle API differences for older ICU
+ * versions.
+ *
+ * Ensure that no path leaks a UCollator.
+ */
+UCollator *
+pg_ucol_open(const char *loc_str)
+{
+       UCollator  *collator;
+       UErrorCode      status;
+       char       *fixed_str;
+
+       fixed_str = fix_icu_locale_str(loc_str);
+
        status = U_ZERO_ERROR;
-       collator = ucol_open(loc_str, &status);
+       collator = ucol_open(fixed_str, &status);
        if (U_FAILURE(status))
                ereport(ERROR,
                /* use original string for error report */
                                (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
                                 errmsg("could not open collator for locale \"%s\": %s",
-                                               orig_str, u_errorName(status))));
+                                               loc_str, u_errorName(status))));
 
        if (U_ICU_VERSION_MAJOR_NUM < 54)
        {
                status = U_ZERO_ERROR;
-               icu_set_collation_attributes(collator, loc_str, &status);
+               icu_set_collation_attributes(collator, fixed_str, &status);
 
                /*
                 * Pretend the error came from ucol_open(), for consistent error
@@ -446,16 +494,43 @@ pg_ucol_open(const char *loc_str)
                        ereport(ERROR,
                                        (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
                                         errmsg("could not open collator for locale \"%s\": %s",
-                                                       orig_str, u_errorName(status))));
+                                                       loc_str, u_errorName(status))));
                }
        }
 
-       if (fixed_str != NULL)
-               pfree(fixed_str);
+       pfree(fixed_str);
 
        return collator;
 }
 
+/*
+ * Wrapper around ucasemap_open() to handle API differences for older ICU
+ * versions.
+ *
+ * Additionally makes sure we get the right options for case folding.
+ */
+static UCaseMap *
+pg_ucasemap_open(const char *loc_str)
+{
+       UErrorCode      status = U_ZERO_ERROR;
+       UCaseMap   *casemap;
+       char       *fixed_str;
+
+       fixed_str = fix_icu_locale_str(loc_str);
+
+       casemap = ucasemap_open(fixed_str, foldcase_options(fixed_str), &status);
+       if (U_FAILURE(status))
+               /* use original string for error report */
+               ereport(ERROR,
+                               errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+                               errmsg("could not open casemap for locale \"%s\": %s",
+                                          loc_str, u_errorName(status)));
+
+       pfree(fixed_str);
+
+       return casemap;
+}
+
 /*
  * Create a UCollator with the given locale string and rules.
  *
@@ -528,80 +603,84 @@ static size_t
 strlower_icu(char *dest, size_t destsize, const char *src, ssize_t srclen,
                         pg_locale_t locale)
 {
-       int32_t         len_uchar;
-       int32_t         len_conv;
-       UChar      *buff_uchar;
-       UChar      *buff_conv;
-       size_t          result_len;
-
-       len_uchar = icu_to_uchar(&buff_uchar, src, srclen);
-       len_conv = icu_convert_case(u_strToLower, locale,
-                                                               &buff_conv, buff_uchar, len_uchar);
-       result_len = icu_from_uchar(dest, destsize, buff_conv, len_conv);
-       pfree(buff_uchar);
-       pfree(buff_conv);
-
-       return result_len;
+       return icu_convert_case(u_strToLower, dest, destsize, src, srclen, locale);
 }
 
 static size_t
 strtitle_icu(char *dest, size_t destsize, const char *src, ssize_t srclen,
                         pg_locale_t locale)
 {
-       int32_t         len_uchar;
-       int32_t         len_conv;
-       UChar      *buff_uchar;
-       UChar      *buff_conv;
-       size_t          result_len;
-
-       len_uchar = icu_to_uchar(&buff_uchar, src, srclen);
-       len_conv = icu_convert_case(u_strToTitle_default_BI, locale,
-                                                               &buff_conv, buff_uchar, len_uchar);
-       result_len = icu_from_uchar(dest, destsize, buff_conv, len_conv);
-       pfree(buff_uchar);
-       pfree(buff_conv);
-
-       return result_len;
+       return icu_convert_case(u_strToTitle_default_BI, dest, destsize, src, srclen, locale);
 }
 
 static size_t
 strupper_icu(char *dest, size_t destsize, const char *src, ssize_t srclen,
                         pg_locale_t locale)
 {
-       int32_t         len_uchar;
-       int32_t         len_conv;
-       UChar      *buff_uchar;
-       UChar      *buff_conv;
-       size_t          result_len;
-
-       len_uchar = icu_to_uchar(&buff_uchar, src, srclen);
-       len_conv = icu_convert_case(u_strToUpper, locale,
-                                                               &buff_conv, buff_uchar, len_uchar);
-       result_len = icu_from_uchar(dest, destsize, buff_conv, len_conv);
-       pfree(buff_uchar);
-       pfree(buff_conv);
-
-       return result_len;
+       return icu_convert_case(u_strToUpper, dest, destsize, src, srclen, locale);
 }
 
 static size_t
 strfold_icu(char *dest, size_t destsize, const char *src, ssize_t srclen,
                        pg_locale_t locale)
 {
-       int32_t         len_uchar;
-       int32_t         len_conv;
-       UChar      *buff_uchar;
-       UChar      *buff_conv;
-       size_t          result_len;
+       return icu_convert_case(u_strFoldCase_default, dest, destsize, src, srclen, locale);
+}
 
-       len_uchar = icu_to_uchar(&buff_uchar, src, srclen);
-       len_conv = icu_convert_case(u_strFoldCase_default, locale,
-                                                               &buff_conv, buff_uchar, len_uchar);
-       result_len = icu_from_uchar(dest, destsize, buff_conv, len_conv);
-       pfree(buff_uchar);
-       pfree(buff_conv);
+static size_t
+strlower_icu_utf8(char *dest, size_t destsize, const char *src, ssize_t srclen,
+                                 pg_locale_t locale)
+{
+       UErrorCode      status = U_ZERO_ERROR;
+       int32_t         needed;
 
-       return result_len;
+       needed = ucasemap_utf8ToLower(locale->icu.ucasemap, dest, destsize, src, srclen, &status);
+       if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR)
+               ereport(ERROR,
+                               errmsg("case conversion failed: %s", u_errorName(status)));
+       return needed;
+}
+
+static size_t
+strtitle_icu_utf8(char *dest, size_t destsize, const char *src, ssize_t srclen,
+                                 pg_locale_t locale)
+{
+       UErrorCode      status = U_ZERO_ERROR;
+       int32_t         needed;
+
+       needed = ucasemap_utf8ToTitle(locale->icu.ucasemap, dest, destsize, src, srclen, &status);
+       if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR)
+               ereport(ERROR,
+                               errmsg("case conversion failed: %s", u_errorName(status)));
+       return needed;
+}
+
+static size_t
+strupper_icu_utf8(char *dest, size_t destsize, const char *src, ssize_t srclen,
+                                 pg_locale_t locale)
+{
+       UErrorCode      status = U_ZERO_ERROR;
+       int32_t         needed;
+
+       needed = ucasemap_utf8ToUpper(locale->icu.ucasemap, dest, destsize, src, srclen, &status);
+       if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR)
+               ereport(ERROR,
+                               errmsg("case conversion failed: %s", u_errorName(status)));
+       return needed;
+}
+
+static size_t
+strfold_icu_utf8(char *dest, size_t destsize, const char *src, ssize_t srclen,
+                                pg_locale_t locale)
+{
+       UErrorCode      status = U_ZERO_ERROR;
+       int32_t         needed;
+
+       needed = ucasemap_utf8FoldCase(locale->icu.ucasemap, dest, destsize, src, srclen, &status);
+       if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR)
+               ereport(ERROR,
+                               errmsg("case conversion failed: %s", u_errorName(status)));
+       return needed;
 }
 
 /*
@@ -829,8 +908,8 @@ icu_from_uchar(char *dest, size_t destsize, const UChar *buff_uchar, int32_t len
 }
 
 static int32_t
-icu_convert_case(ICU_Convert_Func func, pg_locale_t mylocale,
-                                UChar **buff_dest, UChar *buff_source, int32_t len_source)
+convert_case_uchar(ICU_Convert_Func func, pg_locale_t mylocale,
+                                  UChar **buff_dest, UChar *buff_source, int32_t len_source)
 {
        UErrorCode      status;
        int32_t         len_dest;
@@ -855,6 +934,26 @@ icu_convert_case(ICU_Convert_Func func, pg_locale_t mylocale,
        return len_dest;
 }
 
+static int32_t
+icu_convert_case(ICU_Convert_Func func, char *dest, size_t destsize,
+                                const char *src, ssize_t srclen, pg_locale_t locale)
+{
+       int32_t         len_uchar;
+       int32_t         len_conv;
+       UChar      *buff_uchar;
+       UChar      *buff_conv;
+       size_t          result_len;
+
+       len_uchar = icu_to_uchar(&buff_uchar, src, srclen);
+       len_conv = convert_case_uchar(func, locale, &buff_conv,
+                                                                 buff_uchar, len_uchar);
+       result_len = icu_from_uchar(dest, destsize, buff_conv, len_conv);
+       pfree(buff_uchar);
+       pfree(buff_conv);
+
+       return result_len;
+}
+
 static int32_t
 u_strToTitle_default_BI(UChar *dest, int32_t destCapacity,
                                                const UChar *src, int32_t srcLength,
@@ -870,18 +969,25 @@ u_strFoldCase_default(UChar *dest, int32_t destCapacity,
                                          const UChar *src, int32_t srcLength,
                                          const char *locale,
                                          UErrorCode *pErrorCode)
+{
+       return u_strFoldCase(dest, destCapacity, src, srcLength,
+                                                foldcase_options(locale), pErrorCode);
+}
+
+/*
+ * Return the correct u_strFoldCase() options for the given locale.
+ *
+ * Unlike the ICU APIs for lowercasing, titlecasing, and uppercasing, case
+ * folding does not accept a locale. Instead it just supports a single option
+ * relevant to Turkic languages 'az' and 'tr'; check for those languages.
+ */
+static int32_t
+foldcase_options(const char *locale)
 {
        uint32          options = U_FOLD_CASE_DEFAULT;
        char            lang[3];
-       UErrorCode      status;
+       UErrorCode      status = U_ZERO_ERROR;
 
-       /*
-        * Unlike the ICU APIs for lowercasing, titlecasing, and uppercasing, case
-        * folding does not accept a locale. Instead it just supports a single
-        * option relevant to Turkic languages 'az' and 'tr'; check for those
-        * languages to enable the option.
-        */
-       status = U_ZERO_ERROR;
        uloc_getLanguage(locale, lang, 3, &status);
        if (U_SUCCESS(status))
        {
@@ -893,8 +999,7 @@ u_strFoldCase_default(UChar *dest, int32_t destCapacity,
                        options = U_FOLD_CASE_EXCLUDE_SPECIAL_I;
        }
 
-       return u_strFoldCase(dest, destCapacity, src, srcLength,
-                                                options, pErrorCode);
+       return options;
 }
 
 /*
index fd114b19b2985d60dfb73c09c4708622224d31e8..2144219e178fe09e991a1680f7b886bcb21684db 100644 (file)
@@ -30,7 +30,7 @@
 #define BUFSZ 256
 
 #ifdef USE_ICU
-static UCaseMap * casemap = NULL;
+static UCaseMap *casemap = NULL;
 #endif
 
 typedef size_t (*TestFunc) (char *dst, size_t dstsize, const char *src,
index b1ee5fb0ef5daaef7734a0d3b85eb1e84a63afca..465f170ba792109d7b3185a668843c9c3c0c7462 100644 (file)
@@ -21,6 +21,7 @@
 #undef U_SHOW_CPLUSPLUS_HEADER_API
 #define U_SHOW_CPLUSPLUS_HEADER_API 0
 #include <unicode/ucol.h>
+#include <unicode/ucasemap.h>
 #endif
 
 /* use for libc locale names */
@@ -168,6 +169,7 @@ struct pg_locale_struct
                        const char *locale;
                        UCollator  *ucol;
                        locale_t        lt;
+                       UCaseMap   *ucasemap;
                }                       icu;
 #endif
        };
index b9e671fcda85a9c813add5d90ea1fd14a9590882..09e7f1d420ed3229e602e839456aaa216afd0b41 100644 (file)
@@ -3190,6 +3190,7 @@ TypeName
 TzAbbrevCache
 U32
 U8
+UCaseMap
 UChar
 UCharIterator
 UColAttributeValue