From: Jeff Davis Date: Tue, 6 Jan 2026 22:09:07 +0000 (-0800) Subject: ICU: use UTF8-optimized case conversion API X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=c4ff35f10441de7dbed4e87737bca205dcca698e;p=thirdparty%2Fpostgresql.git ICU: use UTF8-optimized case conversion API Initializes a UCaseMap object once for use across calls, and uses UTF8-optimized APIs. Author: Andreas Karlsson Reviewed-by: zengman Discussion: https://postgr.es/m/5a010b27-8ed9-4739-86fe-1562b07ba564@proxel.se --- diff --git a/src/backend/utils/adt/pg_locale_icu.c b/src/backend/utils/adt/pg_locale_icu.c index de80642f9dc..68491666738 100644 --- a/src/backend/utils/adt/pg_locale_icu.c +++ b/src/backend/utils/adt/pg_locale_icu.c @@ -52,6 +52,7 @@ extern pg_locale_t create_pg_locale_icu(Oid collid, MemoryContext context); #ifdef USE_ICU extern UCollator *pg_ucol_open(const char *loc_str); +static UCaseMap *pg_ucasemap_open(const char *loc_str); static size_t strlower_icu(char *dest, size_t destsize, const char *src, ssize_t srclen, pg_locale_t locale); @@ -61,6 +62,14 @@ static size_t strupper_icu(char *dest, size_t destsize, const char *src, ssize_t srclen, pg_locale_t locale); static size_t strfold_icu(char *dest, size_t destsize, const char *src, ssize_t srclen, pg_locale_t locale); +static size_t strlower_icu_utf8(char *dest, size_t destsize, const char *src, + ssize_t srclen, pg_locale_t locale); +static size_t strtitle_icu_utf8(char *dest, size_t destsize, const char *src, + ssize_t srclen, pg_locale_t locale); +static size_t strupper_icu_utf8(char *dest, size_t destsize, const char *src, + ssize_t srclen, pg_locale_t locale); +static size_t strfold_icu_utf8(char *dest, size_t destsize, const char *src, + ssize_t srclen, pg_locale_t locale); static size_t downcase_ident_icu(char *dst, size_t dstsize, const char *src, ssize_t srclen, pg_locale_t locale); static int strncoll_icu(const char *arg1, ssize_t len1, @@ -111,9 +120,9 @@ static size_t icu_from_uchar(char *dest, size_t destsize, const UChar *buff_uchar, int32_t len_uchar); static void icu_set_collation_attributes(UCollator *collator, const char *loc, UErrorCode *status); -static int32_t icu_convert_case(ICU_Convert_Func func, pg_locale_t mylocale, - UChar **buff_dest, UChar *buff_source, - int32_t len_source); +static int32_t icu_convert_case(ICU_Convert_Func func, char *dest, + size_t destsize, const char *src, + ssize_t srclen, pg_locale_t locale); static int32_t u_strToTitle_default_BI(UChar *dest, int32_t destCapacity, const UChar *src, int32_t srcLength, const char *locale, @@ -122,6 +131,7 @@ static int32_t u_strFoldCase_default(UChar *dest, int32_t destCapacity, const UChar *src, int32_t srcLength, const char *locale, UErrorCode *pErrorCode); +static int32_t foldcase_options(const char *locale); /* * XXX: many of the functions below rely on casts directly from pg_wchar to @@ -245,6 +255,28 @@ static const struct ctype_methods ctype_methods_icu = { .wc_tolower = tolower_icu, }; +static const struct ctype_methods ctype_methods_icu_utf8 = { + .strlower = strlower_icu_utf8, + .strtitle = strtitle_icu_utf8, + .strupper = strupper_icu_utf8, + .strfold = strfold_icu_utf8, + /* uses plain ASCII semantics for historical reasons */ + .downcase_ident = NULL, + .wc_isdigit = wc_isdigit_icu, + .wc_isalpha = wc_isalpha_icu, + .wc_isalnum = wc_isalnum_icu, + .wc_isupper = wc_isupper_icu, + .wc_islower = wc_islower_icu, + .wc_isgraph = wc_isgraph_icu, + .wc_isprint = wc_isprint_icu, + .wc_ispunct = wc_ispunct_icu, + .wc_isspace = wc_isspace_icu, + .wc_isxdigit = wc_isxdigit_icu, + .wc_iscased = wc_iscased_icu, + .wc_toupper = toupper_icu, + .wc_tolower = tolower_icu, +}; + /* * ICU still depends on libc for compatibility with certain historical * behavior for single-byte encodings. See downcase_ident_icu(). @@ -347,10 +379,16 @@ create_pg_locale_icu(Oid collid, MemoryContext context) result->collate_is_c = false; result->ctype_is_c = false; if (GetDatabaseEncoding() == PG_UTF8) + { + result->icu.ucasemap = pg_ucasemap_open(iculocstr); result->collate = &collate_methods_icu_utf8; + result->ctype = &ctype_methods_icu_utf8; + } else + { result->collate = &collate_methods_icu; - result->ctype = &ctype_methods_icu; + result->ctype = &ctype_methods_icu; + } return result; #else @@ -366,19 +404,15 @@ create_pg_locale_icu(Oid collid, MemoryContext context) #ifdef USE_ICU /* - * Wrapper around ucol_open() to handle API differences for older ICU - * versions. + * Check locale string and fix it if necessary. Returns a new palloc'd string. * - * Ensure that no path leaks a UCollator. + * In ICU versions 54 and earlier, "und" is not a recognized spelling of the + * root locale. If the first component of the locale is "und", replace with + * "root" before opening. */ -UCollator * -pg_ucol_open(const char *loc_str) +static char * +fix_icu_locale_str(const char *loc_str) { - UCollator *collator; - UErrorCode status; - const char *orig_str = loc_str; - char *fixed_str = NULL; - /* * Must never open default collator, because it depends on the environment * and may change at any time. Should not happen, but check here to catch @@ -391,16 +425,11 @@ pg_ucol_open(const char *loc_str) if (loc_str == NULL) elog(ERROR, "opening default collator is not supported"); - /* - * In ICU versions 54 and earlier, "und" is not a recognized spelling of - * the root locale. If the first component of the locale is "und", replace - * with "root" before opening. - */ if (U_ICU_VERSION_MAJOR_NUM < 55) { char lang[ULOC_LANG_CAPACITY]; + UErrorCode status = U_ZERO_ERROR; - status = U_ZERO_ERROR; uloc_getLanguage(loc_str, lang, ULOC_LANG_CAPACITY, &status); if (U_FAILURE(status) || status == U_STRING_NOT_TERMINATED_WARNING) { @@ -413,28 +442,47 @@ pg_ucol_open(const char *loc_str) if (strcmp(lang, "und") == 0) { const char *remainder = loc_str + strlen("und"); + char *fixed_str; fixed_str = palloc(strlen("root") + strlen(remainder) + 1); strcpy(fixed_str, "root"); strcat(fixed_str, remainder); - loc_str = fixed_str; + return fixed_str; } } + return pstrdup(loc_str); +} + +/* + * Wrapper around ucol_open() to handle API differences for older ICU + * versions. + * + * Ensure that no path leaks a UCollator. + */ +UCollator * +pg_ucol_open(const char *loc_str) +{ + UCollator *collator; + UErrorCode status; + char *fixed_str; + + fixed_str = fix_icu_locale_str(loc_str); + status = U_ZERO_ERROR; - collator = ucol_open(loc_str, &status); + collator = ucol_open(fixed_str, &status); if (U_FAILURE(status)) ereport(ERROR, /* use original string for error report */ (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("could not open collator for locale \"%s\": %s", - orig_str, u_errorName(status)))); + loc_str, u_errorName(status)))); if (U_ICU_VERSION_MAJOR_NUM < 54) { status = U_ZERO_ERROR; - icu_set_collation_attributes(collator, loc_str, &status); + icu_set_collation_attributes(collator, fixed_str, &status); /* * Pretend the error came from ucol_open(), for consistent error @@ -446,16 +494,43 @@ pg_ucol_open(const char *loc_str) ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("could not open collator for locale \"%s\": %s", - orig_str, u_errorName(status)))); + loc_str, u_errorName(status)))); } } - if (fixed_str != NULL) - pfree(fixed_str); + pfree(fixed_str); return collator; } +/* + * Wrapper around ucasemap_open() to handle API differences for older ICU + * versions. + * + * Additionally makes sure we get the right options for case folding. + */ +static UCaseMap * +pg_ucasemap_open(const char *loc_str) +{ + UErrorCode status = U_ZERO_ERROR; + UCaseMap *casemap; + char *fixed_str; + + fixed_str = fix_icu_locale_str(loc_str); + + casemap = ucasemap_open(fixed_str, foldcase_options(fixed_str), &status); + if (U_FAILURE(status)) + /* use original string for error report */ + ereport(ERROR, + errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("could not open casemap for locale \"%s\": %s", + loc_str, u_errorName(status))); + + pfree(fixed_str); + + return casemap; +} + /* * Create a UCollator with the given locale string and rules. * @@ -528,80 +603,84 @@ static size_t strlower_icu(char *dest, size_t destsize, const char *src, ssize_t srclen, pg_locale_t locale) { - int32_t len_uchar; - int32_t len_conv; - UChar *buff_uchar; - UChar *buff_conv; - size_t result_len; - - len_uchar = icu_to_uchar(&buff_uchar, src, srclen); - len_conv = icu_convert_case(u_strToLower, locale, - &buff_conv, buff_uchar, len_uchar); - result_len = icu_from_uchar(dest, destsize, buff_conv, len_conv); - pfree(buff_uchar); - pfree(buff_conv); - - return result_len; + return icu_convert_case(u_strToLower, dest, destsize, src, srclen, locale); } static size_t strtitle_icu(char *dest, size_t destsize, const char *src, ssize_t srclen, pg_locale_t locale) { - int32_t len_uchar; - int32_t len_conv; - UChar *buff_uchar; - UChar *buff_conv; - size_t result_len; - - len_uchar = icu_to_uchar(&buff_uchar, src, srclen); - len_conv = icu_convert_case(u_strToTitle_default_BI, locale, - &buff_conv, buff_uchar, len_uchar); - result_len = icu_from_uchar(dest, destsize, buff_conv, len_conv); - pfree(buff_uchar); - pfree(buff_conv); - - return result_len; + return icu_convert_case(u_strToTitle_default_BI, dest, destsize, src, srclen, locale); } static size_t strupper_icu(char *dest, size_t destsize, const char *src, ssize_t srclen, pg_locale_t locale) { - int32_t len_uchar; - int32_t len_conv; - UChar *buff_uchar; - UChar *buff_conv; - size_t result_len; - - len_uchar = icu_to_uchar(&buff_uchar, src, srclen); - len_conv = icu_convert_case(u_strToUpper, locale, - &buff_conv, buff_uchar, len_uchar); - result_len = icu_from_uchar(dest, destsize, buff_conv, len_conv); - pfree(buff_uchar); - pfree(buff_conv); - - return result_len; + return icu_convert_case(u_strToUpper, dest, destsize, src, srclen, locale); } static size_t strfold_icu(char *dest, size_t destsize, const char *src, ssize_t srclen, pg_locale_t locale) { - int32_t len_uchar; - int32_t len_conv; - UChar *buff_uchar; - UChar *buff_conv; - size_t result_len; + return icu_convert_case(u_strFoldCase_default, dest, destsize, src, srclen, locale); +} - len_uchar = icu_to_uchar(&buff_uchar, src, srclen); - len_conv = icu_convert_case(u_strFoldCase_default, locale, - &buff_conv, buff_uchar, len_uchar); - result_len = icu_from_uchar(dest, destsize, buff_conv, len_conv); - pfree(buff_uchar); - pfree(buff_conv); +static size_t +strlower_icu_utf8(char *dest, size_t destsize, const char *src, ssize_t srclen, + pg_locale_t locale) +{ + UErrorCode status = U_ZERO_ERROR; + int32_t needed; - return result_len; + needed = ucasemap_utf8ToLower(locale->icu.ucasemap, dest, destsize, src, srclen, &status); + if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR) + ereport(ERROR, + errmsg("case conversion failed: %s", u_errorName(status))); + return needed; +} + +static size_t +strtitle_icu_utf8(char *dest, size_t destsize, const char *src, ssize_t srclen, + pg_locale_t locale) +{ + UErrorCode status = U_ZERO_ERROR; + int32_t needed; + + needed = ucasemap_utf8ToTitle(locale->icu.ucasemap, dest, destsize, src, srclen, &status); + if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR) + ereport(ERROR, + errmsg("case conversion failed: %s", u_errorName(status))); + return needed; +} + +static size_t +strupper_icu_utf8(char *dest, size_t destsize, const char *src, ssize_t srclen, + pg_locale_t locale) +{ + UErrorCode status = U_ZERO_ERROR; + int32_t needed; + + needed = ucasemap_utf8ToUpper(locale->icu.ucasemap, dest, destsize, src, srclen, &status); + if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR) + ereport(ERROR, + errmsg("case conversion failed: %s", u_errorName(status))); + return needed; +} + +static size_t +strfold_icu_utf8(char *dest, size_t destsize, const char *src, ssize_t srclen, + pg_locale_t locale) +{ + UErrorCode status = U_ZERO_ERROR; + int32_t needed; + + needed = ucasemap_utf8FoldCase(locale->icu.ucasemap, dest, destsize, src, srclen, &status); + if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR) + ereport(ERROR, + errmsg("case conversion failed: %s", u_errorName(status))); + return needed; } /* @@ -829,8 +908,8 @@ icu_from_uchar(char *dest, size_t destsize, const UChar *buff_uchar, int32_t len } static int32_t -icu_convert_case(ICU_Convert_Func func, pg_locale_t mylocale, - UChar **buff_dest, UChar *buff_source, int32_t len_source) +convert_case_uchar(ICU_Convert_Func func, pg_locale_t mylocale, + UChar **buff_dest, UChar *buff_source, int32_t len_source) { UErrorCode status; int32_t len_dest; @@ -855,6 +934,26 @@ icu_convert_case(ICU_Convert_Func func, pg_locale_t mylocale, return len_dest; } +static int32_t +icu_convert_case(ICU_Convert_Func func, char *dest, size_t destsize, + const char *src, ssize_t srclen, pg_locale_t locale) +{ + int32_t len_uchar; + int32_t len_conv; + UChar *buff_uchar; + UChar *buff_conv; + size_t result_len; + + len_uchar = icu_to_uchar(&buff_uchar, src, srclen); + len_conv = convert_case_uchar(func, locale, &buff_conv, + buff_uchar, len_uchar); + result_len = icu_from_uchar(dest, destsize, buff_conv, len_conv); + pfree(buff_uchar); + pfree(buff_conv); + + return result_len; +} + static int32_t u_strToTitle_default_BI(UChar *dest, int32_t destCapacity, const UChar *src, int32_t srcLength, @@ -870,18 +969,25 @@ u_strFoldCase_default(UChar *dest, int32_t destCapacity, const UChar *src, int32_t srcLength, const char *locale, UErrorCode *pErrorCode) +{ + return u_strFoldCase(dest, destCapacity, src, srcLength, + foldcase_options(locale), pErrorCode); +} + +/* + * Return the correct u_strFoldCase() options for the given locale. + * + * Unlike the ICU APIs for lowercasing, titlecasing, and uppercasing, case + * folding does not accept a locale. Instead it just supports a single option + * relevant to Turkic languages 'az' and 'tr'; check for those languages. + */ +static int32_t +foldcase_options(const char *locale) { uint32 options = U_FOLD_CASE_DEFAULT; char lang[3]; - UErrorCode status; + UErrorCode status = U_ZERO_ERROR; - /* - * Unlike the ICU APIs for lowercasing, titlecasing, and uppercasing, case - * folding does not accept a locale. Instead it just supports a single - * option relevant to Turkic languages 'az' and 'tr'; check for those - * languages to enable the option. - */ - status = U_ZERO_ERROR; uloc_getLanguage(locale, lang, 3, &status); if (U_SUCCESS(status)) { @@ -893,8 +999,7 @@ u_strFoldCase_default(UChar *dest, int32_t destCapacity, options = U_FOLD_CASE_EXCLUDE_SPECIAL_I; } - return u_strFoldCase(dest, destCapacity, src, srcLength, - options, pErrorCode); + return options; } /* diff --git a/src/common/unicode/case_test.c b/src/common/unicode/case_test.c index fd114b19b29..2144219e178 100644 --- a/src/common/unicode/case_test.c +++ b/src/common/unicode/case_test.c @@ -30,7 +30,7 @@ #define BUFSZ 256 #ifdef USE_ICU -static UCaseMap * casemap = NULL; +static UCaseMap *casemap = NULL; #endif typedef size_t (*TestFunc) (char *dst, size_t dstsize, const char *src, diff --git a/src/include/utils/pg_locale.h b/src/include/utils/pg_locale.h index b1ee5fb0ef5..465f170ba79 100644 --- a/src/include/utils/pg_locale.h +++ b/src/include/utils/pg_locale.h @@ -21,6 +21,7 @@ #undef U_SHOW_CPLUSPLUS_HEADER_API #define U_SHOW_CPLUSPLUS_HEADER_API 0 #include +#include #endif /* use for libc locale names */ @@ -168,6 +169,7 @@ struct pg_locale_struct const char *locale; UCollator *ucol; locale_t lt; + UCaseMap *ucasemap; } icu; #endif }; diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list index b9e671fcda8..09e7f1d420e 100644 --- a/src/tools/pgindent/typedefs.list +++ b/src/tools/pgindent/typedefs.list @@ -3190,6 +3190,7 @@ TypeName TzAbbrevCache U32 U8 +UCaseMap UChar UCharIterator UColAttributeValue