Add unicode_strtitle() for Unicode Default Case Conversion.

author Jeff Davis <jdavis@postgresql.org>

Sat, 30 Mar 2024 00:35:07 +0000 (17:35 -0700)

committer Jeff Davis <jdavis@postgresql.org>

Sat, 30 Mar 2024 00:35:07 +0000 (17:35 -0700)
author Jeff Davis <jdavis@postgresql.org>
Sat, 30 Mar 2024 00:35:07 +0000 (17:35 -0700)
committer Jeff Davis <jdavis@postgresql.org>
Sat, 30 Mar 2024 00:35:07 +0000 (17:35 -0700)
diff --git a/src/backend/utils/adt/formatting.c b/src/backend/utils/adt/formatting.c

index 79df80704d751e4f0ef31ff56077ffd27789306a..8736ada4be296116862e3aa699e8a1b325ad0be8 100644 (file)
--- a/src/backend/utils/adt/formatting.c
+++ b/src/backend/utils/adt/formatting.c
@@ -1922,6 +1922,47 @@ str_toupper(const char *buff, size_t nbytes, Oid collid)
         return result;
  }
  
+struct WordBoundaryState
+{
+       const char *str;
+       size_t          len;
+       size_t          offset;
+       bool            init;
+       bool            prev_alnum;
+};
+
+/*
+ * Simple word boundary iterator that draws boundaries each time the result of
+ * pg_u_isalnum() changes.
+ */
+static size_t
+initcap_wbnext(void *state)
+{
+       struct WordBoundaryState *wbstate = (struct WordBoundaryState *) state;
+
+       while (wbstate->offset < wbstate->len &&
+                  wbstate->str[wbstate->offset] != '\0')
+       {
+               pg_wchar        u = utf8_to_unicode((unsigned char *) wbstate->str +
+                                                                               wbstate->offset);
+               bool            curr_alnum = pg_u_isalnum(u, true);
+
+               if (!wbstate->init || curr_alnum != wbstate->prev_alnum)
+               {
+                       size_t          prev_offset = wbstate->offset;
+
+                       wbstate->init = true;
+                       wbstate->offset += unicode_utf8len(u);
+                       wbstate->prev_alnum = curr_alnum;
+                       return prev_offset;
+               }
+
+               wbstate->offset += unicode_utf8len(u);
+       }
+
+       return wbstate->len;
+}
+
  /*
   * collation-aware, wide-character-aware initcap function
   *
@@ -1980,56 +2021,42 @@ str_initcap(const char *buff, size_t nbytes, Oid collid)
  #endif
                 if (mylocale && mylocale->provider == COLLPROVIDER_BUILTIN)
                 {
-                       const unsigned char *src = (unsigned char *) buff;
+                       const char *src = buff;
                         size_t          srclen = nbytes;
-                       unsigned char *dst;
                         size_t          dstsize;
-                       int                     srcoff = 0;
-                       int                     dstoff = 0;
+                       char       *dst;
+                       size_t          needed;
+                       struct WordBoundaryState wbstate = {
+                               .str = src,
+                               .len = srclen,
+                               .offset = 0,
+                               .init = false,
+                               .prev_alnum = false,
+                       };
  
                         Assert(GetDatabaseEncoding() == PG_UTF8);
  
-                       /* overflow paranoia */
-                       if ((srclen + 1) > (INT_MAX / MAX_MULTIBYTE_CHAR_LEN))
-                               ereport(ERROR,
-                                               (errcode(ERRCODE_OUT_OF_MEMORY),
-                                                errmsg("out of memory")));
-
-                       /* result is at most srclen codepoints plus terminating NUL */
-                       dstsize = srclen * MAX_MULTIBYTE_CHAR_LEN + 1;
-                       dst = (unsigned char *) palloc(dstsize);
+                       /* first try buffer of equal size plus terminating NUL */
+                       dstsize = srclen + 1;
+                       dst = palloc(dstsize);
  
-                       while (srcoff < nbytes)
+                       needed = unicode_strtitle(dst, dstsize, src, srclen,
+                                                                         initcap_wbnext, &wbstate);
+                       if (needed + 1 > dstsize)
                         {
-                               pg_wchar        u1 = utf8_to_unicode(src + srcoff);
-                               pg_wchar        u2;
-                               int                     u1len = unicode_utf8len(u1);
-                               int                     u2len;
-
-                               if (wasalnum)
-                                       u2 = unicode_lowercase_simple(u1);
-                               else
-                                       u2 = unicode_uppercase_simple(u1);
+                               /* reset iterator */
+                               wbstate.offset = 0;
+                               wbstate.init = false;
  
-                               u2len = unicode_utf8len(u2);
-
-                               Assert(dstoff + u2len + 1 <= dstsize);
-
-                               wasalnum = pg_u_isalnum(u2, true);
-
-                               unicode_to_utf8(u2, dst + dstoff);
-                               srcoff += u1len;
-                               dstoff += u2len;
+                               /* grow buffer if needed and retry */
+                               dstsize = needed + 1;
+                               dst = repalloc(dst, dstsize);
+                               needed = unicode_strtitle(dst, dstsize, src, srclen,
+                                                                                 initcap_wbnext, &wbstate);
+                               Assert(needed + 1 == dstsize);
                         }
  
-                       Assert(dstoff + 1 <= dstsize);
-                       *(dst + dstoff) = '\0';
-                       dstoff++;
-
-                       /* allocate result buffer of the right size and free workspace */
-                       result = palloc(dstoff);
-                       memcpy(result, dst, dstoff);
-                       pfree(dst);
+                       result = dst;
                 }
                 else
                 {
diff --git a/src/common/unicode_case.c b/src/common/unicode_case.c

index 5e77490006fc60cdf86549f55431e8d50b53ad1c..bc423b0890c4d7d6a3a9954878139a3da6d2ca87 100644 (file)
--- a/src/common/unicode_case.c
+++ b/src/common/unicode_case.c
@@ -21,8 +21,9 @@
  #include "mb/pg_wchar.h"
  
  static const pg_case_map *find_case_map(pg_wchar ucs);
-static size_t convert_case(char *dst, size_t dstsize, const char *src,
-                                                  ssize_t srclen, CaseKind casekind);
+static size_t convert_case(char *dst, size_t dstsize, const char *src, ssize_t srclen,
+                                                  CaseKind str_casekind, WordBoundaryNext wbnext,
+                                                  void *wbstate);
  
  pg_wchar
  unicode_lowercase_simple(pg_wchar code)
@@ -67,7 +68,40 @@ unicode_uppercase_simple(pg_wchar code)
  size_t
  unicode_strlower(char *dst, size_t dstsize, const char *src, ssize_t srclen)
  {
-       return convert_case(dst, dstsize, src, srclen, CaseLower);
+       return convert_case(dst, dstsize, src, srclen, CaseLower, NULL, NULL);
+}
+
+/*
+ * unicode_strtitle()
+ *
+ * Convert src to titlecase, and return the result length (not including
+ * terminating NUL).
+ *
+ * String src must be encoded in UTF-8. If srclen < 0, src must be
+ * NUL-terminated.
+ *
+ * Result string is stored in dst, truncating if larger than dstsize. If
+ * dstsize is greater than the result length, dst will be NUL-terminated;
+ * otherwise not.
+ *
+ * If dstsize is zero, dst may be NULL. This is useful for calculating the
+ * required buffer size before allocating.
+ *
+ * Titlecasing requires knowledge about word boundaries, which is provided by
+ * the callback wbnext. A word boundary is the offset of the start of a word
+ * or the offset of the character immediately following a word.
+ *
+ * The caller is expected to initialize and free the callback state
+ * wbstate. The callback should first return offset 0 for the first boundary;
+ * then the offset of each subsequent word boundary; then the total length of
+ * the string to indicate the final boundary.
+ */
+size_t
+unicode_strtitle(char *dst, size_t dstsize, const char *src, ssize_t srclen,
+                                WordBoundaryNext wbnext, void *wbstate)
+{
+       return convert_case(dst, dstsize, src, srclen, CaseTitle, wbnext,
+                                               wbstate);
  }
  
  /*
@@ -89,20 +123,34 @@ unicode_strlower(char *dst, size_t dstsize, const char *src, ssize_t srclen)
  size_t
  unicode_strupper(char *dst, size_t dstsize, const char *src, ssize_t srclen)
  {
-       return convert_case(dst, dstsize, src, srclen, CaseUpper);
+       return convert_case(dst, dstsize, src, srclen, CaseUpper, NULL, NULL);
  }
  
  /*
- * Implement Unicode Default Case Conversion algorithm.
+ * If str_casekind is CaseLower or CaseUpper, map each character in the string
+ * for which a mapping is available.
   *
- * Map each character in the string for which a mapping is available.
+ * If str_casekind is CaseTitle, maps characters found on a word boundary to
+ * uppercase and other characters to lowercase.
   */
  static size_t
  convert_case(char *dst, size_t dstsize, const char *src, ssize_t srclen,
-                        CaseKind casekind)
+                        CaseKind str_casekind, WordBoundaryNext wbnext, void *wbstate)
  {
+       /* character CaseKind varies while titlecasing */
+       CaseKind        chr_casekind = str_casekind;
         size_t          srcoff = 0;
         size_t          result_len = 0;
+       size_t          boundary = 0;
+
+       Assert((str_casekind == CaseTitle && wbnext && wbstate) ||
+                  (str_casekind != CaseTitle && !wbnext && !wbstate));
+
+       if (str_casekind == CaseTitle)
+       {
+               boundary = wbnext(wbstate);
+               Assert(boundary == 0);  /* start of text is always a boundary */
+       }
  
         while ((srclen < 0 || srcoff < srclen) && src[srcoff] != '\0')
         {
@@ -110,9 +158,21 @@ convert_case(char *dst, size_t dstsize, const char *src, ssize_t srclen,
                 int                     u1len = unicode_utf8len(u1);
                 const           pg_case_map *casemap = find_case_map(u1);
  
+               if (str_casekind == CaseTitle)
+               {
+                       if (srcoff == boundary)
+                       {
+                               chr_casekind = CaseUpper;
+                               boundary = wbnext(wbstate);
+                       }
+                       else
+                               chr_casekind = CaseLower;
+               }
+
+               /* perform mapping, update result_len, and write to dst */
                 if (casemap)
                 {
-                       pg_wchar        u2 = casemap->simplemap[casekind];
+                       pg_wchar        u2 = casemap->simplemap[chr_casekind];
                         pg_wchar        u2len = unicode_utf8len(u2);
  
                         if (result_len + u2len <= dstsize)
diff --git a/src/include/common/unicode_case.h b/src/include/common/unicode_case.h

index df36d8db2135b94e8a5aceb875edd352d3fe518a..c0c3382e79ec0cf34928182ca1cff968aa25e029 100644 (file)
--- a/src/include/common/unicode_case.h
+++ b/src/include/common/unicode_case.h
@@ -16,11 +16,16 @@
  
  #include "mb/pg_wchar.h"
  
+typedef size_t (*WordBoundaryNext) (void *wbstate);
+
  pg_wchar       unicode_lowercase_simple(pg_wchar ucs);
  pg_wchar       unicode_titlecase_simple(pg_wchar ucs);
  pg_wchar       unicode_uppercase_simple(pg_wchar ucs);
  size_t         unicode_strlower(char *dst, size_t dstsize, const char *src,
                                                          ssize_t srclen);
+size_t         unicode_strtitle(char *dst, size_t dstsize, const char *src,
+                                                        ssize_t srclen, WordBoundaryNext wbnext,
+                                                        void *wbstate);
  size_t         unicode_strupper(char *dst, size_t dstsize, const char *src,
                                                          ssize_t srclen);
author	Jeff Davis <jdavis@postgresql.org>
	Sat, 30 Mar 2024 00:35:07 +0000 (17:35 -0700)
committer	Jeff Davis <jdavis@postgresql.org>
	Sat, 30 Mar 2024 00:35:07 +0000 (17:35 -0700)
src/backend/utils/adt/formatting.c		patch \| blob \| blame \| history
src/common/unicode_case.c		patch \| blob \| blame \| history
src/include/common/unicode_case.h		patch \| blob \| blame \| history