Refactor convert_case() to prepare for optimizations.

author Jeff Davis <jdavis@postgresql.org>

Thu, 13 Mar 2025 04:51:52 +0000 (21:51 -0700)

committer Jeff Davis <jdavis@postgresql.org>

Thu, 13 Mar 2025 04:51:52 +0000 (21:51 -0700)
author Jeff Davis <jdavis@postgresql.org>
Thu, 13 Mar 2025 04:51:52 +0000 (21:51 -0700)
committer Jeff Davis <jdavis@postgresql.org>
Thu, 13 Mar 2025 04:51:52 +0000 (21:51 -0700)
diff --git a/src/common/unicode_case.c b/src/common/unicode_case.c

index 7afff1b172b3ea2d05d1627c306f4c75bc5823d1..ccc485bf98fe243883f2474e0a246e2c841e9808 100644 (file)
--- a/src/common/unicode_case.c
+++ b/src/common/unicode_case.c
@@ -20,12 +20,20 @@
  #include "common/unicode_category.h"
  #include "mb/pg_wchar.h"
  
+enum CaseMapResult
+{
+       CASEMAP_SELF,
+       CASEMAP_SIMPLE,
+       CASEMAP_SPECIAL,
+};
+
  static const pg_case_map *find_case_map(pg_wchar ucs);
  static size_t convert_case(char *dst, size_t dstsize, const char *src, ssize_t srclen,
                                                    CaseKind str_casekind, bool full, WordBoundaryNext wbnext,
                                                    void *wbstate);
-static bool check_special_conditions(int conditions, const char *str,
-                                                                        size_t len, size_t offset);
+static enum CaseMapResult casemap(pg_wchar u1, CaseKind casekind, bool full,
+                                                                 const char *src, size_t srclen, size_t srcoff,
+                                                                 pg_wchar *u2, const pg_wchar **special);
  
  pg_wchar
  unicode_lowercase_simple(pg_wchar code)
@@ -214,8 +222,9 @@ convert_case(char *dst, size_t dstsize, const char *src, ssize_t srclen,
         {
                 pg_wchar        u1 = utf8_to_unicode((unsigned char *) src + srcoff);
                 int                     u1len = unicode_utf8len(u1);
-               const pg_case_map *casemap = find_case_map(u1);
-               const pg_special_case *special = NULL;
+               pg_wchar        simple = 0;
+               const pg_wchar *special = NULL;
+               enum CaseMapResult casemap_result;
  
                 if (str_casekind == CaseTitle)
                 {
@@ -228,56 +237,47 @@ convert_case(char *dst, size_t dstsize, const char *src, ssize_t srclen,
                                 chr_casekind = CaseLower;
                 }
  
-               /*
-                * Find special case that matches the conditions, if any.
-                *
-                * Note: only a single special mapping per codepoint is currently
-                * supported, though Unicode allows for multiple special mappings for
-                * a single codepoint.
-                */
-               if (full && casemap && casemap->special_case)
-               {
-                       int16           conditions = casemap->special_case->conditions;
-
-                       Assert(casemap->special_case->codepoint == u1);
-                       if (check_special_conditions(conditions, src, srclen, srcoff))
-                               special = casemap->special_case;
-               }
+               casemap_result = casemap(u1, chr_casekind, full, src, srclen, srcoff,
+                                                                &simple, &special);
  
-               /* perform mapping, update result_len, and write to dst */
-               if (special)
+               switch (casemap_result)
                 {
-                       for (int i = 0; i < MAX_CASE_EXPANSION; i++)
-                       {
-                               pg_wchar        u2 = special->map[chr_casekind][i];
-                               size_t          u2len = unicode_utf8len(u2);
-
-                               if (u2 == '\0')
-                                       break;
-
-                               if (result_len + u2len <= dstsize)
-                                       unicode_to_utf8(u2, (unsigned char *) dst + result_len);
-
-                               result_len += u2len;
-                       }
-               }
-               else if (casemap)
-               {
-                       pg_wchar        u2 = casemap->simplemap[chr_casekind];
-                       pg_wchar        u2len = unicode_utf8len(u2);
-
-                       if (result_len + u2len <= dstsize)
-                               unicode_to_utf8(u2, (unsigned char *) dst + result_len);
-
-                       result_len += u2len;
-               }
-               else
-               {
-                       /* no mapping; copy bytes from src */
-                       if (result_len + u1len <= dstsize)
-                               memcpy(dst + result_len, src + srcoff, u1len);
-
-                       result_len += u1len;
+                       case CASEMAP_SELF:
+                               /* no mapping; copy bytes from src */
+                               Assert(simple == 0);
+                               Assert(special == NULL);
+                               if (result_len + u1len <= dstsize)
+                                       memcpy(dst + result_len, src + srcoff, u1len);
+
+                               result_len += u1len;
+                               break;
+                       case CASEMAP_SIMPLE:
+                               {
+                                       /* replace with single character */
+                                       pg_wchar        u2 = simple;
+                                       pg_wchar        u2len = unicode_utf8len(u2);
+
+                                       Assert(special == NULL);
+                                       if (result_len + u2len <= dstsize)
+                                               unicode_to_utf8(u2, (unsigned char *) dst + result_len);
+
+                                       result_len += u2len;
+                               }
+                               break;
+                       case CASEMAP_SPECIAL:
+                               /* replace with up to MAX_CASE_EXPANSION characters */
+                               Assert(simple == 0);
+                               for (int i = 0; i < MAX_CASE_EXPANSION && special[i]; i++)
+                               {
+                                       pg_wchar        u2 = special[i];
+                                       size_t          u2len = unicode_utf8len(u2);
+
+                                       if (result_len + u2len <= dstsize)
+                                               unicode_to_utf8(u2, (unsigned char *) dst + result_len);
+
+                                       result_len += u2len;
+                               }
+                               break;
                 }
  
                 srcoff += u1len;
@@ -351,6 +351,10 @@ check_final_sigma(const unsigned char *str, size_t len, size_t offset)
         return true;
  }
  
+/*
+ * Unicode allows for special casing to be applied only under certain
+ * circumstances. The only currently-supported condition is Final_Sigma.
+ */
  static bool
  check_special_conditions(int conditions, const char *str, size_t len,
                                                  size_t offset)
@@ -365,6 +369,51 @@ check_special_conditions(int conditions, const char *str, size_t len,
         return false;
  }
  
+/*
+ * Map the given character to the requested case.
+ *
+ * If full is true, and a special case mapping is found and the conditions are
+ * met, 'special' is set to the mapping result (which is an array of up to
+ * MAX_CASE_EXPANSION characters) and CASEMAP_SPECIAL is returned.
+ *
+ * Otherwise, search for a simple mapping, and if found, set 'simple' to the
+ * result and return CASEMAP_SIMPLE.
+ *
+ * If no mapping is found, return CASEMAP_SELF, and the caller should copy the
+ * character without modification.
+ */
+static enum CaseMapResult
+casemap(pg_wchar u1, CaseKind casekind, bool full,
+               const char *src, size_t srclen, size_t srcoff,
+               pg_wchar *simple, const pg_wchar **special)
+{
+       const pg_case_map *map;
+
+       if (u1 < 0x80)
+       {
+               *simple = case_map[u1].simplemap[casekind];
+
+               return CASEMAP_SIMPLE;
+       }
+
+       map = find_case_map(u1);
+
+       if (map == NULL)
+               return CASEMAP_SELF;
+
+       if (full && map->special_case != NULL &&
+               check_special_conditions(map->special_case->conditions,
+                                                                src, srclen, srcoff))
+       {
+               *special = map->special_case->map[casekind];
+               return CASEMAP_SPECIAL;
+       }
+
+       *simple = map->simplemap[casekind];
+
+       return CASEMAP_SIMPLE;
+}
+
  /* find entry in simple case map, if any */
  static const pg_case_map *
  find_case_map(pg_wchar ucs)
author	Jeff Davis <jdavis@postgresql.org>
	Thu, 13 Mar 2025 04:51:52 +0000 (21:51 -0700)
committer	Jeff Davis <jdavis@postgresql.org>
	Thu, 13 Mar 2025 04:51:52 +0000 (21:51 -0700)