Replace pg_mblen() with bounds-checked versions.

author Thomas Munro <tmunro@postgresql.org>

Wed, 7 Jan 2026 09:14:31 +0000 (22:14 +1300)

committer Thomas Munro <tmunro@postgresql.org>

Sun, 8 Feb 2026 23:34:24 +0000 (12:34 +1300)
author Thomas Munro <tmunro@postgresql.org>
Wed, 7 Jan 2026 09:14:31 +0000 (22:14 +1300)
committer Thomas Munro <tmunro@postgresql.org>
Sun, 8 Feb 2026 23:34:24 +0000 (12:34 +1300)
diff --git a/contrib/btree_gist/btree_utils_var.c b/contrib/btree_gist/btree_utils_var.c

index 2886c08b85e405629524cd9a46a51bf7c42a4f57..9d93b3c775e567215a0abe2baa4aba8b9aae137e 100644 (file)
--- a/contrib/btree_gist/btree_utils_var.c
+++ b/contrib/btree_gist/btree_utils_var.c
@@ -116,36 +116,47 @@ gbt_var_leaf2node(GBT_VARKEY *leaf, const gbtree_vinfo *tinfo, FmgrInfo *flinfo)
  
  /*
   * returns the common prefix length of a node key
+ *
+ * If the underlying type is character data, the prefix length may point in
+ * the middle of a multibyte character.
  */
  static int32
  gbt_var_node_cp_len(const GBT_VARKEY *node, const gbtree_vinfo *tinfo)
  {
         GBT_VARKEY_R r = gbt_var_key_readable(node);
         int32           i = 0;
-       int32           l = 0;
+       int32           l_left_to_match = 0;
+       int32           l_total = 0;
         int32           t1len = VARSIZE(r.lower) - VARHDRSZ;
         int32           t2len = VARSIZE(r.upper) - VARHDRSZ;
         int32           ml = Min(t1len, t2len);
         char       *p1 = VARDATA(r.lower);
         char       *p2 = VARDATA(r.upper);
+       const char *end1 = p1 + t1len;
+       const char *end2 = p2 + t2len;
  
         if (ml == 0)
                 return 0;
  
         while (i < ml)
         {
-               if (tinfo->eml > 1 && l == 0)
+               if (tinfo->eml > 1 && l_left_to_match == 0)
                 {
-                       if ((l = pg_mblen(p1)) != pg_mblen(p2))
+                       l_total = pg_mblen_range(p1, end1);
+                       if (l_total != pg_mblen_range(p2, end2))
                         {
                                 return i;
                         }
+                       l_left_to_match = l_total;
                 }
                 if (*p1 != *p2)
                 {
                         if (tinfo->eml > 1)
                         {
-                               return (i - l + 1);
+                               int32           l_matched_subset = l_total - l_left_to_match;
+
+                               /* end common prefix at final byte of last matching char */
+                               return i - l_matched_subset;
                         }
                         else
                         {
@@ -155,7 +166,7 @@ gbt_var_node_cp_len(const GBT_VARKEY *node, const gbtree_vinfo *tinfo)
  
                 p1++;
                 p2++;
-               l--;
+               l_left_to_match--;
                 i++;
         }
         return ml;                                      /* lower == upper */
diff --git a/contrib/dict_xsyn/dict_xsyn.c b/contrib/dict_xsyn/dict_xsyn.c

index 584fe44753dbca1e18cad936a92e85d478d9a20c..a4ced84dd8d9ec50c2058f211ec2ad7c1dbb3d36 100644 (file)
--- a/contrib/dict_xsyn/dict_xsyn.c
+++ b/contrib/dict_xsyn/dict_xsyn.c
@@ -48,15 +48,15 @@ find_word(char *in, char **end)
         char       *start;
  
         *end = NULL;
-       while (*in && t_isspace(in))
-               in += pg_mblen(in);
+       while (*in && t_isspace_cstr(in))
+               in += pg_mblen_cstr(in);
  
         if (!*in || *in == '#')
                 return NULL;
         start = in;
  
-       while (*in && !t_isspace(in))
-               in += pg_mblen(in);
+       while (*in && !t_isspace_cstr(in))
+               in += pg_mblen_cstr(in);
  
         *end = in;
  
diff --git a/contrib/hstore/hstore_io.c b/contrib/hstore/hstore_io.c

index 03057f085d1db527ffaef60335e9384f91c5e9bf..0b1e0581e84d04b987910266afe00bc51ef786fe 100644 (file)
--- a/contrib/hstore/hstore_io.c
+++ b/contrib/hstore/hstore_io.c
@@ -82,7 +82,7 @@ get_val(HSParser *state, bool ignoreeq, bool *escaped)
                         else if (*(state->ptr) == '=' && !ignoreeq)
                         {
                                 elog(ERROR, "Syntax error near \"%.*s\" at position %d",
-                                        pg_mblen(state->ptr), state->ptr,
+                                        pg_mblen_cstr(state->ptr), state->ptr,
                                          (int32) (state->ptr - state->begin));
                         }
                         else if (*(state->ptr) == '\\')
@@ -223,7 +223,7 @@ parse_hstore(HSParser *state)
                         else if (!scanner_isspace((unsigned char) *(state->ptr)))
                         {
                                 elog(ERROR, "Syntax error near \"%.*s\" at position %d",
-                                        pg_mblen(state->ptr), state->ptr,
+                                        pg_mblen_cstr(state->ptr), state->ptr,
                                          (int32) (state->ptr - state->begin));
                         }
                 }
@@ -240,7 +240,7 @@ parse_hstore(HSParser *state)
                         else
                         {
                                 elog(ERROR, "Syntax error near \"%.*s\" at position %d",
-                                        pg_mblen(state->ptr), state->ptr,
+                                        pg_mblen_cstr(state->ptr), state->ptr,
                                          (int32) (state->ptr - state->begin));
                         }
                 }
@@ -275,7 +275,7 @@ parse_hstore(HSParser *state)
                         else if (!scanner_isspace((unsigned char) *(state->ptr)))
                         {
                                 elog(ERROR, "Syntax error near \"%.*s\" at position %d",
-                                        pg_mblen(state->ptr), state->ptr,
+                                        pg_mblen_cstr(state->ptr), state->ptr,
                                          (int32) (state->ptr - state->begin));
                         }
                 }
diff --git a/contrib/ltree/lquery_op.c b/contrib/ltree/lquery_op.c

index d89af20f6cfb00ddb98dddc9e01a5dde929f50de..46019a0e83ab72276b3716c7649ccbaa69809b31 100644 (file)
--- a/contrib/ltree/lquery_op.c
+++ b/contrib/ltree/lquery_op.c
@@ -26,14 +26,14 @@ getlexeme(char *start, char *end, int *len)
         char       *ptr;
         int                     charlen;
  
-       while (start < end && (charlen = pg_mblen(start)) == 1 && t_iseq(start, '_'))
+       while (start < end && (charlen = pg_mblen_range(start, end)) == 1 && t_iseq(start, '_'))
                 start += charlen;
  
         ptr = start;
         if (ptr >= end)
                 return NULL;
  
-       while (ptr < end && !((charlen = pg_mblen(ptr)) == 1 && t_iseq(ptr, '_')))
+       while (ptr < end && !((charlen = pg_mblen_range(ptr, end)) == 1 && t_iseq(ptr, '_')))
                 ptr += charlen;
  
         *len = ptr - start;
diff --git a/contrib/ltree/ltree.h b/contrib/ltree/ltree.h

index 4b47ec8a86f16d97b5ec98a7aea9e31f4b39bb5f..a37ee6b2c679a43869c7ec29c87352b4a2814414 100644 (file)
--- a/contrib/ltree/ltree.h
+++ b/contrib/ltree/ltree.h
@@ -126,7 +126,8 @@ typedef struct
  
  #define LQUERY_HASNOT          0x01
  
-#define ISALNUM(x)     ( t_isalpha(x) || t_isdigit(x)  || ( pg_mblen(x) == 1 && t_iseq((x), '_') ) )
+/* Caller has already called mblen, so we can use _unbounded variants safely. */
+#define ISALNUM(x)     ( t_isalpha_unbounded(x) || t_isdigit_unbounded(x) || ( pg_mblen_unbounded(x) == 1 && t_iseq((x), '_') ) )
  
  /* full text query */
  
diff --git a/contrib/ltree/ltree_io.c b/contrib/ltree/ltree_io.c

index 15115cb29f3c3b5ab5f4f0a0416db66ce5863a49..0a44a8c46915ab38ef8f52fba13f445276856c02 100644 (file)
--- a/contrib/ltree/ltree_io.c
+++ b/contrib/ltree/ltree_io.c
@@ -54,7 +54,7 @@ parse_ltree(const char *buf)
         ptr = buf;
         while (*ptr)
         {
-               charlen = pg_mblen(ptr);
+               charlen = pg_mblen_cstr(ptr);
                 if (t_iseq(ptr, '.'))
                         num++;
                 ptr += charlen;
@@ -69,7 +69,7 @@ parse_ltree(const char *buf)
         ptr = buf;
         while (*ptr)
         {
-               charlen = pg_mblen(ptr);
+               charlen = pg_mblen_cstr(ptr);
  
                 switch (state)
                 {
@@ -285,7 +285,7 @@ parse_lquery(const char *buf)
         ptr = buf;
         while (*ptr)
         {
-               charlen = pg_mblen(ptr);
+               charlen = pg_mblen_cstr(ptr);
  
                 if (t_iseq(ptr, '.'))
                         num++;
@@ -305,7 +305,7 @@ parse_lquery(const char *buf)
         ptr = buf;
         while (*ptr)
         {
-               charlen = pg_mblen(ptr);
+               charlen = pg_mblen_cstr(ptr);
  
                 switch (state)
                 {
@@ -402,7 +402,7 @@ parse_lquery(const char *buf)
                         case LQPRS_WAITFNUM:
                                 if (t_iseq(ptr, ','))
                                         state = LQPRS_WAITSNUM;
-                               else if (t_isdigit(ptr))
+                               else if (t_isdigit_cstr(ptr))
                                 {
                                         int                     low = atoi(ptr);
  
@@ -420,7 +420,7 @@ parse_lquery(const char *buf)
                                         UNCHAR;
                                 break;
                         case LQPRS_WAITSNUM:
-                               if (t_isdigit(ptr))
+                               if (t_isdigit_cstr(ptr))
                                 {
                                         int                     high = atoi(ptr);
  
@@ -451,7 +451,7 @@ parse_lquery(const char *buf)
                         case LQPRS_WAITCLOSE:
                                 if (t_iseq(ptr, '}'))
                                         state = LQPRS_WAITEND;
-                               else if (!t_isdigit(ptr))
+                               else if (!t_isdigit_cstr(ptr))
                                         UNCHAR;
                                 break;
                         case LQPRS_WAITND:
@@ -462,7 +462,7 @@ parse_lquery(const char *buf)
                                 }
                                 else if (t_iseq(ptr, ','))
                                         state = LQPRS_WAITSNUM;
-                               else if (!t_isdigit(ptr))
+                               else if (!t_isdigit_cstr(ptr))
                                         UNCHAR;
                                 break;
                         case LQPRS_WAITEND:
diff --git a/contrib/ltree/ltxtquery_io.c b/contrib/ltree/ltxtquery_io.c

index 3eca5cb8ff30359a0cdca12e460657849f8725a8..bda2d971021798bd3c115b5a28a851c95fe9f9e3 100644 (file)
--- a/contrib/ltree/ltxtquery_io.c
+++ b/contrib/ltree/ltxtquery_io.c
@@ -59,7 +59,7 @@ gettoken_query(QPRS_STATE *state, int32 *val, int32 *lenval, char **strval, uint
  
         for (;;)
         {
-               charlen = pg_mblen(state->buf);
+               charlen = pg_mblen_cstr(state->buf);
  
                 switch (state->state)
                 {
@@ -83,7 +83,7 @@ gettoken_query(QPRS_STATE *state, int32 *val, int32 *lenval, char **strval, uint
                                         *lenval = charlen;
                                         *flag = 0;
                                 }
-                               else if (!t_isspace(state->buf))
+                               else if (!t_isspace_unbounded(state->buf))
                                         ereport(ERROR,
                                                         (errcode(ERRCODE_SYNTAX_ERROR),
                                                          errmsg("operand syntax error")));
diff --git a/contrib/pageinspect/heapfuncs.c b/contrib/pageinspect/heapfuncs.c

index b4991756670ab6141ced9e6b23493b0e43921e7c..94ea38212f147a4cba30cf6716c2493ce0ec7f2c 100644 (file)
--- a/contrib/pageinspect/heapfuncs.c
+++ b/contrib/pageinspect/heapfuncs.c
@@ -101,7 +101,7 @@ text_to_bits(char *str, int len)
                         ereport(ERROR,
                                         (errcode(ERRCODE_DATA_CORRUPTED),
                                          errmsg("invalid character \"%.*s\" in t_bits string",
-                                                       pg_mblen(str + off), str + off)));
+                                                       pg_mblen_cstr(str + off), str + off)));
  
                 if (off % 8 == 7)
                         bits[off / 8] = byte;
diff --git a/contrib/pg_trgm/trgm.h b/contrib/pg_trgm/trgm.h

index 405a1d95528da0a80796810839ffa7176097f827..06d3994e692771014b2dc3cbb9e4b068dcbb6375 100644 (file)
--- a/contrib/pg_trgm/trgm.h
+++ b/contrib/pg_trgm/trgm.h
@@ -52,10 +52,10 @@ typedef char trgm[3];
  } while(0)
  
  #ifdef KEEPONLYALNUM
-#define ISWORDCHR(c)   (t_isalpha(c) || t_isdigit(c))
+#define ISWORDCHR(c, len)      (t_isalpha_with_len(c, len) || t_isdigit_with_len(c, len))
  #define ISPRINTABLECHAR(a)     ( isascii( *(unsigned char*)(a) ) && (isalnum( *(unsigned char*)(a) ) || *(unsigned char*)(a)==' ') )
  #else
-#define ISWORDCHR(c)   (!t_isspace(c))
+#define ISWORDCHR(c, len)      (!t_isspace_with_len(c, len))
  #define ISPRINTABLECHAR(a)     ( isascii( *(unsigned char*)(a) ) && isprint( *(unsigned char*)(a) ) )
  #endif
  #define ISPRINTABLETRGM(t)     ( ISPRINTABLECHAR( ((char*)(t)) ) && ISPRINTABLECHAR( ((char*)(t))+1 ) && ISPRINTABLECHAR( ((char*)(t))+2 ) )
diff --git a/contrib/pg_trgm/trgm_op.c b/contrib/pg_trgm/trgm_op.c

index e9b7981619fb630e7df636b29f4ea5f3700d47d1..fe7f1ca4412d14a6ab4b5b77009e255ac8e7b16c 100644 (file)
--- a/contrib/pg_trgm/trgm_op.c
+++ b/contrib/pg_trgm/trgm_op.c
@@ -173,18 +173,29 @@ static char *
  find_word(char *str, int lenstr, char **endword, int *charlen)
  {
         char       *beginword = str;
+       const char *endstr = str + lenstr;
  
-       while (beginword - str < lenstr && !ISWORDCHR(beginword))
-               beginword += pg_mblen(beginword);
+       while (beginword < endstr)
+       {
+               int                     clen = pg_mblen_range(beginword, endstr);
  
-       if (beginword - str >= lenstr)
+               if (ISWORDCHR(beginword, clen))
+                       break;
+               beginword += clen;
+       }
+
+       if (beginword >= endstr)
                 return NULL;
  
         *endword = beginword;
         *charlen = 0;
-       while (*endword - str < lenstr && ISWORDCHR(*endword))
+       while (*endword < endstr)
         {
-               *endword += pg_mblen(*endword);
+               int                     clen = pg_mblen_range(*endword, endstr);
+
+               if (!ISWORDCHR(*endword, clen))
+                       break;
+               *endword += clen;
                 (*charlen)++;
         }
  
@@ -232,9 +243,9 @@ make_trigrams(trgm *tptr, char *str, int bytelen, int charlen)
         if (bytelen > charlen)
         {
                 /* Find multibyte character boundaries and apply compact_trigram */
-               int                     lenfirst = pg_mblen(str),
-                                       lenmiddle = pg_mblen(str + lenfirst),
-                                       lenlast = pg_mblen(str + lenfirst + lenmiddle);
+               int                     lenfirst = pg_mblen_unbounded(str),
+                                       lenmiddle = pg_mblen_unbounded(str + lenfirst),
+                                       lenlast = pg_mblen_unbounded(str + lenfirst + lenmiddle);
  
                 while ((ptr - str) + lenfirst + lenmiddle + lenlast <= bytelen)
                 {
@@ -245,7 +256,7 @@ make_trigrams(trgm *tptr, char *str, int bytelen, int charlen)
  
                         lenfirst = lenmiddle;
                         lenmiddle = lenlast;
-                       lenlast = pg_mblen(ptr + lenfirst + lenmiddle);
+                       lenlast = pg_mblen_unbounded(ptr + lenfirst + lenmiddle);
                 }
         }
         else
@@ -725,6 +736,7 @@ get_wildcard_part(const char *str, int lenstr,
  {
         const char *beginword = str;
         const char *endword;
+       const char *endstr = str + lenstr;
         char       *s = buf;
         bool            in_leading_wildcard_meta = false;
         bool            in_trailing_wildcard_meta = false;
@@ -737,11 +749,13 @@ get_wildcard_part(const char *str, int lenstr,
          * from this loop to the next one, since we may exit at a word character
          * that is in_escape.
          */
-       while (beginword - str < lenstr)
+       while (beginword < endstr)
         {
+               clen = pg_mblen_range(beginword, endstr);
+
                 if (in_escape)
                 {
-                       if (ISWORDCHR(beginword))
+                       if (ISWORDCHR(beginword, clen))
                                 break;
                         in_escape = false;
                         in_leading_wildcard_meta = false;
@@ -752,12 +766,12 @@ get_wildcard_part(const char *str, int lenstr,
                                 in_escape = true;
                         else if (ISWILDCARDCHAR(beginword))
                                 in_leading_wildcard_meta = true;
-                       else if (ISWORDCHR(beginword))
+                       else if (ISWORDCHR(beginword, clen))
                                 break;
                         else
                                 in_leading_wildcard_meta = false;
                 }
-               beginword += pg_mblen(beginword);
+               beginword += clen;
         }
  
         /*
@@ -790,12 +804,12 @@ get_wildcard_part(const char *str, int lenstr,
          * string boundary.  Strip escapes during copy.
          */
         endword = beginword;
-       while (endword - str < lenstr)
+       while (endword < endstr)
         {
-               clen = pg_mblen(endword);
+               clen = pg_mblen_range(endword, endstr);
                 if (in_escape)
                 {
-                       if (ISWORDCHR(endword))
+                       if (ISWORDCHR(endword, clen))
                         {
                                 memcpy(s, endword, clen);
                                 (*charlen)++;
@@ -823,7 +837,7 @@ get_wildcard_part(const char *str, int lenstr,
                                 in_trailing_wildcard_meta = true;
                                 break;
                         }
-                       else if (ISWORDCHR(endword))
+                       else if (ISWORDCHR(endword, clen))
                         {
                                 memcpy(s, endword, clen);
                                 (*charlen)++;
diff --git a/contrib/pg_trgm/trgm_regexp.c b/contrib/pg_trgm/trgm_regexp.c

index 3fc8a9ec6f0d696e52ae739172f3cb8d2eb85c0a..f675bd064374329a7816da608af30a6900d05fcd 100644 (file)
--- a/contrib/pg_trgm/trgm_regexp.c
+++ b/contrib/pg_trgm/trgm_regexp.c
@@ -480,7 +480,7 @@ static TRGM *createTrgmNFAInternal(regex_t *regex, TrgmPackedGraph **graph,
  static void RE_compile(regex_t *regex, text *text_re,
                                            int cflags, Oid collation);
  static void getColorInfo(regex_t *regex, TrgmNFA *trgmNFA);
-static bool convertPgWchar(pg_wchar c, trgm_mb_char *result);
+static int     convertPgWchar(pg_wchar c, trgm_mb_char *result);
  static void transformGraph(TrgmNFA *trgmNFA);
  static void processState(TrgmNFA *trgmNFA, TrgmState *state);
  static void addKey(TrgmNFA *trgmNFA, TrgmState *state, TrgmStateKey *key);
@@ -818,10 +818,11 @@ getColorInfo(regex_t *regex, TrgmNFA *trgmNFA)
                 for (j = 0; j < charsCount; j++)
                 {
                         trgm_mb_char c;
+                       int                     clen = convertPgWchar(chars[j], &c);
  
-                       if (!convertPgWchar(chars[j], &c))
+                       if (!clen)
                                 continue;               /* ok to ignore it altogether */
-                       if (ISWORDCHR(c.bytes))
+                       if (ISWORDCHR(c.bytes, clen))
                                 colorInfo->wordChars[colorInfo->wordCharsCount++] = c;
                         else
                                 colorInfo->containsNonWord = true;
@@ -833,13 +834,15 @@ getColorInfo(regex_t *regex, TrgmNFA *trgmNFA)
  
  /*
   * Convert pg_wchar to multibyte format.
- * Returns false if the character should be ignored completely.
+ * Returns 0 if the character should be ignored completely, else returns its
+ * byte length.
   */
-static bool
+static int
  convertPgWchar(pg_wchar c, trgm_mb_char *result)
  {
         /* "s" has enough space for a multibyte character and a trailing NUL */
         char            s[MAX_MULTIBYTE_CHAR_LEN + 1];
+       int                     clen;
  
         /*
          * We can ignore the NUL character, since it can never appear in a PG text
@@ -847,11 +850,11 @@ convertPgWchar(pg_wchar c, trgm_mb_char *result)
          * reconstructing trigrams.
          */
         if (c == 0)
-               return false;
+               return 0;
  
         /* Do the conversion, making sure the result is NUL-terminated */
         memset(s, 0, sizeof(s));
-       pg_wchar2mb_with_len(&c, s, 1);
+       clen = pg_wchar2mb_with_len(&c, s, 1);
  
         /*
          * In IGNORECASE mode, we can ignore uppercase characters.  We assume that
@@ -873,7 +876,7 @@ convertPgWchar(pg_wchar c, trgm_mb_char *result)
                 if (strcmp(lowerCased, s) != 0)
                 {
                         pfree(lowerCased);
-                       return false;
+                       return 0;
                 }
                 pfree(lowerCased);
         }
@@ -881,7 +884,7 @@ convertPgWchar(pg_wchar c, trgm_mb_char *result)
  
         /* Fill result with exactly MAX_MULTIBYTE_CHAR_LEN bytes */
         memcpy(result->bytes, s, MAX_MULTIBYTE_CHAR_LEN);
-       return true;
+       return clen;
  }
  
  
diff --git a/contrib/unaccent/unaccent.c b/contrib/unaccent/unaccent.c

index 77ecd765282895f5ea4e193244f48daaa456f71e..de20ac1ae8b474cc94209e4a1e7b9095ae568d62 100644 (file)
--- a/contrib/unaccent/unaccent.c
+++ b/contrib/unaccent/unaccent.c
@@ -149,9 +149,9 @@ initTrie(const char *filename)
                                 state = 0;
                                 for (ptr = line; *ptr; ptr += ptrlen)
                                 {
-                                       ptrlen = pg_mblen(ptr);
+                                       ptrlen = pg_mblen_cstr(ptr);
                                         /* ignore whitespace, but end src or trg */
-                                       if (t_isspace(ptr))
+                                       if (t_isspace_cstr(ptr))
                                         {
                                                 if (state == 1)
                                                         state = 2;
@@ -315,6 +315,7 @@ unaccent_lexize(PG_FUNCTION_ARGS)
         char       *srcchar = (char *) PG_GETARG_POINTER(1);
         int32           len = PG_GETARG_INT32(2);
         char       *srcstart = srcchar;
+       const char *srcend = srcstart + len;
         TSLexeme   *res;
         StringInfoData buf;
  
@@ -342,7 +343,7 @@ unaccent_lexize(PG_FUNCTION_ARGS)
                 }
                 else
                 {
-                       matchlen = pg_mblen(srcchar);
+                       matchlen = pg_mblen_range(srcchar, srcend);
                         if (buf.data != NULL)
                                 appendBinaryStringInfo(&buf, srcchar, matchlen);
                 }
diff --git a/src/backend/catalog/pg_proc.c b/src/backend/catalog/pg_proc.c

index 5fd8a385a570b27c5b397016dfdf24da1212b15c..1acd28f7b8b849aec49b7520c8a96feeb1aa1bfc 100644 (file)
--- a/src/backend/catalog/pg_proc.c
+++ b/src/backend/catalog/pg_proc.c
@@ -1175,7 +1175,7 @@ match_prosrc_to_literal(const char *prosrc, const char *literal,
                         if (cursorpos > 0)
                                 newcp++;
                 }
-               chlen = pg_mblen(prosrc);
+               chlen = pg_mblen_cstr(prosrc);
                 if (strncmp(prosrc, literal, chlen) != 0)
                         goto fail;
                 prosrc += chlen;
diff --git a/src/backend/tsearch/dict_synonym.c b/src/backend/tsearch/dict_synonym.c

index 65e34e91c97b59ebb4a798c99208aa7cf0adde14..b7a8109143b21a4d69554f887870798e97318098 100644 (file)
--- a/src/backend/tsearch/dict_synonym.c
+++ b/src/backend/tsearch/dict_synonym.c
@@ -47,8 +47,8 @@ findwrd(char *in, char **end, uint16 *flags)
         char       *lastchar;
  
         /* Skip leading spaces */
-       while (*in && t_isspace(in))
-               in += pg_mblen(in);
+       while (*in && t_isspace_cstr(in))
+               in += pg_mblen_cstr(in);
  
         /* Return NULL on empty lines */
         if (*in == '\0')
@@ -60,10 +60,10 @@ findwrd(char *in, char **end, uint16 *flags)
         lastchar = start = in;
  
         /* Find end of word */
-       while (*in && !t_isspace(in))
+       while (*in && !t_isspace_cstr(in))
         {
                 lastchar = in;
-               in += pg_mblen(in);
+               in += pg_mblen_cstr(in);
         }
  
         if (in - lastchar == 1 && t_iseq(lastchar, '*') && flags)
diff --git a/src/backend/tsearch/dict_thesaurus.c b/src/backend/tsearch/dict_thesaurus.c

index b8c08bcf7ba500926d86c0a81cc92ba48c71885d..cc6b3829c9032c77d6526131e612d862d7c1f78e 100644 (file)
--- a/src/backend/tsearch/dict_thesaurus.c
+++ b/src/backend/tsearch/dict_thesaurus.c
@@ -190,8 +190,8 @@ thesaurusRead(const char *filename, DictThesaurus *d)
                 ptr = line;
  
                 /* is it a comment? */
-               while (*ptr && t_isspace(ptr))
-                       ptr += pg_mblen(ptr);
+               while (*ptr && t_isspace_cstr(ptr))
+                       ptr += pg_mblen_cstr(ptr);
  
                 if (t_iseq(ptr, '#') || *ptr == '\0' ||
                         t_iseq(ptr, '\n') || t_iseq(ptr, '\r'))
@@ -212,7 +212,7 @@ thesaurusRead(const char *filename, DictThesaurus *d)
                                                                  errmsg("unexpected delimiter")));
                                         state = TR_WAITSUBS;
                                 }
-                               else if (!t_isspace(ptr))
+                               else if (!t_isspace_cstr(ptr))
                                 {
                                         beginwrd = ptr;
                                         state = TR_INLEX;
@@ -225,7 +225,7 @@ thesaurusRead(const char *filename, DictThesaurus *d)
                                         newLexeme(d, beginwrd, ptr, idsubst, posinsubst++);
                                         state = TR_WAITSUBS;
                                 }
-                               else if (t_isspace(ptr))
+                               else if (t_isspace_cstr(ptr))
                                 {
                                         newLexeme(d, beginwrd, ptr, idsubst, posinsubst++);
                                         state = TR_WAITLEX;
@@ -237,15 +237,15 @@ thesaurusRead(const char *filename, DictThesaurus *d)
                                 {
                                         useasis = true;
                                         state = TR_INSUBS;
-                                       beginwrd = ptr + pg_mblen(ptr);
+                                       beginwrd = ptr + pg_mblen_cstr(ptr);
                                 }
                                 else if (t_iseq(ptr, '\\'))
                                 {
                                         useasis = false;
                                         state = TR_INSUBS;
-                                       beginwrd = ptr + pg_mblen(ptr);
+                                       beginwrd = ptr + pg_mblen_cstr(ptr);
                                 }
-                               else if (!t_isspace(ptr))
+                               else if (!t_isspace_cstr(ptr))
                                 {
                                         useasis = false;
                                         beginwrd = ptr;
@@ -254,7 +254,7 @@ thesaurusRead(const char *filename, DictThesaurus *d)
                         }
                         else if (state == TR_INSUBS)
                         {
-                               if (t_isspace(ptr))
+                               if (t_isspace_cstr(ptr))
                                 {
                                         if (ptr == beginwrd)
                                                 ereport(ERROR,
@@ -267,7 +267,7 @@ thesaurusRead(const char *filename, DictThesaurus *d)
                         else
                                 elog(ERROR, "unrecognized thesaurus state: %d", state);
  
-                       ptr += pg_mblen(ptr);
+                       ptr += pg_mblen_cstr(ptr);
                 }
  
                 if (state == TR_INSUBS)
diff --git a/src/backend/tsearch/regis.c b/src/backend/tsearch/regis.c

index 43cab72f472ec28a0d3e1dc485d048d00139ff61..1db03ab0b82608fced0f3f326ffebefd9208ce60 100644 (file)
--- a/src/backend/tsearch/regis.c
+++ b/src/backend/tsearch/regis.c
@@ -37,7 +37,7 @@ RS_isRegis(const char *str)
         {
                 if (state == RS_IN_WAIT)
                 {
-                       if (t_isalpha(c))
+                       if (t_isalpha_cstr(c))
                                  /* okay */ ;
                         else if (t_iseq(c, '['))
                                 state = RS_IN_ONEOF;
@@ -48,14 +48,14 @@ RS_isRegis(const char *str)
                 {
                         if (t_iseq(c, '^'))
                                 state = RS_IN_NONEOF;
-                       else if (t_isalpha(c))
+                       else if (t_isalpha_cstr(c))
                                 state = RS_IN_ONEOF_IN;
                         else
                                 return false;
                 }
                 else if (state == RS_IN_ONEOF_IN || state == RS_IN_NONEOF)
                 {
-                       if (t_isalpha(c))
+                       if (t_isalpha_cstr(c))
                                  /* okay */ ;
                         else if (t_iseq(c, ']'))
                                 state = RS_IN_WAIT;
@@ -64,7 +64,7 @@ RS_isRegis(const char *str)
                 }
                 else
                         elog(ERROR, "internal error in RS_isRegis: state %d", state);
-               c += pg_mblen(c);
+               c += pg_mblen_cstr(c);
         }
  
         return (state == RS_IN_WAIT);
@@ -96,15 +96,14 @@ RS_compile(Regis *r, bool issuffix, const char *str)
         {
                 if (state == RS_IN_WAIT)
                 {
-                       if (t_isalpha(c))
+                       if (t_isalpha_cstr(c))
                         {
                                 if (ptr)
                                         ptr = newRegisNode(ptr, len);
                                 else
                                         ptr = r->node = newRegisNode(NULL, len);
-                               COPYCHAR(ptr->data, c);
                                 ptr->type = RSF_ONEOF;
-                               ptr->len = pg_mblen(c);
+                               ptr->len = ts_copychar_cstr(ptr->data, c);
                         }
                         else if (t_iseq(c, '['))
                         {
@@ -125,10 +124,9 @@ RS_compile(Regis *r, bool issuffix, const char *str)
                                 ptr->type = RSF_NONEOF;
                                 state = RS_IN_NONEOF;
                         }
-                       else if (t_isalpha(c))
+                       else if (t_isalpha_cstr(c))
                         {
-                               COPYCHAR(ptr->data, c);
-                               ptr->len = pg_mblen(c);
+                               ptr->len = ts_copychar_cstr(ptr->data, c);
                                 state = RS_IN_ONEOF_IN;
                         }
                         else                            /* shouldn't get here */
@@ -136,11 +134,8 @@ RS_compile(Regis *r, bool issuffix, const char *str)
                 }
                 else if (state == RS_IN_ONEOF_IN || state == RS_IN_NONEOF)
                 {
-                       if (t_isalpha(c))
-                       {
-                               COPYCHAR(ptr->data + ptr->len, c);
-                               ptr->len += pg_mblen(c);
-                       }
+                       if (t_isalpha_cstr(c))
+                               ptr->len += ts_copychar_cstr(ptr->data + ptr->len, c);
                         else if (t_iseq(c, ']'))
                                 state = RS_IN_WAIT;
                         else                            /* shouldn't get here */
@@ -148,7 +143,7 @@ RS_compile(Regis *r, bool issuffix, const char *str)
                 }
                 else
                         elog(ERROR, "internal error in RS_compile: state %d", state);
-               c += pg_mblen(c);
+               c += pg_mblen_cstr(c);
         }
  
         if (state != RS_IN_WAIT)        /* shouldn't get here */
@@ -187,10 +182,10 @@ mb_strchr(char *str, char *c)
         char       *ptr = str;
         bool            res = false;
  
-       clen = pg_mblen(c);
+       clen = pg_mblen_cstr(c);
         while (*ptr && !res)
         {
-               plen = pg_mblen(ptr);
+               plen = pg_mblen_cstr(ptr);
                 if (plen == clen)
                 {
                         i = plen;
@@ -219,7 +214,7 @@ RS_execute(Regis *r, char *str)
         while (*c)
         {
                 len++;
-               c += pg_mblen(c);
+               c += pg_mblen_cstr(c);
         }
  
         if (len < r->nchar)
@@ -230,7 +225,7 @@ RS_execute(Regis *r, char *str)
         {
                 len -= r->nchar;
                 while (len-- > 0)
-                       c += pg_mblen(c);
+                       c += pg_mblen_cstr(c);
         }
  
  
@@ -250,7 +245,7 @@ RS_execute(Regis *r, char *str)
                                 elog(ERROR, "unrecognized regis node type: %d", ptr->type);
                 }
                 ptr = ptr->next;
-               c += pg_mblen(c);
+               c += pg_mblen_cstr(c);
         }
  
         return true;
diff --git a/src/backend/tsearch/spell.c b/src/backend/tsearch/spell.c

index c20247cf2aeb267007f46f6153f9e359fb283864..047bae6fe6ccb6196a2c9991acf34cfa6fcd2812 100644 (file)
--- a/src/backend/tsearch/spell.c
+++ b/src/backend/tsearch/spell.c
@@ -232,7 +232,7 @@ findchar(char *str, int c)
         {
                 if (t_iseq(str, c))
                         return str;
-               str += pg_mblen(str);
+               str += pg_mblen_cstr(str);
         }
  
         return NULL;
@@ -245,7 +245,7 @@ findchar2(char *str, int c1, int c2)
         {
                 if (t_iseq(str, c1) || t_iseq(str, c2))
                         return str;
-               str += pg_mblen(str);
+               str += pg_mblen_cstr(str);
         }
  
         return NULL;
@@ -352,6 +352,7 @@ getNextFlagFromString(IspellDict *Conf, char **sflagset, char *sflag)
         char       *next,
                            *sbuf = *sflagset;
         int                     maxstep;
+       int                     clen;
         bool            stop = false;
         bool            met_comma = false;
  
@@ -363,11 +364,11 @@ getNextFlagFromString(IspellDict *Conf, char **sflagset, char *sflag)
                 {
                         case FM_LONG:
                         case FM_CHAR:
-                               COPYCHAR(sflag, *sflagset);
-                               sflag += pg_mblen(*sflagset);
+                               clen = ts_copychar_cstr(sflag, *sflagset);
+                               sflag += clen;
  
                                 /* Go to start of the next flag */
-                               *sflagset += pg_mblen(*sflagset);
+                               *sflagset += clen;
  
                                 /* Check if we get all characters of flag */
                                 maxstep--;
@@ -391,7 +392,7 @@ getNextFlagFromString(IspellDict *Conf, char **sflagset, char *sflag)
                                 *sflagset = next;
                                 while (**sflagset)
                                 {
-                                       if (t_isdigit(*sflagset))
+                                       if (t_isdigit_cstr(*sflagset))
                                         {
                                                 if (!met_comma)
                                                         ereport(ERROR,
@@ -409,7 +410,7 @@ getNextFlagFromString(IspellDict *Conf, char **sflagset, char *sflag)
                                                                                         *sflagset)));
                                                 met_comma = true;
                                         }
-                                       else if (!t_isspace(*sflagset))
+                                       else if (!t_isspace_cstr(*sflagset))
                                         {
                                                 ereport(ERROR,
                                                                 (errcode(ERRCODE_CONFIG_FILE_ERROR),
@@ -417,7 +418,7 @@ getNextFlagFromString(IspellDict *Conf, char **sflagset, char *sflag)
                                                                                 *sflagset)));
                                         }
  
-                                       *sflagset += pg_mblen(*sflagset);
+                                       *sflagset += pg_mblen_cstr(*sflagset);
                                 }
                                 stop = true;
                                 break;
@@ -543,7 +544,7 @@ NIImportDictionary(IspellDict *Conf, const char *filename)
                         while (*s)
                         {
                                 /* we allow only single encoded flags for faster works */
-                               if (pg_mblen(s) == 1 && t_isprint(s) && !t_isspace(s))
+                               if (pg_mblen_cstr(s) == 1 && t_isprint_unbounded(s) && !t_isspace_unbounded(s))
                                         s++;
                                 else
                                 {
@@ -559,12 +560,12 @@ NIImportDictionary(IspellDict *Conf, const char *filename)
                 s = line;
                 while (*s)
                 {
-                       if (t_isspace(s))
+                       if (t_isspace_cstr(s))
                         {
                                 *s = '\0';
                                 break;
                         }
-                       s += pg_mblen(s);
+                       s += pg_mblen_cstr(s);
                 }
                 pstr = lowerstr_ctx(Conf, line);
  
@@ -816,17 +817,17 @@ get_nextfield(char **str, char *next)
  
         while (**str)
         {
+               int                     clen = pg_mblen_cstr(*str);
+
                 if (state == PAE_WAIT_MASK)
                 {
                         if (t_iseq(*str, '#'))
                                 return false;
-                       else if (!t_isspace(*str))
+                       else if (!t_isspace_cstr(*str))
                         {
-                               int                     clen = pg_mblen(*str);
-
                                 if (clen < avail)
                                 {
-                                       COPYCHAR(next, *str);
+                                       ts_copychar_with_len(next, *str, clen);
                                         next += clen;
                                         avail -= clen;
                                 }
@@ -835,24 +836,22 @@ get_nextfield(char **str, char *next)
                 }
                 else                                    /* state == PAE_INMASK */
                 {
-                       if (t_isspace(*str))
+                       if (t_isspace_cstr(*str))
                         {
                                 *next = '\0';
                                 return true;
                         }
                         else
                         {
-                               int                     clen = pg_mblen(*str);
-
                                 if (clen < avail)
                                 {
-                                       COPYCHAR(next, *str);
+                                       ts_copychar_with_len(next, *str, clen);
                                         next += clen;
                                         avail -= clen;
                                 }
                         }
                 }
-               *str += pg_mblen(*str);
+               *str += clen;
         }
  
         *next = '\0';
@@ -942,14 +941,15 @@ parse_affentry(char *str, char *mask, char *find, char *repl)
  
         while (*str)
         {
+               int                     clen = pg_mblen_cstr(str);
+
                 if (state == PAE_WAIT_MASK)
                 {
                         if (t_iseq(str, '#'))
                                 return false;
-                       else if (!t_isspace(str))
+                       else if (!t_isspace_cstr(str))
                         {
-                               COPYCHAR(pmask, str);
-                               pmask += pg_mblen(str);
+                               pmask += ts_copychar_with_len(pmask, str, clen);
                                 state = PAE_INMASK;
                         }
                 }
@@ -960,10 +960,9 @@ parse_affentry(char *str, char *mask, char *find, char *repl)
                                 *pmask = '\0';
                                 state = PAE_WAIT_FIND;
                         }
-                       else if (!t_isspace(str))
+                       else if (!t_isspace_cstr(str))
                         {
-                               COPYCHAR(pmask, str);
-                               pmask += pg_mblen(str);
+                               pmask += ts_copychar_with_len(pmask, str, clen);
                         }
                 }
                 else if (state == PAE_WAIT_FIND)
@@ -972,13 +971,12 @@ parse_affentry(char *str, char *mask, char *find, char *repl)
                         {
                                 state = PAE_INFIND;
                         }
-                       else if (t_isalpha(str) || t_iseq(str, '\'') /* english 's */ )
+                       else if (t_isalpha_cstr(str) || t_iseq(str, '\'') /* english 's */ )
                         {
-                               COPYCHAR(prepl, str);
-                               prepl += pg_mblen(str);
+                               prepl += ts_copychar_with_len(prepl, str, clen);
                                 state = PAE_INREPL;
                         }
-                       else if (!t_isspace(str))
+                       else if (!t_isspace_cstr(str))
                                 ereport(ERROR,
                                                 (errcode(ERRCODE_CONFIG_FILE_ERROR),
                                                  errmsg("syntax error")));
@@ -990,12 +988,11 @@ parse_affentry(char *str, char *mask, char *find, char *repl)
                                 *pfind = '\0';
                                 state = PAE_WAIT_REPL;
                         }
-                       else if (t_isalpha(str))
+                       else if (t_isalpha_cstr(str))
                         {
-                               COPYCHAR(pfind, str);
-                               pfind += pg_mblen(str);
+                               pfind += ts_copychar_with_len(pfind, str, clen);
                         }
-                       else if (!t_isspace(str))
+                       else if (!t_isspace_cstr(str))
                                 ereport(ERROR,
                                                 (errcode(ERRCODE_CONFIG_FILE_ERROR),
                                                  errmsg("syntax error")));
@@ -1006,13 +1003,12 @@ parse_affentry(char *str, char *mask, char *find, char *repl)
                         {
                                 break;                  /* void repl */
                         }
-                       else if (t_isalpha(str))
+                       else if (t_isalpha_cstr(str))
                         {
-                               COPYCHAR(prepl, str);
-                               prepl += pg_mblen(str);
+                               prepl += ts_copychar_with_len(prepl, str, clen);
                                 state = PAE_INREPL;
                         }
-                       else if (!t_isspace(str))
+                       else if (!t_isspace_cstr(str))
                                 ereport(ERROR,
                                                 (errcode(ERRCODE_CONFIG_FILE_ERROR),
                                                  errmsg("syntax error")));
@@ -1024,12 +1020,11 @@ parse_affentry(char *str, char *mask, char *find, char *repl)
                                 *prepl = '\0';
                                 break;
                         }
-                       else if (t_isalpha(str))
+                       else if (t_isalpha_cstr(str))
                         {
-                               COPYCHAR(prepl, str);
-                               prepl += pg_mblen(str);
+                               prepl += ts_copychar_with_len(prepl, str, clen);
                         }
-                       else if (!t_isspace(str))
+                       else if (!t_isspace_cstr(str))
                                 ereport(ERROR,
                                                 (errcode(ERRCODE_CONFIG_FILE_ERROR),
                                                  errmsg("syntax error")));
@@ -1037,7 +1032,7 @@ parse_affentry(char *str, char *mask, char *find, char *repl)
                 else
                         elog(ERROR, "unrecognized state in parse_affentry: %d", state);
  
-               str += pg_mblen(str);
+               str += clen;
         }
  
         *pmask = *pfind = *prepl = '\0';
@@ -1090,10 +1085,9 @@ addCompoundAffixFlagValue(IspellDict *Conf, char *s, uint32 val)
         CompoundAffixFlag *newValue;
         char            sbuf[BUFSIZ];
         char       *sflag;
-       int                     clen;
  
-       while (*s && t_isspace(s))
-               s += pg_mblen(s);
+       while (*s && t_isspace_cstr(s))
+               s += pg_mblen_cstr(s);
  
         if (!*s)
                 ereport(ERROR,
@@ -1102,10 +1096,10 @@ addCompoundAffixFlagValue(IspellDict *Conf, char *s, uint32 val)
  
         /* Get flag without \n */
         sflag = sbuf;
-       while (*s && !t_isspace(s) && *s != '\n')
+       while (*s && !t_isspace_cstr(s) && *s != '\n')
         {
-               clen = pg_mblen(s);
-               COPYCHAR(sflag, s);
+               int                     clen = ts_copychar_cstr(sflag, s);
+
                 sflag += clen;
                 s += clen;
         }
@@ -1248,7 +1242,7 @@ NIImportOOAffixes(IspellDict *Conf, const char *filename)
  
         while ((recoded = tsearch_readline(&trst)) != NULL)
         {
-               if (*recoded == '\0' || t_isspace(recoded) || t_iseq(recoded, '#'))
+               if (*recoded == '\0' || t_isspace_cstr(recoded) || t_iseq(recoded, '#'))
                 {
                         pfree(recoded);
                         continue;
@@ -1285,8 +1279,8 @@ NIImportOOAffixes(IspellDict *Conf, const char *filename)
                 {
                         char       *s = recoded + strlen("FLAG");
  
-                       while (*s && t_isspace(s))
-                               s += pg_mblen(s);
+                       while (*s && t_isspace_cstr(s))
+                               s += pg_mblen_cstr(s);
  
                         if (*s)
                         {
@@ -1321,7 +1315,7 @@ NIImportOOAffixes(IspellDict *Conf, const char *filename)
         {
                 int                     fields_read;
  
-               if (*recoded == '\0' || t_isspace(recoded) || t_iseq(recoded, '#'))
+               if (*recoded == '\0' || t_isspace_cstr(recoded) || t_iseq(recoded, '#'))
                         goto nextline;
  
                 fields_read = parse_ooaffentry(recoded, type, sflag, find, repl, mask);
@@ -1484,12 +1478,12 @@ NIImportAffixes(IspellDict *Conf, const char *filename)
                         s = findchar2(recoded, 'l', 'L');
                         if (s)
                         {
-                               while (*s && !t_isspace(s))
-                                       s += pg_mblen(s);
-                               while (*s && t_isspace(s))
-                                       s += pg_mblen(s);
+                               while (*s && !t_isspace_cstr(s))
+                                       s += pg_mblen_cstr(s);
+                               while (*s && t_isspace_cstr(s))
+                                       s += pg_mblen_cstr(s);
  
-                               if (*s && pg_mblen(s) == 1)
+                               if (*s && pg_mblen_cstr(s) == 1)
                                 {
                                         addCompoundAffixFlagValue(Conf, s, FF_COMPOUNDFLAG);
                                         Conf->usecompound = true;
@@ -1517,8 +1511,8 @@ NIImportAffixes(IspellDict *Conf, const char *filename)
                         s = recoded + 4;        /* we need non-lowercased string */
                         flagflags = 0;
  
-                       while (*s && t_isspace(s))
-                               s += pg_mblen(s);
+                       while (*s && t_isspace_cstr(s))
+                               s += pg_mblen_cstr(s);
  
                         if (*s == '*')
                         {
@@ -1539,14 +1533,13 @@ NIImportAffixes(IspellDict *Conf, const char *filename)
                          * be followed by EOL, whitespace, or ':'.  Otherwise this is a
                          * new-format flag command.
                          */
-                       if (*s && pg_mblen(s) == 1)
+                       if (*s && pg_mblen_cstr(s) == 1)
                         {
-                               COPYCHAR(flag, s);
+                               flag[0] = *s++;
                                 flag[1] = '\0';
  
-                               s++;
                                 if (*s == '\0' || *s == '#' || *s == '\n' || *s == ':' ||
-                                       t_isspace(s))
+                                       t_isspace_cstr(s))
                                 {
                                         oldformat = true;
                                         goto nextline;
@@ -1770,7 +1763,7 @@ NISortDictionary(IspellDict *Conf)
                                                         (errcode(ERRCODE_CONFIG_FILE_ERROR),
                                                          errmsg("invalid affix alias \"%s\"",
                                                                         Conf->Spell[i]->p.flag)));
-                               if (*end != '\0' && !t_isdigit(end) && !t_isspace(end))
+                               if (*end != '\0' && !t_isdigit_cstr(end) && !t_isspace_cstr(end))
                                         ereport(ERROR,
                                                         (errcode(ERRCODE_CONFIG_FILE_ERROR),
                                                          errmsg("invalid affix alias \"%s\"",
diff --git a/src/backend/tsearch/ts_locale.c b/src/backend/tsearch/ts_locale.c

index 3a475a0f5fcb738c3e4ff88b80dbf26d70e71614..96e35e04c51f86cb7cdeb1b1093ca10bb5391903 100644 (file)
--- a/src/backend/tsearch/ts_locale.c
+++ b/src/backend/tsearch/ts_locale.c
@@ -33,66 +33,43 @@ static void tsearch_readline_callback(void *arg);
   */
  #define WC_BUF_LEN  3
  
-int
-t_isdigit(const char *ptr)
-{
-       int                     clen = pg_mblen(ptr);
-       wchar_t         character[WC_BUF_LEN];
-       pg_locale_t mylocale = 0;       /* TODO */
-
-       if (clen == 1 || database_ctype_is_c)
-               return isdigit(TOUCHAR(ptr));
-
-       char2wchar(character, WC_BUF_LEN, ptr, clen, mylocale);
-
-       return iswdigit((wint_t) character[0]);
-}
-
-int
-t_isspace(const char *ptr)
-{
-       int                     clen = pg_mblen(ptr);
-       wchar_t         character[WC_BUF_LEN];
-       pg_locale_t mylocale = 0;       /* TODO */
-
-       if (clen == 1 || database_ctype_is_c)
-               return isspace(TOUCHAR(ptr));
-
-       char2wchar(character, WC_BUF_LEN, ptr, clen, mylocale);
-
-       return iswspace((wint_t) character[0]);
-}
-
-int
-t_isalpha(const char *ptr)
-{
-       int                     clen = pg_mblen(ptr);
-       wchar_t         character[WC_BUF_LEN];
-       pg_locale_t mylocale = 0;       /* TODO */
-
-       if (clen == 1 || database_ctype_is_c)
-               return isalpha(TOUCHAR(ptr));
-
-       char2wchar(character, WC_BUF_LEN, ptr, clen, mylocale);
-
-       return iswalpha((wint_t) character[0]);
-}
-
-int
-t_isprint(const char *ptr)
-{
-       int                     clen = pg_mblen(ptr);
-       wchar_t         character[WC_BUF_LEN];
-       pg_locale_t mylocale = 0;       /* TODO */
-
-       if (clen == 1 || database_ctype_is_c)
-               return isprint(TOUCHAR(ptr));
-
-       char2wchar(character, WC_BUF_LEN, ptr, clen, mylocale);
-
-       return iswprint((wint_t) character[0]);
+#define GENERATE_T_ISCLASS_DEF(character_class) \
+/* mblen shall be that of the first character */ \
+int \
+t_is##character_class##_with_len(const char *ptr, int mblen) \
+{ \
+       int                     clen = pg_mblen_with_len(ptr, mblen); \
+       wchar_t         character[WC_BUF_LEN]; \
+       pg_locale_t mylocale = 0;       /* TODO */ \
+       if (clen == 1 || database_ctype_is_c) \
+               return is##character_class(TOUCHAR(ptr)); \
+       char2wchar(character, WC_BUF_LEN, ptr, clen, mylocale); \
+       return isw##character_class((wint_t) character[0]); \
+} \
+\
+/* ptr shall point to a NUL-terminated string */ \
+int \
+t_is##character_class##_cstr(const char *ptr) \
+{ \
+       return t_is##character_class##_with_len(ptr, pg_mblen_cstr(ptr)); \
+} \
+/* ptr shall point to a string with pre-validated encoding */ \
+int \
+t_is##character_class##_unbounded(const char *ptr) \
+{ \
+       return t_is##character_class##_with_len(ptr, pg_mblen_unbounded(ptr)); \
+} \
+/* historical name for _unbounded */ \
+int \
+t_is##character_class(const char *ptr) \
+{ \
+       return t_is##character_class##_unbounded(ptr); \
  }
  
+GENERATE_T_ISCLASS_DEF(alpha)
+GENERATE_T_ISCLASS_DEF(digit)
+GENERATE_T_ISCLASS_DEF(print)
+GENERATE_T_ISCLASS_DEF(space)
  
  /*
   * Set up to read a file using tsearch_readline().  This facility is
diff --git a/src/backend/tsearch/ts_utils.c b/src/backend/tsearch/ts_utils.c

index 7743bdfd592c18e09183d7fbcfce2f5c2f71fbb4..9685e54eb2b6b787ae6959124df84daedac428ec 100644 (file)
--- a/src/backend/tsearch/ts_utils.c
+++ b/src/backend/tsearch/ts_utils.c
@@ -88,8 +88,8 @@ readstoplist(const char *fname, StopList *s, char *(*wordop) (const char *))
                         char       *pbuf = line;
  
                         /* Trim trailing space */
-                       while (*pbuf && !t_isspace(pbuf))
-                               pbuf += pg_mblen(pbuf);
+                       while (*pbuf && !t_isspace_cstr(pbuf))
+                               pbuf += pg_mblen_cstr(pbuf);
                         *pbuf = '\0';
  
                         /* Skip empty lines */
diff --git a/src/backend/tsearch/wparser_def.c b/src/backend/tsearch/wparser_def.c

index 916db5a4746d95d699216bd46cb282813c78ef10..10bd650671962c10f0af7de9cb68c02343c3bdfd 100644 (file)
--- a/src/backend/tsearch/wparser_def.c
+++ b/src/backend/tsearch/wparser_def.c
@@ -1727,7 +1727,8 @@ TParserGet(TParser *prs)
                         prs->state->charlen = 0;
                 else
                         prs->state->charlen = (prs->charmaxlen == 1) ? prs->charmaxlen :
-                               pg_mblen(prs->str + prs->state->posbyte);
+                               pg_mblen_range(prs->str + prs->state->posbyte,
+                                                          prs->str + prs->lenstr);
  
                 Assert(prs->state->posbyte + prs->state->charlen <= prs->lenstr);
                 Assert(prs->state->state >= TPS_Base && prs->state->state < TPS_Null);
diff --git a/src/backend/utils/adt/encode.c b/src/backend/utils/adt/encode.c

index feb3e830e4fd83011302403d7f453b963daacdfb..10a7ad89a9e8430bacfc007c4d767627b59b049e 100644 (file)
--- a/src/backend/utils/adt/encode.c
+++ b/src/backend/utils/adt/encode.c
@@ -172,7 +172,7 @@ hex_encode(const char *src, size_t len, char *dst)
  }
  
  static inline char
-get_hex(const char *cp)
+get_hex(const char *cp, const char *end)
  {
         unsigned char c = (unsigned char) *cp;
         int                     res = -1;
@@ -184,7 +184,7 @@ get_hex(const char *cp)
                 ereport(ERROR,
                                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
                                  errmsg("invalid hexadecimal digit: \"%.*s\"",
-                                               pg_mblen(cp), cp)));
+                                               pg_mblen_range(cp, end), cp)));
  
         return (char) res;
  }
@@ -208,14 +208,14 @@ hex_decode(const char *src, size_t len, char *dst)
                         s++;
                         continue;
                 }
-               v1 = get_hex(s) << 4;
+               v1 = get_hex(s, srcend) << 4;
                 s++;
                 if (s >= srcend)
                         ereport(ERROR,
                                         (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
                                          errmsg("invalid hexadecimal data: odd number of digits")));
  
-               v2 = get_hex(s);
+               v2 = get_hex(s, srcend);
                 s++;
                 *p++ = v1 | v2;
         }
@@ -344,7 +344,7 @@ pg_base64_decode(const char *src, size_t len, char *dst)
                                 ereport(ERROR,
                                                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
                                                  errmsg("invalid symbol \"%.*s\" found while decoding base64 sequence",
-                                                               pg_mblen(s - 1), s - 1)));
+                                                               pg_mblen_range(s - 1, srcend), s - 1)));
                 }
                 /* add it to buffer */
                 buf = (buf << 6) + b;
diff --git a/src/backend/utils/adt/formatting.c b/src/backend/utils/adt/formatting.c

index e301cf70f16ded07deba7290ee50e97ab66a40e6..06bd8ca67c7901d92925f8de0896385aa1b5cf43 100644 (file)
--- a/src/backend/utils/adt/formatting.c
+++ b/src/backend/utils/adt/formatting.c
@@ -1427,7 +1427,7 @@ parse_format(FormatNode *node, const char *str, const KeyWord *kw,
                                         ereport(ERROR,
                                                         (errcode(ERRCODE_INVALID_DATETIME_FORMAT),
                                                          errmsg("invalid datetime format separator: \"%s\"",
-                                                                       pnstrdup(str, pg_mblen(str)))));
+                                                                       pnstrdup(str, pg_mblen_cstr(str)))));
  
                                 if (*str == ' ')
                                         n->type = NODE_TYPE_SPACE;
@@ -1457,7 +1457,7 @@ parse_format(FormatNode *node, const char *str, const KeyWord *kw,
                                         /* backslash quotes the next character, if any */
                                         if (*str == '\\' && *(str + 1))
                                                 str++;
-                                       chlen = pg_mblen(str);
+                                       chlen = pg_mblen_cstr(str);
                                         n->type = NODE_TYPE_CHAR;
                                         memcpy(n->character, str, chlen);
                                         n->character[chlen] = '\0';
@@ -1475,7 +1475,7 @@ parse_format(FormatNode *node, const char *str, const KeyWord *kw,
                                  */
                                 if (*str == '\\' && *(str + 1) == '"')
                                         str++;
-                               chlen = pg_mblen(str);
+                               chlen = pg_mblen_cstr(str);
  
                                 if ((flags & DCH_FLAG) && is_separator_char(str))
                                         n->type = NODE_TYPE_SEPARATOR;
@@ -2180,8 +2180,8 @@ asc_toupper_z(const char *buff)
         do { \
                 if (S_THth(_suf)) \
                 { \
-                       if (*(ptr)) (ptr) += pg_mblen(ptr); \
-                       if (*(ptr)) (ptr) += pg_mblen(ptr); \
+                       if (*(ptr)) (ptr) += pg_mblen_cstr(ptr); \
+                       if (*(ptr)) (ptr) += pg_mblen_cstr(ptr); \
                 } \
         } while (0)
  
@@ -3396,7 +3396,7 @@ DCH_from_char(FormatNode *node, const char *in, TmFromChar *out,
                                  * insist that the consumed character match the format's
                                  * character.
                                  */
-                               s += pg_mblen(s);
+                               s += pg_mblen_cstr(s);
                         }
                         continue;
                 }
@@ -3418,11 +3418,11 @@ DCH_from_char(FormatNode *node, const char *in, TmFromChar *out,
                                 if (extra_skip > 0)
                                         extra_skip--;
                                 else
-                                       s += pg_mblen(s);
+                                       s += pg_mblen_cstr(s);
                         }
                         else
                         {
-                               int                     chlen = pg_mblen(s);
+                               int                     chlen = pg_mblen_cstr(s);
  
                                 /*
                                  * Standard mode requires strict match of format characters.
@@ -5610,13 +5610,15 @@ NUM_numpart_to_char(NUMProc *Np, int id)
  static void
  NUM_eat_non_data_chars(NUMProc *Np, int n, int input_len)
  {
+       const char *end = Np->inout + input_len;
+
         while (n-- > 0)
         {
                 if (OVERLOAD_TEST)
                         break;                          /* end of input */
                 if (strchr("0123456789.,+-", *Np->inout_p) != NULL)
                         break;                          /* it's a data character */
-               Np->inout_p += pg_mblen(Np->inout_p);
+               Np->inout_p += pg_mblen_range(Np->inout_p, end);
         }
  }
  
@@ -6073,7 +6075,7 @@ NUM_processor(FormatNode *node, NUMDesc *Num, char *inout,
                         }
                         else
                         {
-                               Np->inout_p += pg_mblen(Np->inout_p);
+                               Np->inout_p += pg_mblen_range(Np->inout_p, Np->inout + input_len);
                         }
                         continue;
                 }
diff --git a/src/backend/utils/adt/jsonfuncs.c b/src/backend/utils/adt/jsonfuncs.c

index d4736250bb61864d618be2385448d0bb49b281eb..6649e7b7febef986d2055ac166e8f8d2a6d900ff 100644 (file)
--- a/src/backend/utils/adt/jsonfuncs.c
+++ b/src/backend/utils/adt/jsonfuncs.c
@@ -663,7 +663,7 @@ report_json_context(JsonLexContext *lex)
         {
                 /* Advance to next multibyte character */
                 if (IS_HIGHBIT_SET(*context_start))
-                       context_start += pg_mblen(context_start);
+                       context_start += pg_mblen_range(context_start, context_end);
                 else
                         context_start++;
         }
diff --git a/src/backend/utils/adt/jsonpath_gram.y b/src/backend/utils/adt/jsonpath_gram.y

index 91e4308d6559f7b68a1f4e41c596b85a7f12e85c..b79165fc608ea358d9bf0470a63d7906198edafa 100644 (file)
--- a/src/backend/utils/adt/jsonpath_gram.y
+++ b/src/backend/utils/adt/jsonpath_gram.y
@@ -528,7 +528,8 @@ makeItemLikeRegex(JsonPathParseItem *expr, JsonPathString *pattern,
                                                 (errcode(ERRCODE_SYNTAX_ERROR),
                                                  errmsg("invalid input syntax for type %s", "jsonpath"),
                                                  errdetail("Unrecognized flag character \"%.*s\" in LIKE_REGEX predicate.",
-                                                                  pg_mblen(flags->val + i), flags->val + i)));
+                                                                  pg_mblen_range(flags->val + i, flags->val + flags->len),
+                                                                  flags->val + i)));
                                 break;
                 }
         }
diff --git a/src/backend/utils/adt/levenshtein.c b/src/backend/utils/adt/levenshtein.c

index 3026cc2431117706592e627cb62670696dd18417..c6445cdcbf7a123f221732ad69a6d0f7b48858d6 100644 (file)
--- a/src/backend/utils/adt/levenshtein.c
+++ b/src/backend/utils/adt/levenshtein.c
@@ -84,6 +84,8 @@ varstr_levenshtein(const char *source, int slen,
         int                     i,
                                 j;
         const char *y;
+       const char *send = source + slen;
+       const char *tend = target + tlen;
  
         /*
          * For varstr_levenshtein_less_equal, we have real variables called
@@ -184,10 +186,10 @@ varstr_levenshtein(const char *source, int slen,
  #endif
  
         /*
-        * In order to avoid calling pg_mblen() repeatedly on each character in s,
-        * we cache all the lengths before starting the main loop -- but if all
-        * the characters in both strings are single byte, then we skip this and
-        * use a fast-path in the main loop.  If only one string contains
+        * In order to avoid calling pg_mblen_range() repeatedly on each character
+        * in s, we cache all the lengths before starting the main loop -- but if
+        * all the characters in both strings are single byte, then we skip this
+        * and use a fast-path in the main loop.  If only one string contains
          * multi-byte characters, we still build the array, so that the fast-path
          * needn't deal with the case where the array hasn't been initialized.
          */
@@ -199,7 +201,7 @@ varstr_levenshtein(const char *source, int slen,
                 s_char_len = (int *) palloc((m + 1) * sizeof(int));
                 for (i = 0; i < m; ++i)
                 {
-                       s_char_len[i] = pg_mblen(cp);
+                       s_char_len[i] = pg_mblen_range(cp, send);
                         cp += s_char_len[i];
                 }
                 s_char_len[i] = 0;
@@ -225,7 +227,7 @@ varstr_levenshtein(const char *source, int slen,
         {
                 int                *temp;
                 const char *x = source;
-               int                     y_char_len = n != tlen + 1 ? pg_mblen(y) : 1;
+               int                     y_char_len = n != tlen + 1 ? pg_mblen_range(y, tend) : 1;
  
  #ifdef LEVENSHTEIN_LESS_EQUAL
  
diff --git a/src/backend/utils/adt/like.c b/src/backend/utils/adt/like.c

index e02fc3725ad8c17a73c0be7ce39a36b72b4b5e31..b37bc0ceb55fac62368ece6627508cf241ff3e55 100644 (file)
--- a/src/backend/utils/adt/like.c
+++ b/src/backend/utils/adt/like.c
@@ -54,20 +54,20 @@ static int  Generic_Text_IC_like(text *str, text *pat, Oid collation);
   *--------------------
   */
  static inline int
-wchareq(const char *p1, const char *p2)
+wchareq(const char *p1, int p1len, const char *p2, int p2len)
  {
-       int                     p1_len;
+       int                     p1clen;
  
         /* Optimization:  quickly compare the first byte. */
         if (*p1 != *p2)
                 return 0;
  
-       p1_len = pg_mblen(p1);
-       if (pg_mblen(p2) != p1_len)
+       p1clen = pg_mblen_with_len(p1, p1len);
+       if (pg_mblen_with_len(p2, p2len) != p1clen)
                 return 0;
  
         /* They are the same length */
-       while (p1_len--)
+       while (p1clen--)
         {
                 if (*p1++ != *p2++)
                         return 0;
@@ -106,11 +106,11 @@ SB_lower_char(unsigned char c, pg_locale_t locale, bool locale_is_c)
  #define NextByte(p, plen)      ((p)++, (plen)--)
  
  /* Set up to compile like_match.c for multibyte characters */
-#define CHAREQ(p1, p2) wchareq((p1), (p2))
+#define CHAREQ(p1, p1len, p2, p2len) wchareq((p1), (p1len), (p2), (p2len))
  #define NextChar(p, plen) \
-       do { int __l = pg_mblen(p); (p) +=__l; (plen) -=__l; } while (0)
+       do { int __l = pg_mblen_with_len((p), (plen)); (p) +=__l; (plen) -=__l; } while (0)
  #define CopyAdvChar(dst, src, srclen) \
-       do { int __l = pg_mblen(src); \
+       do { int __l = pg_mblen_with_len((src), (srclen)); \
                  (srclen) -= __l; \
                  while (__l-- > 0) \
                          *(dst)++ = *(src)++; \
@@ -122,7 +122,7 @@ SB_lower_char(unsigned char c, pg_locale_t locale, bool locale_is_c)
  #include "like_match.c"
  
  /* Set up to compile like_match.c for single-byte characters */
-#define CHAREQ(p1, p2) (*(p1) == *(p2))
+#define CHAREQ(p1, p1len, p2, p2len) (*(p1) == *(p2))
  #define NextChar(p, plen) NextByte((p), (plen))
  #define CopyAdvChar(dst, src, srclen) (*(dst)++ = *(src)++, (srclen)--)
  
diff --git a/src/backend/utils/adt/like_match.c b/src/backend/utils/adt/like_match.c

index e876560991332e00700700b0fbd8bf4255d22707..e2e8c28ccfba78831ab951e25d2e46bce8aadb6b 100644 (file)
--- a/src/backend/utils/adt/like_match.c
+++ b/src/backend/utils/adt/like_match.c
@@ -294,6 +294,7 @@ do_like_escape(text *pat, text *esc)
                                          errhint("Escape string must be empty or one character.")));
  
                 e = VARDATA_ANY(esc);
+               elen = VARSIZE_ANY_EXHDR(esc);
  
                 /*
                  * If specified escape is '\', just copy the pattern as-is.
@@ -312,7 +313,7 @@ do_like_escape(text *pat, text *esc)
                 afterescape = false;
                 while (plen > 0)
                 {
-                       if (CHAREQ(p, e) && !afterescape)
+                       if (CHAREQ(p, plen, e, elen) && !afterescape)
                         {
                                 *r++ = '\\';
                                 NextChar(p, plen);
diff --git a/src/backend/utils/adt/oracle_compat.c b/src/backend/utils/adt/oracle_compat.c

index 6a5ce1cc1024155faeec570bde2897a82fb1a759..102500192c08921d94aba1d25fc62f016e5cc44f 100644 (file)
--- a/src/backend/utils/adt/oracle_compat.c
+++ b/src/backend/utils/adt/oracle_compat.c
@@ -152,8 +152,8 @@ lpad(PG_FUNCTION_ARGS)
         char       *ptr1,
                            *ptr2,
                            *ptr2start,
-                          *ptr2end,
                            *ptr_ret;
+       const char *ptr2end;
         int                     m,
                                 s1len,
                                 s2len;
@@ -198,7 +198,7 @@ lpad(PG_FUNCTION_ARGS)
  
         while (m--)
         {
-               int                     mlen = pg_mblen(ptr2);
+               int                     mlen = pg_mblen_range(ptr2, ptr2end);
  
                 memcpy(ptr_ret, ptr2, mlen);
                 ptr_ret += mlen;
@@ -211,7 +211,7 @@ lpad(PG_FUNCTION_ARGS)
  
         while (s1len--)
         {
-               int                     mlen = pg_mblen(ptr1);
+               int                     mlen = pg_mblen_unbounded(ptr1);
  
                 memcpy(ptr_ret, ptr1, mlen);
                 ptr_ret += mlen;
@@ -250,8 +250,8 @@ rpad(PG_FUNCTION_ARGS)
         char       *ptr1,
                            *ptr2,
                            *ptr2start,
-                          *ptr2end,
                            *ptr_ret;
+       const char *ptr2end;
         int                     m,
                                 s1len,
                                 s2len;
@@ -291,11 +291,12 @@ rpad(PG_FUNCTION_ARGS)
         m = len - s1len;
  
         ptr1 = VARDATA_ANY(string1);
+
         ptr_ret = VARDATA(ret);
  
         while (s1len--)
         {
-               int                     mlen = pg_mblen(ptr1);
+               int                     mlen = pg_mblen_unbounded(ptr1);
  
                 memcpy(ptr_ret, ptr1, mlen);
                 ptr_ret += mlen;
@@ -307,7 +308,7 @@ rpad(PG_FUNCTION_ARGS)
  
         while (m--)
         {
-               int                     mlen = pg_mblen(ptr2);
+               int                     mlen = pg_mblen_range(ptr2, ptr2end);
  
                 memcpy(ptr_ret, ptr2, mlen);
                 ptr_ret += mlen;
@@ -392,6 +393,7 @@ dotrim(const char *string, int stringlen,
                          */
                         const char **stringchars;
                         const char **setchars;
+                       const char *setend;
                         int                *stringmblen;
                         int                *setmblen;
                         int                     stringnchars;
@@ -399,6 +401,7 @@ dotrim(const char *string, int stringlen,
                         int                     resultndx;
                         int                     resultnchars;
                         const char *p;
+                       const char *pend;
                         int                     len;
                         int                     mblen;
                         const char *str_pos;
@@ -409,10 +412,11 @@ dotrim(const char *string, int stringlen,
                         stringnchars = 0;
                         p = string;
                         len = stringlen;
+                       pend = p + len;
                         while (len > 0)
                         {
                                 stringchars[stringnchars] = p;
-                               stringmblen[stringnchars] = mblen = pg_mblen(p);
+                               stringmblen[stringnchars] = mblen = pg_mblen_range(p, pend);
                                 stringnchars++;
                                 p += mblen;
                                 len -= mblen;
@@ -423,10 +427,11 @@ dotrim(const char *string, int stringlen,
                         setnchars = 0;
                         p = set;
                         len = setlen;
+                       setend = set + setlen;
                         while (len > 0)
                         {
                                 setchars[setnchars] = p;
-                               setmblen[setnchars] = mblen = pg_mblen(p);
+                               setmblen[setnchars] = mblen = pg_mblen_range(p, setend);
                                 setnchars++;
                                 p += mblen;
                                 len -= mblen;
@@ -804,6 +809,8 @@ translate(PG_FUNCTION_ARGS)
                            *to_end;
         char       *source,
                            *target;
+       const char *source_end;
+       const char *from_end;
         int                     m,
                                 fromlen,
                                 tolen,
@@ -818,9 +825,11 @@ translate(PG_FUNCTION_ARGS)
         if (m <= 0)
                 PG_RETURN_TEXT_P(string);
         source = VARDATA_ANY(string);
+       source_end = source + m;
  
         fromlen = VARSIZE_ANY_EXHDR(from);
         from_ptr = VARDATA_ANY(from);
+       from_end = from_ptr + fromlen;
         tolen = VARSIZE_ANY_EXHDR(to);
         to_ptr = VARDATA_ANY(to);
         to_end = to_ptr + tolen;
@@ -844,12 +853,12 @@ translate(PG_FUNCTION_ARGS)
  
         while (m > 0)
         {
-               source_len = pg_mblen(source);
+               source_len = pg_mblen_range(source, source_end);
                 from_index = 0;
  
                 for (i = 0; i < fromlen; i += len)
                 {
-                       len = pg_mblen(&from_ptr[i]);
+                       len = pg_mblen_range(&from_ptr[i], from_end);
                         if (len == source_len &&
                                 memcmp(source, &from_ptr[i], len) == 0)
                                 break;
@@ -865,11 +874,11 @@ translate(PG_FUNCTION_ARGS)
                         {
                                 if (p >= to_end)
                                         break;
-                               p += pg_mblen(p);
+                               p += pg_mblen_range(p, to_end);
                         }
                         if (p < to_end)
                         {
-                               len = pg_mblen(p);
+                               len = pg_mblen_range(p, to_end);
                                 memcpy(target, p, len);
                                 target += len;
                                 retlen += len;
diff --git a/src/backend/utils/adt/regexp.c b/src/backend/utils/adt/regexp.c

index 35aea25ebb8c3b6aa2a806bb8a537c0e8b411df2..eeba9b6c15f57cb36a460f9ce65063c2fed789c1 100644 (file)
--- a/src/backend/utils/adt/regexp.c
+++ b/src/backend/utils/adt/regexp.c
@@ -429,7 +429,7 @@ parse_re_flags(pg_re_flags *flags, text *opts)
                                         ereport(ERROR,
                                                         (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
                                                          errmsg("invalid regular expression option: \"%.*s\"",
-                                                                       pg_mblen(opt_p + i), opt_p + i)));
+                                                                       pg_mblen_range(opt_p + i, opt_p + opt_len), opt_p + i)));
                                         break;
                         }
                 }
@@ -659,12 +659,13 @@ textregexreplace(PG_FUNCTION_ARGS)
         if (VARSIZE_ANY_EXHDR(opt) > 0)
         {
                 char       *opt_p = VARDATA_ANY(opt);
+               const char *end_p = opt_p + VARSIZE_ANY_EXHDR(opt);
  
                 if (*opt_p >= '0' && *opt_p <= '9')
                         ereport(ERROR,
                                         (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
                                          errmsg("invalid regular expression option: \"%.*s\"",
-                                                       pg_mblen(opt_p), opt_p),
+                                                       pg_mblen_range(opt_p, end_p), opt_p),
                                          errhint("If you meant to use regexp_replace() with a start parameter, cast the fourth argument to integer explicitly.")));
         }
  
@@ -758,6 +759,7 @@ similar_escape_internal(text *pat_text, text *esc_text)
                            *r;
         int                     plen,
                                 elen;
+       const char *pend;
         bool            afterescape = false;
         int                     nquotes = 0;
         int                     bracket_depth = 0;      /* square bracket nesting level */
@@ -765,6 +767,7 @@ similar_escape_internal(text *pat_text, text *esc_text)
  
         p = VARDATA_ANY(pat_text);
         plen = VARSIZE_ANY_EXHDR(pat_text);
+       pend = p + plen;
         if (esc_text == NULL)
         {
                 /* No ESCAPE clause provided; default to backslash as escape */
@@ -864,7 +867,7 @@ similar_escape_internal(text *pat_text, text *esc_text)
  
                 if (elen > 1)
                 {
-                       int                     mblen = pg_mblen(p);
+                       int                     mblen = pg_mblen_range(p, pend);
  
                         if (mblen > 1)
                         {
diff --git a/src/backend/utils/adt/tsquery.c b/src/backend/utils/adt/tsquery.c

index 1458b12f316107db58001c4d9f971ff13eefdd29..ee86122bf802429da30d68824c0cbe43c357be82 100644 (file)
--- a/src/backend/utils/adt/tsquery.c
+++ b/src/backend/utils/adt/tsquery.c
@@ -109,7 +109,7 @@ get_modifiers(char *buf, int16 *weight, bool *prefix)
                 return buf;
  
         buf++;
-       while (*buf && pg_mblen(buf) == 1)
+       while (*buf && pg_mblen_cstr(buf) == 1)
         {
                 switch (*buf)
                 {
@@ -186,7 +186,7 @@ parse_phrase_operator(TSQueryParserState pstate, int16 *distance)
                                         continue;
                                 }
  
-                               if (!t_isdigit(ptr))
+                               if (!t_isdigit_cstr(ptr))
                                         return false;
  
                                 errno = 0;
@@ -248,12 +248,12 @@ parse_or_operator(TSQueryParserState pstate)
                 return false;
  
         /* it shouldn't be a part of any word */
-       if (t_iseq(ptr, '-') || t_iseq(ptr, '_') || t_isalpha(ptr) || t_isdigit(ptr))
+       if (t_iseq(ptr, '-') || t_iseq(ptr, '_') || t_isalpha_cstr(ptr) || t_isdigit_cstr(ptr))
                 return false;
  
         for (;;)
         {
-               ptr += pg_mblen(ptr);
+               ptr += pg_mblen_cstr(ptr);
  
                 if (*ptr == '\0')               /* got end of string without operand */
                         return false;
@@ -263,7 +263,7 @@ parse_or_operator(TSQueryParserState pstate)
                  * So we still treat OR literal as operation with possibly incorrect
                  * operand and will not search it as lexeme
                  */
-               if (!t_isspace(ptr))
+               if (!t_isspace_cstr(ptr))
                         break;
         }
  
@@ -306,7 +306,7 @@ gettoken_query_standard(TSQueryParserState state, int8 *operator,
                                                          errmsg("syntax error in tsquery: \"%s\"",
                                                                         state->buffer)));
                                 }
-                               else if (!t_isspace(state->buf))
+                               else if (!t_isspace_cstr(state->buf))
                                 {
                                         /*
                                          * We rely on the tsvector parser to parse the value for
@@ -364,14 +364,14 @@ gettoken_query_standard(TSQueryParserState state, int8 *operator,
                                 {
                                         return (state->count) ? PT_ERR : PT_END;
                                 }
-                               else if (!t_isspace(state->buf))
+                               else if (!t_isspace_cstr(state->buf))
                                 {
                                         return PT_ERR;
                                 }
                                 break;
                 }
  
-               state->buf += pg_mblen(state->buf);
+               state->buf += pg_mblen_cstr(state->buf);
         }
  }
  
@@ -425,7 +425,7 @@ gettoken_query_websearch(TSQueryParserState state, int8 *operator,
                                         state->state = WAITOPERAND;
                                         continue;
                                 }
-                               else if (!t_isspace(state->buf))
+                               else if (!t_isspace_cstr(state->buf))
                                 {
                                         /*
                                          * We rely on the tsvector parser to parse the value for
@@ -468,7 +468,7 @@ gettoken_query_websearch(TSQueryParserState state, int8 *operator,
                                         state->buf++;
                                         continue;
                                 }
-                               else if (!t_isspace(state->buf))
+                               else if (!t_isspace_cstr(state->buf))
                                 {
                                         /* insert implicit AND between operands */
                                         state->state = WAITOPERAND;
@@ -478,7 +478,7 @@ gettoken_query_websearch(TSQueryParserState state, int8 *operator,
                                 break;
                 }
  
-               state->buf += pg_mblen(state->buf);
+               state->buf += pg_mblen_cstr(state->buf);
         }
  }
  
@@ -961,9 +961,8 @@ infix(INFIX *in, int parentPriority, bool rightPhraseOp)
                                 *(in->cur) = '\\';
                                 in->cur++;
                         }
-                       COPYCHAR(in->cur, op);
  
-                       clen = pg_mblen(op);
+                       clen = ts_copychar_cstr(in->cur, op);
                         op += clen;
                         in->cur += clen;
                 }
diff --git a/src/backend/utils/adt/tsvector.c b/src/backend/utils/adt/tsvector.c

index 4c489d72fe24861951f0c5053cfdbfff8f8afdba..1b28911c43d83a500421aff314e6b61564d3fcdb 100644 (file)
--- a/src/backend/utils/adt/tsvector.c
+++ b/src/backend/utils/adt/tsvector.c
@@ -313,9 +313,9 @@ tsvectorout(PG_FUNCTION_ARGS)
                                 lenbuf = 0,
                                 pp;
         WordEntry  *ptr = ARRPTR(out);
-       char       *curbegin,
-                          *curin,
+       char       *curin,
                            *curout;
+       const char *curend;
  
         lenbuf = out->size * 2 /* '' */ + out->size - 1 /* space */ + 2 /* \0 */ ;
         for (i = 0; i < out->size; i++)
@@ -328,13 +328,14 @@ tsvectorout(PG_FUNCTION_ARGS)
         curout = outbuf = (char *) palloc(lenbuf);
         for (i = 0; i < out->size; i++)
         {
-               curbegin = curin = STRPTR(out) + ptr->pos;
+               curin = STRPTR(out) + ptr->pos;
+               curend = curin + ptr->len;
                 if (i != 0)
                         *curout++ = ' ';
                 *curout++ = '\'';
-               while (curin - curbegin < ptr->len)
+               while (curin < curend)
                 {
-                       int                     len = pg_mblen(curin);
+                       int                     len = pg_mblen_range(curin, curend);
  
                         if (t_iseq(curin, '\''))
                                 *curout++ = '\'';
diff --git a/src/backend/utils/adt/tsvector_op.c b/src/backend/utils/adt/tsvector_op.c

index 2ccd3bdbb0ea384ffb4cc38c6a51521c99490a19..6343db87f5794fd9419e4054af528af20daf8c5e 100644 (file)
--- a/src/backend/utils/adt/tsvector_op.c
+++ b/src/backend/utils/adt/tsvector_op.c
@@ -2439,11 +2439,15 @@ ts_stat_sql(MemoryContext persistentContext, text *txt, text *ws)
         if (ws)
         {
                 char       *buf;
+               const char *end;
  
                 buf = VARDATA_ANY(ws);
-               while (buf - VARDATA_ANY(ws) < VARSIZE_ANY_EXHDR(ws))
+               end = buf + VARSIZE_ANY_EXHDR(ws);
+               while (buf < end)
                 {
-                       if (pg_mblen(buf) == 1)
+                       int                     len = pg_mblen_range(buf, end);
+
+                       if (len == 1)
                         {
                                 switch (*buf)
                                 {
@@ -2467,7 +2471,7 @@ ts_stat_sql(MemoryContext persistentContext, text *txt, text *ws)
                                                 stat->weight |= 0;
                                 }
                         }
-                       buf += pg_mblen(buf);
+                       buf += len;
                 }
         }
  
diff --git a/src/backend/utils/adt/tsvector_parser.c b/src/backend/utils/adt/tsvector_parser.c

index e2460d393ab112a35269451b702a536ee61992b7..f7152029b0e8743b5847d9041d24f713ef1da8d5 100644 (file)
--- a/src/backend/utils/adt/tsvector_parser.c
+++ b/src/backend/utils/adt/tsvector_parser.c
@@ -185,10 +185,9 @@ gettoken_tsvector(TSVectorParseState state,
                         else if ((state->oprisdelim && ISOPERATOR(state->prsbuf)) ||
                                          (state->is_web && t_iseq(state->prsbuf, '"')))
                                 PRSSYNTAXERROR;
-                       else if (!t_isspace(state->prsbuf))
+                       else if (!t_isspace_cstr(state->prsbuf))
                         {
-                               COPYCHAR(curpos, state->prsbuf);
-                               curpos += pg_mblen(state->prsbuf);
+                               curpos += ts_copychar_cstr(curpos, state->prsbuf);
                                 statecode = WAITENDWORD;
                         }
                 }
@@ -202,8 +201,7 @@ gettoken_tsvector(TSVectorParseState state,
                         else
                         {
                                 RESIZEPRSBUF;
-                               COPYCHAR(curpos, state->prsbuf);
-                               curpos += pg_mblen(state->prsbuf);
+                               curpos += ts_copychar_cstr(curpos, state->prsbuf);
                                 Assert(oldstate != 0);
                                 statecode = oldstate;
                         }
@@ -215,7 +213,7 @@ gettoken_tsvector(TSVectorParseState state,
                                 statecode = WAITNEXTCHAR;
                                 oldstate = WAITENDWORD;
                         }
-                       else if (t_isspace(state->prsbuf) || *(state->prsbuf) == '\0' ||
+                       else if (t_isspace_cstr(state->prsbuf) || *(state->prsbuf) == '\0' ||
                                          (state->oprisdelim && ISOPERATOR(state->prsbuf)) ||
                                          (state->is_web && t_iseq(state->prsbuf, '"')))
                         {
@@ -238,8 +236,7 @@ gettoken_tsvector(TSVectorParseState state,
                         else
                         {
                                 RESIZEPRSBUF;
-                               COPYCHAR(curpos, state->prsbuf);
-                               curpos += pg_mblen(state->prsbuf);
+                               curpos += ts_copychar_cstr(curpos, state->prsbuf);
                         }
                 }
                 else if (statecode == WAITENDCMPLX)
@@ -258,8 +255,7 @@ gettoken_tsvector(TSVectorParseState state,
                         else
                         {
                                 RESIZEPRSBUF;
-                               COPYCHAR(curpos, state->prsbuf);
-                               curpos += pg_mblen(state->prsbuf);
+                               curpos += ts_copychar_cstr(curpos, state->prsbuf);
                         }
                 }
                 else if (statecode == WAITCHARCMPLX)
@@ -267,8 +263,7 @@ gettoken_tsvector(TSVectorParseState state,
                         if (!state->is_web && t_iseq(state->prsbuf, '\''))
                         {
                                 RESIZEPRSBUF;
-                               COPYCHAR(curpos, state->prsbuf);
-                               curpos += pg_mblen(state->prsbuf);
+                               curpos += ts_copychar_cstr(curpos, state->prsbuf);
                                 statecode = WAITENDCMPLX;
                         }
                         else
@@ -279,7 +274,7 @@ gettoken_tsvector(TSVectorParseState state,
                                         PRSSYNTAXERROR;
                                 if (state->oprisdelim)
                                 {
-                                       /* state->prsbuf+=pg_mblen(state->prsbuf); */
+                                       /* state->prsbuf+=pg_mblen_cstr(state->prsbuf); */
                                         RETURN_TOKEN;
                                 }
                                 else
@@ -296,7 +291,7 @@ gettoken_tsvector(TSVectorParseState state,
                 }
                 else if (statecode == INPOSINFO)
                 {
-                       if (t_isdigit(state->prsbuf))
+                       if (t_isdigit_cstr(state->prsbuf))
                         {
                                 if (posalen == 0)
                                 {
@@ -351,10 +346,10 @@ gettoken_tsvector(TSVectorParseState state,
                                         PRSSYNTAXERROR;
                                 WEP_SETWEIGHT(pos[npos - 1], 0);
                         }
-                       else if (t_isspace(state->prsbuf) ||
+                       else if (t_isspace_cstr(state->prsbuf) ||
                                          *(state->prsbuf) == '\0')
                                 RETURN_TOKEN;
-                       else if (!t_isdigit(state->prsbuf))
+                       else if (!t_isdigit_cstr(state->prsbuf))
                                 PRSSYNTAXERROR;
                 }
                 else                                    /* internal error */
@@ -362,6 +357,6 @@ gettoken_tsvector(TSVectorParseState state,
                                  statecode);
  
                 /* get next char */
-               state->prsbuf += pg_mblen(state->prsbuf);
+               state->prsbuf += pg_mblen_cstr(state->prsbuf);
         }
  }
diff --git a/src/backend/utils/adt/varbit.c b/src/backend/utils/adt/varbit.c

index 73e41e0808fb52549c6e1af77560c60c88a44ec0..532b6dcd097b79ad17ef77bf4ad40feecf218233 100644 (file)
--- a/src/backend/utils/adt/varbit.c
+++ b/src/backend/utils/adt/varbit.c
@@ -232,7 +232,7 @@ bit_in(PG_FUNCTION_ARGS)
                                 ereport(ERROR,
                                                 (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
                                                  errmsg("\"%.*s\" is not a valid binary digit",
-                                                               pg_mblen(sp), sp)));
+                                                               pg_mblen_cstr(sp), sp)));
  
                         x >>= 1;
                         if (x == 0)
@@ -257,7 +257,7 @@ bit_in(PG_FUNCTION_ARGS)
                                 ereport(ERROR,
                                                 (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
                                                  errmsg("\"%.*s\" is not a valid hexadecimal digit",
-                                                               pg_mblen(sp), sp)));
+                                                               pg_mblen_cstr(sp), sp)));
  
                         if (bc)
                         {
@@ -533,7 +533,7 @@ varbit_in(PG_FUNCTION_ARGS)
                                 ereport(ERROR,
                                                 (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
                                                  errmsg("\"%.*s\" is not a valid binary digit",
-                                                               pg_mblen(sp), sp)));
+                                                               pg_mblen_cstr(sp), sp)));
  
                         x >>= 1;
                         if (x == 0)
@@ -558,7 +558,7 @@ varbit_in(PG_FUNCTION_ARGS)
                                 ereport(ERROR,
                                                 (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
                                                  errmsg("\"%.*s\" is not a valid hexadecimal digit",
-                                                               pg_mblen(sp), sp)));
+                                                               pg_mblen_cstr(sp), sp)));
  
                         if (bc)
                         {
diff --git a/src/backend/utils/adt/varlena.c b/src/backend/utils/adt/varlena.c

index 3732b79c21e303d0298d4e98f58faa6277268344..776ca094e60b561d3a245240a947773fda18897d 100644 (file)
--- a/src/backend/utils/adt/varlena.c
+++ b/src/backend/utils/adt/varlena.c
@@ -778,8 +778,11 @@ text_catenate(text *t1, text *t2)
   * charlen_to_bytelen()
   *     Compute the number of bytes occupied by n characters starting at *p
   *
- * It is caller's responsibility that there actually are n characters;
- * the string need not be null-terminated.
+ * The caller shall ensure there are n complete characters.  Callers achieve
+ * this by deriving "n" from regmatch_t findings from searching a wchar array.
+ * pg_mb2wchar_with_len() skips any trailing incomplete character, so regex
+ * matches will end no later than the last complete character.  (The string
+ * need not be null-terminated.)
   */
  static int
  charlen_to_bytelen(const char *p, int n)
@@ -794,7 +797,7 @@ charlen_to_bytelen(const char *p, int n)
                 const char *s;
  
                 for (s = p; n > 0; n--)
-                       s += pg_mblen(s);
+                       s += pg_mblen_unbounded(s); /* caller verified encoding */
  
                 return s - p;
         }
@@ -927,6 +930,7 @@ text_substring(Datum str, int32 start, int32 length, bool length_not_specified)
                 int32           slice_start;
                 int32           slice_size;
                 int32           slice_strlen;
+               int32           slice_len;
                 text       *slice;
                 int32           E1;
                 int32           i;
@@ -996,7 +1000,8 @@ text_substring(Datum str, int32 start, int32 length, bool length_not_specified)
                         slice = (text *) DatumGetPointer(str);
  
                 /* see if we got back an empty string */
-               if (VARSIZE_ANY_EXHDR(slice) == 0)
+               slice_len = VARSIZE_ANY_EXHDR(slice);
+               if (slice_len == 0)
                 {
                         if (slice != (text *) DatumGetPointer(str))
                                 pfree(slice);
@@ -1005,7 +1010,7 @@ text_substring(Datum str, int32 start, int32 length, bool length_not_specified)
  
                 /* Now we can get the actual length of the slice in MB characters */
                 slice_strlen = pg_mbstrlen_with_len(VARDATA_ANY(slice),
-                                                                                       VARSIZE_ANY_EXHDR(slice));
+                                                                                       slice_len);
  
                 /*
                  * Check that the start position wasn't > slice_strlen. If so, SQL99
@@ -1032,7 +1037,7 @@ text_substring(Datum str, int32 start, int32 length, bool length_not_specified)
                  */
                 p = VARDATA_ANY(slice);
                 for (i = 0; i < S1 - 1; i++)
-                       p += pg_mblen(p);
+                       p += pg_mblen_unbounded(p);
  
                 /* hang onto a pointer to our start position */
                 s = p;
@@ -1042,7 +1047,7 @@ text_substring(Datum str, int32 start, int32 length, bool length_not_specified)
                  * length.
                  */
                 for (i = S1; i < E1; i++)
-                       p += pg_mblen(p);
+                       p += pg_mblen_unbounded(p);
  
                 ret = (text *) palloc(VARHDRSZ + (p - s));
                 SET_VARSIZE(ret, VARHDRSZ + (p - s));
@@ -1340,6 +1345,8 @@ retry:
          */
         if (state->is_multibyte_char_in_char)
         {
+               const char *haystack_end = state->str1 + state->len1;
+
                 /* Walk one character at a time, until we reach the match. */
  
                 /* the search should never move backwards. */
@@ -1348,7 +1355,7 @@ retry:
                 while (state->refpoint < matchptr)
                 {
                         /* step to next character. */
-                       state->refpoint += pg_mblen(state->refpoint);
+                       state->refpoint += pg_mblen_range(state->refpoint, haystack_end);
                         state->refpos++;
  
                         /*
@@ -4940,6 +4947,8 @@ split_text(FunctionCallInfo fcinfo, SplitTextOutputData *tstate)
         }
         else
         {
+               const char *end_ptr;
+
                 /*
                  * When fldsep is NULL, each character in the input string becomes a
                  * separate element in the result set.  The separator is effectively
@@ -4948,10 +4957,11 @@ split_text(FunctionCallInfo fcinfo, SplitTextOutputData *tstate)
                 inputstring_len = VARSIZE_ANY_EXHDR(inputstring);
  
                 start_ptr = VARDATA_ANY(inputstring);
+               end_ptr = start_ptr + inputstring_len;
  
                 while (inputstring_len > 0)
                 {
-                       int                     chunk_len = pg_mblen(start_ptr);
+                       int                     chunk_len = pg_mblen_range(start_ptr, end_ptr);
  
                         CHECK_FOR_INTERRUPTS();
  
@@ -5630,7 +5640,7 @@ text_reverse(PG_FUNCTION_ARGS)
                 {
                         int                     sz;
  
-                       sz = pg_mblen(p);
+                       sz = pg_mblen_range(p, endp);
                         dst -= sz;
                         memcpy(dst, p, sz);
                         p += sz;
@@ -5791,7 +5801,7 @@ text_format(PG_FUNCTION_ARGS)
                         ereport(ERROR,
                                         (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
                                          errmsg("unrecognized format() type specifier \"%.*s\"",
-                                                       pg_mblen(cp), cp),
+                                                       pg_mblen_range(cp, end_ptr), cp),
                                          errhint("For a single \"%%\" use \"%%%%\".")));
  
                 /* If indirect width was specified, get its value */
@@ -5912,7 +5922,7 @@ text_format(PG_FUNCTION_ARGS)
                                 ereport(ERROR,
                                                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
                                                  errmsg("unrecognized format() type specifier \"%.*s\"",
-                                                               pg_mblen(cp), cp),
+                                                               pg_mblen_range(cp, end_ptr), cp),
                                                  errhint("For a single \"%%\" use \"%%%%\".")));
                                 break;
                 }
diff --git a/src/backend/utils/adt/xml.c b/src/backend/utils/adt/xml.c

index 279c15aa33116992b1fe092a19befe827218a444..3dd6cc037abed7582d399cc4f26ffbc6d715a171 100644 (file)
--- a/src/backend/utils/adt/xml.c
+++ b/src/backend/utils/adt/xml.c
@@ -2036,8 +2036,7 @@ sqlchar_to_unicode(const char *s)
         char       *utf8string;
         pg_wchar        ret[2];                 /* need space for trailing zero */
  
-       /* note we're not assuming s is null-terminated */
-       utf8string = pg_server_to_any(s, pg_mblen(s), PG_UTF8);
+       utf8string = pg_server_to_any(s, pg_mblen_cstr(s), PG_UTF8);
  
         pg_encoding_mb2wchar_with_len(PG_UTF8, utf8string, ret,
                                                                   pg_encoding_mblen(PG_UTF8, utf8string));
@@ -2090,7 +2089,7 @@ map_sql_identifier_to_xml_name(const char *ident, bool fully_escaped,
  
         initStringInfo(&buf);
  
-       for (p = ident; *p; p += pg_mblen(p))
+       for (p = ident; *p; p += pg_mblen_cstr(p))
         {
                 if (*p == ':' && (p == ident || fully_escaped))
                         appendStringInfoString(&buf, "_x003A_");
@@ -2115,7 +2114,7 @@ map_sql_identifier_to_xml_name(const char *ident, bool fully_escaped,
                                 : !is_valid_xml_namechar(u))
                                 appendStringInfo(&buf, "_x%04X_", (unsigned int) u);
                         else
-                               appendBinaryStringInfo(&buf, p, pg_mblen(p));
+                               appendBinaryStringInfo(&buf, p, pg_mblen_cstr(p));
                 }
         }
  
@@ -2138,7 +2137,7 @@ map_xml_name_to_sql_identifier(const char *name)
  
         initStringInfo(&buf);
  
-       for (p = name; *p; p += pg_mblen(p))
+       for (p = name; *p; p += pg_mblen_cstr(p))
         {
                 if (*p == '_' && *(p + 1) == 'x'
                         && isxdigit((unsigned char) *(p + 2))
@@ -2156,7 +2155,7 @@ map_xml_name_to_sql_identifier(const char *name)
                         p += 6;
                 }
                 else
-                       appendBinaryStringInfo(&buf, p, pg_mblen(p));
+                       appendBinaryStringInfo(&buf, p, pg_mblen_cstr(p));
         }
  
         return buf.data;
diff --git a/src/backend/utils/mb/mbutils.c b/src/backend/utils/mb/mbutils.c

index 39e5fcaf5a015c14ef2f81c71b3d4b95fa826052..4033b6dc1d9934ee97af49f1acb2b7a6b0c9e6d1 100644 (file)
--- a/src/backend/utils/mb/mbutils.c
+++ b/src/backend/utils/mb/mbutils.c
@@ -38,6 +38,7 @@
  #include "catalog/namespace.h"
  #include "mb/pg_wchar.h"
  #include "utils/builtins.h"
+#include "utils/memdebug.h"
  #include "utils/memutils.h"
  #include "utils/relcache.h"
  #include "utils/syscache.h"
@@ -97,6 +98,13 @@ static char *perform_default_encoding_conversion(const char *src,
                                                                                                  int len, bool is_client_to_server);
  static int     cliplen(const char *str, int len, int limit);
  
+pg_attribute_noreturn()
+static void report_invalid_encoding_int(int encoding, const char *mbstr,
+                                                                               int mblen, int len);
+
+pg_attribute_noreturn()
+static void report_invalid_encoding_db(const char *mbstr, int mblen, int len);
+
  
  /*
   * Prepare for a future call to SetClientEncoding.  Success should mean
@@ -962,11 +970,126 @@ pg_encoding_wchar2mb_with_len(int encoding,
         return pg_wchar_table[encoding].wchar2mb_with_len(from, (unsigned char *) to, len);
  }
  
-/* returns the byte length of a multibyte character */
+/*
+ * Returns the byte length of a multibyte character sequence in a
+ * null-terminated string.  Raises an illegal byte sequence error if the
+ * sequence would hit a null terminator.
+ *
+ * The caller is expected to have checked for a terminator at *mbstr == 0
+ * before calling, but some callers want 1 in that case, so this function
+ * continues that tradition.
+ *
+ * This must only be used for strings that have a null-terminator to enable
+ * bounds detection.
+ */
+int
+pg_mblen_cstr(const char *mbstr)
+{
+       int                     length = pg_wchar_table[DatabaseEncoding->encoding].mblen((const unsigned char *) mbstr);
+
+       /*
+        * The .mblen functions return 1 when given a pointer to a terminator.
+        * Some callers depend on that, so we tolerate it for now.  Well-behaved
+        * callers check the leading byte for a terminator *before* calling.
+        */
+       for (int i = 1; i < length; ++i)
+               if (unlikely(mbstr[i] == 0))
+                       report_invalid_encoding_db(mbstr, length, i);
+
+       /*
+        * String should be NUL-terminated, but checking that would make typical
+        * callers O(N^2), tripling Valgrind check-world time.  Unless
+        * VALGRIND_EXPENSIVE, check 1 byte after each actual character.  (If we
+        * found a character, not a terminator, the next byte must be a terminator
+        * or the start of the next character.)  If the caller iterates the whole
+        * string, the last call will diagnose a missing terminator.
+        */
+       if (mbstr[0] != '\0')
+       {
+#ifdef VALGRIND_EXPENSIVE
+               VALGRIND_CHECK_MEM_IS_DEFINED(mbstr, strlen(mbstr));
+#else
+               VALGRIND_CHECK_MEM_IS_DEFINED(mbstr + length, 1);
+#endif
+       }
+
+       return length;
+}
+
+/*
+ * Returns the byte length of a multibyte character sequence bounded by a range
+ * [mbstr, end) of at least one byte in size.  Raises an illegal byte sequence
+ * error if the sequence would exceed the range.
+ */
+int
+pg_mblen_range(const char *mbstr, const char *end)
+{
+       int                     length = pg_wchar_table[DatabaseEncoding->encoding].mblen((const unsigned char *) mbstr);
+
+       Assert(end > mbstr);
+#ifdef VALGRIND_EXPENSIVE
+       VALGRIND_CHECK_MEM_IS_DEFINED(mbstr, end - mbstr);
+#else
+       VALGRIND_CHECK_MEM_IS_DEFINED(mbstr, length);
+#endif
+
+       if (unlikely(mbstr + length > end))
+               report_invalid_encoding_db(mbstr, length, end - mbstr);
+
+       return length;
+}
+
+/*
+ * Returns the byte length of a multibyte character sequence bounded by a range
+ * extending for 'limit' bytes, which must be at least one.  Raises an illegal
+ * byte sequence error if the sequence would exceed the range.
+ */
+int
+pg_mblen_with_len(const char *mbstr, int limit)
+{
+       int                     length = pg_wchar_table[DatabaseEncoding->encoding].mblen((const unsigned char *) mbstr);
+
+       Assert(limit >= 1);
+#ifdef VALGRIND_EXPENSIVE
+       VALGRIND_CHECK_MEM_IS_DEFINED(mbstr, limit);
+#else
+       VALGRIND_CHECK_MEM_IS_DEFINED(mbstr, length);
+#endif
+
+       if (unlikely(length > limit))
+               report_invalid_encoding_db(mbstr, length, limit);
+
+       return length;
+}
+
+
+/*
+ * Returns the length of a multibyte character sequence, without any
+ * validation of bounds.
+ *
+ * PLEASE NOTE:  This function can only be used safely if the caller has
+ * already verified the input string, since otherwise there is a risk of
+ * overrunning the buffer if the string is invalid.  A prior call to a
+ * pg_mbstrlen* function suffices.
+ */
+int
+pg_mblen_unbounded(const char *mbstr)
+{
+       int                     length = pg_wchar_table[DatabaseEncoding->encoding].mblen((const unsigned char *) mbstr);
+
+       VALGRIND_CHECK_MEM_IS_DEFINED(mbstr, length);
+
+       return length;
+}
+
+/*
+ * Historical name for pg_mblen_unbounded().  Should not be used and will be
+ * removed in a later version.
+ */
  int
  pg_mblen(const char *mbstr)
  {
-       return pg_wchar_table[DatabaseEncoding->encoding].mblen((const unsigned char *) mbstr);
+       return pg_mblen_unbounded(mbstr);
  }
  
  /* returns the display length of a multibyte character */
@@ -988,14 +1111,14 @@ pg_mbstrlen(const char *mbstr)
  
         while (*mbstr)
         {
-               mbstr += pg_mblen(mbstr);
+               mbstr += pg_mblen_cstr(mbstr);
                 len++;
         }
         return len;
  }
  
  /* returns the length (counted in wchars) of a multibyte string
- * (not necessarily NULL terminated)
+ * (stops at the first of "limit" or a NUL)
   */
  int
  pg_mbstrlen_with_len(const char *mbstr, int limit)
@@ -1008,7 +1131,7 @@ pg_mbstrlen_with_len(const char *mbstr, int limit)
  
         while (limit > 0 && *mbstr)
         {
-               int                     l = pg_mblen(mbstr);
+               int                     l = pg_mblen_with_len(mbstr, limit);
  
                 limit -= l;
                 mbstr += l;
@@ -1078,7 +1201,7 @@ pg_mbcharcliplen(const char *mbstr, int len, int limit)
  
         while (len > 0 && *mbstr)
         {
-               l = pg_mblen(mbstr);
+               l = pg_mblen_with_len(mbstr, len);
                 nch++;
                 if (nch > limit)
                         break;
@@ -1648,12 +1771,19 @@ void
  report_invalid_encoding(int encoding, const char *mbstr, int len)
  {
         int                     l = pg_encoding_mblen_or_incomplete(encoding, mbstr, len);
+
+       report_invalid_encoding_int(encoding, mbstr, l, len);
+}
+
+static void
+report_invalid_encoding_int(int encoding, const char *mbstr, int mblen, int len)
+{
         char            buf[8 * 5 + 1];
         char       *p = buf;
         int                     j,
                                 jlimit;
  
-       jlimit = Min(l, len);
+       jlimit = Min(mblen, len);
         jlimit = Min(jlimit, 8);        /* prevent buffer overrun */
  
         for (j = 0; j < jlimit; j++)
@@ -1670,6 +1800,12 @@ report_invalid_encoding(int encoding, const char *mbstr, int len)
                                         buf)));
  }
  
+static void
+report_invalid_encoding_db(const char *mbstr, int mblen, int len)
+{
+       report_invalid_encoding_int(GetDatabaseEncoding(), mbstr, mblen, len);
+}
+
  /*
   * report_untranslatable_char: complain about untranslatable character
   *
diff --git a/src/include/mb/pg_wchar.h b/src/include/mb/pg_wchar.h

index 45ef9cbba70c9b8476d48573580561f10f959f87..0412f3886e587a104ffd4d61f3dfb947940c07fb 100644 (file)
--- a/src/include/mb/pg_wchar.h
+++ b/src/include/mb/pg_wchar.h
@@ -608,7 +608,14 @@ extern int pg_char_and_wchar_strcmp(const char *s1, const pg_wchar *s2);
  extern int     pg_wchar_strncmp(const pg_wchar *s1, const pg_wchar *s2, size_t n);
  extern int     pg_char_and_wchar_strncmp(const char *s1, const pg_wchar *s2, size_t n);
  extern size_t pg_wchar_strlen(const pg_wchar *wstr);
+extern int     pg_mblen_cstr(const char *mbstr);
+extern int     pg_mblen_range(const char *mbstr, const char *end);
+extern int     pg_mblen_with_len(const char *mbstr, int limit);
+extern int     pg_mblen_unbounded(const char *mbstr);
+
+/* deprecated */
  extern int     pg_mblen(const char *mbstr);
+
  extern int     pg_dsplen(const char *mbstr);
  extern int     pg_mbstrlen(const char *mbstr);
  extern int     pg_mbstrlen_with_len(const char *mbstr, int len);
diff --git a/src/include/tsearch/ts_locale.h b/src/include/tsearch/ts_locale.h

index 7d7c4e16c621c4921a936c5a206a14a9e9f3e9db..1a1e713a89207dd517c1078ff7124a185e6da670 100644 (file)
--- a/src/include/tsearch/ts_locale.h
+++ b/src/include/tsearch/ts_locale.h
@@ -45,12 +45,36 @@ typedef struct
  /* The second argument of t_iseq() must be a plain ASCII character */
  #define t_iseq(x,c)            (TOUCHAR(x) == (unsigned char) (c))
  
-#define COPYCHAR(d,s)  memcpy(d, s, pg_mblen(s))
+/* Copy multibyte character of known byte length, return byte length. */
+static inline int
+ts_copychar_with_len(void *dest, const void *src, int length)
+{
+       memcpy(dest, src, length);
+       return length;
+}
+
+/* Copy multibyte character from null-terminated string,  return byte length. */
+static inline int
+ts_copychar_cstr(void *dest, const void *src)
+{
+       return ts_copychar_with_len(dest, src, pg_mblen_cstr((const char *) src));
+}
+
+/* Historical macro for the above. */
+#define COPYCHAR ts_copychar_cstr
+
+#define GENERATE_T_ISCLASS_DECL(character_class) \
+extern int     t_is##character_class##_with_len(const char *ptr, int len); \
+extern int     t_is##character_class##_cstr(const char *ptr); \
+extern int     t_is##character_class##_unbounded(const char *ptr); \
+\
+/* deprecated */ \
+extern int     t_is##character_class(const char *ptr);
  
-extern int     t_isdigit(const char *ptr);
-extern int     t_isspace(const char *ptr);
-extern int     t_isalpha(const char *ptr);
-extern int     t_isprint(const char *ptr);
+GENERATE_T_ISCLASS_DECL(alpha);
+GENERATE_T_ISCLASS_DECL(digit);
+GENERATE_T_ISCLASS_DECL(print);
+GENERATE_T_ISCLASS_DECL(space);
  
  extern char *lowerstr(const char *str);
  extern char *lowerstr_with_len(const char *str, int len);
diff --git a/src/include/tsearch/ts_utils.h b/src/include/tsearch/ts_utils.h

index c36c711dae0d1a03823f8ec4e138912493f5f3c6..94ca47588ff9ea9b91e5dad8d3d387690a497e62 100644 (file)
--- a/src/include/tsearch/ts_utils.h
+++ b/src/include/tsearch/ts_utils.h
@@ -38,14 +38,12 @@ extern bool gettoken_tsvector(TSVectorParseState state,
  extern void close_tsvector_parser(TSVectorParseState state);
  
  /* phrase operator begins with '<' */
-#define ISOPERATOR(x) \
-       ( pg_mblen(x) == 1 && ( *(x) == '!' ||  \
-                                                       *(x) == '&' ||  \
-                                                       *(x) == '|' ||  \
-                                                       *(x) == '(' ||  \
-                                                       *(x) == ')' ||  \
-                                                       *(x) == '<'             \
-                                                 ) )
+#define ISOPERATOR(x)          (*(x) == '!' || \
+                                                        *(x) == '&' || \
+                                                        *(x) == '|' || \
+                                                        *(x) == '(' || \
+                                                        *(x) == ')' || \
+                                                        *(x) == '<')
  
  /* parse_tsquery */
  
diff --git a/src/test/modules/test_regex/test_regex.c b/src/test/modules/test_regex/test_regex.c

index e23a0bd0d7f5d56e9531d75af64c6c6abbd60e73..f7bf50b760916670af13539ebda6e4d641ec8d8a 100644 (file)
--- a/src/test/modules/test_regex/test_regex.c
+++ b/src/test/modules/test_regex/test_regex.c
@@ -424,7 +424,8 @@ parse_test_flags(test_re_flags *flags, text *opts)
                                         ereport(ERROR,
                                                         (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
                                                          errmsg("invalid regular expression test option: \"%.*s\"",
-                                                                       pg_mblen(opt_p + i), opt_p + i)));
+                                                                       pg_mblen_range(opt_p + i, opt_p + opt_len),
+                                                                       opt_p + i)));
                                         break;
                         }
                 }
author	Thomas Munro <tmunro@postgresql.org>
	Wed, 7 Jan 2026 09:14:31 +0000 (22:14 +1300)
committer	Thomas Munro <tmunro@postgresql.org>
	Sun, 8 Feb 2026 23:34:24 +0000 (12:34 +1300)
contrib/btree_gist/btree_utils_var.c		patch \| blob \| blame \| history
contrib/dict_xsyn/dict_xsyn.c		patch \| blob \| blame \| history
contrib/hstore/hstore_io.c		patch \| blob \| blame \| history
contrib/ltree/lquery_op.c		patch \| blob \| blame \| history
contrib/ltree/ltree.h		patch \| blob \| blame \| history
contrib/ltree/ltree_io.c		patch \| blob \| blame \| history
contrib/ltree/ltxtquery_io.c		patch \| blob \| blame \| history
contrib/pageinspect/heapfuncs.c		patch \| blob \| blame \| history
contrib/pg_trgm/trgm.h		patch \| blob \| blame \| history
contrib/pg_trgm/trgm_op.c		patch \| blob \| blame \| history
contrib/pg_trgm/trgm_regexp.c		patch \| blob \| blame \| history
contrib/unaccent/unaccent.c		patch \| blob \| blame \| history
src/backend/catalog/pg_proc.c		patch \| blob \| blame \| history
src/backend/tsearch/dict_synonym.c		patch \| blob \| blame \| history
src/backend/tsearch/dict_thesaurus.c		patch \| blob \| blame \| history
src/backend/tsearch/regis.c		patch \| blob \| blame \| history
src/backend/tsearch/spell.c		patch \| blob \| blame \| history
src/backend/tsearch/ts_locale.c		patch \| blob \| blame \| history
src/backend/tsearch/ts_utils.c		patch \| blob \| blame \| history
src/backend/tsearch/wparser_def.c		patch \| blob \| blame \| history
src/backend/utils/adt/encode.c		patch \| blob \| blame \| history
src/backend/utils/adt/formatting.c		patch \| blob \| blame \| history
src/backend/utils/adt/jsonfuncs.c		patch \| blob \| blame \| history
src/backend/utils/adt/jsonpath_gram.y		patch \| blob \| blame \| history
src/backend/utils/adt/levenshtein.c		patch \| blob \| blame \| history
src/backend/utils/adt/like.c		patch \| blob \| blame \| history
src/backend/utils/adt/like_match.c		patch \| blob \| blame \| history
src/backend/utils/adt/oracle_compat.c		patch \| blob \| blame \| history
src/backend/utils/adt/regexp.c		patch \| blob \| blame \| history
src/backend/utils/adt/tsquery.c		patch \| blob \| blame \| history
src/backend/utils/adt/tsvector.c		patch \| blob \| blame \| history
src/backend/utils/adt/tsvector_op.c		patch \| blob \| blame \| history
src/backend/utils/adt/tsvector_parser.c		patch \| blob \| blame \| history
src/backend/utils/adt/varbit.c		patch \| blob \| blame \| history
src/backend/utils/adt/varlena.c		patch \| blob \| blame \| history
src/backend/utils/adt/xml.c		patch \| blob \| blame \| history
src/backend/utils/mb/mbutils.c		patch \| blob \| blame \| history
src/include/mb/pg_wchar.h		patch \| blob \| blame \| history
src/include/tsearch/ts_locale.h		patch \| blob \| blame \| history
src/include/tsearch/ts_utils.h		patch \| blob \| blame \| history
src/test/modules/test_regex/test_regex.c		patch \| blob \| blame \| history