]> git.ipfire.org Git - thirdparty/postgresql.git/commitdiff
Replace pg_mblen() with bounds-checked versions.
authorThomas Munro <tmunro@postgresql.org>
Wed, 7 Jan 2026 09:14:31 +0000 (22:14 +1300)
committerThomas Munro <tmunro@postgresql.org>
Sun, 8 Feb 2026 23:44:04 +0000 (12:44 +1300)
A corrupted string could cause code that iterates with pg_mblen() to
overrun its buffer.  Fix, by converting all callers to one of the
following:

1. Callers with a null-terminated string now use pg_mblen_cstr(), which
raises an "illegal byte sequence" error if it finds a terminator in the
middle of the sequence.

2. Callers with a length or end pointer now use either
pg_mblen_with_len() or pg_mblen_range(), for the same effect, depending
on which of the two seems more convenient at each site.

3. A small number of cases pre-validate a string, and can use
pg_mblen_unbounded().

The traditional pg_mblen() function and COPYCHAR macro still exist for
backward compatibility, but are no longer used by core code and are
hereby deprecated.  The same applies to the t_isXXX() functions.

Security: CVE-2026-2006
Backpatch-through: 14
Co-authored-by: Thomas Munro <thomas.munro@gmail.com>
Co-authored-by: Noah Misch <noah@leadboat.com>
Reviewed-by: Heikki Linnakangas <hlinnaka@iki.fi>
Reported-by: Paul Gerste (as part of zeroday.cloud)
Reported-by: Moritz Sanft (as part of zeroday.cloud)
43 files changed:
contrib/btree_gist/btree_utils_var.c
contrib/dict_xsyn/dict_xsyn.c
contrib/hstore/hstore_io.c
contrib/ltree/crc32.c
contrib/ltree/lquery_op.c
contrib/ltree/ltree.h
contrib/ltree/ltree_io.c
contrib/ltree/ltxtquery_io.c
contrib/pageinspect/heapfuncs.c
contrib/pg_trgm/trgm.h
contrib/pg_trgm/trgm_op.c
contrib/pg_trgm/trgm_regexp.c
contrib/pgcrypto/crypt-sha.c
contrib/unaccent/unaccent.c
src/backend/catalog/pg_proc.c
src/backend/tsearch/dict_synonym.c
src/backend/tsearch/dict_thesaurus.c
src/backend/tsearch/regis.c
src/backend/tsearch/spell.c
src/backend/tsearch/ts_locale.c
src/backend/tsearch/ts_utils.c
src/backend/tsearch/wparser_def.c
src/backend/utils/adt/encode.c
src/backend/utils/adt/formatting.c
src/backend/utils/adt/jsonfuncs.c
src/backend/utils/adt/jsonpath_gram.y
src/backend/utils/adt/levenshtein.c
src/backend/utils/adt/like.c
src/backend/utils/adt/like_match.c
src/backend/utils/adt/oracle_compat.c
src/backend/utils/adt/regexp.c
src/backend/utils/adt/tsquery.c
src/backend/utils/adt/tsvector.c
src/backend/utils/adt/tsvector_op.c
src/backend/utils/adt/tsvector_parser.c
src/backend/utils/adt/varbit.c
src/backend/utils/adt/varlena.c
src/backend/utils/adt/xml.c
src/backend/utils/mb/mbutils.c
src/include/mb/pg_wchar.h
src/include/tsearch/ts_locale.h
src/include/tsearch/ts_utils.h
src/test/modules/test_regex/test_regex.c

index 6847e4e54d5aaeeed355124dfd4c2d9e6232bebd..f6ba1c0c82583489375f7277b073dc362a65a8b5 100644 (file)
@@ -115,36 +115,47 @@ gbt_var_leaf2node(GBT_VARKEY *leaf, const gbtree_vinfo *tinfo, FmgrInfo *flinfo)
 
 /*
  * returns the common prefix length of a node key
+ *
+ * If the underlying type is character data, the prefix length may point in
+ * the middle of a multibyte character.
 */
 static int32
 gbt_var_node_cp_len(const GBT_VARKEY *node, const gbtree_vinfo *tinfo)
 {
        GBT_VARKEY_R r = gbt_var_key_readable(node);
        int32           i = 0;
-       int32           l = 0;
+       int32           l_left_to_match = 0;
+       int32           l_total = 0;
        int32           t1len = VARSIZE(r.lower) - VARHDRSZ;
        int32           t2len = VARSIZE(r.upper) - VARHDRSZ;
        int32           ml = Min(t1len, t2len);
        char       *p1 = VARDATA(r.lower);
        char       *p2 = VARDATA(r.upper);
+       const char *end1 = p1 + t1len;
+       const char *end2 = p2 + t2len;
 
        if (ml == 0)
                return 0;
 
        while (i < ml)
        {
-               if (tinfo->eml > 1 && l == 0)
+               if (tinfo->eml > 1 && l_left_to_match == 0)
                {
-                       if ((l = pg_mblen(p1)) != pg_mblen(p2))
+                       l_total = pg_mblen_range(p1, end1);
+                       if (l_total != pg_mblen_range(p2, end2))
                        {
                                return i;
                        }
+                       l_left_to_match = l_total;
                }
                if (*p1 != *p2)
                {
                        if (tinfo->eml > 1)
                        {
-                               return (i - l + 1);
+                               int32           l_matched_subset = l_total - l_left_to_match;
+
+                               /* end common prefix at final byte of last matching char */
+                               return i - l_matched_subset;
                        }
                        else
                        {
@@ -154,7 +165,7 @@ gbt_var_node_cp_len(const GBT_VARKEY *node, const gbtree_vinfo *tinfo)
 
                p1++;
                p2++;
-               l--;
+               l_left_to_match--;
                i++;
        }
        return ml;                                      /* lower == upper */
index 5c4917ce1fc853219d293ecf8d7fbc12acf192d1..9e3784e0f47369a0c500c4f2bfeee996faa6912e 100644 (file)
@@ -54,14 +54,14 @@ find_word(char *in, char **end)
 
        *end = NULL;
        while (*in && isspace((unsigned char) *in))
-               in += pg_mblen(in);
+               in += pg_mblen_cstr(in);
 
        if (!*in || *in == '#')
                return NULL;
        start = in;
 
        while (*in && !isspace((unsigned char) *in))
-               in += pg_mblen(in);
+               in += pg_mblen_cstr(in);
 
        *end = in;
 
index 34e3918811cd4151a08cd19934f91fdea52ffdfa..9cdfcb5daa0ce95ff5a591d78b9a9d78fb7fdefe 100644 (file)
@@ -67,7 +67,7 @@ prssyntaxerror(HSParser *state)
        errsave(state->escontext,
                        (errcode(ERRCODE_SYNTAX_ERROR),
                         errmsg("syntax error in hstore, near \"%.*s\" at position %d",
-                                       pg_mblen(state->ptr), state->ptr,
+                                       pg_mblen_cstr(state->ptr), state->ptr,
                                        (int) (state->ptr - state->begin))));
        /* In soft error situation, return false as convenience for caller */
        return false;
index 3918d4a0ec251a1196ced515a3bb5a1bb4e91e34..d21bed31fdd47830485b8b8168e752ad1bc3afda 100644 (file)
@@ -23,6 +23,7 @@ ltree_crc32_sz(const char *buf, int size)
 {
        pg_crc32        crc;
        const char *p = buf;
+       const char *end = buf + size;
        static pg_locale_t locale = NULL;
 
        if (!locale)
@@ -32,7 +33,7 @@ ltree_crc32_sz(const char *buf, int size)
        while (size > 0)
        {
                char            foldstr[UNICODE_CASEMAP_BUFSZ];
-               int                     srclen = pg_mblen(p);
+               int                     srclen = pg_mblen_range(p, end);
                size_t          foldlen;
 
                /* fold one codepoint at a time */
index a28ddbf40de34224213b7fab73ff899ca1fd05da..0adcdd8ff2ac63248df4cacfa87b69515804bf0f 100644 (file)
@@ -27,14 +27,14 @@ getlexeme(char *start, char *end, int *len)
        char       *ptr;
 
        while (start < end && t_iseq(start, '_'))
-               start += pg_mblen(start);
+               start += pg_mblen_range(start, end);
 
        ptr = start;
        if (ptr >= end)
                return NULL;
 
        while (ptr < end && !t_iseq(ptr, '_'))
-               ptr += pg_mblen(ptr);
+               ptr += pg_mblen_range(ptr, end);
 
        *len = ptr - start;
        return start;
index 78478dec173d47a392ab001441e5932ae7d5404a..b0ded40eba9b6732ff05a6e7c3d4a6b9c27e3345 100644 (file)
@@ -127,7 +127,7 @@ typedef struct
 #define LQUERY_HASNOT          0x01
 
 /* valid label chars are alphanumerics, underscores and hyphens */
-#define ISLABEL(x) ( t_isalnum(x) || t_iseq(x, '_') || t_iseq(x, '-') )
+#define ISLABEL(x) ( t_isalnum_cstr(x) || t_iseq(x, '_') || t_iseq(x, '-') )
 
 /* full text query */
 
index 59c4462df8061d63f2d0daa47b27321c3526e25c..54c4ca3c5c3b50da3e73968a9e0f47ea9ed6a1c2 100644 (file)
@@ -54,7 +54,7 @@ parse_ltree(const char *buf, struct Node *escontext)
        ptr = buf;
        while (*ptr)
        {
-               charlen = pg_mblen(ptr);
+               charlen = pg_mblen_cstr(ptr);
                if (t_iseq(ptr, '.'))
                        num++;
                ptr += charlen;
@@ -69,7 +69,7 @@ parse_ltree(const char *buf, struct Node *escontext)
        ptr = buf;
        while (*ptr)
        {
-               charlen = pg_mblen(ptr);
+               charlen = pg_mblen_cstr(ptr);
 
                switch (state)
                {
@@ -291,7 +291,7 @@ parse_lquery(const char *buf, struct Node *escontext)
        ptr = buf;
        while (*ptr)
        {
-               charlen = pg_mblen(ptr);
+               charlen = pg_mblen_cstr(ptr);
 
                if (t_iseq(ptr, '.'))
                        num++;
@@ -311,7 +311,7 @@ parse_lquery(const char *buf, struct Node *escontext)
        ptr = buf;
        while (*ptr)
        {
-               charlen = pg_mblen(ptr);
+               charlen = pg_mblen_cstr(ptr);
 
                switch (state)
                {
index 91a2222eaa9544a95c8c9f84fe16b94e6de03394..d15f323539303f8c93d88f21e9b3848951d73d36 100644 (file)
@@ -64,7 +64,7 @@ gettoken_query(QPRS_STATE *state, int32 *val, int32 *lenval, char **strval, uint
 
        for (;;)
        {
-               charlen = pg_mblen(state->buf);
+               charlen = pg_mblen_cstr(state->buf);
 
                switch (state->state)
                {
index 8277fa256c371c1e4f5dbe56db5054391dfd224a..2f0dfff175ab78525c739ff641d3f593f5d0a651 100644 (file)
@@ -101,7 +101,7 @@ text_to_bits(char *str, int len)
                        ereport(ERROR,
                                        (errcode(ERRCODE_DATA_CORRUPTED),
                                         errmsg("invalid character \"%.*s\" in t_bits string",
-                                                       pg_mblen(str + off), str + off)));
+                                                       pg_mblen_cstr(str + off), str + off)));
 
                if (off % 8 == 7)
                        bits[off / 8] = byte;
index ca017585369ad7ef870b02dba68a18e45108c8e3..ca23aad4dd997fe32bb76a2a039fa230fe2017ce 100644 (file)
@@ -47,7 +47,7 @@ typedef char trgm[3];
 } while(0)
 extern int     (*CMPTRGM) (const void *a, const void *b);
 
-#define ISWORDCHR(c)   (t_isalnum(c))
+#define ISWORDCHR(c, len)      (t_isalnum_with_len(c, len))
 #define ISPRINTABLECHAR(a)     ( isascii( *(unsigned char*)(a) ) && (isalnum( *(unsigned char*)(a) ) || *(unsigned char*)(a)==' ') )
 #define ISPRINTABLETRGM(t)     ( ISPRINTABLECHAR( ((char*)(t)) ) && ISPRINTABLECHAR( ((char*)(t))+1 ) && ISPRINTABLECHAR( ((char*)(t))+2 ) )
 
index 4bb5506647c5da3fde21549932300a7088b0a693..5fba594b61f08be4ff98198760198ba75c8995be 100644 (file)
@@ -295,16 +295,29 @@ static char *
 find_word(char *str, int lenstr, char **endword)
 {
        char       *beginword = str;
+       const char *endstr = str + lenstr;
 
-       while (beginword - str < lenstr && !ISWORDCHR(beginword))
-               beginword += pg_mblen(beginword);
+       while (beginword < endstr)
+       {
+               int                     clen = pg_mblen_range(beginword, endstr);
 
-       if (beginword - str >= lenstr)
+               if (ISWORDCHR(beginword, clen))
+                       break;
+               beginword += clen;
+       }
+
+       if (beginword >= endstr)
                return NULL;
 
        *endword = beginword;
-       while (*endword - str < lenstr && ISWORDCHR(*endword))
-               *endword += pg_mblen(*endword);
+       while (*endword < endstr)
+       {
+               int                     clen = pg_mblen_range(*endword, endstr);
+
+               if (!ISWORDCHR(*endword, clen))
+                       break;
+               *endword += clen;
+       }
 
        return beginword;
 }
@@ -385,17 +398,17 @@ make_trigrams(growable_trgm_array *dst, char *str, int bytelen)
 
                        lenfirst = 1;
                        lenmiddle = 1;
-                       lenlast = pg_mblen(ptr + 2);
+                       lenlast = pg_mblen_unbounded(ptr + 2);
                }
                else
                {
-                       lenfirst = pg_mblen(ptr);
+                       lenfirst = pg_mblen_unbounded(ptr);
                        if (ptr + lenfirst >= str + bytelen)
                                goto done;
-                       lenmiddle = pg_mblen(ptr + lenfirst);
+                       lenmiddle = pg_mblen_unbounded(ptr + lenfirst);
                        if (ptr + lenfirst + lenmiddle >= str + bytelen)
                                goto done;
-                       lenlast = pg_mblen(ptr + lenfirst + lenmiddle);
+                       lenlast = pg_mblen_unbounded(ptr + lenfirst + lenmiddle);
                }
 
                /*
@@ -416,7 +429,7 @@ make_trigrams(growable_trgm_array *dst, char *str, int bytelen)
                        ptr += lenfirst;
                        lenfirst = lenmiddle;
                        lenmiddle = lenlast;
-                       lenlast = pg_mblen(endptr);
+                       lenlast = pg_mblen_unbounded(endptr);
                        endptr += lenlast;
                }
        }
@@ -898,6 +911,7 @@ get_wildcard_part(const char *str, int lenstr,
 {
        const char *beginword = str;
        const char *endword;
+       const char *endstr = str + lenstr;
        char       *s = buf;
        bool            in_leading_wildcard_meta = false;
        bool            in_trailing_wildcard_meta = false;
@@ -910,11 +924,13 @@ get_wildcard_part(const char *str, int lenstr,
         * from this loop to the next one, since we may exit at a word character
         * that is in_escape.
         */
-       while (beginword - str < lenstr)
+       while (beginword < endstr)
        {
+               clen = pg_mblen_range(beginword, endstr);
+
                if (in_escape)
                {
-                       if (ISWORDCHR(beginword))
+                       if (ISWORDCHR(beginword, clen))
                                break;
                        in_escape = false;
                        in_leading_wildcard_meta = false;
@@ -925,12 +941,12 @@ get_wildcard_part(const char *str, int lenstr,
                                in_escape = true;
                        else if (ISWILDCARDCHAR(beginword))
                                in_leading_wildcard_meta = true;
-                       else if (ISWORDCHR(beginword))
+                       else if (ISWORDCHR(beginword, clen))
                                break;
                        else
                                in_leading_wildcard_meta = false;
                }
-               beginword += pg_mblen(beginword);
+               beginword += clen;
        }
 
        /*
@@ -958,12 +974,12 @@ get_wildcard_part(const char *str, int lenstr,
         * string boundary.  Strip escapes during copy.
         */
        endword = beginword;
-       while (endword - str < lenstr)
+       while (endword < endstr)
        {
-               clen = pg_mblen(endword);
+               clen = pg_mblen_range(endword, endstr);
                if (in_escape)
                {
-                       if (ISWORDCHR(endword))
+                       if (ISWORDCHR(endword, clen))
                        {
                                memcpy(s, endword, clen);
                                s += clen;
@@ -990,7 +1006,7 @@ get_wildcard_part(const char *str, int lenstr,
                                in_trailing_wildcard_meta = true;
                                break;
                        }
-                       else if (ISWORDCHR(endword))
+                       else if (ISWORDCHR(endword, clen))
                        {
                                memcpy(s, endword, clen);
                                s += clen;
index 1d1b5fe304d08e420805c8af23a3724297567187..efee4cf5fb4bca2a48f4accd425ef86c664ef658 100644 (file)
@@ -483,7 +483,7 @@ static TRGM *createTrgmNFAInternal(regex_t *regex, TrgmPackedGraph **graph,
 static void RE_compile(regex_t *regex, text *text_re,
                                           int cflags, Oid collation);
 static void getColorInfo(regex_t *regex, TrgmNFA *trgmNFA);
-static bool convertPgWchar(pg_wchar c, trgm_mb_char *result);
+static int     convertPgWchar(pg_wchar c, trgm_mb_char *result);
 static void transformGraph(TrgmNFA *trgmNFA);
 static void processState(TrgmNFA *trgmNFA, TrgmState *state);
 static void addKey(TrgmNFA *trgmNFA, TrgmState *state, TrgmStateKey *key);
@@ -807,10 +807,11 @@ getColorInfo(regex_t *regex, TrgmNFA *trgmNFA)
                for (j = 0; j < charsCount; j++)
                {
                        trgm_mb_char c;
+                       int                     clen = convertPgWchar(chars[j], &c);
 
-                       if (!convertPgWchar(chars[j], &c))
+                       if (!clen)
                                continue;               /* ok to ignore it altogether */
-                       if (ISWORDCHR(c.bytes))
+                       if (ISWORDCHR(c.bytes, clen))
                                colorInfo->wordChars[colorInfo->wordCharsCount++] = c;
                        else
                                colorInfo->containsNonWord = true;
@@ -822,13 +823,15 @@ getColorInfo(regex_t *regex, TrgmNFA *trgmNFA)
 
 /*
  * Convert pg_wchar to multibyte format.
- * Returns false if the character should be ignored completely.
+ * Returns 0 if the character should be ignored completely, else returns its
+ * byte length.
  */
-static bool
+static int
 convertPgWchar(pg_wchar c, trgm_mb_char *result)
 {
        /* "s" has enough space for a multibyte character and a trailing NUL */
        char            s[MAX_MULTIBYTE_CHAR_LEN + 1];
+       int                     clen;
 
        /*
         * We can ignore the NUL character, since it can never appear in a PG text
@@ -836,11 +839,11 @@ convertPgWchar(pg_wchar c, trgm_mb_char *result)
         * reconstructing trigrams.
         */
        if (c == 0)
-               return false;
+               return 0;
 
        /* Do the conversion, making sure the result is NUL-terminated */
        memset(s, 0, sizeof(s));
-       pg_wchar2mb_with_len(&c, s, 1);
+       clen = pg_wchar2mb_with_len(&c, s, 1);
 
        /*
         * In IGNORECASE mode, we can ignore uppercase characters.  We assume that
@@ -857,12 +860,12 @@ convertPgWchar(pg_wchar c, trgm_mb_char *result)
         */
 #ifdef IGNORECASE
        {
-               char       *lowerCased = str_tolower(s, strlen(s), DEFAULT_COLLATION_OID);
+               char       *lowerCased = str_tolower(s, clen, DEFAULT_COLLATION_OID);
 
                if (strcmp(lowerCased, s) != 0)
                {
                        pfree(lowerCased);
-                       return false;
+                       return 0;
                }
                pfree(lowerCased);
        }
@@ -870,7 +873,7 @@ convertPgWchar(pg_wchar c, trgm_mb_char *result)
 
        /* Fill result with exactly MAX_MULTIBYTE_CHAR_LEN bytes */
        memcpy(result->bytes, s, MAX_MULTIBYTE_CHAR_LEN);
-       return true;
+       return clen;
 }
 
 
index 7ec21771a83b7a462384e67fa7f3fa4347ab43dd..e8f32bc3896c3310d9cd80bcc7a065546f1f8d46 100644 (file)
@@ -328,7 +328,7 @@ px_crypt_shacrypt(const char *pw, const char *salt, char *passwd, unsigned dstle
                                ereport(ERROR,
                                                errcode(ERRCODE_INVALID_PARAMETER_VALUE),
                                                errmsg("invalid character in salt string: \"%.*s\"",
-                                                          pg_mblen(ep), ep));
+                                                          pg_mblen_cstr(ep), ep));
                }
                else
                {
index e25c8a5aa26ceb55c9abe17811214df34d7f346a..69b173e4498df3c6718d728ec21eb2c2fc58de71 100644 (file)
@@ -156,7 +156,7 @@ initTrie(const char *filename)
                                state = 0;
                                for (ptr = line; *ptr; ptr += ptrlen)
                                {
-                                       ptrlen = pg_mblen(ptr);
+                                       ptrlen = pg_mblen_cstr(ptr);
                                        /* ignore whitespace, but end src or trg */
                                        if (isspace((unsigned char) *ptr))
                                        {
@@ -382,6 +382,7 @@ unaccent_lexize(PG_FUNCTION_ARGS)
        char       *srcchar = (char *) PG_GETARG_POINTER(1);
        int32           len = PG_GETARG_INT32(2);
        char       *srcstart = srcchar;
+       const char *srcend = srcstart + len;
        TSLexeme   *res;
        StringInfoData buf;
 
@@ -409,7 +410,7 @@ unaccent_lexize(PG_FUNCTION_ARGS)
                }
                else
                {
-                       matchlen = pg_mblen(srcchar);
+                       matchlen = pg_mblen_range(srcchar, srcend);
                        if (buf.data != NULL)
                                appendBinaryStringInfo(&buf, srcchar, matchlen);
                }
index acff7a0096dec812d2c35d454a560921a302e49c..5df4b3f7a91e702ce707c891f8d98abf11000f45 100644 (file)
@@ -1206,7 +1206,7 @@ match_prosrc_to_literal(const char *prosrc, const char *literal,
                        if (cursorpos > 0)
                                newcp++;
                }
-               chlen = pg_mblen(prosrc);
+               chlen = pg_mblen_cstr(prosrc);
                if (strncmp(prosrc, literal, chlen) != 0)
                        goto fail;
                prosrc += chlen;
index 6dee28ae5256cba35bba35b4d6cff963d3d87129..3937f25bcc6cae65b2cd24670e2a0b812c5fc119 100644 (file)
@@ -50,7 +50,7 @@ findwrd(char *in, char **end, uint16 *flags)
 
        /* Skip leading spaces */
        while (*in && isspace((unsigned char) *in))
-               in += pg_mblen(in);
+               in += pg_mblen_cstr(in);
 
        /* Return NULL on empty lines */
        if (*in == '\0')
@@ -65,7 +65,7 @@ findwrd(char *in, char **end, uint16 *flags)
        while (*in && !isspace((unsigned char) *in))
        {
                lastchar = in;
-               in += pg_mblen(in);
+               in += pg_mblen_cstr(in);
        }
 
        if (in - lastchar == 1 && t_iseq(lastchar, '*') && flags)
index 7253f64e5f7e5adc008f8a15131378932097471a..0fd4cf3dfa85fe613ca9df70825c0c46f6d873bc 100644 (file)
@@ -191,7 +191,7 @@ thesaurusRead(const char *filename, DictThesaurus *d)
 
                /* is it a comment? */
                while (*ptr && isspace((unsigned char) *ptr))
-                       ptr += pg_mblen(ptr);
+                       ptr += pg_mblen_cstr(ptr);
 
                if (t_iseq(ptr, '#') || *ptr == '\0' ||
                        t_iseq(ptr, '\n') || t_iseq(ptr, '\r'))
@@ -237,13 +237,13 @@ thesaurusRead(const char *filename, DictThesaurus *d)
                                {
                                        useasis = true;
                                        state = TR_INSUBS;
-                                       beginwrd = ptr + pg_mblen(ptr);
+                                       beginwrd = ptr + pg_mblen_cstr(ptr);
                                }
                                else if (t_iseq(ptr, '\\'))
                                {
                                        useasis = false;
                                        state = TR_INSUBS;
-                                       beginwrd = ptr + pg_mblen(ptr);
+                                       beginwrd = ptr + pg_mblen_cstr(ptr);
                                }
                                else if (!isspace((unsigned char) *ptr))
                                {
@@ -267,7 +267,7 @@ thesaurusRead(const char *filename, DictThesaurus *d)
                        else
                                elog(ERROR, "unrecognized thesaurus state: %d", state);
 
-                       ptr += pg_mblen(ptr);
+                       ptr += pg_mblen_cstr(ptr);
                }
 
                if (state == TR_INSUBS)
index 1c7d5c361f17669c95ea55a0e71c83215e57ba28..51ba78fabbcd377b8d48011d64890eb1656a0d53 100644 (file)
@@ -37,7 +37,7 @@ RS_isRegis(const char *str)
        {
                if (state == RS_IN_WAIT)
                {
-                       if (t_isalpha(c))
+                       if (t_isalpha_cstr(c))
                                 /* okay */ ;
                        else if (t_iseq(c, '['))
                                state = RS_IN_ONEOF;
@@ -48,14 +48,14 @@ RS_isRegis(const char *str)
                {
                        if (t_iseq(c, '^'))
                                state = RS_IN_NONEOF;
-                       else if (t_isalpha(c))
+                       else if (t_isalpha_cstr(c))
                                state = RS_IN_ONEOF_IN;
                        else
                                return false;
                }
                else if (state == RS_IN_ONEOF_IN || state == RS_IN_NONEOF)
                {
-                       if (t_isalpha(c))
+                       if (t_isalpha_cstr(c))
                                 /* okay */ ;
                        else if (t_iseq(c, ']'))
                                state = RS_IN_WAIT;
@@ -64,7 +64,7 @@ RS_isRegis(const char *str)
                }
                else
                        elog(ERROR, "internal error in RS_isRegis: state %d", state);
-               c += pg_mblen(c);
+               c += pg_mblen_cstr(c);
        }
 
        return (state == RS_IN_WAIT);
@@ -96,15 +96,14 @@ RS_compile(Regis *r, bool issuffix, const char *str)
        {
                if (state == RS_IN_WAIT)
                {
-                       if (t_isalpha(c))
+                       if (t_isalpha_cstr(c))
                        {
                                if (ptr)
                                        ptr = newRegisNode(ptr, len);
                                else
                                        ptr = r->node = newRegisNode(NULL, len);
-                               COPYCHAR(ptr->data, c);
                                ptr->type = RSF_ONEOF;
-                               ptr->len = pg_mblen(c);
+                               ptr->len = ts_copychar_cstr(ptr->data, c);
                        }
                        else if (t_iseq(c, '['))
                        {
@@ -125,10 +124,9 @@ RS_compile(Regis *r, bool issuffix, const char *str)
                                ptr->type = RSF_NONEOF;
                                state = RS_IN_NONEOF;
                        }
-                       else if (t_isalpha(c))
+                       else if (t_isalpha_cstr(c))
                        {
-                               COPYCHAR(ptr->data, c);
-                               ptr->len = pg_mblen(c);
+                               ptr->len = ts_copychar_cstr(ptr->data, c);
                                state = RS_IN_ONEOF_IN;
                        }
                        else                            /* shouldn't get here */
@@ -136,11 +134,8 @@ RS_compile(Regis *r, bool issuffix, const char *str)
                }
                else if (state == RS_IN_ONEOF_IN || state == RS_IN_NONEOF)
                {
-                       if (t_isalpha(c))
-                       {
-                               COPYCHAR(ptr->data + ptr->len, c);
-                               ptr->len += pg_mblen(c);
-                       }
+                       if (t_isalpha_cstr(c))
+                               ptr->len += ts_copychar_cstr(ptr->data + ptr->len, c);
                        else if (t_iseq(c, ']'))
                                state = RS_IN_WAIT;
                        else                            /* shouldn't get here */
@@ -148,7 +143,7 @@ RS_compile(Regis *r, bool issuffix, const char *str)
                }
                else
                        elog(ERROR, "internal error in RS_compile: state %d", state);
-               c += pg_mblen(c);
+               c += pg_mblen_cstr(c);
        }
 
        if (state != RS_IN_WAIT)        /* shouldn't get here */
@@ -187,10 +182,10 @@ mb_strchr(char *str, char *c)
        char       *ptr = str;
        bool            res = false;
 
-       clen = pg_mblen(c);
+       clen = pg_mblen_cstr(c);
        while (*ptr && !res)
        {
-               plen = pg_mblen(ptr);
+               plen = pg_mblen_cstr(ptr);
                if (plen == clen)
                {
                        i = plen;
@@ -219,7 +214,7 @@ RS_execute(Regis *r, char *str)
        while (*c)
        {
                len++;
-               c += pg_mblen(c);
+               c += pg_mblen_cstr(c);
        }
 
        if (len < r->nchar)
@@ -230,7 +225,7 @@ RS_execute(Regis *r, char *str)
        {
                len -= r->nchar;
                while (len-- > 0)
-                       c += pg_mblen(c);
+                       c += pg_mblen_cstr(c);
        }
 
 
@@ -250,7 +245,7 @@ RS_execute(Regis *r, char *str)
                                elog(ERROR, "unrecognized regis node type: %d", ptr->type);
                }
                ptr = ptr->next;
-               c += pg_mblen(c);
+               c += pg_mblen_cstr(c);
        }
 
        return true;
index ad0ceec37b04b893a058fdb0cd48d6d01c952be1..a1bfd2a9f9b1016da9abdbe5f6f9988ac4c49094 100644 (file)
@@ -233,7 +233,7 @@ findchar(char *str, int c)
        {
                if (t_iseq(str, c))
                        return str;
-               str += pg_mblen(str);
+               str += pg_mblen_cstr(str);
        }
 
        return NULL;
@@ -246,7 +246,7 @@ findchar2(char *str, int c1, int c2)
        {
                if (t_iseq(str, c1) || t_iseq(str, c2))
                        return str;
-               str += pg_mblen(str);
+               str += pg_mblen_cstr(str);
        }
 
        return NULL;
@@ -353,6 +353,7 @@ getNextFlagFromString(IspellDict *Conf, const char **sflagset, char *sflag)
        char       *next;
        const char *sbuf = *sflagset;
        int                     maxstep;
+       int                     clen;
        bool            stop = false;
        bool            met_comma = false;
 
@@ -364,11 +365,11 @@ getNextFlagFromString(IspellDict *Conf, const char **sflagset, char *sflag)
                {
                        case FM_LONG:
                        case FM_CHAR:
-                               COPYCHAR(sflag, *sflagset);
-                               sflag += pg_mblen(*sflagset);
+                               clen = ts_copychar_cstr(sflag, *sflagset);
+                               sflag += clen;
 
                                /* Go to start of the next flag */
-                               *sflagset += pg_mblen(*sflagset);
+                               *sflagset += clen;
 
                                /* Check if we get all characters of flag */
                                maxstep--;
@@ -418,7 +419,7 @@ getNextFlagFromString(IspellDict *Conf, const char **sflagset, char *sflag)
                                                                                *sflagset)));
                                        }
 
-                                       *sflagset += pg_mblen(*sflagset);
+                                       *sflagset += pg_mblen_cstr(*sflagset);
                                }
                                stop = true;
                                break;
@@ -544,7 +545,7 @@ NIImportDictionary(IspellDict *Conf, const char *filename)
                        while (*s)
                        {
                                /* we allow only single encoded flags for faster works */
-                               if (pg_mblen(s) == 1 && isprint((unsigned char) *s) && !isspace((unsigned char) *s))
+                               if (pg_mblen_cstr(s) == 1 && isprint((unsigned char) *s) && !isspace((unsigned char) *s))
                                        s++;
                                else
                                {
@@ -565,7 +566,7 @@ NIImportDictionary(IspellDict *Conf, const char *filename)
                                *s = '\0';
                                break;
                        }
-                       s += pg_mblen(s);
+                       s += pg_mblen_cstr(s);
                }
                pstr = lowerstr_ctx(Conf, line);
 
@@ -797,17 +798,17 @@ get_nextfield(char **str, char *next)
 
        while (**str)
        {
+               int                     clen = pg_mblen_cstr(*str);
+
                if (state == PAE_WAIT_MASK)
                {
                        if (t_iseq(*str, '#'))
                                return false;
                        else if (!isspace((unsigned char) **str))
                        {
-                               int                     clen = pg_mblen(*str);
-
                                if (clen < avail)
                                {
-                                       COPYCHAR(next, *str);
+                                       ts_copychar_with_len(next, *str, clen);
                                        next += clen;
                                        avail -= clen;
                                }
@@ -823,17 +824,15 @@ get_nextfield(char **str, char *next)
                        }
                        else
                        {
-                               int                     clen = pg_mblen(*str);
-
                                if (clen < avail)
                                {
-                                       COPYCHAR(next, *str);
+                                       ts_copychar_with_len(next, *str, clen);
                                        next += clen;
                                        avail -= clen;
                                }
                        }
                }
-               *str += pg_mblen(*str);
+               *str += clen;
        }
 
        *next = '\0';
@@ -923,14 +922,15 @@ parse_affentry(char *str, char *mask, char *find, char *repl)
 
        while (*str)
        {
+               int                     clen = pg_mblen_cstr(str);
+
                if (state == PAE_WAIT_MASK)
                {
                        if (t_iseq(str, '#'))
                                return false;
                        else if (!isspace((unsigned char) *str))
                        {
-                               COPYCHAR(pmask, str);
-                               pmask += pg_mblen(str);
+                               pmask += ts_copychar_with_len(pmask, str, clen);
                                state = PAE_INMASK;
                        }
                }
@@ -943,8 +943,7 @@ parse_affentry(char *str, char *mask, char *find, char *repl)
                        }
                        else if (!isspace((unsigned char) *str))
                        {
-                               COPYCHAR(pmask, str);
-                               pmask += pg_mblen(str);
+                               pmask += ts_copychar_with_len(pmask, str, clen);
                        }
                }
                else if (state == PAE_WAIT_FIND)
@@ -953,10 +952,9 @@ parse_affentry(char *str, char *mask, char *find, char *repl)
                        {
                                state = PAE_INFIND;
                        }
-                       else if (t_isalpha(str) || t_iseq(str, '\'') /* english 's */ )
+                       else if (t_isalpha_cstr(str) || t_iseq(str, '\'') /* english 's */ )
                        {
-                               COPYCHAR(prepl, str);
-                               prepl += pg_mblen(str);
+                               prepl += ts_copychar_with_len(prepl, str, clen);
                                state = PAE_INREPL;
                        }
                        else if (!isspace((unsigned char) *str))
@@ -971,10 +969,9 @@ parse_affentry(char *str, char *mask, char *find, char *repl)
                                *pfind = '\0';
                                state = PAE_WAIT_REPL;
                        }
-                       else if (t_isalpha(str))
+                       else if (t_isalpha_cstr(str))
                        {
-                               COPYCHAR(pfind, str);
-                               pfind += pg_mblen(str);
+                               pfind += ts_copychar_with_len(pfind, str, clen);
                        }
                        else if (!isspace((unsigned char) *str))
                                ereport(ERROR,
@@ -987,10 +984,9 @@ parse_affentry(char *str, char *mask, char *find, char *repl)
                        {
                                break;                  /* void repl */
                        }
-                       else if (t_isalpha(str))
+                       else if (t_isalpha_cstr(str))
                        {
-                               COPYCHAR(prepl, str);
-                               prepl += pg_mblen(str);
+                               prepl += ts_copychar_with_len(prepl, str, clen);
                                state = PAE_INREPL;
                        }
                        else if (!isspace((unsigned char) *str))
@@ -1005,10 +1001,9 @@ parse_affentry(char *str, char *mask, char *find, char *repl)
                                *prepl = '\0';
                                break;
                        }
-                       else if (t_isalpha(str))
+                       else if (t_isalpha_cstr(str))
                        {
-                               COPYCHAR(prepl, str);
-                               prepl += pg_mblen(str);
+                               prepl += ts_copychar_with_len(prepl, str, clen);
                        }
                        else if (!isspace((unsigned char) *str))
                                ereport(ERROR,
@@ -1018,7 +1013,7 @@ parse_affentry(char *str, char *mask, char *find, char *repl)
                else
                        elog(ERROR, "unrecognized state in parse_affentry: %d", state);
 
-               str += pg_mblen(str);
+               str += clen;
        }
 
        *pmask = *pfind = *prepl = '\0';
@@ -1071,10 +1066,9 @@ addCompoundAffixFlagValue(IspellDict *Conf, char *s, uint32 val)
        CompoundAffixFlag *newValue;
        char            sbuf[BUFSIZ];
        char       *sflag;
-       int                     clen;
 
        while (*s && isspace((unsigned char) *s))
-               s += pg_mblen(s);
+               s += pg_mblen_cstr(s);
 
        if (!*s)
                ereport(ERROR,
@@ -1085,8 +1079,8 @@ addCompoundAffixFlagValue(IspellDict *Conf, char *s, uint32 val)
        sflag = sbuf;
        while (*s && !isspace((unsigned char) *s) && *s != '\n')
        {
-               clen = pg_mblen(s);
-               COPYCHAR(sflag, s);
+               int                     clen = ts_copychar_cstr(sflag, s);
+
                sflag += clen;
                s += clen;
        }
@@ -1267,7 +1261,7 @@ NIImportOOAffixes(IspellDict *Conf, const char *filename)
                        char       *s = recoded + strlen("FLAG");
 
                        while (*s && isspace((unsigned char) *s))
-                               s += pg_mblen(s);
+                               s += pg_mblen_cstr(s);
 
                        if (*s)
                        {
@@ -1466,11 +1460,11 @@ NIImportAffixes(IspellDict *Conf, const char *filename)
                        if (s)
                        {
                                while (*s && !isspace((unsigned char) *s))
-                                       s += pg_mblen(s);
+                                       s += pg_mblen_cstr(s);
                                while (*s && isspace((unsigned char) *s))
-                                       s += pg_mblen(s);
+                                       s += pg_mblen_cstr(s);
 
-                               if (*s && pg_mblen(s) == 1)
+                               if (*s && pg_mblen_cstr(s) == 1)
                                {
                                        addCompoundAffixFlagValue(Conf, s, FF_COMPOUNDFLAG);
                                        Conf->usecompound = true;
@@ -1499,7 +1493,7 @@ NIImportAffixes(IspellDict *Conf, const char *filename)
                        flagflags = 0;
 
                        while (*s && isspace((unsigned char) *s))
-                               s += pg_mblen(s);
+                               s += pg_mblen_cstr(s);
 
                        if (*s == '*')
                        {
@@ -1520,12 +1514,11 @@ NIImportAffixes(IspellDict *Conf, const char *filename)
                         * be followed by EOL, whitespace, or ':'.  Otherwise this is a
                         * new-format flag command.
                         */
-                       if (*s && pg_mblen(s) == 1)
+                       if (*s && pg_mblen_cstr(s) == 1)
                        {
-                               COPYCHAR(flag, s);
+                               flag[0] = *s++;
                                flag[1] = '\0';
 
-                               s++;
                                if (*s == '\0' || *s == '#' || *s == '\n' || *s == ':' ||
                                        isspace((unsigned char) *s))
                                {
index 1e98f32195714110e30c1059c8471488cd582001..df02ffb12fd3620cfd154f8d1ba906152900e211 100644 (file)
@@ -23,32 +23,40 @@ static void tsearch_readline_callback(void *arg);
 /* space for a single character plus a trailing NUL */
 #define WC_BUF_LEN  2
 
-int
-t_isalpha(const char *ptr)
-{
-       pg_wchar        wstr[WC_BUF_LEN];
-       int                     wlen pg_attribute_unused();
-
-       wlen = pg_mb2wchar_with_len(ptr, wstr, pg_mblen(ptr));
-       Assert(wlen <= 1);
-
-       /* pass single character, or NUL if empty */
-       return pg_iswalpha(wstr[0], pg_database_locale());
-}
-
-int
-t_isalnum(const char *ptr)
-{
-       pg_wchar        wstr[WC_BUF_LEN];
-       int                     wlen pg_attribute_unused();
-
-       wlen = pg_mb2wchar_with_len(ptr, wstr, pg_mblen(ptr));
-       Assert(wlen <= 1);
-
-       /* pass single character, or NUL if empty */
-       return pg_iswalnum(wstr[0], pg_database_locale());
+#define GENERATE_T_ISCLASS_DEF(character_class) \
+/* mblen shall be that of the first character */ \
+int \
+t_is##character_class##_with_len(const char *ptr, int mblen) \
+{ \
+       pg_wchar        wstr[WC_BUF_LEN]; \
+       int                     wlen pg_attribute_unused(); \
+       wlen = pg_mb2wchar_with_len(ptr, wstr, mblen); \
+       Assert(wlen <= 1); \
+       /* pass single character, or NUL if empty */ \
+       return pg_isw##character_class(wstr[0], pg_database_locale()); \
+} \
+\
+/* ptr shall point to a NUL-terminated string */ \
+int \
+t_is##character_class##_cstr(const char *ptr) \
+{ \
+       return t_is##character_class##_with_len(ptr, pg_mblen_cstr(ptr)); \
+} \
+/* ptr shall point to a string with pre-validated encoding */ \
+int \
+t_is##character_class##_unbounded(const char *ptr) \
+{ \
+       return t_is##character_class##_with_len(ptr, pg_mblen_unbounded(ptr)); \
+} \
+/* historical name for _unbounded */ \
+int \
+t_is##character_class(const char *ptr) \
+{ \
+       return t_is##character_class##_unbounded(ptr); \
 }
 
+GENERATE_T_ISCLASS_DEF(alnum)
+GENERATE_T_ISCLASS_DEF(alpha)
 
 /*
  * Set up to read a file using tsearch_readline().  This facility is
index 9072d22423f43c37512edaa501030937f1c32aaa..52cf65533e4eee4a14412938d2f196c4650d4f72 100644 (file)
@@ -90,7 +90,7 @@ readstoplist(const char *fname, StopList *s, char *(*wordop) (const char *, size
 
                        /* Trim trailing space */
                        while (*pbuf && !isspace((unsigned char) *pbuf))
-                               pbuf += pg_mblen(pbuf);
+                               pbuf += pg_mblen_cstr(pbuf);
                        *pbuf = '\0';
 
                        /* Skip empty lines */
index bfe8aa7fbce77bb2d50ee2db569f2c660489bd84..8b9b34e762a8cb03e093cb1cf80bfc64bbe55e2b 100644 (file)
@@ -1683,7 +1683,8 @@ TParserGet(TParser *prs)
                        prs->state->charlen = 0;
                else
                        prs->state->charlen = (prs->charmaxlen == 1) ? prs->charmaxlen :
-                               pg_mblen(prs->str + prs->state->posbyte);
+                               pg_mblen_range(prs->str + prs->state->posbyte,
+                                                          prs->str + prs->lenstr);
 
                Assert(prs->state->posbyte + prs->state->charlen <= prs->lenstr);
                Assert(prs->state->state >= TPS_Base && prs->state->state < TPS_Null);
index 3c7f54f263804ce978bf2d5f93ae23e995d98d2e..f5f835e944ab422b01f3520caf735c0c57428d39 100644 (file)
@@ -290,7 +290,7 @@ hex_decode_safe_scalar(const char *src, size_t len, char *dst, Node *escontext)
                        ereturn(escontext, 0,
                                        (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
                                         errmsg("invalid hexadecimal digit: \"%.*s\"",
-                                                       pg_mblen(s), s)));
+                                                       pg_mblen_range(s, srcend), s)));
                s++;
                if (s >= srcend)
                        ereturn(escontext, 0,
@@ -300,7 +300,7 @@ hex_decode_safe_scalar(const char *src, size_t len, char *dst, Node *escontext)
                        ereturn(escontext, 0,
                                        (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
                                         errmsg("invalid hexadecimal digit: \"%.*s\"",
-                                                       pg_mblen(s), s)));
+                                                       pg_mblen_range(s, srcend), s)));
                s++;
                *p++ = (v1 << 4) | v2;
        }
@@ -564,7 +564,7 @@ pg_base64_decode_internal(const char *src, size_t len, char *dst, bool url)
                                ereport(ERROR,
                                                (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
                                                 errmsg("invalid symbol \"%.*s\" found while decoding %s sequence",
-                                                               pg_mblen(s - 1), s - 1,
+                                                               pg_mblen_range(s - 1, srcend), s - 1,
                                                                url ? "base64url" : "base64")));
                        }
                }
index cf580c63c78c17137aa77a5912af3de38a3c03a0..7720911a6a9127bfcaccad651686db022f2fe2c8 100644 (file)
@@ -1438,7 +1438,7 @@ parse_format(FormatNode *node, const char *str, const KeyWord *kw,
                                        ereport(ERROR,
                                                        (errcode(ERRCODE_INVALID_DATETIME_FORMAT),
                                                         errmsg("invalid datetime format separator: \"%s\"",
-                                                                       pnstrdup(str, pg_mblen(str)))));
+                                                                       pnstrdup(str, pg_mblen_cstr(str)))));
 
                                if (*str == ' ')
                                        n->type = NODE_TYPE_SPACE;
@@ -1468,7 +1468,7 @@ parse_format(FormatNode *node, const char *str, const KeyWord *kw,
                                        /* backslash quotes the next character, if any */
                                        if (*str == '\\' && *(str + 1))
                                                str++;
-                                       chlen = pg_mblen(str);
+                                       chlen = pg_mblen_cstr(str);
                                        n->type = NODE_TYPE_CHAR;
                                        memcpy(n->character, str, chlen);
                                        n->character[chlen] = '\0';
@@ -1486,7 +1486,7 @@ parse_format(FormatNode *node, const char *str, const KeyWord *kw,
                                 */
                                if (*str == '\\' && *(str + 1) == '"')
                                        str++;
-                               chlen = pg_mblen(str);
+                               chlen = pg_mblen_cstr(str);
 
                                if ((flags & DCH_FLAG) && is_separator_char(str))
                                        n->type = NODE_TYPE_SEPARATOR;
@@ -1992,8 +1992,8 @@ asc_toupper_z(const char *buff)
        do { \
                if (IS_SUFFIX_THth(_suf)) \
                { \
-                       if (*(ptr)) (ptr) += pg_mblen(ptr); \
-                       if (*(ptr)) (ptr) += pg_mblen(ptr); \
+                       if (*(ptr)) (ptr) += pg_mblen_cstr(ptr); \
+                       if (*(ptr)) (ptr) += pg_mblen_cstr(ptr); \
                } \
        } while (0)
 
@@ -3183,7 +3183,7 @@ DCH_from_char(FormatNode *node, const char *in, TmFromChar *out,
                                 * insist that the consumed character match the format's
                                 * character.
                                 */
-                               s += pg_mblen(s);
+                               s += pg_mblen_cstr(s);
                        }
                        continue;
                }
@@ -3205,11 +3205,11 @@ DCH_from_char(FormatNode *node, const char *in, TmFromChar *out,
                                if (extra_skip > 0)
                                        extra_skip--;
                                else
-                                       s += pg_mblen(s);
+                                       s += pg_mblen_cstr(s);
                        }
                        else
                        {
-                               int                     chlen = pg_mblen(s);
+                               int                     chlen = pg_mblen_cstr(s);
 
                                /*
                                 * Standard mode requires strict match of format characters.
@@ -5724,13 +5724,15 @@ NUM_numpart_to_char(NUMProc *Np, int id)
 static void
 NUM_eat_non_data_chars(NUMProc *Np, int n, size_t input_len)
 {
+       const char *end = Np->inout + input_len;
+
        while (n-- > 0)
        {
                if (OVERLOAD_TEST)
                        break;                          /* end of input */
                if (strchr("0123456789.,+-", *Np->inout_p) != NULL)
                        break;                          /* it's a data character */
-               Np->inout_p += pg_mblen(Np->inout_p);
+               Np->inout_p += pg_mblen_range(Np->inout_p, end);
        }
 }
 
@@ -6167,7 +6169,7 @@ NUM_processor(FormatNode *node, NUMDesc *Num, char *inout,
                        }
                        else
                        {
-                               Np->inout_p += pg_mblen(Np->inout_p);
+                               Np->inout_p += pg_mblen_range(Np->inout_p, Np->inout + input_len);
                        }
                        continue;
                }
index 1e5b60801e4c2f9db2bbaa337df13797f2474eeb..d5b64d7fca5688caae496d05d41895d8e381e0d9 100644 (file)
@@ -695,7 +695,7 @@ report_json_context(JsonLexContext *lex)
        {
                /* Advance to next multibyte character */
                if (IS_HIGHBIT_SET(*context_start))
-                       context_start += pg_mblen(context_start);
+                       context_start += pg_mblen_range(context_start, context_end);
                else
                        context_start++;
        }
index 4543626ffc8903e6951d0cd15efdd6961f6a0b90..87070235d119d7a8c00ba2f9808107503cab9729 100644 (file)
@@ -599,7 +599,8 @@ makeItemLikeRegex(JsonPathParseItem *expr, JsonPathString *pattern,
                                                (errcode(ERRCODE_SYNTAX_ERROR),
                                                 errmsg("invalid input syntax for type %s", "jsonpath"),
                                                 errdetail("Unrecognized flag character \"%.*s\" in LIKE_REGEX predicate.",
-                                                                  pg_mblen(flags->val + i), flags->val + i)));
+                                                                  pg_mblen_range(flags->val + i, flags->val + flags->len),
+                                                                  flags->val + i)));
                                break;
                }
        }
index fb2ba591acddab200715d4764ecb1c29436c9d3c..5b3d84029f6b94bf5ea288e6c7023649015175cd 100644 (file)
@@ -83,6 +83,8 @@ varstr_levenshtein(const char *source, int slen,
        int                *s_char_len = NULL;
        int                     j;
        const char *y;
+       const char *send = source + slen;
+       const char *tend = target + tlen;
 
        /*
         * For varstr_levenshtein_less_equal, we have real variables called
@@ -183,10 +185,10 @@ varstr_levenshtein(const char *source, int slen,
 #endif
 
        /*
-        * In order to avoid calling pg_mblen() repeatedly on each character in s,
-        * we cache all the lengths before starting the main loop -- but if all
-        * the characters in both strings are single byte, then we skip this and
-        * use a fast-path in the main loop.  If only one string contains
+        * In order to avoid calling pg_mblen_range() repeatedly on each character
+        * in s, we cache all the lengths before starting the main loop -- but if
+        * all the characters in both strings are single byte, then we skip this
+        * and use a fast-path in the main loop.  If only one string contains
         * multi-byte characters, we still build the array, so that the fast-path
         * needn't deal with the case where the array hasn't been initialized.
         */
@@ -198,7 +200,7 @@ varstr_levenshtein(const char *source, int slen,
                s_char_len = (int *) palloc((m + 1) * sizeof(int));
                for (i = 0; i < m; ++i)
                {
-                       s_char_len[i] = pg_mblen(cp);
+                       s_char_len[i] = pg_mblen_range(cp, send);
                        cp += s_char_len[i];
                }
                s_char_len[i] = 0;
@@ -224,7 +226,7 @@ varstr_levenshtein(const char *source, int slen,
        {
                int                *temp;
                const char *x = source;
-               int                     y_char_len = n != tlen + 1 ? pg_mblen(y) : 1;
+               int                     y_char_len = n != tlen + 1 ? pg_mblen_range(y, tend) : 1;
                int                     i;
 
 #ifdef LEVENSHTEIN_LESS_EQUAL
index 2143d8658e8d8b3e71077cab6016dc72b7c027a1..350bc07f210e1ab90ca9ffad13c77f96cbf2981e 100644 (file)
@@ -55,20 +55,20 @@ static int  Generic_Text_IC_like(text *str, text *pat, Oid collation);
  *--------------------
  */
 static inline int
-wchareq(const char *p1, const char *p2)
+wchareq(const char *p1, int p1len, const char *p2, int p2len)
 {
-       int                     p1_len;
+       int                     p1clen;
 
        /* Optimization:  quickly compare the first byte. */
        if (*p1 != *p2)
                return 0;
 
-       p1_len = pg_mblen(p1);
-       if (pg_mblen(p2) != p1_len)
+       p1clen = pg_mblen_with_len(p1, p1len);
+       if (pg_mblen_with_len(p2, p2len) != p1clen)
                return 0;
 
        /* They are the same length */
-       while (p1_len--)
+       while (p1clen--)
        {
                if (*p1++ != *p2++)
                        return 0;
@@ -93,11 +93,11 @@ wchareq(const char *p1, const char *p2)
 #define NextByte(p, plen)      ((p)++, (plen)--)
 
 /* Set up to compile like_match.c for multibyte characters */
-#define CHAREQ(p1, p2) wchareq((p1), (p2))
+#define CHAREQ(p1, p1len, p2, p2len) wchareq((p1), (p1len), (p2), (p2len))
 #define NextChar(p, plen) \
-       do { int __l = pg_mblen(p); (p) +=__l; (plen) -=__l; } while (0)
+       do { int __l = pg_mblen_with_len((p), (plen)); (p) +=__l; (plen) -=__l; } while (0)
 #define CopyAdvChar(dst, src, srclen) \
-       do { int __l = pg_mblen(src); \
+       do { int __l = pg_mblen_with_len((src), (srclen)); \
                 (srclen) -= __l; \
                 while (__l-- > 0) \
                         *(dst)++ = *(src)++; \
@@ -109,7 +109,7 @@ wchareq(const char *p1, const char *p2)
 #include "like_match.c"
 
 /* Set up to compile like_match.c for single-byte characters */
-#define CHAREQ(p1, p2) (*(p1) == *(p2))
+#define CHAREQ(p1, p1len, p2, p2len) (*(p1) == *(p2))
 #define NextChar(p, plen) NextByte((p), (plen))
 #define CopyAdvChar(dst, src, srclen) (*(dst)++ = *(src)++, (srclen)--)
 
index 02990ca9a1bcb52081a3673f3e3eebb7fe350774..f5f72b82e215201d1e2bdd2aaee083365ede5c26 100644 (file)
@@ -442,6 +442,7 @@ do_like_escape(text *pat, text *esc)
                                         errhint("Escape string must be empty or one character.")));
 
                e = VARDATA_ANY(esc);
+               elen = VARSIZE_ANY_EXHDR(esc);
 
                /*
                 * If specified escape is '\', just copy the pattern as-is.
@@ -460,7 +461,7 @@ do_like_escape(text *pat, text *esc)
                afterescape = false;
                while (plen > 0)
                {
-                       if (CHAREQ(p, e) && !afterescape)
+                       if (CHAREQ(p, plen, e, elen) && !afterescape)
                        {
                                *r++ = '\\';
                                NextChar(p, plen);
index a003f90066ce8f972f10f303ade1d325c9f5ec7d..5b0d098bd07d502043a61c17a435a2cd6b3fbd40 100644 (file)
@@ -169,8 +169,8 @@ lpad(PG_FUNCTION_ARGS)
        char       *ptr1,
                           *ptr2,
                           *ptr2start,
-                          *ptr2end,
                           *ptr_ret;
+       const char *ptr2end;
        int                     m,
                                s1len,
                                s2len;
@@ -215,7 +215,7 @@ lpad(PG_FUNCTION_ARGS)
 
        while (m--)
        {
-               int                     mlen = pg_mblen(ptr2);
+               int                     mlen = pg_mblen_range(ptr2, ptr2end);
 
                memcpy(ptr_ret, ptr2, mlen);
                ptr_ret += mlen;
@@ -228,7 +228,7 @@ lpad(PG_FUNCTION_ARGS)
 
        while (s1len--)
        {
-               int                     mlen = pg_mblen(ptr1);
+               int                     mlen = pg_mblen_unbounded(ptr1);
 
                memcpy(ptr_ret, ptr1, mlen);
                ptr_ret += mlen;
@@ -267,8 +267,8 @@ rpad(PG_FUNCTION_ARGS)
        char       *ptr1,
                           *ptr2,
                           *ptr2start,
-                          *ptr2end,
                           *ptr_ret;
+       const char *ptr2end;
        int                     m,
                                s1len,
                                s2len;
@@ -308,11 +308,12 @@ rpad(PG_FUNCTION_ARGS)
        m = len - s1len;
 
        ptr1 = VARDATA_ANY(string1);
+
        ptr_ret = VARDATA(ret);
 
        while (s1len--)
        {
-               int                     mlen = pg_mblen(ptr1);
+               int                     mlen = pg_mblen_unbounded(ptr1);
 
                memcpy(ptr_ret, ptr1, mlen);
                ptr_ret += mlen;
@@ -324,7 +325,7 @@ rpad(PG_FUNCTION_ARGS)
 
        while (m--)
        {
-               int                     mlen = pg_mblen(ptr2);
+               int                     mlen = pg_mblen_range(ptr2, ptr2end);
 
                memcpy(ptr_ret, ptr2, mlen);
                ptr_ret += mlen;
@@ -409,6 +410,7 @@ dotrim(const char *string, int stringlen,
                         */
                        const char **stringchars;
                        const char **setchars;
+                       const char *setend;
                        int                *stringmblen;
                        int                *setmblen;
                        int                     stringnchars;
@@ -416,6 +418,7 @@ dotrim(const char *string, int stringlen,
                        int                     resultndx;
                        int                     resultnchars;
                        const char *p;
+                       const char *pend;
                        int                     len;
                        int                     mblen;
                        const char *str_pos;
@@ -426,10 +429,11 @@ dotrim(const char *string, int stringlen,
                        stringnchars = 0;
                        p = string;
                        len = stringlen;
+                       pend = p + len;
                        while (len > 0)
                        {
                                stringchars[stringnchars] = p;
-                               stringmblen[stringnchars] = mblen = pg_mblen(p);
+                               stringmblen[stringnchars] = mblen = pg_mblen_range(p, pend);
                                stringnchars++;
                                p += mblen;
                                len -= mblen;
@@ -440,10 +444,11 @@ dotrim(const char *string, int stringlen,
                        setnchars = 0;
                        p = set;
                        len = setlen;
+                       setend = set + setlen;
                        while (len > 0)
                        {
                                setchars[setnchars] = p;
-                               setmblen[setnchars] = mblen = pg_mblen(p);
+                               setmblen[setnchars] = mblen = pg_mblen_range(p, setend);
                                setnchars++;
                                p += mblen;
                                len -= mblen;
@@ -821,6 +826,8 @@ translate(PG_FUNCTION_ARGS)
                           *to_end;
        char       *source,
                           *target;
+       const char *source_end;
+       const char *from_end;
        int                     m,
                                fromlen,
                                tolen,
@@ -835,9 +842,11 @@ translate(PG_FUNCTION_ARGS)
        if (m <= 0)
                PG_RETURN_TEXT_P(string);
        source = VARDATA_ANY(string);
+       source_end = source + m;
 
        fromlen = VARSIZE_ANY_EXHDR(from);
        from_ptr = VARDATA_ANY(from);
+       from_end = from_ptr + fromlen;
        tolen = VARSIZE_ANY_EXHDR(to);
        to_ptr = VARDATA_ANY(to);
        to_end = to_ptr + tolen;
@@ -861,12 +870,12 @@ translate(PG_FUNCTION_ARGS)
 
        while (m > 0)
        {
-               source_len = pg_mblen(source);
+               source_len = pg_mblen_range(source, source_end);
                from_index = 0;
 
                for (i = 0; i < fromlen; i += len)
                {
-                       len = pg_mblen(&from_ptr[i]);
+                       len = pg_mblen_range(&from_ptr[i], from_end);
                        if (len == source_len &&
                                memcmp(source, &from_ptr[i], len) == 0)
                                break;
@@ -882,11 +891,11 @@ translate(PG_FUNCTION_ARGS)
                        {
                                if (p >= to_end)
                                        break;
-                               p += pg_mblen(p);
+                               p += pg_mblen_range(p, to_end);
                        }
                        if (p < to_end)
                        {
-                               len = pg_mblen(p);
+                               len = pg_mblen_range(p, to_end);
                                memcpy(target, p, len);
                                target += len;
                                retlen += len;
index 94cd15bbab10d819f6e2350732ee80de5428c78b..311b9877bbb96040c2afe5128c089684981808df 100644 (file)
@@ -443,7 +443,7 @@ parse_re_flags(pg_re_flags *flags, text *opts)
                                        ereport(ERROR,
                                                        (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
                                                         errmsg("invalid regular expression option: \"%.*s\"",
-                                                                       pg_mblen(opt_p + i), opt_p + i)));
+                                                                       pg_mblen_range(opt_p + i, opt_p + opt_len), opt_p + i)));
                                        break;
                        }
                }
@@ -673,12 +673,13 @@ textregexreplace(PG_FUNCTION_ARGS)
        if (VARSIZE_ANY_EXHDR(opt) > 0)
        {
                char       *opt_p = VARDATA_ANY(opt);
+               const char *end_p = opt_p + VARSIZE_ANY_EXHDR(opt);
 
                if (*opt_p >= '0' && *opt_p <= '9')
                        ereport(ERROR,
                                        (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
                                         errmsg("invalid regular expression option: \"%.*s\"",
-                                                       pg_mblen(opt_p), opt_p),
+                                                       pg_mblen_range(opt_p, end_p), opt_p),
                                         errhint("If you meant to use regexp_replace() with a start parameter, cast the fourth argument to integer explicitly.")));
        }
 
@@ -772,6 +773,7 @@ similar_escape_internal(text *pat_text, text *esc_text)
                           *r;
        int                     plen,
                                elen;
+       const char *pend;
        bool            afterescape = false;
        int                     nquotes = 0;
        int                     bracket_depth = 0;      /* square bracket nesting level */
@@ -779,6 +781,7 @@ similar_escape_internal(text *pat_text, text *esc_text)
 
        p = VARDATA_ANY(pat_text);
        plen = VARSIZE_ANY_EXHDR(pat_text);
+       pend = p + plen;
        if (esc_text == NULL)
        {
                /* No ESCAPE clause provided; default to backslash as escape */
@@ -878,7 +881,7 @@ similar_escape_internal(text *pat_text, text *esc_text)
 
                if (elen > 1)
                {
-                       int                     mblen = pg_mblen(p);
+                       int                     mblen = pg_mblen_range(p, pend);
 
                        if (mblen > 1)
                        {
index e3bf1fbbfd76b0e529eecaaba665544afb7e0deb..7e54f36c2a7b4aefd693e1fd3231f1681dfde1c0 100644 (file)
@@ -120,7 +120,7 @@ get_modifiers(char *buf, int16 *weight, bool *prefix)
                return buf;
 
        buf++;
-       while (*buf && pg_mblen(buf) == 1)
+       while (*buf && pg_mblen_cstr(buf) == 1)
        {
                switch (*buf)
                {
@@ -259,12 +259,12 @@ parse_or_operator(TSQueryParserState pstate)
                return false;
 
        /* it shouldn't be a part of any word */
-       if (t_iseq(ptr, '-') || t_iseq(ptr, '_') || t_isalnum(ptr))
+       if (t_iseq(ptr, '-') || t_iseq(ptr, '_') || t_isalnum_cstr(ptr))
                return false;
 
        for (;;)
        {
-               ptr += pg_mblen(ptr);
+               ptr += pg_mblen_cstr(ptr);
 
                if (*ptr == '\0')               /* got end of string without operand */
                        return false;
@@ -390,7 +390,7 @@ gettoken_query_standard(TSQueryParserState state, int8 *operator,
                                break;
                }
 
-               state->buf += pg_mblen(state->buf);
+               state->buf += pg_mblen_cstr(state->buf);
        }
 }
 
@@ -502,7 +502,7 @@ gettoken_query_websearch(TSQueryParserState state, int8 *operator,
                                break;
                }
 
-               state->buf += pg_mblen(state->buf);
+               state->buf += pg_mblen_cstr(state->buf);
        }
 }
 
@@ -1014,9 +1014,8 @@ infix(INFIX *in, int parentPriority, bool rightPhraseOp)
                                *(in->cur) = '\\';
                                in->cur++;
                        }
-                       COPYCHAR(in->cur, op);
 
-                       clen = pg_mblen(op);
+                       clen = ts_copychar_cstr(in->cur, op);
                        op += clen;
                        in->cur += clen;
                }
index 38342298a5d79a87e3dbc13f4e773a15ad2ad778..024f5160cd4ebe33036943649d55332c18e518ed 100644 (file)
@@ -319,9 +319,9 @@ tsvectorout(PG_FUNCTION_ARGS)
                                lenbuf = 0,
                                pp;
        WordEntry  *ptr = ARRPTR(out);
-       char       *curbegin,
-                          *curin,
+       char       *curin,
                           *curout;
+       const char *curend;
 
        lenbuf = out->size * 2 /* '' */ + out->size - 1 /* space */ + 2 /* \0 */ ;
        for (i = 0; i < out->size; i++)
@@ -334,13 +334,14 @@ tsvectorout(PG_FUNCTION_ARGS)
        curout = outbuf = (char *) palloc(lenbuf);
        for (i = 0; i < out->size; i++)
        {
-               curbegin = curin = STRPTR(out) + ptr->pos;
+               curin = STRPTR(out) + ptr->pos;
+               curend = curin + ptr->len;
                if (i != 0)
                        *curout++ = ' ';
                *curout++ = '\'';
-               while (curin - curbegin < ptr->len)
+               while (curin < curend)
                {
-                       int                     len = pg_mblen(curin);
+                       int                     len = pg_mblen_range(curin, curend);
 
                        if (t_iseq(curin, '\''))
                                *curout++ = '\'';
index 94e0fed8309ec7e25f0ed40f4ed887e9871de5ef..71c7c7d3b3cee794e83cc1b547f7fee5478dc7f7 100644 (file)
@@ -2604,11 +2604,15 @@ ts_stat_sql(MemoryContext persistentContext, text *txt, text *ws)
        if (ws)
        {
                char       *buf;
+               const char *end;
 
                buf = VARDATA_ANY(ws);
-               while (buf - VARDATA_ANY(ws) < VARSIZE_ANY_EXHDR(ws))
+               end = buf + VARSIZE_ANY_EXHDR(ws);
+               while (buf < end)
                {
-                       if (pg_mblen(buf) == 1)
+                       int                     len = pg_mblen_range(buf, end);
+
+                       if (len == 1)
                        {
                                switch (*buf)
                                {
@@ -2632,7 +2636,7 @@ ts_stat_sql(MemoryContext persistentContext, text *txt, text *ws)
                                                stat->weight |= 0;
                                }
                        }
-                       buf += pg_mblen(buf);
+                       buf += len;
                }
        }
 
index b3c04f6344f17c127938905bd517ca7290ef1c57..efeaeb553342383ce6f8fdaa5c61766df4b75661 100644 (file)
@@ -208,8 +208,7 @@ gettoken_tsvector(TSVectorParseState state,
                                PRSSYNTAXERROR;
                        else if (!isspace((unsigned char) *state->prsbuf))
                        {
-                               COPYCHAR(curpos, state->prsbuf);
-                               curpos += pg_mblen(state->prsbuf);
+                               curpos += ts_copychar_cstr(curpos, state->prsbuf);
                                statecode = WAITENDWORD;
                        }
                }
@@ -223,8 +222,7 @@ gettoken_tsvector(TSVectorParseState state,
                        else
                        {
                                RESIZEPRSBUF;
-                               COPYCHAR(curpos, state->prsbuf);
-                               curpos += pg_mblen(state->prsbuf);
+                               curpos += ts_copychar_cstr(curpos, state->prsbuf);
                                Assert(oldstate != 0);
                                statecode = oldstate;
                        }
@@ -259,8 +257,7 @@ gettoken_tsvector(TSVectorParseState state,
                        else
                        {
                                RESIZEPRSBUF;
-                               COPYCHAR(curpos, state->prsbuf);
-                               curpos += pg_mblen(state->prsbuf);
+                               curpos += ts_copychar_cstr(curpos, state->prsbuf);
                        }
                }
                else if (statecode == WAITENDCMPLX)
@@ -279,8 +276,7 @@ gettoken_tsvector(TSVectorParseState state,
                        else
                        {
                                RESIZEPRSBUF;
-                               COPYCHAR(curpos, state->prsbuf);
-                               curpos += pg_mblen(state->prsbuf);
+                               curpos += ts_copychar_cstr(curpos, state->prsbuf);
                        }
                }
                else if (statecode == WAITCHARCMPLX)
@@ -288,8 +284,7 @@ gettoken_tsvector(TSVectorParseState state,
                        if (!state->is_web && t_iseq(state->prsbuf, '\''))
                        {
                                RESIZEPRSBUF;
-                               COPYCHAR(curpos, state->prsbuf);
-                               curpos += pg_mblen(state->prsbuf);
+                               curpos += ts_copychar_cstr(curpos, state->prsbuf);
                                statecode = WAITENDCMPLX;
                        }
                        else
@@ -300,7 +295,7 @@ gettoken_tsvector(TSVectorParseState state,
                                        PRSSYNTAXERROR;
                                if (state->oprisdelim)
                                {
-                                       /* state->prsbuf+=pg_mblen(state->prsbuf); */
+                                       /* state->prsbuf+=pg_mblen_cstr(state->prsbuf); */
                                        RETURN_TOKEN;
                                }
                                else
@@ -383,6 +378,6 @@ gettoken_tsvector(TSVectorParseState state,
                                 statecode);
 
                /* get next char */
-               state->prsbuf += pg_mblen(state->prsbuf);
+               state->prsbuf += pg_mblen_cstr(state->prsbuf);
        }
 }
index 50ffee679b9e6010e2fd1220b45c082397574cac..65ad1bfe18f660a2c6d7eb013a5f440dcd46bc2a 100644 (file)
@@ -232,7 +232,7 @@ bit_in(PG_FUNCTION_ARGS)
                                ereturn(escontext, (Datum) 0,
                                                (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
                                                 errmsg("\"%.*s\" is not a valid binary digit",
-                                                               pg_mblen(sp), sp)));
+                                                               pg_mblen_cstr(sp), sp)));
 
                        x >>= 1;
                        if (x == 0)
@@ -257,7 +257,7 @@ bit_in(PG_FUNCTION_ARGS)
                                ereturn(escontext, (Datum) 0,
                                                (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
                                                 errmsg("\"%.*s\" is not a valid hexadecimal digit",
-                                                               pg_mblen(sp), sp)));
+                                                               pg_mblen_cstr(sp), sp)));
 
                        if (bc)
                        {
@@ -533,7 +533,7 @@ varbit_in(PG_FUNCTION_ARGS)
                                ereturn(escontext, (Datum) 0,
                                                (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
                                                 errmsg("\"%.*s\" is not a valid binary digit",
-                                                               pg_mblen(sp), sp)));
+                                                               pg_mblen_cstr(sp), sp)));
 
                        x >>= 1;
                        if (x == 0)
@@ -558,7 +558,7 @@ varbit_in(PG_FUNCTION_ARGS)
                                ereturn(escontext, (Datum) 0,
                                                (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
                                                 errmsg("\"%.*s\" is not a valid hexadecimal digit",
-                                                               pg_mblen(sp), sp)));
+                                                               pg_mblen_cstr(sp), sp)));
 
                        if (bc)
                        {
index 552ac0c61d308a15032431fcbfb2235094179efc..6bb14620a638b4d68ed63efb71fa8f13100cb72d 100644 (file)
@@ -494,8 +494,11 @@ text_catenate(text *t1, text *t2)
  * charlen_to_bytelen()
  *     Compute the number of bytes occupied by n characters starting at *p
  *
- * It is caller's responsibility that there actually are n characters;
- * the string need not be null-terminated.
+ * The caller shall ensure there are n complete characters.  Callers achieve
+ * this by deriving "n" from regmatch_t findings from searching a wchar array.
+ * pg_mb2wchar_with_len() skips any trailing incomplete character, so regex
+ * matches will end no later than the last complete character.  (The string
+ * need not be null-terminated.)
  */
 static int
 charlen_to_bytelen(const char *p, int n)
@@ -510,7 +513,7 @@ charlen_to_bytelen(const char *p, int n)
                const char *s;
 
                for (s = p; n > 0; n--)
-                       s += pg_mblen(s);
+                       s += pg_mblen_unbounded(s); /* caller verified encoding */
 
                return s - p;
        }
@@ -644,6 +647,7 @@ text_substring(Datum str, int32 start, int32 length, bool length_not_specified)
                int32           slice_start;
                int32           slice_size;
                int32           slice_strlen;
+               int32           slice_len;
                text       *slice;
                int32           E1;
                int32           i;
@@ -713,7 +717,8 @@ text_substring(Datum str, int32 start, int32 length, bool length_not_specified)
                        slice = (text *) DatumGetPointer(str);
 
                /* see if we got back an empty string */
-               if (VARSIZE_ANY_EXHDR(slice) == 0)
+               slice_len = VARSIZE_ANY_EXHDR(slice);
+               if (slice_len == 0)
                {
                        if (slice != (text *) DatumGetPointer(str))
                                pfree(slice);
@@ -722,7 +727,7 @@ text_substring(Datum str, int32 start, int32 length, bool length_not_specified)
 
                /* Now we can get the actual length of the slice in MB characters */
                slice_strlen = pg_mbstrlen_with_len(VARDATA_ANY(slice),
-                                                                                       VARSIZE_ANY_EXHDR(slice));
+                                                                                       slice_len);
 
                /*
                 * Check that the start position wasn't > slice_strlen. If so, SQL99
@@ -749,7 +754,7 @@ text_substring(Datum str, int32 start, int32 length, bool length_not_specified)
                 */
                p = VARDATA_ANY(slice);
                for (i = 0; i < S1 - 1; i++)
-                       p += pg_mblen(p);
+                       p += pg_mblen_unbounded(p);
 
                /* hang onto a pointer to our start position */
                s = p;
@@ -759,7 +764,7 @@ text_substring(Datum str, int32 start, int32 length, bool length_not_specified)
                 * length.
                 */
                for (i = S1; i < E1; i++)
-                       p += pg_mblen(p);
+                       p += pg_mblen_unbounded(p);
 
                ret = (text *) palloc(VARHDRSZ + (p - s));
                SET_VARSIZE(ret, VARHDRSZ + (p - s));
@@ -1064,6 +1069,8 @@ retry:
         */
        if (state->is_multibyte_char_in_char && state->locale->deterministic)
        {
+               const char *haystack_end = state->str1 + state->len1;
+
                /* Walk one character at a time, until we reach the match. */
 
                /* the search should never move backwards. */
@@ -1072,7 +1079,7 @@ retry:
                while (state->refpoint < matchptr)
                {
                        /* step to next character. */
-                       state->refpoint += pg_mblen(state->refpoint);
+                       state->refpoint += pg_mblen_range(state->refpoint, haystack_end);
                        state->refpos++;
 
                        /*
@@ -1160,7 +1167,7 @@ text_position_next_internal(char *start_ptr, TextPositionState *state)
                        test_end = hptr;
                        do
                        {
-                               test_end += pg_mblen(test_end);
+                               test_end += pg_mblen_range(test_end, haystack_end);
                                if (pg_strncoll(hptr, (test_end - hptr), needle, needle_len, state->locale) == 0)
                                {
                                        state->last_match_len_tmp = (test_end - hptr);
@@ -1173,7 +1180,7 @@ text_position_next_internal(char *start_ptr, TextPositionState *state)
                        if (result_hptr)
                                break;
 
-                       hptr += pg_mblen(hptr);
+                       hptr += pg_mblen_range(hptr, haystack_end);
                }
 
                return (char *) result_hptr;
@@ -3767,6 +3774,8 @@ split_text(FunctionCallInfo fcinfo, SplitTextOutputData *tstate)
        }
        else
        {
+               const char *end_ptr;
+
                /*
                 * When fldsep is NULL, each character in the input string becomes a
                 * separate element in the result set.  The separator is effectively
@@ -3775,10 +3784,11 @@ split_text(FunctionCallInfo fcinfo, SplitTextOutputData *tstate)
                inputstring_len = VARSIZE_ANY_EXHDR(inputstring);
 
                start_ptr = VARDATA_ANY(inputstring);
+               end_ptr = start_ptr + inputstring_len;
 
                while (inputstring_len > 0)
                {
-                       int                     chunk_len = pg_mblen(start_ptr);
+                       int                     chunk_len = pg_mblen_range(start_ptr, end_ptr);
 
                        CHECK_FOR_INTERRUPTS();
 
@@ -4684,7 +4694,7 @@ text_reverse(PG_FUNCTION_ARGS)
                {
                        int                     sz;
 
-                       sz = pg_mblen(p);
+                       sz = pg_mblen_range(p, endp);
                        dst -= sz;
                        memcpy(dst, p, sz);
                        p += sz;
@@ -4845,7 +4855,7 @@ text_format(PG_FUNCTION_ARGS)
                        ereport(ERROR,
                                        (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
                                         errmsg("unrecognized format() type specifier \"%.*s\"",
-                                                       pg_mblen(cp), cp),
+                                                       pg_mblen_range(cp, end_ptr), cp),
                                         errhint("For a single \"%%\" use \"%%%%\".")));
 
                /* If indirect width was specified, get its value */
@@ -4966,7 +4976,7 @@ text_format(PG_FUNCTION_ARGS)
                                ereport(ERROR,
                                                (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
                                                 errmsg("unrecognized format() type specifier \"%.*s\"",
-                                                               pg_mblen(cp), cp),
+                                                               pg_mblen_range(cp, end_ptr), cp),
                                                 errhint("For a single \"%%\" use \"%%%%\".")));
                                break;
                }
index f69dc68286c5250cc2dd6e60e694d97d4ec23011..fcb13e7c0a1c9d06c5d89571fc00687a1364cb92 100644 (file)
@@ -2376,8 +2376,7 @@ sqlchar_to_unicode(const char *s)
        char       *utf8string;
        pg_wchar        ret[2];                 /* need space for trailing zero */
 
-       /* note we're not assuming s is null-terminated */
-       utf8string = pg_server_to_any(s, pg_mblen(s), PG_UTF8);
+       utf8string = pg_server_to_any(s, pg_mblen_cstr(s), PG_UTF8);
 
        pg_encoding_mb2wchar_with_len(PG_UTF8, utf8string, ret,
                                                                  pg_encoding_mblen(PG_UTF8, utf8string));
@@ -2430,7 +2429,7 @@ map_sql_identifier_to_xml_name(const char *ident, bool fully_escaped,
 
        initStringInfo(&buf);
 
-       for (p = ident; *p; p += pg_mblen(p))
+       for (p = ident; *p; p += pg_mblen_cstr(p))
        {
                if (*p == ':' && (p == ident || fully_escaped))
                        appendStringInfoString(&buf, "_x003A_");
@@ -2455,7 +2454,7 @@ map_sql_identifier_to_xml_name(const char *ident, bool fully_escaped,
                                : !is_valid_xml_namechar(u))
                                appendStringInfo(&buf, "_x%04X_", (unsigned int) u);
                        else
-                               appendBinaryStringInfo(&buf, p, pg_mblen(p));
+                               appendBinaryStringInfo(&buf, p, pg_mblen_cstr(p));
                }
        }
 
@@ -2478,7 +2477,7 @@ map_xml_name_to_sql_identifier(const char *name)
 
        initStringInfo(&buf);
 
-       for (p = name; *p; p += pg_mblen(p))
+       for (p = name; *p; p += pg_mblen_cstr(p))
        {
                if (*p == '_' && *(p + 1) == 'x'
                        && isxdigit((unsigned char) *(p + 2))
@@ -2496,7 +2495,7 @@ map_xml_name_to_sql_identifier(const char *name)
                        p += 6;
                }
                else
-                       appendBinaryStringInfo(&buf, p, pg_mblen(p));
+                       appendBinaryStringInfo(&buf, p, pg_mblen_cstr(p));
        }
 
        return buf.data;
index 6950e743d03dacd5d18f2eaf95ddcb968dc8279e..a5a734839aff29b797ce0ac2360258f756d18b6e 100644 (file)
@@ -38,6 +38,7 @@
 #include "catalog/namespace.h"
 #include "mb/pg_wchar.h"
 #include "utils/fmgrprotos.h"
+#include "utils/memdebug.h"
 #include "utils/memutils.h"
 #include "utils/relcache.h"
 #include "varatt.h"
@@ -97,6 +98,13 @@ static char *perform_default_encoding_conversion(const char *src,
                                                                                                 int len, bool is_client_to_server);
 static int     cliplen(const char *str, int len, int limit);
 
+pg_noreturn
+static void report_invalid_encoding_int(int encoding, const char *mbstr,
+                                                                               int mblen, int len);
+
+pg_noreturn
+static void report_invalid_encoding_db(const char *mbstr, int mblen, int len);
+
 
 /*
  * Prepare for a future call to SetClientEncoding.  Success should mean
@@ -1021,11 +1029,126 @@ pg_encoding_wchar2mb_with_len(int encoding,
        return pg_wchar_table[encoding].wchar2mb_with_len(from, (unsigned char *) to, len);
 }
 
-/* returns the byte length of a multibyte character */
+/*
+ * Returns the byte length of a multibyte character sequence in a
+ * null-terminated string.  Raises an illegal byte sequence error if the
+ * sequence would hit a null terminator.
+ *
+ * The caller is expected to have checked for a terminator at *mbstr == 0
+ * before calling, but some callers want 1 in that case, so this function
+ * continues that tradition.
+ *
+ * This must only be used for strings that have a null-terminator to enable
+ * bounds detection.
+ */
+int
+pg_mblen_cstr(const char *mbstr)
+{
+       int                     length = pg_wchar_table[DatabaseEncoding->encoding].mblen((const unsigned char *) mbstr);
+
+       /*
+        * The .mblen functions return 1 when given a pointer to a terminator.
+        * Some callers depend on that, so we tolerate it for now.  Well-behaved
+        * callers check the leading byte for a terminator *before* calling.
+        */
+       for (int i = 1; i < length; ++i)
+               if (unlikely(mbstr[i] == 0))
+                       report_invalid_encoding_db(mbstr, length, i);
+
+       /*
+        * String should be NUL-terminated, but checking that would make typical
+        * callers O(N^2), tripling Valgrind check-world time.  Unless
+        * VALGRIND_EXPENSIVE, check 1 byte after each actual character.  (If we
+        * found a character, not a terminator, the next byte must be a terminator
+        * or the start of the next character.)  If the caller iterates the whole
+        * string, the last call will diagnose a missing terminator.
+        */
+       if (mbstr[0] != '\0')
+       {
+#ifdef VALGRIND_EXPENSIVE
+               VALGRIND_CHECK_MEM_IS_DEFINED(mbstr, strlen(mbstr));
+#else
+               VALGRIND_CHECK_MEM_IS_DEFINED(mbstr + length, 1);
+#endif
+       }
+
+       return length;
+}
+
+/*
+ * Returns the byte length of a multibyte character sequence bounded by a range
+ * [mbstr, end) of at least one byte in size.  Raises an illegal byte sequence
+ * error if the sequence would exceed the range.
+ */
+int
+pg_mblen_range(const char *mbstr, const char *end)
+{
+       int                     length = pg_wchar_table[DatabaseEncoding->encoding].mblen((const unsigned char *) mbstr);
+
+       Assert(end > mbstr);
+#ifdef VALGRIND_EXPENSIVE
+       VALGRIND_CHECK_MEM_IS_DEFINED(mbstr, end - mbstr);
+#else
+       VALGRIND_CHECK_MEM_IS_DEFINED(mbstr, length);
+#endif
+
+       if (unlikely(mbstr + length > end))
+               report_invalid_encoding_db(mbstr, length, end - mbstr);
+
+       return length;
+}
+
+/*
+ * Returns the byte length of a multibyte character sequence bounded by a range
+ * extending for 'limit' bytes, which must be at least one.  Raises an illegal
+ * byte sequence error if the sequence would exceed the range.
+ */
+int
+pg_mblen_with_len(const char *mbstr, int limit)
+{
+       int                     length = pg_wchar_table[DatabaseEncoding->encoding].mblen((const unsigned char *) mbstr);
+
+       Assert(limit >= 1);
+#ifdef VALGRIND_EXPENSIVE
+       VALGRIND_CHECK_MEM_IS_DEFINED(mbstr, limit);
+#else
+       VALGRIND_CHECK_MEM_IS_DEFINED(mbstr, length);
+#endif
+
+       if (unlikely(length > limit))
+               report_invalid_encoding_db(mbstr, length, limit);
+
+       return length;
+}
+
+
+/*
+ * Returns the length of a multibyte character sequence, without any
+ * validation of bounds.
+ *
+ * PLEASE NOTE:  This function can only be used safely if the caller has
+ * already verified the input string, since otherwise there is a risk of
+ * overrunning the buffer if the string is invalid.  A prior call to a
+ * pg_mbstrlen* function suffices.
+ */
+int
+pg_mblen_unbounded(const char *mbstr)
+{
+       int                     length = pg_wchar_table[DatabaseEncoding->encoding].mblen((const unsigned char *) mbstr);
+
+       VALGRIND_CHECK_MEM_IS_DEFINED(mbstr, length);
+
+       return length;
+}
+
+/*
+ * Historical name for pg_mblen_unbounded().  Should not be used and will be
+ * removed in a later version.
+ */
 int
 pg_mblen(const char *mbstr)
 {
-       return pg_wchar_table[DatabaseEncoding->encoding].mblen((const unsigned char *) mbstr);
+       return pg_mblen_unbounded(mbstr);
 }
 
 /* returns the display length of a multibyte character */
@@ -1047,14 +1170,14 @@ pg_mbstrlen(const char *mbstr)
 
        while (*mbstr)
        {
-               mbstr += pg_mblen(mbstr);
+               mbstr += pg_mblen_cstr(mbstr);
                len++;
        }
        return len;
 }
 
 /* returns the length (counted in wchars) of a multibyte string
- * (not necessarily NULL terminated)
+ * (stops at the first of "limit" or a NUL)
  */
 int
 pg_mbstrlen_with_len(const char *mbstr, int limit)
@@ -1067,7 +1190,7 @@ pg_mbstrlen_with_len(const char *mbstr, int limit)
 
        while (limit > 0 && *mbstr)
        {
-               int                     l = pg_mblen(mbstr);
+               int                     l = pg_mblen_with_len(mbstr, limit);
 
                limit -= l;
                mbstr += l;
@@ -1137,7 +1260,7 @@ pg_mbcharcliplen(const char *mbstr, int len, int limit)
 
        while (len > 0 && *mbstr)
        {
-               l = pg_mblen(mbstr);
+               l = pg_mblen_with_len(mbstr, len);
                nch++;
                if (nch > limit)
                        break;
@@ -1701,12 +1824,19 @@ void
 report_invalid_encoding(int encoding, const char *mbstr, int len)
 {
        int                     l = pg_encoding_mblen_or_incomplete(encoding, mbstr, len);
+
+       report_invalid_encoding_int(encoding, mbstr, l, len);
+}
+
+static void
+report_invalid_encoding_int(int encoding, const char *mbstr, int mblen, int len)
+{
        char            buf[8 * 5 + 1];
        char       *p = buf;
        int                     j,
                                jlimit;
 
-       jlimit = Min(l, len);
+       jlimit = Min(mblen, len);
        jlimit = Min(jlimit, 8);        /* prevent buffer overrun */
 
        for (j = 0; j < jlimit; j++)
@@ -1723,6 +1853,12 @@ report_invalid_encoding(int encoding, const char *mbstr, int len)
                                        buf)));
 }
 
+static void
+report_invalid_encoding_db(const char *mbstr, int mblen, int len)
+{
+       report_invalid_encoding_int(GetDatabaseEncoding(), mbstr, mblen, len);
+}
+
 /*
  * report_untranslatable_char: complain about untranslatable character
  *
index a5b7b49e4b50e1707b7a40d9d6d844459b1c912d..e1655fe61d675092333cf78124b730869a5bd4e2 100644 (file)
@@ -695,7 +695,14 @@ extern int pg_char_and_wchar_strcmp(const char *s1, const pg_wchar *s2);
 extern int     pg_wchar_strncmp(const pg_wchar *s1, const pg_wchar *s2, size_t n);
 extern int     pg_char_and_wchar_strncmp(const char *s1, const pg_wchar *s2, size_t n);
 extern size_t pg_wchar_strlen(const pg_wchar *str);
+extern int     pg_mblen_cstr(const char *mbstr);
+extern int     pg_mblen_range(const char *mbstr, const char *end);
+extern int     pg_mblen_with_len(const char *mbstr, int limit);
+extern int     pg_mblen_unbounded(const char *mbstr);
+
+/* deprecated */
 extern int     pg_mblen(const char *mbstr);
+
 extern int     pg_dsplen(const char *mbstr);
 extern int     pg_mbstrlen(const char *mbstr);
 extern int     pg_mbstrlen_with_len(const char *mbstr, int limit);
index cea417a91b5f110cf0ff81e71d76761c5a1a6e72..6e2d67ee4a538e1c8ed1d2d6da82eaab64838f02 100644 (file)
@@ -37,10 +37,34 @@ typedef struct
 /* The second argument of t_iseq() must be a plain ASCII character */
 #define t_iseq(x,c)            (TOUCHAR(x) == (unsigned char) (c))
 
-#define COPYCHAR(d,s)  memcpy(d, s, pg_mblen(s))
+/* Copy multibyte character of known byte length, return byte length. */
+static inline int
+ts_copychar_with_len(void *dest, const void *src, int length)
+{
+       memcpy(dest, src, length);
+       return length;
+}
+
+/* Copy multibyte character from null-terminated string,  return byte length. */
+static inline int
+ts_copychar_cstr(void *dest, const void *src)
+{
+       return ts_copychar_with_len(dest, src, pg_mblen_cstr((const char *) src));
+}
+
+/* Historical macro for the above. */
+#define COPYCHAR ts_copychar_cstr
+
+#define GENERATE_T_ISCLASS_DECL(character_class) \
+extern int     t_is##character_class##_with_len(const char *ptr, int len); \
+extern int     t_is##character_class##_cstr(const char *ptr); \
+extern int     t_is##character_class##_unbounded(const char *ptr); \
+\
+/* deprecated */ \
+extern int     t_is##character_class(const char *ptr);
 
-extern int     t_isalpha(const char *ptr);
-extern int     t_isalnum(const char *ptr);
+GENERATE_T_ISCLASS_DECL(alnum);
+GENERATE_T_ISCLASS_DECL(alpha);
 
 extern bool tsearch_readline_begin(tsearch_readline_state *stp,
                                                                   const char *filename);
index b0d1dbab6da29aeb598549cf1fdcafc8e13029a8..3eb0770f9c20a662ac9ac29f9e6f66fcc22740f8 100644 (file)
@@ -40,14 +40,12 @@ extern bool gettoken_tsvector(TSVectorParseState state,
 extern void close_tsvector_parser(TSVectorParseState state);
 
 /* phrase operator begins with '<' */
-#define ISOPERATOR(x) \
-       ( pg_mblen(x) == 1 && ( *(x) == '!' ||  \
-                                                       *(x) == '&' ||  \
-                                                       *(x) == '|' ||  \
-                                                       *(x) == '(' ||  \
-                                                       *(x) == ')' ||  \
-                                                       *(x) == '<'             \
-                                                 ) )
+#define ISOPERATOR(x)          (*(x) == '!' || \
+                                                        *(x) == '&' || \
+                                                        *(x) == '|' || \
+                                                        *(x) == '(' || \
+                                                        *(x) == ')' || \
+                                                        *(x) == '<')
 
 /* parse_tsquery */
 
index 070464a341ef9c5cac002fa41e5b9620ed7e2a73..4e97cde65a6fc49ff5aa1429c99bbf6ce04b7c94 100644 (file)
@@ -411,7 +411,8 @@ parse_test_flags(test_re_flags *flags, text *opts)
                                        ereport(ERROR,
                                                        (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
                                                         errmsg("invalid regular expression test option: \"%.*s\"",
-                                                                       pg_mblen(opt_p + i), opt_p + i)));
+                                                                       pg_mblen_range(opt_p + i, opt_p + opt_len),
+                                                                       opt_p + i)));
                                        break;
                        }
                }