/*
* returns the common prefix length of a node key
+ *
+ * If the underlying type is character data, the prefix length may point in
+ * the middle of a multibyte character.
*/
static int32
gbt_var_node_cp_len(const GBT_VARKEY *node, const gbtree_vinfo *tinfo)
{
GBT_VARKEY_R r = gbt_var_key_readable(node);
int32 i = 0;
- int32 l = 0;
+ int32 l_left_to_match = 0;
+ int32 l_total = 0;
int32 t1len = VARSIZE(r.lower) - VARHDRSZ;
int32 t2len = VARSIZE(r.upper) - VARHDRSZ;
int32 ml = Min(t1len, t2len);
char *p1 = VARDATA(r.lower);
char *p2 = VARDATA(r.upper);
+ const char *end1 = p1 + t1len;
+ const char *end2 = p2 + t2len;
if (ml == 0)
return 0;
while (i < ml)
{
- if (tinfo->eml > 1 && l == 0)
+ if (tinfo->eml > 1 && l_left_to_match == 0)
{
- if ((l = pg_mblen(p1)) != pg_mblen(p2))
+ l_total = pg_mblen_range(p1, end1);
+ if (l_total != pg_mblen_range(p2, end2))
{
return i;
}
+ l_left_to_match = l_total;
}
if (*p1 != *p2)
{
if (tinfo->eml > 1)
{
- return (i - l + 1);
+ int32 l_matched_subset = l_total - l_left_to_match;
+
+ /* end common prefix at final byte of last matching char */
+ return i - l_matched_subset;
}
else
{
p1++;
p2++;
- l--;
+ l_left_to_match--;
i++;
}
return ml; /* lower == upper */
*end = NULL;
while (*in && isspace((unsigned char) *in))
- in += pg_mblen(in);
+ in += pg_mblen_cstr(in);
if (!*in || *in == '#')
return NULL;
start = in;
while (*in && !isspace((unsigned char) *in))
- in += pg_mblen(in);
+ in += pg_mblen_cstr(in);
*end = in;
errsave(state->escontext,
(errcode(ERRCODE_SYNTAX_ERROR),
errmsg("syntax error in hstore, near \"%.*s\" at position %d",
- pg_mblen(state->ptr), state->ptr,
+ pg_mblen_cstr(state->ptr), state->ptr,
(int) (state->ptr - state->begin))));
/* In soft error situation, return false as convenience for caller */
return false;
{
pg_crc32 crc;
const char *p = buf;
+ const char *end = buf + size;
static pg_locale_t locale = NULL;
if (!locale)
while (size > 0)
{
char foldstr[UNICODE_CASEMAP_BUFSZ];
- int srclen = pg_mblen(p);
+ int srclen = pg_mblen_range(p, end);
size_t foldlen;
/* fold one codepoint at a time */
char *ptr;
while (start < end && t_iseq(start, '_'))
- start += pg_mblen(start);
+ start += pg_mblen_range(start, end);
ptr = start;
if (ptr >= end)
return NULL;
while (ptr < end && !t_iseq(ptr, '_'))
- ptr += pg_mblen(ptr);
+ ptr += pg_mblen_range(ptr, end);
*len = ptr - start;
return start;
#define LQUERY_HASNOT 0x01
/* valid label chars are alphanumerics, underscores and hyphens */
-#define ISLABEL(x) ( t_isalnum(x) || t_iseq(x, '_') || t_iseq(x, '-') )
+#define ISLABEL(x) ( t_isalnum_cstr(x) || t_iseq(x, '_') || t_iseq(x, '-') )
/* full text query */
ptr = buf;
while (*ptr)
{
- charlen = pg_mblen(ptr);
+ charlen = pg_mblen_cstr(ptr);
if (t_iseq(ptr, '.'))
num++;
ptr += charlen;
ptr = buf;
while (*ptr)
{
- charlen = pg_mblen(ptr);
+ charlen = pg_mblen_cstr(ptr);
switch (state)
{
ptr = buf;
while (*ptr)
{
- charlen = pg_mblen(ptr);
+ charlen = pg_mblen_cstr(ptr);
if (t_iseq(ptr, '.'))
num++;
ptr = buf;
while (*ptr)
{
- charlen = pg_mblen(ptr);
+ charlen = pg_mblen_cstr(ptr);
switch (state)
{
for (;;)
{
- charlen = pg_mblen(state->buf);
+ charlen = pg_mblen_cstr(state->buf);
switch (state->state)
{
ereport(ERROR,
(errcode(ERRCODE_DATA_CORRUPTED),
errmsg("invalid character \"%.*s\" in t_bits string",
- pg_mblen(str + off), str + off)));
+ pg_mblen_cstr(str + off), str + off)));
if (off % 8 == 7)
bits[off / 8] = byte;
} while(0)
extern int (*CMPTRGM) (const void *a, const void *b);
-#define ISWORDCHR(c) (t_isalnum(c))
+#define ISWORDCHR(c, len) (t_isalnum_with_len(c, len))
#define ISPRINTABLECHAR(a) ( isascii( *(unsigned char*)(a) ) && (isalnum( *(unsigned char*)(a) ) || *(unsigned char*)(a)==' ') )
#define ISPRINTABLETRGM(t) ( ISPRINTABLECHAR( ((char*)(t)) ) && ISPRINTABLECHAR( ((char*)(t))+1 ) && ISPRINTABLECHAR( ((char*)(t))+2 ) )
find_word(char *str, int lenstr, char **endword)
{
char *beginword = str;
+ const char *endstr = str + lenstr;
- while (beginword - str < lenstr && !ISWORDCHR(beginword))
- beginword += pg_mblen(beginword);
+ while (beginword < endstr)
+ {
+ int clen = pg_mblen_range(beginword, endstr);
- if (beginword - str >= lenstr)
+ if (ISWORDCHR(beginword, clen))
+ break;
+ beginword += clen;
+ }
+
+ if (beginword >= endstr)
return NULL;
*endword = beginword;
- while (*endword - str < lenstr && ISWORDCHR(*endword))
- *endword += pg_mblen(*endword);
+ while (*endword < endstr)
+ {
+ int clen = pg_mblen_range(*endword, endstr);
+
+ if (!ISWORDCHR(*endword, clen))
+ break;
+ *endword += clen;
+ }
return beginword;
}
lenfirst = 1;
lenmiddle = 1;
- lenlast = pg_mblen(ptr + 2);
+ lenlast = pg_mblen_unbounded(ptr + 2);
}
else
{
- lenfirst = pg_mblen(ptr);
+ lenfirst = pg_mblen_unbounded(ptr);
if (ptr + lenfirst >= str + bytelen)
goto done;
- lenmiddle = pg_mblen(ptr + lenfirst);
+ lenmiddle = pg_mblen_unbounded(ptr + lenfirst);
if (ptr + lenfirst + lenmiddle >= str + bytelen)
goto done;
- lenlast = pg_mblen(ptr + lenfirst + lenmiddle);
+ lenlast = pg_mblen_unbounded(ptr + lenfirst + lenmiddle);
}
/*
ptr += lenfirst;
lenfirst = lenmiddle;
lenmiddle = lenlast;
- lenlast = pg_mblen(endptr);
+ lenlast = pg_mblen_unbounded(endptr);
endptr += lenlast;
}
}
{
const char *beginword = str;
const char *endword;
+ const char *endstr = str + lenstr;
char *s = buf;
bool in_leading_wildcard_meta = false;
bool in_trailing_wildcard_meta = false;
* from this loop to the next one, since we may exit at a word character
* that is in_escape.
*/
- while (beginword - str < lenstr)
+ while (beginword < endstr)
{
+ clen = pg_mblen_range(beginword, endstr);
+
if (in_escape)
{
- if (ISWORDCHR(beginword))
+ if (ISWORDCHR(beginword, clen))
break;
in_escape = false;
in_leading_wildcard_meta = false;
in_escape = true;
else if (ISWILDCARDCHAR(beginword))
in_leading_wildcard_meta = true;
- else if (ISWORDCHR(beginword))
+ else if (ISWORDCHR(beginword, clen))
break;
else
in_leading_wildcard_meta = false;
}
- beginword += pg_mblen(beginword);
+ beginword += clen;
}
/*
* string boundary. Strip escapes during copy.
*/
endword = beginword;
- while (endword - str < lenstr)
+ while (endword < endstr)
{
- clen = pg_mblen(endword);
+ clen = pg_mblen_range(endword, endstr);
if (in_escape)
{
- if (ISWORDCHR(endword))
+ if (ISWORDCHR(endword, clen))
{
memcpy(s, endword, clen);
s += clen;
in_trailing_wildcard_meta = true;
break;
}
- else if (ISWORDCHR(endword))
+ else if (ISWORDCHR(endword, clen))
{
memcpy(s, endword, clen);
s += clen;
static void RE_compile(regex_t *regex, text *text_re,
int cflags, Oid collation);
static void getColorInfo(regex_t *regex, TrgmNFA *trgmNFA);
-static bool convertPgWchar(pg_wchar c, trgm_mb_char *result);
+static int convertPgWchar(pg_wchar c, trgm_mb_char *result);
static void transformGraph(TrgmNFA *trgmNFA);
static void processState(TrgmNFA *trgmNFA, TrgmState *state);
static void addKey(TrgmNFA *trgmNFA, TrgmState *state, TrgmStateKey *key);
for (j = 0; j < charsCount; j++)
{
trgm_mb_char c;
+ int clen = convertPgWchar(chars[j], &c);
- if (!convertPgWchar(chars[j], &c))
+ if (!clen)
continue; /* ok to ignore it altogether */
- if (ISWORDCHR(c.bytes))
+ if (ISWORDCHR(c.bytes, clen))
colorInfo->wordChars[colorInfo->wordCharsCount++] = c;
else
colorInfo->containsNonWord = true;
/*
* Convert pg_wchar to multibyte format.
- * Returns false if the character should be ignored completely.
+ * Returns 0 if the character should be ignored completely, else returns its
+ * byte length.
*/
-static bool
+static int
convertPgWchar(pg_wchar c, trgm_mb_char *result)
{
/* "s" has enough space for a multibyte character and a trailing NUL */
char s[MAX_MULTIBYTE_CHAR_LEN + 1];
+ int clen;
/*
* We can ignore the NUL character, since it can never appear in a PG text
* reconstructing trigrams.
*/
if (c == 0)
- return false;
+ return 0;
/* Do the conversion, making sure the result is NUL-terminated */
memset(s, 0, sizeof(s));
- pg_wchar2mb_with_len(&c, s, 1);
+ clen = pg_wchar2mb_with_len(&c, s, 1);
/*
* In IGNORECASE mode, we can ignore uppercase characters. We assume that
*/
#ifdef IGNORECASE
{
- char *lowerCased = str_tolower(s, strlen(s), DEFAULT_COLLATION_OID);
+ char *lowerCased = str_tolower(s, clen, DEFAULT_COLLATION_OID);
if (strcmp(lowerCased, s) != 0)
{
pfree(lowerCased);
- return false;
+ return 0;
}
pfree(lowerCased);
}
/* Fill result with exactly MAX_MULTIBYTE_CHAR_LEN bytes */
memcpy(result->bytes, s, MAX_MULTIBYTE_CHAR_LEN);
- return true;
+ return clen;
}
ereport(ERROR,
errcode(ERRCODE_INVALID_PARAMETER_VALUE),
errmsg("invalid character in salt string: \"%.*s\"",
- pg_mblen(ep), ep));
+ pg_mblen_cstr(ep), ep));
}
else
{
state = 0;
for (ptr = line; *ptr; ptr += ptrlen)
{
- ptrlen = pg_mblen(ptr);
+ ptrlen = pg_mblen_cstr(ptr);
/* ignore whitespace, but end src or trg */
if (isspace((unsigned char) *ptr))
{
char *srcchar = (char *) PG_GETARG_POINTER(1);
int32 len = PG_GETARG_INT32(2);
char *srcstart = srcchar;
+ const char *srcend = srcstart + len;
TSLexeme *res;
StringInfoData buf;
}
else
{
- matchlen = pg_mblen(srcchar);
+ matchlen = pg_mblen_range(srcchar, srcend);
if (buf.data != NULL)
appendBinaryStringInfo(&buf, srcchar, matchlen);
}
if (cursorpos > 0)
newcp++;
}
- chlen = pg_mblen(prosrc);
+ chlen = pg_mblen_cstr(prosrc);
if (strncmp(prosrc, literal, chlen) != 0)
goto fail;
prosrc += chlen;
/* Skip leading spaces */
while (*in && isspace((unsigned char) *in))
- in += pg_mblen(in);
+ in += pg_mblen_cstr(in);
/* Return NULL on empty lines */
if (*in == '\0')
while (*in && !isspace((unsigned char) *in))
{
lastchar = in;
- in += pg_mblen(in);
+ in += pg_mblen_cstr(in);
}
if (in - lastchar == 1 && t_iseq(lastchar, '*') && flags)
/* is it a comment? */
while (*ptr && isspace((unsigned char) *ptr))
- ptr += pg_mblen(ptr);
+ ptr += pg_mblen_cstr(ptr);
if (t_iseq(ptr, '#') || *ptr == '\0' ||
t_iseq(ptr, '\n') || t_iseq(ptr, '\r'))
{
useasis = true;
state = TR_INSUBS;
- beginwrd = ptr + pg_mblen(ptr);
+ beginwrd = ptr + pg_mblen_cstr(ptr);
}
else if (t_iseq(ptr, '\\'))
{
useasis = false;
state = TR_INSUBS;
- beginwrd = ptr + pg_mblen(ptr);
+ beginwrd = ptr + pg_mblen_cstr(ptr);
}
else if (!isspace((unsigned char) *ptr))
{
else
elog(ERROR, "unrecognized thesaurus state: %d", state);
- ptr += pg_mblen(ptr);
+ ptr += pg_mblen_cstr(ptr);
}
if (state == TR_INSUBS)
{
if (state == RS_IN_WAIT)
{
- if (t_isalpha(c))
+ if (t_isalpha_cstr(c))
/* okay */ ;
else if (t_iseq(c, '['))
state = RS_IN_ONEOF;
{
if (t_iseq(c, '^'))
state = RS_IN_NONEOF;
- else if (t_isalpha(c))
+ else if (t_isalpha_cstr(c))
state = RS_IN_ONEOF_IN;
else
return false;
}
else if (state == RS_IN_ONEOF_IN || state == RS_IN_NONEOF)
{
- if (t_isalpha(c))
+ if (t_isalpha_cstr(c))
/* okay */ ;
else if (t_iseq(c, ']'))
state = RS_IN_WAIT;
}
else
elog(ERROR, "internal error in RS_isRegis: state %d", state);
- c += pg_mblen(c);
+ c += pg_mblen_cstr(c);
}
return (state == RS_IN_WAIT);
{
if (state == RS_IN_WAIT)
{
- if (t_isalpha(c))
+ if (t_isalpha_cstr(c))
{
if (ptr)
ptr = newRegisNode(ptr, len);
else
ptr = r->node = newRegisNode(NULL, len);
- COPYCHAR(ptr->data, c);
ptr->type = RSF_ONEOF;
- ptr->len = pg_mblen(c);
+ ptr->len = ts_copychar_cstr(ptr->data, c);
}
else if (t_iseq(c, '['))
{
ptr->type = RSF_NONEOF;
state = RS_IN_NONEOF;
}
- else if (t_isalpha(c))
+ else if (t_isalpha_cstr(c))
{
- COPYCHAR(ptr->data, c);
- ptr->len = pg_mblen(c);
+ ptr->len = ts_copychar_cstr(ptr->data, c);
state = RS_IN_ONEOF_IN;
}
else /* shouldn't get here */
}
else if (state == RS_IN_ONEOF_IN || state == RS_IN_NONEOF)
{
- if (t_isalpha(c))
- {
- COPYCHAR(ptr->data + ptr->len, c);
- ptr->len += pg_mblen(c);
- }
+ if (t_isalpha_cstr(c))
+ ptr->len += ts_copychar_cstr(ptr->data + ptr->len, c);
else if (t_iseq(c, ']'))
state = RS_IN_WAIT;
else /* shouldn't get here */
}
else
elog(ERROR, "internal error in RS_compile: state %d", state);
- c += pg_mblen(c);
+ c += pg_mblen_cstr(c);
}
if (state != RS_IN_WAIT) /* shouldn't get here */
char *ptr = str;
bool res = false;
- clen = pg_mblen(c);
+ clen = pg_mblen_cstr(c);
while (*ptr && !res)
{
- plen = pg_mblen(ptr);
+ plen = pg_mblen_cstr(ptr);
if (plen == clen)
{
i = plen;
while (*c)
{
len++;
- c += pg_mblen(c);
+ c += pg_mblen_cstr(c);
}
if (len < r->nchar)
{
len -= r->nchar;
while (len-- > 0)
- c += pg_mblen(c);
+ c += pg_mblen_cstr(c);
}
elog(ERROR, "unrecognized regis node type: %d", ptr->type);
}
ptr = ptr->next;
- c += pg_mblen(c);
+ c += pg_mblen_cstr(c);
}
return true;
{
if (t_iseq(str, c))
return str;
- str += pg_mblen(str);
+ str += pg_mblen_cstr(str);
}
return NULL;
{
if (t_iseq(str, c1) || t_iseq(str, c2))
return str;
- str += pg_mblen(str);
+ str += pg_mblen_cstr(str);
}
return NULL;
char *next;
const char *sbuf = *sflagset;
int maxstep;
+ int clen;
bool stop = false;
bool met_comma = false;
{
case FM_LONG:
case FM_CHAR:
- COPYCHAR(sflag, *sflagset);
- sflag += pg_mblen(*sflagset);
+ clen = ts_copychar_cstr(sflag, *sflagset);
+ sflag += clen;
/* Go to start of the next flag */
- *sflagset += pg_mblen(*sflagset);
+ *sflagset += clen;
/* Check if we get all characters of flag */
maxstep--;
*sflagset)));
}
- *sflagset += pg_mblen(*sflagset);
+ *sflagset += pg_mblen_cstr(*sflagset);
}
stop = true;
break;
while (*s)
{
/* we allow only single encoded flags for faster works */
- if (pg_mblen(s) == 1 && isprint((unsigned char) *s) && !isspace((unsigned char) *s))
+ if (pg_mblen_cstr(s) == 1 && isprint((unsigned char) *s) && !isspace((unsigned char) *s))
s++;
else
{
*s = '\0';
break;
}
- s += pg_mblen(s);
+ s += pg_mblen_cstr(s);
}
pstr = lowerstr_ctx(Conf, line);
while (**str)
{
+ int clen = pg_mblen_cstr(*str);
+
if (state == PAE_WAIT_MASK)
{
if (t_iseq(*str, '#'))
return false;
else if (!isspace((unsigned char) **str))
{
- int clen = pg_mblen(*str);
-
if (clen < avail)
{
- COPYCHAR(next, *str);
+ ts_copychar_with_len(next, *str, clen);
next += clen;
avail -= clen;
}
}
else
{
- int clen = pg_mblen(*str);
-
if (clen < avail)
{
- COPYCHAR(next, *str);
+ ts_copychar_with_len(next, *str, clen);
next += clen;
avail -= clen;
}
}
}
- *str += pg_mblen(*str);
+ *str += clen;
}
*next = '\0';
while (*str)
{
+ int clen = pg_mblen_cstr(str);
+
if (state == PAE_WAIT_MASK)
{
if (t_iseq(str, '#'))
return false;
else if (!isspace((unsigned char) *str))
{
- COPYCHAR(pmask, str);
- pmask += pg_mblen(str);
+ pmask += ts_copychar_with_len(pmask, str, clen);
state = PAE_INMASK;
}
}
}
else if (!isspace((unsigned char) *str))
{
- COPYCHAR(pmask, str);
- pmask += pg_mblen(str);
+ pmask += ts_copychar_with_len(pmask, str, clen);
}
}
else if (state == PAE_WAIT_FIND)
{
state = PAE_INFIND;
}
- else if (t_isalpha(str) || t_iseq(str, '\'') /* english 's */ )
+ else if (t_isalpha_cstr(str) || t_iseq(str, '\'') /* english 's */ )
{
- COPYCHAR(prepl, str);
- prepl += pg_mblen(str);
+ prepl += ts_copychar_with_len(prepl, str, clen);
state = PAE_INREPL;
}
else if (!isspace((unsigned char) *str))
*pfind = '\0';
state = PAE_WAIT_REPL;
}
- else if (t_isalpha(str))
+ else if (t_isalpha_cstr(str))
{
- COPYCHAR(pfind, str);
- pfind += pg_mblen(str);
+ pfind += ts_copychar_with_len(pfind, str, clen);
}
else if (!isspace((unsigned char) *str))
ereport(ERROR,
{
break; /* void repl */
}
- else if (t_isalpha(str))
+ else if (t_isalpha_cstr(str))
{
- COPYCHAR(prepl, str);
- prepl += pg_mblen(str);
+ prepl += ts_copychar_with_len(prepl, str, clen);
state = PAE_INREPL;
}
else if (!isspace((unsigned char) *str))
*prepl = '\0';
break;
}
- else if (t_isalpha(str))
+ else if (t_isalpha_cstr(str))
{
- COPYCHAR(prepl, str);
- prepl += pg_mblen(str);
+ prepl += ts_copychar_with_len(prepl, str, clen);
}
else if (!isspace((unsigned char) *str))
ereport(ERROR,
else
elog(ERROR, "unrecognized state in parse_affentry: %d", state);
- str += pg_mblen(str);
+ str += clen;
}
*pmask = *pfind = *prepl = '\0';
CompoundAffixFlag *newValue;
char sbuf[BUFSIZ];
char *sflag;
- int clen;
while (*s && isspace((unsigned char) *s))
- s += pg_mblen(s);
+ s += pg_mblen_cstr(s);
if (!*s)
ereport(ERROR,
sflag = sbuf;
while (*s && !isspace((unsigned char) *s) && *s != '\n')
{
- clen = pg_mblen(s);
- COPYCHAR(sflag, s);
+ int clen = ts_copychar_cstr(sflag, s);
+
sflag += clen;
s += clen;
}
char *s = recoded + strlen("FLAG");
while (*s && isspace((unsigned char) *s))
- s += pg_mblen(s);
+ s += pg_mblen_cstr(s);
if (*s)
{
if (s)
{
while (*s && !isspace((unsigned char) *s))
- s += pg_mblen(s);
+ s += pg_mblen_cstr(s);
while (*s && isspace((unsigned char) *s))
- s += pg_mblen(s);
+ s += pg_mblen_cstr(s);
- if (*s && pg_mblen(s) == 1)
+ if (*s && pg_mblen_cstr(s) == 1)
{
addCompoundAffixFlagValue(Conf, s, FF_COMPOUNDFLAG);
Conf->usecompound = true;
flagflags = 0;
while (*s && isspace((unsigned char) *s))
- s += pg_mblen(s);
+ s += pg_mblen_cstr(s);
if (*s == '*')
{
* be followed by EOL, whitespace, or ':'. Otherwise this is a
* new-format flag command.
*/
- if (*s && pg_mblen(s) == 1)
+ if (*s && pg_mblen_cstr(s) == 1)
{
- COPYCHAR(flag, s);
+ flag[0] = *s++;
flag[1] = '\0';
- s++;
if (*s == '\0' || *s == '#' || *s == '\n' || *s == ':' ||
isspace((unsigned char) *s))
{
/* space for a single character plus a trailing NUL */
#define WC_BUF_LEN 2
-int
-t_isalpha(const char *ptr)
-{
- pg_wchar wstr[WC_BUF_LEN];
- int wlen pg_attribute_unused();
-
- wlen = pg_mb2wchar_with_len(ptr, wstr, pg_mblen(ptr));
- Assert(wlen <= 1);
-
- /* pass single character, or NUL if empty */
- return pg_iswalpha(wstr[0], pg_database_locale());
-}
-
-int
-t_isalnum(const char *ptr)
-{
- pg_wchar wstr[WC_BUF_LEN];
- int wlen pg_attribute_unused();
-
- wlen = pg_mb2wchar_with_len(ptr, wstr, pg_mblen(ptr));
- Assert(wlen <= 1);
-
- /* pass single character, or NUL if empty */
- return pg_iswalnum(wstr[0], pg_database_locale());
+#define GENERATE_T_ISCLASS_DEF(character_class) \
+/* mblen shall be that of the first character */ \
+int \
+t_is##character_class##_with_len(const char *ptr, int mblen) \
+{ \
+ pg_wchar wstr[WC_BUF_LEN]; \
+ int wlen pg_attribute_unused(); \
+ wlen = pg_mb2wchar_with_len(ptr, wstr, mblen); \
+ Assert(wlen <= 1); \
+ /* pass single character, or NUL if empty */ \
+ return pg_isw##character_class(wstr[0], pg_database_locale()); \
+} \
+\
+/* ptr shall point to a NUL-terminated string */ \
+int \
+t_is##character_class##_cstr(const char *ptr) \
+{ \
+ return t_is##character_class##_with_len(ptr, pg_mblen_cstr(ptr)); \
+} \
+/* ptr shall point to a string with pre-validated encoding */ \
+int \
+t_is##character_class##_unbounded(const char *ptr) \
+{ \
+ return t_is##character_class##_with_len(ptr, pg_mblen_unbounded(ptr)); \
+} \
+/* historical name for _unbounded */ \
+int \
+t_is##character_class(const char *ptr) \
+{ \
+ return t_is##character_class##_unbounded(ptr); \
}
+GENERATE_T_ISCLASS_DEF(alnum)
+GENERATE_T_ISCLASS_DEF(alpha)
/*
* Set up to read a file using tsearch_readline(). This facility is
/* Trim trailing space */
while (*pbuf && !isspace((unsigned char) *pbuf))
- pbuf += pg_mblen(pbuf);
+ pbuf += pg_mblen_cstr(pbuf);
*pbuf = '\0';
/* Skip empty lines */
prs->state->charlen = 0;
else
prs->state->charlen = (prs->charmaxlen == 1) ? prs->charmaxlen :
- pg_mblen(prs->str + prs->state->posbyte);
+ pg_mblen_range(prs->str + prs->state->posbyte,
+ prs->str + prs->lenstr);
Assert(prs->state->posbyte + prs->state->charlen <= prs->lenstr);
Assert(prs->state->state >= TPS_Base && prs->state->state < TPS_Null);
ereturn(escontext, 0,
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
errmsg("invalid hexadecimal digit: \"%.*s\"",
- pg_mblen(s), s)));
+ pg_mblen_range(s, srcend), s)));
s++;
if (s >= srcend)
ereturn(escontext, 0,
ereturn(escontext, 0,
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
errmsg("invalid hexadecimal digit: \"%.*s\"",
- pg_mblen(s), s)));
+ pg_mblen_range(s, srcend), s)));
s++;
*p++ = (v1 << 4) | v2;
}
ereport(ERROR,
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
errmsg("invalid symbol \"%.*s\" found while decoding %s sequence",
- pg_mblen(s - 1), s - 1,
+ pg_mblen_range(s - 1, srcend), s - 1,
url ? "base64url" : "base64")));
}
}
ereport(ERROR,
(errcode(ERRCODE_INVALID_DATETIME_FORMAT),
errmsg("invalid datetime format separator: \"%s\"",
- pnstrdup(str, pg_mblen(str)))));
+ pnstrdup(str, pg_mblen_cstr(str)))));
if (*str == ' ')
n->type = NODE_TYPE_SPACE;
/* backslash quotes the next character, if any */
if (*str == '\\' && *(str + 1))
str++;
- chlen = pg_mblen(str);
+ chlen = pg_mblen_cstr(str);
n->type = NODE_TYPE_CHAR;
memcpy(n->character, str, chlen);
n->character[chlen] = '\0';
*/
if (*str == '\\' && *(str + 1) == '"')
str++;
- chlen = pg_mblen(str);
+ chlen = pg_mblen_cstr(str);
if ((flags & DCH_FLAG) && is_separator_char(str))
n->type = NODE_TYPE_SEPARATOR;
do { \
if (IS_SUFFIX_THth(_suf)) \
{ \
- if (*(ptr)) (ptr) += pg_mblen(ptr); \
- if (*(ptr)) (ptr) += pg_mblen(ptr); \
+ if (*(ptr)) (ptr) += pg_mblen_cstr(ptr); \
+ if (*(ptr)) (ptr) += pg_mblen_cstr(ptr); \
} \
} while (0)
* insist that the consumed character match the format's
* character.
*/
- s += pg_mblen(s);
+ s += pg_mblen_cstr(s);
}
continue;
}
if (extra_skip > 0)
extra_skip--;
else
- s += pg_mblen(s);
+ s += pg_mblen_cstr(s);
}
else
{
- int chlen = pg_mblen(s);
+ int chlen = pg_mblen_cstr(s);
/*
* Standard mode requires strict match of format characters.
static void
NUM_eat_non_data_chars(NUMProc *Np, int n, size_t input_len)
{
+ const char *end = Np->inout + input_len;
+
while (n-- > 0)
{
if (OVERLOAD_TEST)
break; /* end of input */
if (strchr("0123456789.,+-", *Np->inout_p) != NULL)
break; /* it's a data character */
- Np->inout_p += pg_mblen(Np->inout_p);
+ Np->inout_p += pg_mblen_range(Np->inout_p, end);
}
}
}
else
{
- Np->inout_p += pg_mblen(Np->inout_p);
+ Np->inout_p += pg_mblen_range(Np->inout_p, Np->inout + input_len);
}
continue;
}
{
/* Advance to next multibyte character */
if (IS_HIGHBIT_SET(*context_start))
- context_start += pg_mblen(context_start);
+ context_start += pg_mblen_range(context_start, context_end);
else
context_start++;
}
(errcode(ERRCODE_SYNTAX_ERROR),
errmsg("invalid input syntax for type %s", "jsonpath"),
errdetail("Unrecognized flag character \"%.*s\" in LIKE_REGEX predicate.",
- pg_mblen(flags->val + i), flags->val + i)));
+ pg_mblen_range(flags->val + i, flags->val + flags->len),
+ flags->val + i)));
break;
}
}
int *s_char_len = NULL;
int j;
const char *y;
+ const char *send = source + slen;
+ const char *tend = target + tlen;
/*
* For varstr_levenshtein_less_equal, we have real variables called
#endif
/*
- * In order to avoid calling pg_mblen() repeatedly on each character in s,
- * we cache all the lengths before starting the main loop -- but if all
- * the characters in both strings are single byte, then we skip this and
- * use a fast-path in the main loop. If only one string contains
+ * In order to avoid calling pg_mblen_range() repeatedly on each character
+ * in s, we cache all the lengths before starting the main loop -- but if
+ * all the characters in both strings are single byte, then we skip this
+ * and use a fast-path in the main loop. If only one string contains
* multi-byte characters, we still build the array, so that the fast-path
* needn't deal with the case where the array hasn't been initialized.
*/
s_char_len = (int *) palloc((m + 1) * sizeof(int));
for (i = 0; i < m; ++i)
{
- s_char_len[i] = pg_mblen(cp);
+ s_char_len[i] = pg_mblen_range(cp, send);
cp += s_char_len[i];
}
s_char_len[i] = 0;
{
int *temp;
const char *x = source;
- int y_char_len = n != tlen + 1 ? pg_mblen(y) : 1;
+ int y_char_len = n != tlen + 1 ? pg_mblen_range(y, tend) : 1;
int i;
#ifdef LEVENSHTEIN_LESS_EQUAL
*--------------------
*/
static inline int
-wchareq(const char *p1, const char *p2)
+wchareq(const char *p1, int p1len, const char *p2, int p2len)
{
- int p1_len;
+ int p1clen;
/* Optimization: quickly compare the first byte. */
if (*p1 != *p2)
return 0;
- p1_len = pg_mblen(p1);
- if (pg_mblen(p2) != p1_len)
+ p1clen = pg_mblen_with_len(p1, p1len);
+ if (pg_mblen_with_len(p2, p2len) != p1clen)
return 0;
/* They are the same length */
- while (p1_len--)
+ while (p1clen--)
{
if (*p1++ != *p2++)
return 0;
#define NextByte(p, plen) ((p)++, (plen)--)
/* Set up to compile like_match.c for multibyte characters */
-#define CHAREQ(p1, p2) wchareq((p1), (p2))
+#define CHAREQ(p1, p1len, p2, p2len) wchareq((p1), (p1len), (p2), (p2len))
#define NextChar(p, plen) \
- do { int __l = pg_mblen(p); (p) +=__l; (plen) -=__l; } while (0)
+ do { int __l = pg_mblen_with_len((p), (plen)); (p) +=__l; (plen) -=__l; } while (0)
#define CopyAdvChar(dst, src, srclen) \
- do { int __l = pg_mblen(src); \
+ do { int __l = pg_mblen_with_len((src), (srclen)); \
(srclen) -= __l; \
while (__l-- > 0) \
*(dst)++ = *(src)++; \
#include "like_match.c"
/* Set up to compile like_match.c for single-byte characters */
-#define CHAREQ(p1, p2) (*(p1) == *(p2))
+#define CHAREQ(p1, p1len, p2, p2len) (*(p1) == *(p2))
#define NextChar(p, plen) NextByte((p), (plen))
#define CopyAdvChar(dst, src, srclen) (*(dst)++ = *(src)++, (srclen)--)
errhint("Escape string must be empty or one character.")));
e = VARDATA_ANY(esc);
+ elen = VARSIZE_ANY_EXHDR(esc);
/*
* If specified escape is '\', just copy the pattern as-is.
afterescape = false;
while (plen > 0)
{
- if (CHAREQ(p, e) && !afterescape)
+ if (CHAREQ(p, plen, e, elen) && !afterescape)
{
*r++ = '\\';
NextChar(p, plen);
char *ptr1,
*ptr2,
*ptr2start,
- *ptr2end,
*ptr_ret;
+ const char *ptr2end;
int m,
s1len,
s2len;
while (m--)
{
- int mlen = pg_mblen(ptr2);
+ int mlen = pg_mblen_range(ptr2, ptr2end);
memcpy(ptr_ret, ptr2, mlen);
ptr_ret += mlen;
while (s1len--)
{
- int mlen = pg_mblen(ptr1);
+ int mlen = pg_mblen_unbounded(ptr1);
memcpy(ptr_ret, ptr1, mlen);
ptr_ret += mlen;
char *ptr1,
*ptr2,
*ptr2start,
- *ptr2end,
*ptr_ret;
+ const char *ptr2end;
int m,
s1len,
s2len;
m = len - s1len;
ptr1 = VARDATA_ANY(string1);
+
ptr_ret = VARDATA(ret);
while (s1len--)
{
- int mlen = pg_mblen(ptr1);
+ int mlen = pg_mblen_unbounded(ptr1);
memcpy(ptr_ret, ptr1, mlen);
ptr_ret += mlen;
while (m--)
{
- int mlen = pg_mblen(ptr2);
+ int mlen = pg_mblen_range(ptr2, ptr2end);
memcpy(ptr_ret, ptr2, mlen);
ptr_ret += mlen;
*/
const char **stringchars;
const char **setchars;
+ const char *setend;
int *stringmblen;
int *setmblen;
int stringnchars;
int resultndx;
int resultnchars;
const char *p;
+ const char *pend;
int len;
int mblen;
const char *str_pos;
stringnchars = 0;
p = string;
len = stringlen;
+ pend = p + len;
while (len > 0)
{
stringchars[stringnchars] = p;
- stringmblen[stringnchars] = mblen = pg_mblen(p);
+ stringmblen[stringnchars] = mblen = pg_mblen_range(p, pend);
stringnchars++;
p += mblen;
len -= mblen;
setnchars = 0;
p = set;
len = setlen;
+ setend = set + setlen;
while (len > 0)
{
setchars[setnchars] = p;
- setmblen[setnchars] = mblen = pg_mblen(p);
+ setmblen[setnchars] = mblen = pg_mblen_range(p, setend);
setnchars++;
p += mblen;
len -= mblen;
*to_end;
char *source,
*target;
+ const char *source_end;
+ const char *from_end;
int m,
fromlen,
tolen,
if (m <= 0)
PG_RETURN_TEXT_P(string);
source = VARDATA_ANY(string);
+ source_end = source + m;
fromlen = VARSIZE_ANY_EXHDR(from);
from_ptr = VARDATA_ANY(from);
+ from_end = from_ptr + fromlen;
tolen = VARSIZE_ANY_EXHDR(to);
to_ptr = VARDATA_ANY(to);
to_end = to_ptr + tolen;
while (m > 0)
{
- source_len = pg_mblen(source);
+ source_len = pg_mblen_range(source, source_end);
from_index = 0;
for (i = 0; i < fromlen; i += len)
{
- len = pg_mblen(&from_ptr[i]);
+ len = pg_mblen_range(&from_ptr[i], from_end);
if (len == source_len &&
memcmp(source, &from_ptr[i], len) == 0)
break;
{
if (p >= to_end)
break;
- p += pg_mblen(p);
+ p += pg_mblen_range(p, to_end);
}
if (p < to_end)
{
- len = pg_mblen(p);
+ len = pg_mblen_range(p, to_end);
memcpy(target, p, len);
target += len;
retlen += len;
ereport(ERROR,
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
errmsg("invalid regular expression option: \"%.*s\"",
- pg_mblen(opt_p + i), opt_p + i)));
+ pg_mblen_range(opt_p + i, opt_p + opt_len), opt_p + i)));
break;
}
}
if (VARSIZE_ANY_EXHDR(opt) > 0)
{
char *opt_p = VARDATA_ANY(opt);
+ const char *end_p = opt_p + VARSIZE_ANY_EXHDR(opt);
if (*opt_p >= '0' && *opt_p <= '9')
ereport(ERROR,
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
errmsg("invalid regular expression option: \"%.*s\"",
- pg_mblen(opt_p), opt_p),
+ pg_mblen_range(opt_p, end_p), opt_p),
errhint("If you meant to use regexp_replace() with a start parameter, cast the fourth argument to integer explicitly.")));
}
*r;
int plen,
elen;
+ const char *pend;
bool afterescape = false;
int nquotes = 0;
int bracket_depth = 0; /* square bracket nesting level */
p = VARDATA_ANY(pat_text);
plen = VARSIZE_ANY_EXHDR(pat_text);
+ pend = p + plen;
if (esc_text == NULL)
{
/* No ESCAPE clause provided; default to backslash as escape */
if (elen > 1)
{
- int mblen = pg_mblen(p);
+ int mblen = pg_mblen_range(p, pend);
if (mblen > 1)
{
return buf;
buf++;
- while (*buf && pg_mblen(buf) == 1)
+ while (*buf && pg_mblen_cstr(buf) == 1)
{
switch (*buf)
{
return false;
/* it shouldn't be a part of any word */
- if (t_iseq(ptr, '-') || t_iseq(ptr, '_') || t_isalnum(ptr))
+ if (t_iseq(ptr, '-') || t_iseq(ptr, '_') || t_isalnum_cstr(ptr))
return false;
for (;;)
{
- ptr += pg_mblen(ptr);
+ ptr += pg_mblen_cstr(ptr);
if (*ptr == '\0') /* got end of string without operand */
return false;
break;
}
- state->buf += pg_mblen(state->buf);
+ state->buf += pg_mblen_cstr(state->buf);
}
}
break;
}
- state->buf += pg_mblen(state->buf);
+ state->buf += pg_mblen_cstr(state->buf);
}
}
*(in->cur) = '\\';
in->cur++;
}
- COPYCHAR(in->cur, op);
- clen = pg_mblen(op);
+ clen = ts_copychar_cstr(in->cur, op);
op += clen;
in->cur += clen;
}
lenbuf = 0,
pp;
WordEntry *ptr = ARRPTR(out);
- char *curbegin,
- *curin,
+ char *curin,
*curout;
+ const char *curend;
lenbuf = out->size * 2 /* '' */ + out->size - 1 /* space */ + 2 /* \0 */ ;
for (i = 0; i < out->size; i++)
curout = outbuf = (char *) palloc(lenbuf);
for (i = 0; i < out->size; i++)
{
- curbegin = curin = STRPTR(out) + ptr->pos;
+ curin = STRPTR(out) + ptr->pos;
+ curend = curin + ptr->len;
if (i != 0)
*curout++ = ' ';
*curout++ = '\'';
- while (curin - curbegin < ptr->len)
+ while (curin < curend)
{
- int len = pg_mblen(curin);
+ int len = pg_mblen_range(curin, curend);
if (t_iseq(curin, '\''))
*curout++ = '\'';
if (ws)
{
char *buf;
+ const char *end;
buf = VARDATA_ANY(ws);
- while (buf - VARDATA_ANY(ws) < VARSIZE_ANY_EXHDR(ws))
+ end = buf + VARSIZE_ANY_EXHDR(ws);
+ while (buf < end)
{
- if (pg_mblen(buf) == 1)
+ int len = pg_mblen_range(buf, end);
+
+ if (len == 1)
{
switch (*buf)
{
stat->weight |= 0;
}
}
- buf += pg_mblen(buf);
+ buf += len;
}
}
PRSSYNTAXERROR;
else if (!isspace((unsigned char) *state->prsbuf))
{
- COPYCHAR(curpos, state->prsbuf);
- curpos += pg_mblen(state->prsbuf);
+ curpos += ts_copychar_cstr(curpos, state->prsbuf);
statecode = WAITENDWORD;
}
}
else
{
RESIZEPRSBUF;
- COPYCHAR(curpos, state->prsbuf);
- curpos += pg_mblen(state->prsbuf);
+ curpos += ts_copychar_cstr(curpos, state->prsbuf);
Assert(oldstate != 0);
statecode = oldstate;
}
else
{
RESIZEPRSBUF;
- COPYCHAR(curpos, state->prsbuf);
- curpos += pg_mblen(state->prsbuf);
+ curpos += ts_copychar_cstr(curpos, state->prsbuf);
}
}
else if (statecode == WAITENDCMPLX)
else
{
RESIZEPRSBUF;
- COPYCHAR(curpos, state->prsbuf);
- curpos += pg_mblen(state->prsbuf);
+ curpos += ts_copychar_cstr(curpos, state->prsbuf);
}
}
else if (statecode == WAITCHARCMPLX)
if (!state->is_web && t_iseq(state->prsbuf, '\''))
{
RESIZEPRSBUF;
- COPYCHAR(curpos, state->prsbuf);
- curpos += pg_mblen(state->prsbuf);
+ curpos += ts_copychar_cstr(curpos, state->prsbuf);
statecode = WAITENDCMPLX;
}
else
PRSSYNTAXERROR;
if (state->oprisdelim)
{
- /* state->prsbuf+=pg_mblen(state->prsbuf); */
+ /* state->prsbuf+=pg_mblen_cstr(state->prsbuf); */
RETURN_TOKEN;
}
else
statecode);
/* get next char */
- state->prsbuf += pg_mblen(state->prsbuf);
+ state->prsbuf += pg_mblen_cstr(state->prsbuf);
}
}
ereturn(escontext, (Datum) 0,
(errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
errmsg("\"%.*s\" is not a valid binary digit",
- pg_mblen(sp), sp)));
+ pg_mblen_cstr(sp), sp)));
x >>= 1;
if (x == 0)
ereturn(escontext, (Datum) 0,
(errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
errmsg("\"%.*s\" is not a valid hexadecimal digit",
- pg_mblen(sp), sp)));
+ pg_mblen_cstr(sp), sp)));
if (bc)
{
ereturn(escontext, (Datum) 0,
(errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
errmsg("\"%.*s\" is not a valid binary digit",
- pg_mblen(sp), sp)));
+ pg_mblen_cstr(sp), sp)));
x >>= 1;
if (x == 0)
ereturn(escontext, (Datum) 0,
(errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
errmsg("\"%.*s\" is not a valid hexadecimal digit",
- pg_mblen(sp), sp)));
+ pg_mblen_cstr(sp), sp)));
if (bc)
{
* charlen_to_bytelen()
* Compute the number of bytes occupied by n characters starting at *p
*
- * It is caller's responsibility that there actually are n characters;
- * the string need not be null-terminated.
+ * The caller shall ensure there are n complete characters. Callers achieve
+ * this by deriving "n" from regmatch_t findings from searching a wchar array.
+ * pg_mb2wchar_with_len() skips any trailing incomplete character, so regex
+ * matches will end no later than the last complete character. (The string
+ * need not be null-terminated.)
*/
static int
charlen_to_bytelen(const char *p, int n)
const char *s;
for (s = p; n > 0; n--)
- s += pg_mblen(s);
+ s += pg_mblen_unbounded(s); /* caller verified encoding */
return s - p;
}
int32 slice_start;
int32 slice_size;
int32 slice_strlen;
+ int32 slice_len;
text *slice;
int32 E1;
int32 i;
slice = (text *) DatumGetPointer(str);
/* see if we got back an empty string */
- if (VARSIZE_ANY_EXHDR(slice) == 0)
+ slice_len = VARSIZE_ANY_EXHDR(slice);
+ if (slice_len == 0)
{
if (slice != (text *) DatumGetPointer(str))
pfree(slice);
/* Now we can get the actual length of the slice in MB characters */
slice_strlen = pg_mbstrlen_with_len(VARDATA_ANY(slice),
- VARSIZE_ANY_EXHDR(slice));
+ slice_len);
/*
* Check that the start position wasn't > slice_strlen. If so, SQL99
*/
p = VARDATA_ANY(slice);
for (i = 0; i < S1 - 1; i++)
- p += pg_mblen(p);
+ p += pg_mblen_unbounded(p);
/* hang onto a pointer to our start position */
s = p;
* length.
*/
for (i = S1; i < E1; i++)
- p += pg_mblen(p);
+ p += pg_mblen_unbounded(p);
ret = (text *) palloc(VARHDRSZ + (p - s));
SET_VARSIZE(ret, VARHDRSZ + (p - s));
*/
if (state->is_multibyte_char_in_char && state->locale->deterministic)
{
+ const char *haystack_end = state->str1 + state->len1;
+
/* Walk one character at a time, until we reach the match. */
/* the search should never move backwards. */
while (state->refpoint < matchptr)
{
/* step to next character. */
- state->refpoint += pg_mblen(state->refpoint);
+ state->refpoint += pg_mblen_range(state->refpoint, haystack_end);
state->refpos++;
/*
test_end = hptr;
do
{
- test_end += pg_mblen(test_end);
+ test_end += pg_mblen_range(test_end, haystack_end);
if (pg_strncoll(hptr, (test_end - hptr), needle, needle_len, state->locale) == 0)
{
state->last_match_len_tmp = (test_end - hptr);
if (result_hptr)
break;
- hptr += pg_mblen(hptr);
+ hptr += pg_mblen_range(hptr, haystack_end);
}
return (char *) result_hptr;
}
else
{
+ const char *end_ptr;
+
/*
* When fldsep is NULL, each character in the input string becomes a
* separate element in the result set. The separator is effectively
inputstring_len = VARSIZE_ANY_EXHDR(inputstring);
start_ptr = VARDATA_ANY(inputstring);
+ end_ptr = start_ptr + inputstring_len;
while (inputstring_len > 0)
{
- int chunk_len = pg_mblen(start_ptr);
+ int chunk_len = pg_mblen_range(start_ptr, end_ptr);
CHECK_FOR_INTERRUPTS();
{
int sz;
- sz = pg_mblen(p);
+ sz = pg_mblen_range(p, endp);
dst -= sz;
memcpy(dst, p, sz);
p += sz;
ereport(ERROR,
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
errmsg("unrecognized format() type specifier \"%.*s\"",
- pg_mblen(cp), cp),
+ pg_mblen_range(cp, end_ptr), cp),
errhint("For a single \"%%\" use \"%%%%\".")));
/* If indirect width was specified, get its value */
ereport(ERROR,
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
errmsg("unrecognized format() type specifier \"%.*s\"",
- pg_mblen(cp), cp),
+ pg_mblen_range(cp, end_ptr), cp),
errhint("For a single \"%%\" use \"%%%%\".")));
break;
}
char *utf8string;
pg_wchar ret[2]; /* need space for trailing zero */
- /* note we're not assuming s is null-terminated */
- utf8string = pg_server_to_any(s, pg_mblen(s), PG_UTF8);
+ utf8string = pg_server_to_any(s, pg_mblen_cstr(s), PG_UTF8);
pg_encoding_mb2wchar_with_len(PG_UTF8, utf8string, ret,
pg_encoding_mblen(PG_UTF8, utf8string));
initStringInfo(&buf);
- for (p = ident; *p; p += pg_mblen(p))
+ for (p = ident; *p; p += pg_mblen_cstr(p))
{
if (*p == ':' && (p == ident || fully_escaped))
appendStringInfoString(&buf, "_x003A_");
: !is_valid_xml_namechar(u))
appendStringInfo(&buf, "_x%04X_", (unsigned int) u);
else
- appendBinaryStringInfo(&buf, p, pg_mblen(p));
+ appendBinaryStringInfo(&buf, p, pg_mblen_cstr(p));
}
}
initStringInfo(&buf);
- for (p = name; *p; p += pg_mblen(p))
+ for (p = name; *p; p += pg_mblen_cstr(p))
{
if (*p == '_' && *(p + 1) == 'x'
&& isxdigit((unsigned char) *(p + 2))
p += 6;
}
else
- appendBinaryStringInfo(&buf, p, pg_mblen(p));
+ appendBinaryStringInfo(&buf, p, pg_mblen_cstr(p));
}
return buf.data;
#include "catalog/namespace.h"
#include "mb/pg_wchar.h"
#include "utils/fmgrprotos.h"
+#include "utils/memdebug.h"
#include "utils/memutils.h"
#include "utils/relcache.h"
#include "varatt.h"
int len, bool is_client_to_server);
static int cliplen(const char *str, int len, int limit);
+pg_noreturn
+static void report_invalid_encoding_int(int encoding, const char *mbstr,
+ int mblen, int len);
+
+pg_noreturn
+static void report_invalid_encoding_db(const char *mbstr, int mblen, int len);
+
/*
* Prepare for a future call to SetClientEncoding. Success should mean
return pg_wchar_table[encoding].wchar2mb_with_len(from, (unsigned char *) to, len);
}
-/* returns the byte length of a multibyte character */
+/*
+ * Returns the byte length of a multibyte character sequence in a
+ * null-terminated string. Raises an illegal byte sequence error if the
+ * sequence would hit a null terminator.
+ *
+ * The caller is expected to have checked for a terminator at *mbstr == 0
+ * before calling, but some callers want 1 in that case, so this function
+ * continues that tradition.
+ *
+ * This must only be used for strings that have a null-terminator to enable
+ * bounds detection.
+ */
+int
+pg_mblen_cstr(const char *mbstr)
+{
+ int length = pg_wchar_table[DatabaseEncoding->encoding].mblen((const unsigned char *) mbstr);
+
+ /*
+ * The .mblen functions return 1 when given a pointer to a terminator.
+ * Some callers depend on that, so we tolerate it for now. Well-behaved
+ * callers check the leading byte for a terminator *before* calling.
+ */
+ for (int i = 1; i < length; ++i)
+ if (unlikely(mbstr[i] == 0))
+ report_invalid_encoding_db(mbstr, length, i);
+
+ /*
+ * String should be NUL-terminated, but checking that would make typical
+ * callers O(N^2), tripling Valgrind check-world time. Unless
+ * VALGRIND_EXPENSIVE, check 1 byte after each actual character. (If we
+ * found a character, not a terminator, the next byte must be a terminator
+ * or the start of the next character.) If the caller iterates the whole
+ * string, the last call will diagnose a missing terminator.
+ */
+ if (mbstr[0] != '\0')
+ {
+#ifdef VALGRIND_EXPENSIVE
+ VALGRIND_CHECK_MEM_IS_DEFINED(mbstr, strlen(mbstr));
+#else
+ VALGRIND_CHECK_MEM_IS_DEFINED(mbstr + length, 1);
+#endif
+ }
+
+ return length;
+}
+
+/*
+ * Returns the byte length of a multibyte character sequence bounded by a range
+ * [mbstr, end) of at least one byte in size. Raises an illegal byte sequence
+ * error if the sequence would exceed the range.
+ */
+int
+pg_mblen_range(const char *mbstr, const char *end)
+{
+ int length = pg_wchar_table[DatabaseEncoding->encoding].mblen((const unsigned char *) mbstr);
+
+ Assert(end > mbstr);
+#ifdef VALGRIND_EXPENSIVE
+ VALGRIND_CHECK_MEM_IS_DEFINED(mbstr, end - mbstr);
+#else
+ VALGRIND_CHECK_MEM_IS_DEFINED(mbstr, length);
+#endif
+
+ if (unlikely(mbstr + length > end))
+ report_invalid_encoding_db(mbstr, length, end - mbstr);
+
+ return length;
+}
+
+/*
+ * Returns the byte length of a multibyte character sequence bounded by a range
+ * extending for 'limit' bytes, which must be at least one. Raises an illegal
+ * byte sequence error if the sequence would exceed the range.
+ */
+int
+pg_mblen_with_len(const char *mbstr, int limit)
+{
+ int length = pg_wchar_table[DatabaseEncoding->encoding].mblen((const unsigned char *) mbstr);
+
+ Assert(limit >= 1);
+#ifdef VALGRIND_EXPENSIVE
+ VALGRIND_CHECK_MEM_IS_DEFINED(mbstr, limit);
+#else
+ VALGRIND_CHECK_MEM_IS_DEFINED(mbstr, length);
+#endif
+
+ if (unlikely(length > limit))
+ report_invalid_encoding_db(mbstr, length, limit);
+
+ return length;
+}
+
+
+/*
+ * Returns the length of a multibyte character sequence, without any
+ * validation of bounds.
+ *
+ * PLEASE NOTE: This function can only be used safely if the caller has
+ * already verified the input string, since otherwise there is a risk of
+ * overrunning the buffer if the string is invalid. A prior call to a
+ * pg_mbstrlen* function suffices.
+ */
+int
+pg_mblen_unbounded(const char *mbstr)
+{
+ int length = pg_wchar_table[DatabaseEncoding->encoding].mblen((const unsigned char *) mbstr);
+
+ VALGRIND_CHECK_MEM_IS_DEFINED(mbstr, length);
+
+ return length;
+}
+
+/*
+ * Historical name for pg_mblen_unbounded(). Should not be used and will be
+ * removed in a later version.
+ */
int
pg_mblen(const char *mbstr)
{
- return pg_wchar_table[DatabaseEncoding->encoding].mblen((const unsigned char *) mbstr);
+ return pg_mblen_unbounded(mbstr);
}
/* returns the display length of a multibyte character */
while (*mbstr)
{
- mbstr += pg_mblen(mbstr);
+ mbstr += pg_mblen_cstr(mbstr);
len++;
}
return len;
}
/* returns the length (counted in wchars) of a multibyte string
- * (not necessarily NULL terminated)
+ * (stops at the first of "limit" or a NUL)
*/
int
pg_mbstrlen_with_len(const char *mbstr, int limit)
while (limit > 0 && *mbstr)
{
- int l = pg_mblen(mbstr);
+ int l = pg_mblen_with_len(mbstr, limit);
limit -= l;
mbstr += l;
while (len > 0 && *mbstr)
{
- l = pg_mblen(mbstr);
+ l = pg_mblen_with_len(mbstr, len);
nch++;
if (nch > limit)
break;
report_invalid_encoding(int encoding, const char *mbstr, int len)
{
int l = pg_encoding_mblen_or_incomplete(encoding, mbstr, len);
+
+ report_invalid_encoding_int(encoding, mbstr, l, len);
+}
+
+static void
+report_invalid_encoding_int(int encoding, const char *mbstr, int mblen, int len)
+{
char buf[8 * 5 + 1];
char *p = buf;
int j,
jlimit;
- jlimit = Min(l, len);
+ jlimit = Min(mblen, len);
jlimit = Min(jlimit, 8); /* prevent buffer overrun */
for (j = 0; j < jlimit; j++)
buf)));
}
+static void
+report_invalid_encoding_db(const char *mbstr, int mblen, int len)
+{
+ report_invalid_encoding_int(GetDatabaseEncoding(), mbstr, mblen, len);
+}
+
/*
* report_untranslatable_char: complain about untranslatable character
*
extern int pg_wchar_strncmp(const pg_wchar *s1, const pg_wchar *s2, size_t n);
extern int pg_char_and_wchar_strncmp(const char *s1, const pg_wchar *s2, size_t n);
extern size_t pg_wchar_strlen(const pg_wchar *str);
+extern int pg_mblen_cstr(const char *mbstr);
+extern int pg_mblen_range(const char *mbstr, const char *end);
+extern int pg_mblen_with_len(const char *mbstr, int limit);
+extern int pg_mblen_unbounded(const char *mbstr);
+
+/* deprecated */
extern int pg_mblen(const char *mbstr);
+
extern int pg_dsplen(const char *mbstr);
extern int pg_mbstrlen(const char *mbstr);
extern int pg_mbstrlen_with_len(const char *mbstr, int limit);
/* The second argument of t_iseq() must be a plain ASCII character */
#define t_iseq(x,c) (TOUCHAR(x) == (unsigned char) (c))
-#define COPYCHAR(d,s) memcpy(d, s, pg_mblen(s))
+/* Copy multibyte character of known byte length, return byte length. */
+static inline int
+ts_copychar_with_len(void *dest, const void *src, int length)
+{
+ memcpy(dest, src, length);
+ return length;
+}
+
+/* Copy multibyte character from null-terminated string, return byte length. */
+static inline int
+ts_copychar_cstr(void *dest, const void *src)
+{
+ return ts_copychar_with_len(dest, src, pg_mblen_cstr((const char *) src));
+}
+
+/* Historical macro for the above. */
+#define COPYCHAR ts_copychar_cstr
+
+#define GENERATE_T_ISCLASS_DECL(character_class) \
+extern int t_is##character_class##_with_len(const char *ptr, int len); \
+extern int t_is##character_class##_cstr(const char *ptr); \
+extern int t_is##character_class##_unbounded(const char *ptr); \
+\
+/* deprecated */ \
+extern int t_is##character_class(const char *ptr);
-extern int t_isalpha(const char *ptr);
-extern int t_isalnum(const char *ptr);
+GENERATE_T_ISCLASS_DECL(alnum);
+GENERATE_T_ISCLASS_DECL(alpha);
extern bool tsearch_readline_begin(tsearch_readline_state *stp,
const char *filename);
extern void close_tsvector_parser(TSVectorParseState state);
/* phrase operator begins with '<' */
-#define ISOPERATOR(x) \
- ( pg_mblen(x) == 1 && ( *(x) == '!' || \
- *(x) == '&' || \
- *(x) == '|' || \
- *(x) == '(' || \
- *(x) == ')' || \
- *(x) == '<' \
- ) )
+#define ISOPERATOR(x) (*(x) == '!' || \
+ *(x) == '&' || \
+ *(x) == '|' || \
+ *(x) == '(' || \
+ *(x) == ')' || \
+ *(x) == '<')
/* parse_tsquery */
ereport(ERROR,
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
errmsg("invalid regular expression test option: \"%.*s\"",
- pg_mblen(opt_p + i), opt_p + i)));
+ pg_mblen_range(opt_p + i, opt_p + opt_len),
+ opt_p + i)));
break;
}
}