* endword points to the character after word
*/
static char *
-find_word(char *str, int lenstr, char **endword, int *charlen)
+find_word(char *str, int lenstr, char **endword)
{
char *beginword = str;
return NULL;
*endword = beginword;
- *charlen = 0;
while (*endword - str < lenstr && ISWORDCHR(*endword))
- {
*endword += pg_mblen(*endword);
- (*charlen)++;
- }
return beginword;
}
}
/*
- * Adds trigrams from words (already padded).
+ * Adds trigrams from the word in 'str' (already padded if necessary).
*/
static trgm *
-make_trigrams(trgm *tptr, char *str, int bytelen, int charlen)
+make_trigrams(trgm *tptr, char *str, int bytelen)
{
char *ptr = str;
- if (charlen < 3)
+ if (bytelen < 3)
return tptr;
- if (bytelen > charlen)
+ if (pg_encoding_max_length(GetDatabaseEncoding()) == 1)
{
- /* Find multibyte character boundaries and apply compact_trigram */
- int lenfirst = pg_mblen(str),
- lenmiddle = pg_mblen(str + lenfirst),
- lenlast = pg_mblen(str + lenfirst + lenmiddle);
-
- while ((ptr - str) + lenfirst + lenmiddle + lenlast <= bytelen)
+ while (ptr < str + bytelen - 2)
{
- compact_trigram(tptr, ptr, lenfirst + lenmiddle + lenlast);
-
- ptr += lenfirst;
+ CPTRGM(tptr, ptr);
+ ptr++;
tptr++;
-
- lenfirst = lenmiddle;
- lenmiddle = lenlast;
- lenlast = pg_mblen(ptr + lenfirst + lenmiddle);
}
}
else
{
- /* Fast path when there are no multibyte characters */
- Assert(bytelen == charlen);
+ int lenfirst,
+ lenmiddle,
+ lenlast;
+ char *endptr;
- while (ptr - str < bytelen - 2 /* number of trigrams = strlen - 2 */ )
+ /*
+ * Fast path as long as there are no multibyte characters
+ */
+ if (!IS_HIGHBIT_SET(ptr[0]) && !IS_HIGHBIT_SET(ptr[1]))
{
- CPTRGM(tptr, ptr);
- ptr++;
+ while (!IS_HIGHBIT_SET(ptr[2]))
+ {
+ CPTRGM(tptr, ptr);
+ ptr++;
+ tptr++;
+
+ if (ptr == str + bytelen - 2)
+ return tptr;
+ }
+
+ lenfirst = 1;
+ lenmiddle = 1;
+ lenlast = pg_mblen(ptr + 2);
+ }
+ else
+ {
+ lenfirst = pg_mblen(ptr);
+ if (ptr + lenfirst >= str + bytelen)
+ return tptr;
+ lenmiddle = pg_mblen(ptr + lenfirst);
+ if (ptr + lenfirst + lenmiddle >= str + bytelen)
+ return tptr;
+ lenlast = pg_mblen(ptr + lenfirst + lenmiddle);
+ }
+
+ /*
+ * Slow path to handle any remaining multibyte characters
+ *
+ * As we go, 'ptr' points to the beginning of the current
+ * three-character string and 'endptr' points to just past it.
+ */
+ endptr = ptr + lenfirst + lenmiddle + lenlast;
+ while (endptr <= str + bytelen)
+ {
+ compact_trigram(tptr, ptr, endptr - ptr);
tptr++;
+
+ /* Advance to the next character */
+ if (endptr == str + bytelen)
+ break;
+ ptr += lenfirst;
+ lenfirst = lenmiddle;
+ lenmiddle = lenlast;
+ lenlast = pg_mblen(endptr);
+ endptr += lenlast;
}
}
{
trgm *tptr;
char *buf;
- int charlen,
- bytelen;
+ int bytelen;
char *bword,
*eword;
}
eword = str;
- while ((bword = find_word(eword, slen - (eword - str), &eword, &charlen)) != NULL)
+ while ((bword = find_word(eword, slen - (eword - str), &eword)) != NULL)
{
#ifdef IGNORECASE
bword = str_tolower(bword, eword - bword, DEFAULT_COLLATION_OID);
/* Calculate trigrams marking their bounds if needed */
if (bounds)
bounds[tptr - trg] |= TRGM_BOUND_LEFT;
- tptr = make_trigrams(tptr, buf, bytelen + LPADDING + RPADDING,
- charlen + LPADDING + RPADDING);
+ tptr = make_trigrams(tptr, buf, bytelen + LPADDING + RPADDING);
if (bounds)
bounds[tptr - trg - 1] |= TRGM_BOUND_RIGHT;
}
* str: source string, of length lenstr bytes (need not be null-terminated)
* buf: where to return the substring (must be long enough)
* *bytelen: receives byte length of the found substring
- * *charlen: receives character length of the found substring
*
* Returns pointer to end+1 of the found substring in the source string.
- * Returns NULL if no word found (in which case buf, bytelen, charlen not set)
+ * Returns NULL if no word found (in which case buf, bytelen is not set)
*
* If the found word is bounded by non-word characters or string boundaries
* then this function will include corresponding padding spaces into buf.
*/
static const char *
get_wildcard_part(const char *str, int lenstr,
- char *buf, int *bytelen, int *charlen)
+ char *buf, int *bytelen)
{
const char *beginword = str;
const char *endword;
* Add left padding spaces if preceding character wasn't wildcard
* meta-character.
*/
- *charlen = 0;
if (!in_leading_wildcard_meta)
{
if (LPADDING > 0)
{
*s++ = ' ';
- (*charlen)++;
if (LPADDING > 1)
- {
*s++ = ' ';
- (*charlen)++;
- }
}
}
if (ISWORDCHR(endword))
{
memcpy(s, endword, clen);
- (*charlen)++;
s += clen;
}
else
else if (ISWORDCHR(endword))
{
memcpy(s, endword, clen);
- (*charlen)++;
s += clen;
}
else
if (RPADDING > 0)
{
*s++ = ' ';
- (*charlen)++;
if (RPADDING > 1)
- {
*s++ = ' ';
- (*charlen)++;
- }
}
}
*buf2;
trgm *tptr;
int len,
- charlen,
bytelen;
const char *eword;
*/
eword = str;
while ((eword = get_wildcard_part(eword, slen - (eword - str),
- buf, &bytelen, &charlen)) != NULL)
+ buf, &bytelen)) != NULL)
{
#ifdef IGNORECASE
buf2 = str_tolower(buf, bytelen, DEFAULT_COLLATION_OID);
/*
* count trigrams
*/
- tptr = make_trigrams(tptr, buf2, bytelen, charlen);
+ tptr = make_trigrams(tptr, buf2, bytelen);
#ifdef IGNORECASE
pfree(buf2);