From a3e619294133c74617c8b2f9d152fd5b2625f455 Mon Sep 17 00:00:00 2001 From: dan Date: Fri, 10 Feb 2023 17:17:04 +0000 Subject: [PATCH] Fix a problem with the fts5 trigram tokenizer and LIKE or GLOB patterns for which contain runs of 2 or fewer non-wildcard characters that are 3 or more bytes when encoded as utf-8. FossilOrigin-Name: 00714b39b39c51519edbc0194f98c7275fecf96763a06fd95db6e1d81bb9f1f1 --- ext/fts5/fts5_expr.c | 16 +++++++++++++++- ext/fts5/test/fts5trigram.test | 18 ++++++++++++++++++ manifest | 16 ++++++++-------- manifest.uuid | 2 +- 4 files changed, 42 insertions(+), 10 deletions(-) diff --git a/ext/fts5/fts5_expr.c b/ext/fts5/fts5_expr.c index 66bd304d42..e4072db7aa 100644 --- a/ext/fts5/fts5_expr.c +++ b/ext/fts5/fts5_expr.c @@ -289,6 +289,19 @@ int sqlite3Fts5ExprNew( return sParse.rc; } +/* +** Assuming that buffer z is at least nByte bytes in size and contains a +** valid utf-8 string, return the number of characters in the string. +*/ +static int fts5ExprCountChar(const char *z, int nByte){ + int nRet = 0; + int ii; + for(ii=0; ii=3 ){ + + if( fts5ExprCountChar(&zText[iFirst], i-iFirst)>=3 ){ int jj; zExpr[iOut++] = '"'; for(jj=iFirst; jj