From d8e48fffdff1ebfbf5e0b2ded9e12ce00b1ba427 Mon Sep 17 00:00:00 2001 From: drh <> Date: Thu, 17 Nov 2022 19:24:39 +0000 Subject: [PATCH] Fix corner cases in UTF8 handling in the REGEXP extension. [forum:/forumpost/3ffe058b04|Forum post 3ffe058b04]. FossilOrigin-Name: abb18f61c5cec0f524acc41453b4c06b61c5af51ff46417588837fc0c3967288 --- ext/misc/regexp.c | 8 ++++---- manifest | 14 +++++++------- manifest.uuid | 2 +- test/regexp1.test | 27 +++++++++++++++++++++++++++ 4 files changed, 39 insertions(+), 12 deletions(-) diff --git a/ext/misc/regexp.c b/ext/misc/regexp.c index d0c8ee5cfe..086ef564cb 100644 --- a/ext/misc/regexp.c +++ b/ext/misc/regexp.c @@ -185,7 +185,7 @@ static unsigned re_next_char(ReInput *p){ c = (c&0x0f)<<12 | ((p->z[p->i]&0x3f)<<6) | (p->z[p->i+1]&0x3f); p->i += 2; if( c<=0x7ff || (c>=0xd800 && c<=0xdfff) ) c = 0xfffd; - }else if( (c&0xf8)==0xf0 && p->i+3mx && (p->z[p->i]&0xc0)==0x80 + }else if( (c&0xf8)==0xf0 && p->i+2mx && (p->z[p->i]&0xc0)==0x80 && (p->z[p->i+1]&0xc0)==0x80 && (p->z[p->i+2]&0xc0)==0x80 ){ c = (c&0x07)<<18 | ((p->z[p->i]&0x3f)<<12) | ((p->z[p->i+1]&0x3f)<<6) | (p->z[p->i+2]&0x3f); @@ -712,15 +712,15 @@ static const char *re_compile(ReCompiled **ppRe, const char *zIn, int noCase){ ** one or more matching characters, enter those matching characters into ** zInit[]. The re_match() routine can then search ahead in the input ** string looking for the initial match without having to run the whole - ** regex engine over the string. Do not worry able trying to match + ** regex engine over the string. Do not worry about trying to match ** unicode characters beyond plane 0 - those are very rare and this is ** just an optimization. */ if( pRe->aOp[0]==RE_OP_ANYSTAR && !noCase ){ for(j=0, i=1; j<(int)sizeof(pRe->zInit)-2 && pRe->aOp[i]==RE_OP_MATCH; i++){ unsigned x = pRe->aArg[i]; - if( x<=127 ){ + if( x<=0x7f ){ pRe->zInit[j++] = (unsigned char)x; - }else if( x<=0xfff ){ + }else if( x<=0x7ff ){ pRe->zInit[j++] = (unsigned char)(0xc0 | (x>>6)); pRe->zInit[j++] = 0x80 | (x&0x3f); }else if( x<=0xffff ){ diff --git a/manifest b/manifest index 2432894ab6..00bc7b6ef8 100644 --- a/manifest +++ b/manifest @@ -1,5 +1,5 @@ -C Use\sthe\slog10()\sand\slog2()\sfunctions\sfrom\sthe\sstandard\sC\slibrary\sto\simplement\nthe\sequivalent\sSQL\sfunctions,\sin\sthe\shope\sthat\sthis\swill\sprevent\sreported\nprecision\sproblems.\nSee\s[forum:/forumpost/cfceb1230bdcfd84|forum\spost\scfceb1230bdcfd84]\sand\sthe\nsurrounding\sthread. -D 2022-11-17T14:40:33.537 +C Fix\scorner\scases\sin\sUTF8\shandling\sin\sthe\sREGEXP\sextension.\n[forum:/forumpost/3ffe058b04|Forum\spost\s3ffe058b04]. +D 2022-11-17T19:24:39.375 F .fossil-settings/empty-dirs dbb81e8fc0401ac46a1491ab34a7f2c7c0452f2f06b54ebb845d024ca8283ef1 F .fossil-settings/ignore-glob 35175cdfcf539b2318cb04a9901442804be81cd677d8b889fcc9149c21f239ea F LICENSE.md df5091916dbb40e6e9686186587125e1b2ff51f022cc334e886c19a0e9982724 @@ -316,7 +316,7 @@ F ext/misc/normalize.c bd84355c118e297522aba74de34a4fd286fc775524e0499b14473918d F ext/misc/percentile.c b9086e223d583bdaf8cb73c98a6539d501a2fc4282654adbfea576453d82e691 F ext/misc/prefixes.c 0f4f8cff5aebc00a7e3ac4021fd59cfe1a8e17c800ceaf592859ecb9cbc38196 F ext/misc/qpvtab.c 09738419e25f603a35c0ac8bd0a04daab794f48d08a9bc07a6085b9057b99009 -F ext/misc/regexp.c 5abed0ace2d9340b42b9ab1dbe64db9c276e4e8eba38a903232b6253e05ccdaf +F ext/misc/regexp.c 064838f7b31e90d312cce089cfafbb992034e75d359009c48886ca06c0a794b2 F ext/misc/remember.c add730f0f7e7436cd15ea3fd6a90fd83c3f706ab44169f7f048438b7d6baa69c F ext/misc/rot13.c 51ac5f51e9d5fd811db58a9c23c628ad5f333c173f1fc53c8491a3603d38556c F ext/misc/scrub.c 2a44b0d44c69584c0580ad2553f6290a307a49df4668941d2812135bfb96a946 @@ -1411,7 +1411,7 @@ F test/randexpr1.test eda062a97e60f9c38ae8d806b03b0ddf23d796df F test/rbu.test 168573d353cd0fd10196b87b0caa322c144ef736 F test/rdonly.test 64e2696c322e3538df0b1ed624e21f9a23ed9ff8 F test/recover.test fd5199f928757cb308661b5fdca1abc19398a798ff7f24b57c3071e9f8e0471e -F test/regexp1.test 83c631617357150f8054ca1d1fed40a552b0d0f8eb7a7f090c3be02cee9f9913 +F test/regexp1.test 8f2a8bc1569666e29a4cee6c1a666cd224eb6d50e2470d1dc1df995170f3e0f1 F test/regexp2.test 55ed41da802b0e284ac7e2fe944be3948f93ff25abbca0361a609acfed1368b5 F test/reindex.test cd9d6021729910ece82267b4f5e1b5ac2911a7566c43b43c176a6a4732e2118d F test/releasetest_data.tcl 0db8aee0c348090fd06da47020ab4ed8ec692e0723427b2f3947d4dfb806f3b0 @@ -2055,8 +2055,8 @@ F vsixtest/vsixtest.tcl 6a9a6ab600c25a91a7acc6293828957a386a8a93 F vsixtest/vsixtest.vcxproj.data 2ed517e100c66dc455b492e1a33350c1b20fbcdc F vsixtest/vsixtest.vcxproj.filters 37e51ffedcdb064aad6ff33b6148725226cd608e F vsixtest/vsixtest_TemporaryKey.pfx e5b1b036facdb453873e7084e1cae9102ccc67a0 -P 9048a766ff7dfa0cd91ea74092e462f4501cb3f719033ccb55700bf5e4dfd0d3 -R a034e1e03397012452b518f4075f2764 +P 7c572d02e60a83b36543ba4d9d45f61e9fc111b61fee085410c2d87558c732d6 +R f93830c4b318c6359968570a2d6be5a6 U drh -Z 49084f227abbaddd45e0c2d870d50ee7 +Z 502cd80eb5e32a4170e530e0ee906c79 # Remove this line to create a well-formed Fossil manifest. diff --git a/manifest.uuid b/manifest.uuid index d717acf28e..13c0efb61f 100644 --- a/manifest.uuid +++ b/manifest.uuid @@ -1 +1 @@ -7c572d02e60a83b36543ba4d9d45f61e9fc111b61fee085410c2d87558c732d6 \ No newline at end of file +abb18f61c5cec0f524acc41453b4c06b61c5af51ff46417588837fc0c3967288 \ No newline at end of file diff --git a/test/regexp1.test b/test/regexp1.test index 102c1280c0..0401b13d72 100644 --- a/test/regexp1.test +++ b/test/regexp1.test @@ -303,6 +303,33 @@ do_execsql_test regexp1-6.7 {SELECT 'xabc' REGEXP '(^abc|def)';} {0} do_execsql_test regexp1-6.8 {SELECT 'def' REGEXP '(^abc|def)';} {1} do_execsql_test regexp1-6.9 {SELECT 'xdef' REGEXP '(^abc|def)';} {1} +# 2022-11-17 +# https://sqlite.org/forum/forumpost/3ffe058b04 +# +do_execsql_test regexp1-7.1 { + SELECT char(0x61,0x7ff,0x62) REGEXP char(0x7ff); +} 1 +do_execsql_test regexp1-7.2 { + SELECT char(0x61,0x800,0x62) REGEXP char(0x800); +} 1 +do_execsql_test regexp1-7.3 { + SELECT char(0x61,0xabc,0x62) REGEXP char(0xabc); +} 1 +do_execsql_test regexp1-7.4 { + SELECT char(0x61,0xfff,0x62) REGEXP char(0xfff); +} 1 +do_execsql_test regexp1-7.5 { + SELECT char(0x61,0x1000,0x62) REGEXP char(0x1000); +} 1 +do_execsql_test regexp1-7.10 { + SELECT char(0x61,0xffff,0x62) REGEXP char(0xffff); +} 1 +do_execsql_test regexp1-7.11 { + SELECT char(0x61,0x10000,0x62) REGEXP char(0x10000); +} 1 +do_execsql_test regexp1-7.12 { + SELECT char(0x61,0x10ffff,0x62) REGEXP char(0x10ffff); +} 1 finish_test -- 2.47.2