From: drh <drh@noemail.net>
Date: Sat, 5 Jan 2013 17:17:21 +0000 (+0000)
Subject: More bug fixes to the test_regexp.c module.  Bring test_regexp.c into alignment
X-Git-Tag: version-3.7.16~81
X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=aa5df79d327a20b8e62f0364c0e7d4e5884f25b2;p=thirdparty%2Fsqlite.git

More bug fixes to the test_regexp.c module.  Bring test_regexp.c into alignment
with the regexp.c file in the Fossil sources.

FossilOrigin-Name: 7695b88fe0d73fd0a36fb7d8f95350b80615a89e
---

diff --git a/manifest b/manifest
index f13b9def1d..c0b5bd7ec3 100644
--- a/manifest
+++ b/manifest
@@ -1,5 +1,5 @@
-C Remove\ssuperfluous\stext\sfrom\sa\scomment.
-D 2013-01-04T22:26:47.761
+C More\sbug\sfixes\sto\sthe\stest_regexp.c\smodule.\s\sBring\stest_regexp.c\sinto\salignment\nwith\sthe\sregexp.c\sfile\sin\sthe\sFossil\ssources.
+D 2013-01-05T17:17:21.332
 F Makefile.arm-wince-mingw32ce-gcc d6df77f1f48d690bd73162294bbba7f59507c72f
 F Makefile.in a48faa9e7dd7d556d84f5456eabe5825dd8a6282
 F Makefile.linux-gcc 91d710bdc4998cb015f39edf3cb314ec4f4d7e23
@@ -217,7 +217,7 @@ F src/test_osinst.c 90a845c8183013d80eccb1f29e8805608516edba
 F src/test_pcache.c a5cd24730cb43c5b18629043314548c9169abb00
 F src/test_quota.c 0e0e2e3bf6766b101ecccd8c042b66e44e9be8f5
 F src/test_quota.h 8761e463b25e75ebc078bd67d70e39b9c817a0cb
-F src/test_regexp.c f32daf8e08466df5e81617f6694e96172d98fcd2
+F src/test_regexp.c 935a1bfb48c7a6857514aa9cedf6df048f8b9928
 F src/test_rtree.c aba603c949766c4193f1068b91c787f57274e0d9
 F src/test_schema.c 8c06ef9ddb240c7a0fcd31bc221a6a2aade58bf0
 F src/test_server.c 2f99eb2837dfa06a4aacf24af24c6affdf66a84f
@@ -1031,7 +1031,7 @@ F tool/vdbe-compress.tcl f12c884766bd14277f4fcedcae07078011717381
 F tool/warnings-clang.sh f6aa929dc20ef1f856af04a730772f59283631d4
 F tool/warnings.sh fbc018d67fd7395f440c28f33ef0f94420226381
 F tool/win/sqlite.vsix 97894c2790eda7b5bce3cc79cb2a8ec2fde9b3ac
-P 6c4c2b7dbadedac384d380efd54f12f6ccbf4ca9
-R 03d4aa554ff3262e05c30977fa9109df
-U mistachkin
-Z e183cfa598aad0d3bd7483fe9e033a09
+P e9ac5339603766c120c775bda8ae816d0ccb1503
+R b07326038178e7051b1863f2d68963fe
+U drh
+Z 66007f41514d49327911f47fbc5d3001
diff --git a/manifest.uuid b/manifest.uuid
index 2b56862be2..1b51e6969f 100644
--- a/manifest.uuid
+++ b/manifest.uuid
@@ -1 +1 @@
-e9ac5339603766c120c775bda8ae816d0ccb1503
\ No newline at end of file
+7695b88fe0d73fd0a36fb7d8f95350b80615a89e
\ No newline at end of file
diff --git a/src/test_regexp.c b/src/test_regexp.c
index da86b69aa5..c0361f17d3 100644
--- a/src/test_regexp.c
+++ b/src/test_regexp.c
@@ -129,7 +129,7 @@ static unsigned re_next_char(ReInput *p){
   unsigned c;
   if( p->i>=p->mx ) return 0;
   c = p->z[p->i++];
-  if( c>0x80 ){
+  if( c>=0x80 ){
     if( (c&0xe0)==0xc0 && p->i<p->mx && (p->z[p->i]&0xc0)==0x80 ){
       c = (c&0x1f)<<6 | (p->z[p->i++]&0x3f);
       if( c<0x80 ) c = 0xfffd;
@@ -137,13 +137,13 @@ static unsigned re_next_char(ReInput *p){
            && (p->z[p->i+1]&0xc0)==0x80 ){
       c = (c&0x0f)<<12 | ((p->z[p->i]&0x3f)<<6) | (p->z[p->i+1]&0x3f);
       p->i += 2;
-      if( c<0x3ff || (c>=0xd800 && c<=0xdfff) ) c = 0xfffd;
+      if( c<=0x3ff || (c>=0xd800 && c<=0xdfff) ) c = 0xfffd;
     }else if( (c&0xf8)==0xf0 && p->i+3<p->mx && (p->z[p->i]&0xc0)==0x80
            && (p->z[p->i+1]&0xc0)==0x80 && (p->z[p->i+2]&0xc0)==0x80 ){
       c = (c&0x07)<<18 | ((p->z[p->i]&0x3f)<<12) | ((p->z[p->i+1]&0x3f)<<6)
                        | (p->z[p->i+2]&0x3f);
       p->i += 3;
-      if( c<0xffff ) c = 0xfffd;
+      if( c<=0xffff || c>0x10ffff ) c = 0xfffd;
     }else{
       c = 0xfffd;
     }
@@ -169,7 +169,7 @@ static int re_digit_char(int c){
 
 /* Return true if c is a perl "space" character:  [ \t\r\n\v\f] */
 static int re_space_char(int c){
-  return c==' ' || c=='\t' || c=='\n' || c=='\v' || c=='\f';
+  return c==' ' || c=='\t' || c=='\n' || c=='\r' || c=='\v' || c=='\f';
 }
 
 /* Run a compiled regular expression on the zero-terminated input
@@ -188,7 +188,9 @@ int re_match(ReCompiled *pRe, const unsigned char *zIn, int nIn){
 
   in.z = zIn;
   in.i = 0;
-  in.mx = nIn>=0 ? nIn : strlen((char*)zIn);
+  in.mx = nIn>=0 ? nIn : strlen((char const*)zIn);
+
+  /* Look for the initial prefix match, if there is one. */
   if( pRe->nInit ){
     unsigned char x = pRe->zInit[0];
     while( in.i+pRe->nInit<=in.mx 
@@ -198,6 +200,7 @@ int re_match(ReCompiled *pRe, const unsigned char *zIn, int nIn){
     }
     if( in.i+pRe->nInit>in.mx ) return 0;
   }
+
   if( pRe->nState<=(sizeof(aSpace)/(sizeof(aSpace[0])*2)) ){
     pToFree = 0;
     aStateSet[0].aState = aSpace;
@@ -624,7 +627,7 @@ const char *re_compile(ReCompiled **ppRe, const char *zIn, int noCase){
   }
   pRe->sIn.z = (unsigned char*)zIn;
   pRe->sIn.i = 0;
-  pRe->sIn.mx = strlen((char*)pRe->sIn.z);
+  pRe->sIn.mx = strlen(zIn);
   zErr = re_subcompile_re(pRe);
   if( zErr ){
     re_free(pRe);
@@ -641,6 +644,15 @@ const char *re_compile(ReCompiled **ppRe, const char *zIn, int noCase){
     re_free(pRe);
     return "unrecognized character";
   }
+
+  /* The following is a performance optimization.  If the regex begins with
+  ** ".*" (if the input regex lacks an initial "^") and afterwards there are
+  ** one or more matching characters, enter those matching characters into
+  ** zInit[].  The re_match() routine can then search ahead in the input 
+  ** string looking for the initial match without having to run the whole
+  ** regex engine over the string.  Do not worry able trying to match
+  ** unicode characters beyond plane 0 - those are very rare and this is
+  ** just an optimization. */
   if( pRe->aOp[0]==RE_OP_ANYSTAR ){
     for(j=0, i=1; j<sizeof(pRe->zInit)-2 && pRe->aOp[i]==RE_OP_MATCH; i++){
       unsigned x = pRe->aArg[i];
@@ -652,7 +664,7 @@ const char *re_compile(ReCompiled **ppRe, const char *zIn, int noCase){
       }else if( x<=0xffff ){
         pRe->zInit[j++] = 0xd0 | (x>>12);
         pRe->zInit[j++] = 0x80 | ((x>>6)&0x3f);
-        pRe->zInit[j++] = 0x80 | (0x3f);
+        pRe->zInit[j++] = 0x80 | (x&0x3f);
       }else{
         break;
       }