When converting UTF8 or UTF16 strings, change overlong strings and other

author drh <drh@noemail.net>

Thu, 19 Oct 2006 01:58:43 +0000 (01:58 +0000)

committer drh <drh@noemail.net>

Thu, 19 Oct 2006 01:58:43 +0000 (01:58 +0000)
author drh <drh@noemail.net>
Thu, 19 Oct 2006 01:58:43 +0000 (01:58 +0000)
committer drh <drh@noemail.net>
Thu, 19 Oct 2006 01:58:43 +0000 (01:58 +0000)
diff --git a/manifest b/manifest

index 85edf8a3570fc8e015195863a49a11ca1846bb6b..a88f5511d227f9f4455eb0b853fce487b6a27474 100644 (file)
--- a/manifest
+++ b/manifest
@@ -1,5 +1,5 @@
-C Fix\sa\sproblems\sthat\sarise\sif\smalloc()\sfails\swhile\scompiling\sSELECT\nstatements\swithin\sa\sTRIGGER.\s(CVS\s3478)
-D 2006-10-18T23:26:39
+C When\sconverting\sUTF8\sor\sUTF16\sstrings,\schange\soverlong\sstrings\sand\sother\nillegal\scodes\sto\s0xFFFD.\s\sTicket\s#2029.\s(CVS\s3479)
+D 2006-10-19T01:58:44
  F Makefile.in 4379c909d46b38b8c5db3533084601621d4f14b2
  F Makefile.linux-gcc 2d8574d1ba75f129aba2019f0b959db380a90935
  F README 9c4e2d6706bdcc3efdd773ce752a8cdab4f90028
@@ -118,7 +118,7 @@ F src/test_tclvar.c 315e77c17f128ff8c06b38c08617fd07c825a95b
  F src/tokenize.c dfdff21768fbedd40e8d3ca84fc5d0d7af2b46dd
  F src/trigger.c 8c55d31876013ed4e97ee7ce24478fbe00db49bb
  F src/update.c 951f95ef044cf6d28557c48dc35cb0711a0b9129
-F src/utf.c f467b4892a75f60d36ee933be83f5d7562c5290e
+F src/utf.c 67ecb1032bc0b42c105e88d65ef9d9f626eb0e1f
  F src/util.c 91d4cb189476906639ae611927d939691d1365f6
  F src/vacuum.c f6a7943f1f1002cb82ef2ea026cb1975a5b687cb
  F src/vdbe.c 84a9c0b0dd037c064ffed651977e20dd9d2bc1d1
@@ -410,7 +410,7 @@ F www/tclsqlite.tcl bb0d1357328a42b1993d78573e587c6dcbc964b9
  F www/vdbe.tcl 87a31ace769f20d3627a64fa1fade7fed47b90d0
  F www/version3.tcl 890248cf7b70e60c383b0e84d77d5132b3ead42b
  F www/whentouse.tcl 97e2b5cd296f7d8057e11f44427dea8a4c2db513
-P b886eaa334150262ce4d1a1d0470ca4cf623a396
-R 9bb26a66074bed22316d4b359f75108a
+P ee4894b49995e4904db1991281563cfbb7b1c16d
+R f570c2492328855150d4a5def15ee9fe
  U drh
-Z 42173c26897f66009294e3d680c52f79
+Z 452bce4bb83169559d00576edd9e29b4
diff --git a/manifest.uuid b/manifest.uuid

index ac9d8b299afae38ca3dc4c9894e639d8be4035c6..6a52d9169721d404e0adb5ab00bdb071a3873011 100644 (file)
--- a/manifest.uuid
+++ b/manifest.uuid
@@ -1 +1 @@
-ee4894b49995e4904db1991281563cfbb7b1c16d
-\ No newline at end of file
+0c6736df9cb4c3c8f6224e30df939cead9cd5369
+\ No newline at end of file
diff --git a/src/utf.c b/src/utf.c

index 05a8985e2f24663c13e01dd6cb07bd6a3c2f4f01..fb587930b3a8b60b3fbc3e616b1b2a9479916665 100644 (file)
--- a/src/utf.c
+++ b/src/utf.c
@@ -12,7 +12,7 @@
  ** This file contains routines used to translate between UTF-8, 
  ** UTF-16, UTF-16BE, and UTF-16LE.
  **
-** $Id: utf.c,v 1.42 2006/10/05 11:43:53 drh Exp $
+** $Id: utf.c,v 1.43 2006/10/19 01:58:44 drh Exp $
  **
  ** Notes on UTF-8:
  **
@@ -64,7 +64,7 @@
  
  /*
  ** This table maps from the first byte of a UTF-8 character to the number
-** of trailing bytes expected. A value '255' indicates that the table key
+** of trailing bytes expected. A value '4' indicates that the table key
  ** is not a legal first byte for a UTF-8 character.
  */
  static const u8 xtra_utf8_bytes[256]  = {
@@ -79,10 +79,10 @@ static const u8 xtra_utf8_bytes[256]  = {
  0, 0, 0, 0, 0, 0, 0, 0,     0, 0, 0, 0, 0, 0, 0, 0,
  
  /* 10wwwwww */
-255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
-255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
-255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
-255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+4, 4, 4, 4, 4, 4, 4, 4,     4, 4, 4, 4, 4, 4, 4, 4,
+4, 4, 4, 4, 4, 4, 4, 4,     4, 4, 4, 4, 4, 4, 4, 4,
+4, 4, 4, 4, 4, 4, 4, 4,     4, 4, 4, 4, 4, 4, 4, 4,
+4, 4, 4, 4, 4, 4, 4, 4,     4, 4, 4, 4, 4, 4, 4, 4,
  
  /* 110yyyyy */
  1, 1, 1, 1, 1, 1, 1, 1,     1, 1, 1, 1, 1, 1, 1, 1,
@@ -92,7 +92,7 @@ static const u8 xtra_utf8_bytes[256]  = {
  2, 2, 2, 2, 2, 2, 2, 2,     2, 2, 2, 2, 2, 2, 2, 2,
  
  /* 11110yyy */
-3, 3, 3, 3, 3, 3, 3, 3,     255, 255, 255, 255, 255, 255, 255, 255,
+3, 3, 3, 3, 3, 3, 3, 3,     4, 4, 4, 4, 4, 4, 4, 4,
  };
  
  /*
@@ -101,11 +101,24 @@ static const u8 xtra_utf8_bytes[256]  = {
  ** read by a naive implementation of a UTF-8 character reader. The code
  ** in the READ_UTF8 macro explains things best.
  */
-static const int xtra_utf8_bits[4] =  {
-0,
-12416,          /* (0xC0 << 6) + (0x80) */
-925824,         /* (0xE0 << 12) + (0x80 << 6) + (0x80) */
-63447168        /* (0xF0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80 */
+static const int xtra_utf8_bits[] =  {
+  0,
+  12416,          /* (0xC0 << 6) + (0x80) */
+  925824,         /* (0xE0 << 12) + (0x80 << 6) + (0x80) */
+  63447168        /* (0xF0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80 */
+};
+
+/*
+** If a UTF-8 character contains N bytes extra bytes (N bytes follow
+** the initial byte so that the total character length is N+1) then
+** masking the character with utf8_mask[N] must produce a non-zero
+** result.  Otherwise, we have an (illegal) overlong encoding.
+*/
+static const int utf_mask[] = {
+  0x00000000,
+  0xffffff80,
+  0xfffff800,
+  0xffff0000,
  };
  
  #define READ_UTF8(zIn, c) { \
@@ -113,11 +126,14 @@ static const int xtra_utf8_bits[4] =  {
    c = *(zIn)++;                                        \
    xtra = xtra_utf8_bytes[c];                           \
    switch( xtra ){                                      \
-    case 255: c = (int)0xFFFD; break;                  \
+    case 4: c = (int)0xFFFD; break;                    \
      case 3: c = (c<<6) + *(zIn)++;                     \
      case 2: c = (c<<6) + *(zIn)++;                     \
      case 1: c = (c<<6) + *(zIn)++;                     \
      c -= xtra_utf8_bits[xtra];                         \
+    if( (utf_mask[xtra]&c)==0                          \
+        || (c&0xFFFFF800)==0xD800                      \
+        || (c&0xFFFFFFFE)==0xFFFE ){  c = 0xFFFD; }    \
    }                                                    \
  }
  int sqlite3ReadUtf8(const unsigned char *z){
@@ -181,6 +197,7 @@ int sqlite3ReadUtf8(const unsigned char *z){
      int c2 = (*zIn++);                                                \
      c2 += ((*zIn++)<<8);                                              \
      c = (c2&0x03FF) + ((c&0x003F)<<10) + (((c&0x03C0)+0x0040)<<10);   \
+    if( (c & 0xFFFF0000)==0 ) c = 0xFFFD;                             \
    }                                                                   \
  }
  
@@ -191,6 +208,7 @@ int sqlite3ReadUtf8(const unsigned char *z){
      int c2 = ((*zIn++)<<8);                                           \
      c2 += (*zIn++);                                                   \
      c = (c2&0x03FF) + ((c&0x003F)<<10) + (((c&0x03C0)+0x0040)<<10);   \
+    if( (c & 0xFFFF0000)==0 ) c = 0xFFFD;                             \
    }                                                                   \
  }
  
@@ -556,7 +574,7 @@ void sqlite3utf16Substr(
  ** characters in each encoding are inverses of each other.
  */
  void sqlite3utfSelfTest(){
-  unsigned int i;
+  unsigned int i, t;
    unsigned char zBuf[20];
    unsigned char *z;
    int n;
@@ -568,7 +586,10 @@ void sqlite3utfSelfTest(){
      n = z-zBuf;
      z = zBuf;
      READ_UTF8(z, c);
-    assert( c==i );
+    t = i;
+    if( i>=0xD800 && i<=0xDFFF ) t = 0xFFFD;
+    if( (i&0xFFFFFFFE)==0xFFFE ) t = 0xFFFD;
+    assert( c==t );
      assert( (z-zBuf)==n );
    }
    for(i=0; i<0x00110000; i++){
author	drh <drh@noemail.net>
	Thu, 19 Oct 2006 01:58:43 +0000 (01:58 +0000)
committer	drh <drh@noemail.net>
	Thu, 19 Oct 2006 01:58:43 +0000 (01:58 +0000)
manifest		patch \| blob \| blame \| history
manifest.uuid		patch \| blob \| blame \| history
src/utf.c		patch \| blob \| blame \| history