]> git.ipfire.org Git - thirdparty/postgresql.git/commitdiff
Change the backend to reject strings containing invalidly-encoded multibyte
authorTom Lane <tgl@sss.pgh.pa.us>
Sun, 21 May 2006 20:07:13 +0000 (20:07 +0000)
committerTom Lane <tgl@sss.pgh.pa.us>
Sun, 21 May 2006 20:07:13 +0000 (20:07 +0000)
characters in all cases.  Formerly we mostly just threw warnings for invalid
input, and failed to detect it at all if no encoding conversion was required.
The tighter check is needed to defend against SQL-injection attacks as per
CVE-2006-2313 (further details will be published after release).  Embedded
zero (null) bytes will be rejected as well.  The checks are applied during
input to the backend (receipt from client or COPY IN), so it no longer seems
necessary to check in textin() and related routines; any string arriving at
those functions will already have been validated.  Conversion failure
reporting (for characters with no equivalent in the destination encoding)
has been cleaned up and made consistent while at it.

Also, fix a few longstanding errors in little-used encoding conversion
routines: win1251_to_iso, win866_to_iso, euc_tw_to_big5, euc_tw_to_mic,
mic_to_euc_tw were all broken to varying extents.

Patches by Tatsuo Ishii and Tom Lane.  Thanks to Akio Ishida and Yasuo Ohgaki
for identifying the security issues.

36 files changed:
src/backend/access/transam/xact.c
src/backend/commands/copy.c
src/backend/utils/adt/name.c
src/backend/utils/adt/selfuncs.c
src/backend/utils/adt/varchar.c
src/backend/utils/adt/varlena.c
src/backend/utils/mb/conv.c
src/backend/utils/mb/conversion_procs/cyrillic_and_mic/cyrillic_and_mic.c
src/backend/utils/mb/conversion_procs/euc_cn_and_mic/euc_cn_and_mic.c
src/backend/utils/mb/conversion_procs/euc_jp_and_sjis/euc_jp_and_sjis.c
src/backend/utils/mb/conversion_procs/euc_kr_and_mic/euc_kr_and_mic.c
src/backend/utils/mb/conversion_procs/euc_tw_and_big5/euc_tw_and_big5.c
src/backend/utils/mb/conversion_procs/latin2_and_win1250/latin2_and_win1250.c
src/backend/utils/mb/conversion_procs/latin_and_mic/latin_and_mic.c
src/backend/utils/mb/conversion_procs/utf8_and_ascii/utf8_and_ascii.c
src/backend/utils/mb/conversion_procs/utf8_and_big5/utf8_and_big5.c
src/backend/utils/mb/conversion_procs/utf8_and_cyrillic/utf8_and_cyrillic.c
src/backend/utils/mb/conversion_procs/utf8_and_euc_cn/utf8_and_euc_cn.c
src/backend/utils/mb/conversion_procs/utf8_and_euc_jp/utf8_and_euc_jp.c
src/backend/utils/mb/conversion_procs/utf8_and_euc_kr/utf8_and_euc_kr.c
src/backend/utils/mb/conversion_procs/utf8_and_euc_tw/utf8_and_euc_tw.c
src/backend/utils/mb/conversion_procs/utf8_and_gb18030/utf8_and_gb18030.c
src/backend/utils/mb/conversion_procs/utf8_and_gbk/utf8_and_gbk.c
src/backend/utils/mb/conversion_procs/utf8_and_iso8859/utf8_and_iso8859.c
src/backend/utils/mb/conversion_procs/utf8_and_iso8859_1/utf8_and_iso8859_1.c
src/backend/utils/mb/conversion_procs/utf8_and_johab/utf8_and_johab.c
src/backend/utils/mb/conversion_procs/utf8_and_sjis/utf8_and_sjis.c
src/backend/utils/mb/conversion_procs/utf8_and_tcvn/utf8_and_tcvn.c
src/backend/utils/mb/conversion_procs/utf8_and_uhc/utf8_and_uhc.c
src/backend/utils/mb/conversion_procs/utf8_and_win1250/utf8_and_win1250.c
src/backend/utils/mb/conversion_procs/utf8_and_win1256/utf8_and_win1256.c
src/backend/utils/mb/conversion_procs/utf8_and_win874/utf8_and_win874.c
src/backend/utils/mb/mbutils.c
src/backend/utils/mb/wchar.c
src/include/c.h
src/include/mb/pg_wchar.h

index 6c14ef36468225241aa67e9e36242b7afedafa5c..b5d205af9f7f9fb3a56dba95445a8aa663824993 100644 (file)
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *       $Header: /cvsroot/pgsql/src/backend/access/transam/xact.c,v 1.135.2.2 2004/08/11 04:08:39 tgl Exp $
+ *       $Header: /cvsroot/pgsql/src/backend/access/transam/xact.c,v 1.135.2.3 2006/05/21 20:07:11 tgl Exp $
  *
  * NOTES
  *             Transaction aborts can now occur two ways:
@@ -1098,9 +1098,14 @@ AbortTransaction(void)
 
        /*
         * check the current transaction state
+        *
+        * reduced to DEBUG2 because this is expected when rejecting an
+        * invalidly-encoded query outside a transaction block.  PG 8.0
+        * and up fix it better, but it's not worth back-porting those
+        * changes to 7.3.
         */
        if (s->state != TRANS_INPROGRESS)
-               elog(WARNING, "AbortTransaction and not in in-progress state");
+               elog(DEBUG2, "AbortTransaction and not in in-progress state");
 
        /*
         * set the current transaction state information appropriately during
index dd551659d5aca855c1aca7ef7a6f73be93cc746b..ede4d59e7fa96c79535a1928ee8f8c4214f35a61 100644 (file)
@@ -7,7 +7,7 @@
  *
  *
  * IDENTIFICATION
- *       $Header: /cvsroot/pgsql/src/backend/commands/copy.c,v 1.177.2.2 2003/04/25 22:14:33 tgl Exp $
+ *       $Header: /cvsroot/pgsql/src/backend/commands/copy.c,v 1.177.2.3 2006/05/21 20:07:11 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -88,6 +88,7 @@ static StringInfoData attribute_buf;
 
 static int     client_encoding;
 static int     server_encoding;
+static int     server_max_length;
 
 /*
  * Internal communications functions
@@ -388,6 +389,7 @@ DoCopy(const CopyStmt *stmt)
 
        client_encoding = pg_get_client_encoding();
        server_encoding = GetDatabaseEncoding();
+       server_max_length = pg_database_encoding_max_length();
 
        if (is_from)
        {                                                       /* copy from file to database */
@@ -1471,7 +1473,8 @@ CopyReadAttribute(FILE *fp, const char *delim, CopyReadResult *result)
 
 copy_eof:
 
-       if (client_encoding != server_encoding)
+       if (client_encoding != server_encoding ||
+               server_max_length > 1)
        {
                cvt = (char *) pg_client_to_server((unsigned char *) attribute_buf.data,
                                                                                   attribute_buf.len);
index 63ea829812a63667cde687dfcc743fe124e03c6d..d2810e5edf6886a70147135644a73e6279510eb1 100644 (file)
@@ -12,7 +12,7 @@
  *
  *
  * IDENTIFICATION
- *       $Header: /cvsroot/pgsql/src/backend/utils/adt/name.c,v 1.41 2002/09/04 20:31:28 momjian Exp $
+ *       $Header: /cvsroot/pgsql/src/backend/utils/adt/name.c,v 1.41.2.1 2006/05/21 20:07:11 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -44,13 +44,8 @@ namein(PG_FUNCTION_ARGS)
        char       *s = PG_GETARG_CSTRING(0);
        NameData   *result;
        int                     len;
-       char       *ermsg;
 
-       /* veryfy encoding */
        len = strlen(s);
-       if ((ermsg = pg_verifymbstr(s, len)))
-               elog(ERROR, "%s", ermsg);
-
        len = pg_mbcliplen(s, len, NAMEDATALEN - 1);
 
        result = (NameData *) palloc(NAMEDATALEN);
index d00f6addb3feafcc31adcc59b0196987c1c5c21f..be7d74ef2973a520c985c21f3bed212f6e0f0fca 100644 (file)
@@ -15,7 +15,7 @@
  *
  *
  * IDENTIFICATION
- *       $Header: /cvsroot/pgsql/src/backend/utils/adt/selfuncs.c,v 1.119.2.8 2004/02/02 03:07:25 tgl Exp $
+ *       $Header: /cvsroot/pgsql/src/backend/utils/adt/selfuncs.c,v 1.119.2.9 2006/05/21 20:07:11 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -3473,8 +3473,8 @@ make_greater_string(const Const *str_const)
                        if (datatype != BYTEAOID)
                        {
                                /* do not generate invalid encoding sequences */
-                               if (pg_verifymbstr((const unsigned char *) workstr,
-                                                                  len) != NULL)
+                               if (!pg_verifymbstr((const char *) workstr,
+                                                                       len, true))
                                        continue;
                                workstr_const = string_to_const(workstr, datatype);
                        }
index 03579f437b62b15ba79ad1382d3efdc934d2df90..f75c1c206d1bc79f33203cb6bc8e9058860e2a1d 100644 (file)
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *       $Header: /cvsroot/pgsql/src/backend/utils/adt/varchar.c,v 1.95 2002/09/18 21:35:23 tgl Exp $
+ *       $Header: /cvsroot/pgsql/src/backend/utils/adt/varchar.c,v 1.95.2.1 2006/05/21 20:07:11 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -73,18 +73,11 @@ bpcharin(PG_FUNCTION_ARGS)
        size_t          len,
                                maxlen;
        int                     i;
-
        int                     charlen;                /* number of charcters in the input string */
-       char       *ermsg;
 
        len = strlen(s);
-
-       if ((ermsg = pg_verifymbstr(s, len)))
-               elog(ERROR, "%s", ermsg);
-
        charlen = pg_mbstrlen(s);
 
-
        /* If typmod is -1 (or invalid), use the actual string length */
        if (atttypmod < (int32) VARHDRSZ)
                maxlen = charlen;
@@ -349,13 +342,7 @@ varcharin(PG_FUNCTION_ARGS)
        size_t          len,
                                maxlen;
 
-       char       *ermsg;
-
        len = strlen(s);
-
-       if ((ermsg = pg_verifymbstr(s, len)))
-               elog(ERROR, "%s", ermsg);
-
        maxlen = atttypmod - VARHDRSZ;
 
        if (atttypmod >= (int32) VARHDRSZ && len > maxlen)
index 15261f3248d44e6e37cb41d53bb5e3254bedb46b..897bf571d960c8907e99ec68e6ea2748ed15cad1 100644 (file)
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *       $Header: /cvsroot/pgsql/src/backend/utils/adt/varlena.c,v 1.92.2.4 2005/12/22 22:50:29 tgl Exp $
+ *       $Header: /cvsroot/pgsql/src/backend/utils/adt/varlena.c,v 1.92.2.5 2006/05/21 20:07:11 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -217,13 +217,8 @@ textin(PG_FUNCTION_ARGS)
        text       *result;
        int                     len;
 
-       char       *ermsg;
-
        len = strlen(inputText) + VARHDRSZ;
 
-       if ((ermsg = pg_verifymbstr(inputText, len - VARHDRSZ)))
-               elog(ERROR, "%s", ermsg);
-
        result = (text *) palloc(len);
        VARATT_SIZEP(result) = len;
 
index 35e9851989f71262d9b427d9dff768934a69eafd..e84c7ce0e6199df84a6bc2bf3070bb137c92152f 100644 (file)
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * IDENTIFICATION
- *       $Header: /cvsroot/pgsql/src/backend/utils/mb/conv.c,v 1.43.2.1 2003/04/12 08:01:23 ishii Exp $
+ *       $Header: /cvsroot/pgsql/src/backend/utils/mb/conv.c,v 1.43.2.2 2006/05/21 20:07:11 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
 #include "postgres.h"
 #include "mb/pg_wchar.h"
 
-/*
- * convert bogus chars that cannot be represented in the current
- * encoding system.
- */
-void
-pg_print_bogus_char(unsigned char **mic, unsigned char **p)
-{
-       char            strbuf[16];
-       int                     l = pg_mic_mblen(*mic);
-
-       *(*p)++ = '(';
-       while (l--)
-       {
-               sprintf(strbuf, "%02x", *(*mic)++);
-               *(*p)++ = strbuf[0];
-               *(*p)++ = strbuf[1];
-       }
-       *(*p)++ = ')';
-}
-
-#ifdef NOT_USED
-
-/*
- * GB18030 ---> MIC
- * Added by Bill Huang <bhuang@redhat.com>,<bill_huanghb@ybb.ne.jp>
- */
-static void
-gb180302mic(unsigned char *gb18030, unsigned char *p, int len)
-{
-       int                     c1;
-       int                     c2;
-
-       while (len > 0 && (c1 = *gb18030++))
-       {
-               if (c1 < 0x80)
-               {                                               /* should be ASCII */
-                       len--;
-                       *p++ = c1;
-               }
-               else if (c1 >= 0x81 && c1 <= 0xfe)
-               {
-                       c2 = *gb18030++;
-
-                       if (c2 >= 0x30 && c2 <= 0x69)
-                       {
-                               len -= 4;
-                               *p++ = c1;
-                               *p++ = c2;
-                               *p++ = *gb18030++;
-                               *p++ = *gb18030++;
-                               *p++ = *gb18030++;
-                       }
-                       else if ((c2 >= 0x40 && c2 <= 0x7e) || (c2 >= 0x80 && c2 <= 0xfe))
-                       {
-                               len -= 2;
-                               *p++ = c1;
-                               *p++ = c2;
-                               *p++ = *gb18030++;
-                       }
-                       else
-                       {                                       /* throw the strange code */
-                               len--;
-                       }
-               }
-       }
-       *p = '\0';
-}
 
 /*
- * MIC ---> GB18030
- * Added by Bill Huang <bhuang@redhat.com>,<bill_huanghb@ybb.ne.jp>
- */
-static void
-mic2gb18030(unsigned char *mic, unsigned char *p, int len)
-{
-       int                     c1;
-       int                     c2;
-
-       while (len > 0 && (c1 = *mic))
-       {
-               len -= pg_mic_mblen(mic++);
-
-               if (c1 <= 0x7f)                 /* ASCII */
-                       *p++ = c1;
-               else if (c1 >= 0x81 && c1 <= 0xfe)
-               {
-                       c2 = *mic++;
-
-                       if ((c2 >= 0x40 && c2 <= 0x7e) || (c2 >= 0x80 && c2 <= 0xfe))
-                       {
-                               *p++ = c1;
-                               *p++ = c2;
-                       }
-                       else if (c2 >= 0x30 && c2 <= 0x39)
-                       {
-                               *p++ = c1;
-                               *p++ = c2;
-                               *p++ = *mic++;
-                               *p++ = *mic++;
-                       }
-                       else
-                       {
-                               mic--;
-                               pg_print_bogus_char(&mic, &p);
-                               mic--;
-                               pg_print_bogus_char(&mic, &p);
-                       }
-               }
-               else
-               {
-                       mic--;
-                       pg_print_bogus_char(&mic, &p);
-               }
-       }
-       *p = '\0';
-}
-#endif
-
-/*
- * LATINn ---> MIC
+ * LATINn ---> MIC when the charset's local codes map directly to MIC
+ *
+ * l points to the source string of length len
+ * p is the output area (must be large enough!)
+ * lc is the mule character set id for the local encoding
+ * encoding is the PG identifier for the local encoding
  */
 void
-latin2mic(unsigned char *l, unsigned char *p, int len, int lc)
+latin2mic(const unsigned char *l, unsigned char *p, int len,
+                 int lc, int encoding)
 {
        int                     c1;
 
-       while (len-- > 0 && (c1 = *l++))
+       while (len > 0)
        {
-               if (c1 > 0x7f)
-               {                                               /* Latin? */
+               c1 = *l;
+               if (c1 == 0)
+                       report_invalid_encoding(encoding, (const char *) l, len);
+               if (IS_HIGHBIT_SET(c1))
                        *p++ = lc;
-               }
                *p++ = c1;
+               l++;
+               len--;
        }
        *p = '\0';
 }
 
 /*
- * MIC ---> LATINn
+ * MIC ---> LATINn when the charset's local codes map directly to MIC
+ *
+ * mic points to the source string of length len
+ * p is the output area (must be large enough!)
+ * lc is the mule character set id for the local encoding
+ * encoding is the PG identifier for the local encoding
  */
 void
-mic2latin(unsigned char *mic, unsigned char *p, int len, int lc)
+mic2latin(const unsigned char *mic, unsigned char *p, int len,
+                 int lc, int encoding)
 {
        int                     c1;
 
-       while (len > 0 && (c1 = *mic))
+       while (len > 0)
        {
-               len -= pg_mic_mblen(mic++);
-
-               if (c1 == lc)
-                       *p++ = *mic++;
-               else if (c1 > 0x7f)
+               c1 = *mic;
+               if (c1 == 0)
+                       report_invalid_encoding(PG_MULE_INTERNAL, (const char *) mic, len);
+               if (!IS_HIGHBIT_SET(c1))
                {
-                       mic--;
-                       pg_print_bogus_char(&mic, &p);
+                       /* easy for ASCII */
+                       *p++ = c1;
+                       mic++;
+                       len--;
                }
                else
-               {                                               /* should be ASCII */
-                       *p++ = c1;
+               {
+                       int             l = pg_mic_mblen(mic);
+
+                       if (len < l)
+                               report_invalid_encoding(PG_MULE_INTERNAL, (const char *) mic,
+                                                                               len);
+                       if (l != 2 || c1 != lc || !IS_HIGHBIT_SET(mic[1]))
+                               report_untranslatable_char(PG_MULE_INTERNAL, encoding,
+                                                                                  (const char *) mic, len);
+                       *p++ = mic[1];
+                       mic += 2;
+                       len -= 2;
                }
        }
        *p = '\0';
@@ -180,14 +89,25 @@ mic2latin(unsigned char *mic, unsigned char *p, int len, int lc)
 
 /*
  * ASCII ---> MIC
+ *
+ * While ordinarily SQL_ASCII encoding is forgiving of high-bit-set
+ * characters, here we must take a hard line because we don't know
+ * the appropriate MIC equivalent.
  */
 void
-pg_ascii2mic(unsigned char *l, unsigned char *p, int len)
+pg_ascii2mic(const unsigned char *l, unsigned char *p, int len)
 {
        int                     c1;
 
-       while (len-- > 0 && (c1 = *l++))
-               *p++ = (c1 & 0x7f);
+       while (len > 0)
+       {
+               c1 = *l;
+               if (c1 == 0 || IS_HIGHBIT_SET(c1))
+                       report_invalid_encoding(PG_SQL_ASCII, (const char *) l, len);
+               *p++ = c1;
+               l++;
+               len--;
+       }
        *p = '\0';
 }
 
@@ -195,19 +115,19 @@ pg_ascii2mic(unsigned char *l, unsigned char *p, int len)
  * MIC ---> ASCII
  */
 void
-pg_mic2ascii(unsigned char *mic, unsigned char *p, int len)
+pg_mic2ascii(const unsigned char *mic, unsigned char *p, int len)
 {
        int                     c1;
 
-       while (len-- > 0 && (c1 = *mic))
+       while (len > 0)
        {
-               if (c1 > 0x7f)
-                       pg_print_bogus_char(&mic, &p);
-               else
-               {                                               /* should be ASCII */
-                       *p++ = c1;
-                       mic++;
-               }
+               c1 = *mic;
+               if (c1 == 0 || IS_HIGHBIT_SET(c1))
+                       report_untranslatable_char(PG_MULE_INTERNAL, PG_SQL_ASCII,
+                                                                          (const char *) mic, len);
+               *p++ = c1;
+               mic++;
+               len--;
        }
        *p = '\0';
 }
@@ -215,87 +135,103 @@ pg_mic2ascii(unsigned char *mic, unsigned char *p, int len)
 /*
  * latin2mic_with_table: a generic single byte charset encoding
  * conversion from a local charset to the mule internal code.
- * with a encoding conversion table.
- * the table is ordered according to the local charset,
+ *
+ * l points to the source string of length len
+ * p is the output area (must be large enough!)
+ * lc is the mule character set id for the local encoding
+ * encoding is the PG identifier for the local encoding
+ * tab holds conversion entries for the local charset
  * starting from 128 (0x80). each entry in the table
  * holds the corresponding code point for the mule internal code.
  */
 void
-latin2mic_with_table(
-                                        unsigned char *l,      /* local charset string (source) */
-                                        unsigned char *p,      /* pointer to store mule internal
-                                                                                * code (destination) */
-                                        int len,       /* length of l */
-                                        int lc,        /* leading character of p */
-                                        unsigned char *tab /* code conversion table */
-)
+latin2mic_with_table(const unsigned char *l,
+                                        unsigned char *p,
+                                        int len,
+                                        int lc,
+                                        int encoding,
+                                        const unsigned char *tab)
 {
        unsigned char c1,
                                c2;
 
-       while (len-- > 0 && (c1 = *l++))
+       while (len > 0)
        {
-               if (c1 < 128)
+               c1 = *l;
+               if (c1 == 0)
+                       report_invalid_encoding(encoding, (const char *) l, len);
+               if (!IS_HIGHBIT_SET(c1))
                        *p++ = c1;
                else
                {
-                       c2 = tab[c1 - 128];
+                       c2 = tab[c1 - HIGHBIT];
                        if (c2)
                        {
                                *p++ = lc;
                                *p++ = c2;
                        }
                        else
-                       {
-                               *p++ = ' ';             /* cannot convert */
-                       }
+                               report_untranslatable_char(encoding, PG_MULE_INTERNAL,
+                                                                                  (const char *) l, len);
                }
+               l++;
+               len--;
        }
        *p = '\0';
 }
 
 /*
  * mic2latin_with_table: a generic single byte charset encoding
- * conversion from the mule internal code to a local charset
- * with a encoding conversion table.
- * the table is ordered according to the second byte of the mule
- * internal code starting from 128 (0x80).
- * each entry in the table
- * holds the corresponding code point for the local code.
+ * conversion from the mule internal code to a local charset.
+ *
+ * mic points to the source string of length len
+ * p is the output area (must be large enough!)
+ * lc is the mule character set id for the local encoding
+ * encoding is the PG identifier for the local encoding
+ * tab holds conversion entries for the mule internal code's
+ * second byte, starting from 128 (0x80). each entry in the table
+ * holds the corresponding code point for the local charset.
  */
 void
-mic2latin_with_table(
-                                        unsigned char *mic,            /* mule internal code
-                                                                                                * (source) */
-                                        unsigned char *p,      /* local code (destination) */
-                                        int len,       /* length of p */
-                                        int lc,        /* leading character */
-                                        unsigned char *tab /* code conversion table */
-)
+mic2latin_with_table(const unsigned char *mic,
+                                        unsigned char *p,
+                                        int len,
+                                        int lc,
+                                        int encoding,
+                                        const unsigned char *tab)
 {
-
        unsigned char c1,
                                c2;
 
-       while (len-- > 0 && (c1 = *mic++))
+       while (len > 0)
        {
-               if (c1 < 128)
-                       *p++ = c1;
-               else if (c1 == lc)
+               c1 = *mic;
+               if (c1 == 0)
+                       report_invalid_encoding(PG_MULE_INTERNAL, (const char *) mic, len);
+               if (!IS_HIGHBIT_SET(c1))
                {
-                       c1 = *mic++;
+                       /* easy for ASCII */
+                       *p++ = c1;
+                       mic++;
                        len--;
-                       c2 = tab[c1 - 128];
-                       if (c2)
-                               *p++ = c2;
-                       else
-                       {
-                               *p++ = ' ';             /* cannot convert */
-                       }
                }
                else
                {
-                       *p++ = ' ';                     /* bogus character */
+                       int             l = pg_mic_mblen(mic);
+
+                       if (len < l)
+                               report_invalid_encoding(PG_MULE_INTERNAL, (const char *) mic,
+                                                                               len);
+                       if (l != 2 || c1 != lc || !IS_HIGHBIT_SET(mic[1]) ||
+                               (c2 = tab[mic[1] - HIGHBIT]) == 0)
+                       {
+                               report_untranslatable_char(PG_MULE_INTERNAL, encoding,
+                                                                                  (const char *) mic, len);
+                               break;                  /* keep compiler quiet */
+                       }
+                       *p++ = c2;
+                       mic += 2;
+                       len -= 2;
                }
        }
        *p = '\0';
@@ -332,27 +268,40 @@ compare2(const void *p1, const void *p2)
 }
 
 /*
- * UTF-8 ---> local code
+ * UTF8 ---> local code
  *
- * utf: input UTF-8 string. Its length is limited by "len" parameter
- *             or a null terminater.
- * iso: pointer to the output.
+ * utf: input UTF8 string (need not be null-terminated).
+ * iso: pointer to the output area (must be large enough!)
  * map: the conversion map.
  * size: the size of the conversion map.
+ * encoding: the PG identifier for the local encoding.
+ * len: length of input string.
  */
 void
-UtfToLocal(unsigned char *utf, unsigned char *iso,
-                  pg_utf_to_local *map, int size, int len)
+UtfToLocal(const unsigned char *utf, unsigned char *iso,
+                  const pg_utf_to_local *map, int size, int encoding, int len)
 {
        unsigned int iutf;
        int                     l;
        pg_utf_to_local *p;
 
-       for (; len > 0 && *utf; len -= l)
+       for (; len > 0; len -= l)
        {
+               /* "break" cases all represent errors */
+               if (*utf == '\0')
+                       break;
+
                l = pg_utf_mblen(utf);
+
+               if (len < l)
+                       break;
+
+               if (!pg_utf8_islegal(utf, l))
+                       break;
+
                if (l == 1)
                {
+                       /* ASCII case is easy */
                        *iso++ = *utf++;
                        continue;
                }
@@ -361,19 +310,27 @@ UtfToLocal(unsigned char *utf, unsigned char *iso,
                        iutf = *utf++ << 8;
                        iutf |= *utf++;
                }
-               else
+               else if (l == 3)
                {
                        iutf = *utf++ << 16;
                        iutf |= *utf++ << 8;
                        iutf |= *utf++;
                }
+               else if (l == 4)
+               {
+                       iutf = *utf++ << 24;
+                       iutf |= *utf++ << 16;
+                       iutf |= *utf++ << 8;
+                       iutf |= *utf++;
+               }
+
                p = bsearch(&iutf, map, size,
                                        sizeof(pg_utf_to_local), compare1);
+
                if (p == NULL)
-               {
-                       elog(WARNING, "UtfToLocal: could not convert UTF-8 (0x%04x). Ignored", iutf);
-                       continue;
-               }
+                       report_untranslatable_char(PG_UTF8, encoding,
+                                                                          (const char *) (utf - l), len);
+
                if (p->code & 0xff000000)
                        *iso++ = p->code >> 24;
                if (p->code & 0x00ff0000)
@@ -383,15 +340,26 @@ UtfToLocal(unsigned char *utf, unsigned char *iso,
                if (p->code & 0x000000ff)
                        *iso++ = p->code & 0x000000ff;
        }
+
+       if (len > 0)
+               report_invalid_encoding(PG_UTF8, (const char *) utf, len);
+
        *iso = '\0';
 }
 
 /*
- * local code ---> UTF-8
+ * local code ---> UTF8
+ *
+ * iso: input local string (need not be null-terminated).
+ * utf: pointer to the output area (must be large enough!)
+ * map: the conversion map.
+ * size: the size of the conversion map.
+ * encoding: the PG identifier for the local encoding.
+ * len: length of input string.
  */
 void
-LocalToUtf(unsigned char *iso, unsigned char *utf,
-                  pg_local_to_utf *map, int size, int encoding, int len)
+LocalToUtf(const unsigned char *iso, unsigned char *utf,
+                  const pg_local_to_utf *map, int size, int encoding, int len)
 {
        unsigned int iiso;
        int                     l;
@@ -400,16 +368,23 @@ LocalToUtf(unsigned char *iso, unsigned char *utf,
        if (!PG_VALID_ENCODING(encoding))
                elog(ERROR, "Invalid encoding number %d", encoding);
 
-       for (; len > 0 && *iso; len -= l)
+       for (; len > 0; len -= l)
        {
-               if (*iso < 0x80)
+               /* "break" cases all represent errors */
+               if (*iso == '\0')
+                       break;
+
+               if (!IS_HIGHBIT_SET(*iso))
                {
+                       /* ASCII case is easy */
                        *utf++ = *iso++;
                        l = 1;
                        continue;
                }
 
-               l = pg_encoding_mblen(encoding, iso);
+               l = pg_encoding_verifymb(encoding, (const char *) iso, len);
+               if (l < 0)
+                       break;
 
                if (l == 1)
                        iiso = *iso++;
@@ -431,14 +406,13 @@ LocalToUtf(unsigned char *iso, unsigned char *utf,
                        iiso |= *iso++ << 8;
                        iiso |= *iso++;
                }
+
                p = bsearch(&iiso, map, size,
                                        sizeof(pg_local_to_utf), compare2);
                if (p == NULL)
-               {
-                       elog(WARNING, "LocalToUtf: could not convert (0x%04x) %s to UTF-8. Ignored",
-                                iiso, (&pg_enc2name_tbl[encoding])->name);
-                       continue;
-               }
+                       report_untranslatable_char(encoding, PG_UTF8,
+                                                                          (const char *) (iso - l), len);
+
                if (p->utf & 0xff000000)
                        *utf++ = p->utf >> 24;
                if (p->utf & 0x00ff0000)
@@ -448,5 +422,9 @@ LocalToUtf(unsigned char *iso, unsigned char *utf,
                if (p->utf & 0x000000ff)
                        *utf++ = p->utf & 0x000000ff;
        }
+
+       if (len > 0)
+               report_invalid_encoding(encoding, (const char *) iso, len);
+
        *utf = '\0';
 }
index ef3a07fc215d65e59a74e7636586789e1aa90d4d..338c9a214b819501cb7f87783d87727460188584 100644 (file)
@@ -6,7 +6,7 @@
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * IDENTIFICATION
- *       $Header: /cvsroot/pgsql/src/backend/utils/mb/conversion_procs/cyrillic_and_mic/cyrillic_and_mic.c,v 1.5 2002/10/26 15:00:59 tgl Exp $
+ *       $Header: /cvsroot/pgsql/src/backend/utils/mb/conversion_procs/cyrillic_and_mic/cyrillic_and_mic.c,v 1.5.2.1 2006/05/21 20:07:11 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -70,14 +70,14 @@ extern Datum alt_to_iso(PG_FUNCTION_ARGS);
  * ----------
  */
 
-static void koi8r2mic(unsigned char *l, unsigned char *p, int len);
-static void mic2koi8r(unsigned char *mic, unsigned char *p, int len);
-static void iso2mic(unsigned char *l, unsigned char *p, int len);
-static void mic2iso(unsigned char *mic, unsigned char *p, int len);
-static void win12512mic(unsigned char *l, unsigned char *p, int len);
-static void mic2win1251(unsigned char *mic, unsigned char *p, int len);
-static void alt2mic(unsigned char *l, unsigned char *p, int len);
-static void mic2alt(unsigned char *mic, unsigned char *p, int len);
+static void koi8r2mic(const unsigned char *l, unsigned char *p, int len);
+static void mic2koi8r(const unsigned char *mic, unsigned char *p, int len);
+static void iso2mic(const unsigned char *l, unsigned char *p, int len);
+static void mic2iso(const unsigned char *mic, unsigned char *p, int len);
+static void win12512mic(const unsigned char *l, unsigned char *p, int len);
+static void mic2win1251(const unsigned char *mic, unsigned char *p, int len);
+static void alt2mic(const unsigned char *l, unsigned char *p, int len);
+static void mic2alt(const unsigned char *mic, unsigned char *p, int len);
 
 Datum
 koi8r_to_mic(PG_FUNCTION_ARGS)
@@ -401,7 +401,7 @@ win1251_to_iso(PG_FUNCTION_ARGS)
 
        buf = palloc(len * ENCODING_GROWTH_RATE);
        win12512mic(src, buf, len);
-       mic2win1251(buf, dest, strlen(buf));
+       mic2iso(buf, dest, strlen(buf));
        pfree(buf);
 
        PG_RETURN_VOID();
@@ -441,7 +441,7 @@ alt_to_iso(PG_FUNCTION_ARGS)
 
        buf = palloc(len * ENCODING_GROWTH_RATE);
        alt2mic(src, buf, len);
-       mic2alt(buf, dest, strlen(buf));
+       mic2iso(buf, dest, strlen(buf));
        pfree(buf);
 
        PG_RETURN_VOID();
@@ -460,23 +460,23 @@ alt_to_iso(PG_FUNCTION_ARGS)
 
 /* koi8r2mic: KOI8-R to Mule internal code */
 static void
-koi8r2mic(unsigned char *l, unsigned char *p, int len)
+koi8r2mic(const unsigned char *l, unsigned char *p, int len)
 {
-       latin2mic(l, p, len, LC_KOI8_R);
+       latin2mic(l, p, len, LC_KOI8_R, PG_KOI8R);
 }
 
 /* mic2koi8r: Mule internal code to KOI8-R */
 static void
-mic2koi8r(unsigned char *mic, unsigned char *p, int len)
+mic2koi8r(const unsigned char *mic, unsigned char *p, int len)
 {
-       mic2latin(mic, p, len, LC_KOI8_R);
+       mic2latin(mic, p, len, LC_KOI8_R, PG_KOI8R);
 }
 
 /* iso2mic: ISO-8859-5 to Mule internal code */
 static void
-iso2mic(unsigned char *l, unsigned char *p, int len)
+iso2mic(const unsigned char *l, unsigned char *p, int len)
 {
-       static unsigned char iso2koi[] = {
+       static const unsigned char iso2koi[] = {
                0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
                0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
                0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
@@ -495,14 +495,14 @@ iso2mic(unsigned char *l, unsigned char *p, int len)
                0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
        };
 
-       latin2mic_with_table(l, p, len, LC_KOI8_R, iso2koi);
+       latin2mic_with_table(l, p, len, LC_KOI8_R, PG_ISO_8859_5, iso2koi);
 }
 
 /* mic2iso: Mule internal code to ISO8859-5 */
 static void
-mic2iso(unsigned char *mic, unsigned char *p, int len)
+mic2iso(const unsigned char *mic, unsigned char *p, int len)
 {
-       static unsigned char koi2iso[] = {
+       static const unsigned char koi2iso[] = {
                0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
                0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
                0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
@@ -521,14 +521,14 @@ mic2iso(unsigned char *mic, unsigned char *p, int len)
                0xcc, 0xcb, 0xb7, 0xc8, 0xcd, 0xc9, 0xc7, 0xca
        };
 
-       mic2latin_with_table(mic, p, len, LC_KOI8_R, koi2iso);
+       mic2latin_with_table(mic, p, len, LC_KOI8_R, PG_ISO_8859_5, koi2iso);
 }
 
 /* win2mic: CP1251 to Mule internal code */
 static void
-win12512mic(unsigned char *l, unsigned char *p, int len)
+win12512mic(const unsigned char *l, unsigned char *p, int len)
 {
-       static unsigned char win2koi[] = {
+       static const unsigned char win2koi[] = {
                0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
                0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
                0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
@@ -547,14 +547,14 @@ win12512mic(unsigned char *l, unsigned char *p, int len)
                0xdb, 0xdd, 0xdf, 0xd9, 0xd8, 0xdc, 0xc0, 0xd1
        };
 
-       latin2mic_with_table(l, p, len, LC_KOI8_R, win2koi);
+       latin2mic_with_table(l, p, len, LC_KOI8_R, PG_WIN1251, win2koi);
 }
 
 /* mic2win: Mule internal code to CP1251 */
 static void
-mic2win1251(unsigned char *mic, unsigned char *p, int len)
+mic2win1251(const unsigned char *mic, unsigned char *p, int len)
 {
-       static unsigned char koi2win[] = {
+       static const unsigned char koi2win[] = {
                0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
                0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
                0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
@@ -573,14 +573,14 @@ mic2win1251(unsigned char *mic, unsigned char *p, int len)
                0xdc, 0xdb, 0xc7, 0xd8, 0xdd, 0xd9, 0xd7, 0xda
        };
 
-       mic2latin_with_table(mic, p, len, LC_KOI8_R, koi2win);
+       mic2latin_with_table(mic, p, len, LC_KOI8_R, PG_WIN1251, koi2win);
 }
 
 /* alt2mic: CP866 to Mule internal code */
 static void
-alt2mic(unsigned char *l, unsigned char *p, int len)
+alt2mic(const unsigned char *l, unsigned char *p, int len)
 {
-       static unsigned char alt2koi[] = {
+       static const unsigned char alt2koi[] = {
                0xe1, 0xe2, 0xf7, 0xe7, 0xe4, 0xe5, 0xf6, 0xfa,
                0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, 0xf0,
                0xf2, 0xf3, 0xf4, 0xf5, 0xe6, 0xe8, 0xe3, 0xfe,
@@ -599,14 +599,14 @@ alt2mic(unsigned char *l, unsigned char *p, int len)
                0xb6, 0xa6, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
        };
 
-       latin2mic_with_table(l, p, len, LC_KOI8_R, alt2koi);
+       latin2mic_with_table(l, p, len, LC_KOI8_R, PG_ALT, alt2koi);
 }
 
 /* mic2alt: Mule internal code to CP866 */
 static void
-mic2alt(unsigned char *mic, unsigned char *p, int len)
+mic2alt(const unsigned char *mic, unsigned char *p, int len)
 {
-       static unsigned char koi2alt[] = {
+       static const unsigned char koi2alt[] = {
                0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
                0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
                0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
@@ -625,5 +625,5 @@ mic2alt(unsigned char *mic, unsigned char *p, int len)
                0x9c, 0x9b, 0x87, 0x98, 0x9d, 0x99, 0x97, 0x9a
        };
 
-       mic2latin_with_table(mic, p, len, LC_KOI8_R, koi2alt);
+       mic2latin_with_table(mic, p, len, LC_KOI8_R, PG_ALT, koi2alt);
 }
index 2b046f49c424c86588e1bf3f32b42ff4fb854d6e..02a44581af5b663edac90be194a6cfc3b38e269a 100644 (file)
@@ -6,7 +6,7 @@
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * IDENTIFICATION
- *       $Header: /cvsroot/pgsql/src/backend/utils/mb/conversion_procs/euc_cn_and_mic/euc_cn_and_mic.c,v 1.5 2002/10/26 15:00:59 tgl Exp $
+ *       $Header: /cvsroot/pgsql/src/backend/utils/mb/conversion_procs/euc_cn_and_mic/euc_cn_and_mic.c,v 1.5.2.1 2006/05/21 20:07:11 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -32,8 +32,8 @@ extern Datum mic_to_euc_cn(PG_FUNCTION_ARGS);
  * ----------
  */
 
-static void euc_cn2mic(unsigned char *euc, unsigned char *p, int len);
-static void mic2euc_cn(unsigned char *mic, unsigned char *p, int len);
+static void euc_cn2mic(const unsigned char *euc, unsigned char *p, int len);
+static void mic2euc_cn(const unsigned char *mic, unsigned char *p, int len);
 
 Datum
 euc_cn_to_mic(PG_FUNCTION_ARGS)
@@ -71,23 +71,30 @@ mic_to_euc_cn(PG_FUNCTION_ARGS)
  * EUC_CN ---> MIC
  */
 static void
-euc_cn2mic(unsigned char *euc, unsigned char *p, int len)
+euc_cn2mic(const unsigned char *euc, unsigned char *p, int len)
 {
        int                     c1;
 
-       while (len >= 0 && (c1 = *euc++))
+       while (len > 0)
        {
-               if (c1 & 0x80)
+               c1 = *euc;
+               if (IS_HIGHBIT_SET(c1))
                {
-                       len -= 2;
+                       if (len < 2 || !IS_HIGHBIT_SET(euc[1]))
+                               report_invalid_encoding(PG_EUC_CN, (const char *) euc, len);
                        *p++ = LC_GB2312_80;
                        *p++ = c1;
-                       *p++ = *euc++;
+                       *p++ = euc[1];
+                       euc += 2;
+                       len -= 2;
                }
                else
                {                                               /* should be ASCII */
-                       len--;
+                       if (c1 == 0)
+                               report_invalid_encoding(PG_EUC_CN, (const char *) euc, len);
                        *p++ = c1;
+                       euc++;
+                       len--;
                }
        }
        *p = '\0';
@@ -97,27 +104,34 @@ euc_cn2mic(unsigned char *euc, unsigned char *p, int len)
  * MIC ---> EUC_CN
  */
 static void
-mic2euc_cn(unsigned char *mic, unsigned char *p, int len)
+mic2euc_cn(const unsigned char *mic, unsigned char *p, int len)
 {
        int                     c1;
 
-       while (len >= 0 && (c1 = *mic))
+       while (len > 0)
        {
-               len -= pg_mic_mblen(mic++);
-
-               if (c1 == LC_GB2312_80)
+               c1 = *mic;
+               if (IS_HIGHBIT_SET(c1))
                {
+                       if (c1 != LC_GB2312_80)
+                               report_untranslatable_char(PG_MULE_INTERNAL, PG_EUC_CN,
+                                                                                  (const char *) mic, len);
+                       if (len < 3 || !IS_HIGHBIT_SET(mic[1]) || !IS_HIGHBIT_SET(mic[2]))
+                               report_invalid_encoding(PG_MULE_INTERNAL,
+                                                                               (const char *) mic, len);
+                       mic++;
                        *p++ = *mic++;
                        *p++ = *mic++;
-               }
-               else if (c1 > 0x7f)
-               {                                               /* cannot convert to EUC_CN! */
-                       mic--;
-                       pg_print_bogus_char(&mic, &p);
+                       len -= 3;
                }
                else
                {                                               /* should be ASCII */
+                       if (c1 == 0)
+                               report_invalid_encoding(PG_MULE_INTERNAL,
+                                                                               (const char *) mic, len);
                        *p++ = c1;
+                       mic++;
+                       len--;
                }
        }
        *p = '\0';
index e2f077ca9a89197389eede16eca49b1e86115ca5..c468c23fff4b912775d7489cb20a2048ca35d95d 100644 (file)
@@ -6,7 +6,7 @@
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * IDENTIFICATION
- *       $Header: /cvsroot/pgsql/src/backend/utils/mb/conversion_procs/euc_jp_and_sjis/euc_jp_and_sjis.c,v 1.5.2.3 2006/03/04 12:35:08 ishii Exp $
+ *       $Header: /cvsroot/pgsql/src/backend/utils/mb/conversion_procs/euc_jp_and_sjis/euc_jp_and_sjis.c,v 1.5.2.4 2006/05/21 20:07:11 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -22,9 +22,6 @@
 #define PGSJISALTCODE 0x81ac
 #define PGEUCALTCODE 0xa2ae
 
-#define ISSJISHEAD(c) ((c >= 0x81 && c <= 0x9f) || (c >= 0xe0 && c <= 0xfc))
-#define ISSJISTAIL(c) ((c >= 0x40 && c <= 0x7e) || (c >= 0x80 && c <= 0xfc))
-
 /*
  * conversion table between SJIS UDC (IBM kanji) and EUC_JP
  */
@@ -57,10 +54,10 @@ extern Datum mic_to_sjis(PG_FUNCTION_ARGS);
  * ----------
  */
 
-static void sjis2mic(unsigned char *sjis, unsigned char *p, int len);
-static void mic2sjis(unsigned char *mic, unsigned char *p, int len);
-static void euc_jp2mic(unsigned char *euc, unsigned char *p, int len);
-static void mic2euc_jp(unsigned char *mic, unsigned char *p, int len);
+static void sjis2mic(const unsigned char *sjis, unsigned char *p, int len);
+static void mic2sjis(const unsigned char *mic, unsigned char *p, int len);
+static void euc_jp2mic(const unsigned char *euc, unsigned char *p, int len);
+static void mic2euc_jp(const unsigned char *mic, unsigned char *p, int len);
 
 Datum
 euc_jp_to_sjis(PG_FUNCTION_ARGS)
@@ -170,37 +167,34 @@ mic_to_sjis(PG_FUNCTION_ARGS)
  * SJIS ---> MIC
  */
 static void
-sjis2mic(unsigned char *sjis, unsigned char *p, int len)
+sjis2mic(const unsigned char *sjis, unsigned char *p, int len)
 {
        int                     c1,
                                c2,
-/* Eiji Tokuya patched begin */
                                i,
                                k,
                                k2;
 
-/* Eiji Tokuya patched end */
-       while (len >= 0 && (c1 = *sjis++))
+       while (len > 0)
        {
+               c1 = *sjis;
                if (c1 >= 0xa1 && c1 <= 0xdf)
                {
                        /* JIS X0201 (1 byte kana) */
-                       len--;
                        *p++ = LC_JISX0201K;
                        *p++ = c1;
+                       sjis++;
+                       len--;
                }
-               else if (c1 > 0x7f)
+               else if (IS_HIGHBIT_SET(c1))
                {
                        /*
                         * JIS X0208, X0212, user defined extended characters
                         */
-                       c2 = *sjis++;
-                       if (!ISSJISHEAD(c1) || !ISSJISTAIL(c2))
-                               elog(ERROR,"invalid byte sequence for encoding \"SJIS\": 0x%02x%02x",
-                                    c1, c2);
-
+                       if (len < 2 || !ISSJISHEAD(c1) || !ISSJISTAIL(sjis[1]))
+                               report_invalid_encoding(PG_SJIS, (const char *) sjis, len);
+                       c2 = sjis[1];
                        k = (c1 << 8) + c2;
-/* Eiji Tokuya patched begin */
                        if (k >= 0xed40 && k < 0xf040)
                        {
                                /* NEC selection IBM kanji */
@@ -219,19 +213,15 @@ sjis2mic(unsigned char *sjis, unsigned char *p, int len)
                        }
 
                        if (k < 0xeb3f)
-/* Eiji Tokuya patched end */
                        {
                                /* JIS X0208 */
-                               len -= 2;
                                *p++ = LC_JISX0208;
                                *p++ = ((c1 & 0x3f) << 1) + 0x9f + (c2 > 0x9e);
                                *p++ = c2 + ((c2 > 0x9e) ? 2 : 0x60) + (c2 < 0x80);
                        }
-/* Eiji Tokuya patched begin */
                        else if ((k >= 0xeb40 && k < 0xf040) || (k >= 0xfc4c && k <= 0xfcfc))
                        {
                                /* NEC selection IBM kanji - Other undecided justice */
-/* Eiji Tokuya patched end */
                                *p++ = LC_JISX0208;
                                *p++ = PGEUCALTCODE >> 8;
                                *p++ = PGEUCALTCODE & 0xff;
@@ -242,7 +232,6 @@ sjis2mic(unsigned char *sjis, unsigned char *p, int len)
                                 * UDC1 mapping to X0208 85 ku - 94 ku JIS code 0x7521 -
                                 * 0x7e7e EUC 0xf5a1 - 0xfefe
                                 */
-                               len -= 2;
                                *p++ = LC_JISX0208;
                                c1 -= 0x6f;
                                *p++ = ((c1 & 0x3f) << 1) + 0xf3 + (c2 > 0x9e);
@@ -254,7 +243,6 @@ sjis2mic(unsigned char *sjis, unsigned char *p, int len)
                                 * UDC2 mapping to X0212 85 ku - 94 ku JIS code 0x7521 -
                                 * 0x7e7e EUC 0x8ff5a1 - 0x8ffefe
                                 */
-                               len -= 2;
                                *p++ = LC_JISX0212;
                                c1 -= 0x74;
                                *p++ = ((c1 & 0x3f) << 1) + 0xf3 + (c2 > 0x9e);
@@ -264,9 +252,7 @@ sjis2mic(unsigned char *sjis, unsigned char *p, int len)
                        {
                                /*
                                 * mapping IBM kanji to X0208 and X0212
-                                *
                                 */
-                               len -= 2;
                                for (i = 0;; i++)
                                {
                                        k2 = ibmkanji[i].sjis;
@@ -290,11 +276,16 @@ sjis2mic(unsigned char *sjis, unsigned char *p, int len)
                                        }
                                }
                        }
+                       sjis += 2;
+                       len -= 2;
                }
                else
                {                                               /* should be ASCII */
-                       len--;
+                       if (c1 == 0)
+                               report_invalid_encoding(PG_SJIS, (const char *) sjis, len);
                        *p++ = c1;
+                       sjis++;
+                       len--;
                }
        }
        *p = '\0';
@@ -304,22 +295,37 @@ sjis2mic(unsigned char *sjis, unsigned char *p, int len)
  * MIC ---> SJIS
  */
 static void
-mic2sjis(unsigned char *mic, unsigned char *p, int len)
+mic2sjis(const unsigned char *mic, unsigned char *p, int len)
 {
        int                     c1,
                                c2,
-                               k;
+                               k,
+                               l;
 
-       while (len >= 0 && (c1 = *mic))
+       while (len > 0)
        {
-               len -= pg_mic_mblen(mic++);
-
+               c1 = *mic;
+               if (!IS_HIGHBIT_SET(c1))
+               {
+                       /* ASCII */
+                       if (c1 == 0)
+                               report_invalid_encoding(PG_MULE_INTERNAL,
+                                                                               (const char *) mic, len);
+                       *p++ = c1;
+                       mic++;
+                       len--;
+                       continue;
+               }
+               l = pg_encoding_verifymb(PG_MULE_INTERNAL, (const char *) mic, len);
+               if (l < 0)
+                       report_invalid_encoding(PG_MULE_INTERNAL,
+                                                                       (const char *) mic, len);
                if (c1 == LC_JISX0201K)
-                       *p++ = *mic++;
+                       *p++ = mic[1];
                else if (c1 == LC_JISX0208)
                {
-                       c1 = *mic++;
-                       c2 = *mic++;
+                       c1 = mic[1];
+                       c2 = mic[2];
                        k = (c1 << 8) | (c2 & 0xff);
                        if (k >= 0xf5a1)
                        {
@@ -336,8 +342,8 @@ mic2sjis(unsigned char *mic, unsigned char *p, int len)
                        int                     i,
                                                k2;
 
-                       c1 = *mic++;
-                       c2 = *mic++;
+                       c1 = mic[1];
+                       c2 = mic[2];
                        k = c1 << 8 | c2;
                        if (k >= 0xf5a1)
                        {
@@ -368,16 +374,11 @@ mic2sjis(unsigned char *mic, unsigned char *p, int len)
                                }
                        }
                }
-               else if (c1 > 0x7f)
-               {
-                       /* cannot convert to SJIS! */
-                       *p++ = PGSJISALTCODE >> 8;
-                       *p++ = PGSJISALTCODE & 0xff;
-               }
                else
-               {                                               /* should be ASCII */
-                       *p++ = c1;
-               }
+                       report_untranslatable_char(PG_MULE_INTERNAL, PG_SJIS,
+                                                                          (const char *) mic, len);
+               mic += l;
+               len -= l;
        }
        *p = '\0';
 }
@@ -386,37 +387,48 @@ mic2sjis(unsigned char *mic, unsigned char *p, int len)
  * EUC_JP ---> MIC
  */
 static void
-euc_jp2mic(unsigned char *euc, unsigned char *p, int len)
+euc_jp2mic(const unsigned char *euc, unsigned char *p, int len)
 {
        int                     c1;
+       int                     l;
 
-       while (len >= 0 && (c1 = *euc++))
+       while (len > 0)
        {
+               c1 = *euc;
+               if (!IS_HIGHBIT_SET(c1))
+               {
+                       /* ASCII */
+                       if (c1 == 0)
+                               report_invalid_encoding(PG_EUC_JP,
+                                                                               (const char *) euc, len);
+                       *p++ = c1;
+                       euc++;
+                       len--;
+                       continue;
+               }
+               l = pg_encoding_verifymb(PG_EUC_JP, (const char *) euc, len);
+               if (l < 0)
+                       report_invalid_encoding(PG_EUC_JP,
+                                                                       (const char *) euc, len);
                if (c1 == SS2)
                {                                               /* 1 byte kana? */
-                       len -= 2;
                        *p++ = LC_JISX0201K;
-                       *p++ = *euc++;
+                       *p++ = euc[1];
                }
                else if (c1 == SS3)
                {                                               /* JIS X0212 kanji? */
-                       len -= 3;
                        *p++ = LC_JISX0212;
-                       *p++ = *euc++;
-                       *p++ = *euc++;
+                       *p++ = euc[1];
+                       *p++ = euc[2];
                }
-               else if (c1 & 0x80)
+               else
                {                                               /* kanji? */
-                       len -= 2;
                        *p++ = LC_JISX0208;
                        *p++ = c1;
-                       *p++ = *euc++;
-               }
-               else
-               {                                               /* should be ASCII */
-                       len--;
-                       *p++ = c1;
+                       *p++ = euc[1];
                }
+               euc += l;
+               len -= l;
        }
        *p = '\0';
 }
@@ -425,39 +437,50 @@ euc_jp2mic(unsigned char *euc, unsigned char *p, int len)
  * MIC ---> EUC_JP
  */
 static void
-mic2euc_jp(unsigned char *mic, unsigned char *p, int len)
+mic2euc_jp(const unsigned char *mic, unsigned char *p, int len)
 {
        int                     c1;
+       int                     l;
 
-       while (len >= 0 && (c1 = *mic))
+       while (len > 0)
        {
-               len -= pg_mic_mblen(mic++);
-
+               c1 = *mic;
+               if (!IS_HIGHBIT_SET(c1))
+               {
+                       /* ASCII */
+                       if (c1 == 0)
+                               report_invalid_encoding(PG_MULE_INTERNAL,
+                                                                               (const char *) mic, len);
+                       *p++ = c1;
+                       mic++;
+                       len--;
+                       continue;
+               }
+               l = pg_encoding_verifymb(PG_MULE_INTERNAL, (const char *) mic, len);
+               if (l < 0)
+                       report_invalid_encoding(PG_MULE_INTERNAL,
+                                                                       (const char *) mic, len);
                if (c1 == LC_JISX0201K)
                {
                        *p++ = SS2;
-                       *p++ = *mic++;
+                       *p++ = mic[1];
                }
                else if (c1 == LC_JISX0212)
                {
                        *p++ = SS3;
-                       *p++ = *mic++;
-                       *p++ = *mic++;
+                       *p++ = mic[1];
+                       *p++ = mic[2];
                }
                else if (c1 == LC_JISX0208)
                {
-                       *p++ = *mic++;
-                       *p++ = *mic++;
-               }
-               else if (c1 > 0x7f)
-               {                                               /* cannot convert to EUC_JP! */
-                       mic--;
-                       pg_print_bogus_char(&mic, &p);
+                       *p++ = mic[1];
+                       *p++ = mic[2];
                }
                else
-               {                                               /* should be ASCII */
-                       *p++ = c1;
-               }
+                       report_untranslatable_char(PG_MULE_INTERNAL, PG_EUC_JP,
+                                                                          (const char *) mic, len);
+               mic += l;
+               len -= l;
        }
        *p = '\0';
 }
index 6c82401c20ffc325bd098430bbaa81093ed8aee6..18e732a7935f18b2c84eda50f5a5cab914ba020e 100644 (file)
@@ -6,7 +6,7 @@
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * IDENTIFICATION
- *       $Header: /cvsroot/pgsql/src/backend/utils/mb/conversion_procs/euc_kr_and_mic/euc_kr_and_mic.c,v 1.5 2002/10/26 15:00:59 tgl Exp $
+ *       $Header: /cvsroot/pgsql/src/backend/utils/mb/conversion_procs/euc_kr_and_mic/euc_kr_and_mic.c,v 1.5.2.1 2006/05/21 20:07:11 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -32,8 +32,8 @@ extern Datum mic_to_euc_kr(PG_FUNCTION_ARGS);
  * ----------
  */
 
-static void euc_kr2mic(unsigned char *euc, unsigned char *p, int len);
-static void mic2euc_kr(unsigned char *mic, unsigned char *p, int len);
+static void euc_kr2mic(const unsigned char *euc, unsigned char *p, int len);
+static void mic2euc_kr(const unsigned char *mic, unsigned char *p, int len);
 
 Datum
 euc_kr_to_mic(PG_FUNCTION_ARGS)
@@ -71,23 +71,34 @@ mic_to_euc_kr(PG_FUNCTION_ARGS)
  * EUC_KR ---> MIC
  */
 static void
-euc_kr2mic(unsigned char *euc, unsigned char *p, int len)
+euc_kr2mic(const unsigned char *euc, unsigned char *p, int len)
 {
        int                     c1;
+       int                     l;
 
-       while (len >= 0 && (c1 = *euc++))
+       while (len > 0)
        {
-               if (c1 & 0x80)
+               c1 = *euc;
+               if (IS_HIGHBIT_SET(c1))
                {
-                       len -= 2;
+                       l = pg_encoding_verifymb(PG_EUC_KR, (const char *) euc, len);
+                       if (l != 2)
+                               report_invalid_encoding(PG_EUC_KR,
+                                                                               (const char *) euc, len);
                        *p++ = LC_KS5601;
                        *p++ = c1;
-                       *p++ = *euc++;
+                       *p++ = euc[1];
+                       euc += 2;
+                       len -= 2;
                }
                else
                {                                               /* should be ASCII */
-                       len--;
+                       if (c1 == 0)
+                               report_invalid_encoding(PG_EUC_KR,
+                                                                               (const char *) euc, len);
                        *p++ = c1;
+                       euc++;
+                       len--;
                }
        }
        *p = '\0';
@@ -97,28 +108,39 @@ euc_kr2mic(unsigned char *euc, unsigned char *p, int len)
  * MIC ---> EUC_KR
  */
 static void
-mic2euc_kr(unsigned char *mic, unsigned char *p, int len)
+mic2euc_kr(const unsigned char *mic, unsigned char *p, int len)
 {
        int                     c1;
+       int                     l;
 
-       while (len >= 0 && (c1 = *mic))
+       while (len > 0)
        {
-               len -= pg_mic_mblen(mic++);
-
-               if (c1 == LC_KS5601)
+               c1 = *mic;
+               if (!IS_HIGHBIT_SET(c1))
                {
-                       *p++ = *mic++;
-                       *p++ = *mic++;
+                       /* ASCII */
+                       if (c1 == 0)
+                               report_invalid_encoding(PG_MULE_INTERNAL,
+                                                                               (const char *) mic, len);
+                       *p++ = c1;
+                       mic++;
+                       len--;
+                       continue;
                }
-               else if (c1 > 0x7f)
-               {                                               /* cannot convert to EUC_KR! */
-                       mic--;
-                       pg_print_bogus_char(&mic, &p);
+               l = pg_encoding_verifymb(PG_MULE_INTERNAL, (const char *) mic, len);
+               if (l < 0)
+                       report_invalid_encoding(PG_MULE_INTERNAL,
+                                                                       (const char *) mic, len);
+               if (c1 == LC_KS5601)
+               {
+                       *p++ = mic[1];
+                       *p++ = mic[2];
                }
                else
-               {                                               /* should be ASCII */
-                       *p++ = c1;
-               }
+                       report_untranslatable_char(PG_MULE_INTERNAL, PG_EUC_KR,
+                                                                          (const char *) mic, len);
+               mic += l;
+               len -= l;
        }
        *p = '\0';
 }
index 72fda78f49b401846ab6ca9da4478a581fc49fb8..063dc8ec64087b4af2caadbcc0ab017966a1feff 100644 (file)
@@ -6,7 +6,7 @@
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * IDENTIFICATION
- *       $Header: /cvsroot/pgsql/src/backend/utils/mb/conversion_procs/euc_tw_and_big5/euc_tw_and_big5.c,v 1.5 2002/10/26 15:00:59 tgl Exp $
+ *       $Header: /cvsroot/pgsql/src/backend/utils/mb/conversion_procs/euc_tw_and_big5/euc_tw_and_big5.c,v 1.5.2.1 2006/05/21 20:07:12 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -42,10 +42,10 @@ extern Datum mic_to_big5(PG_FUNCTION_ARGS);
  * ----------
  */
 
-static void big52mic(unsigned char *big5, unsigned char *p, int len);
-static void mic2big5(unsigned char *mic, unsigned char *p, int len);
-static void euc_tw2mic(unsigned char *euc, unsigned char *p, int len);
-static void mic2euc_tw(unsigned char *mic, unsigned char *p, int len);
+static void big52mic(const unsigned char *big5, unsigned char *p, int len);
+static void mic2big5(const unsigned char *mic, unsigned char *p, int len);
+static void euc_tw2mic(const unsigned char *euc, unsigned char *p, int len);
+static void mic2euc_tw(const unsigned char *mic, unsigned char *p, int len);
 
 Datum
 euc_tw_to_big5(PG_FUNCTION_ARGS)
@@ -114,7 +114,7 @@ mic_to_euc_tw(PG_FUNCTION_ARGS)
        Assert(PG_GETARG_INT32(1) == PG_EUC_TW);
        Assert(len >= 0);
 
-       mic2big5(src, dest, len);
+       mic2euc_tw(src, dest, len);
 
        PG_RETURN_VOID();
 }
@@ -155,39 +155,52 @@ mic_to_big5(PG_FUNCTION_ARGS)
  * EUC_TW ---> MIC
  */
 static void
-euc_tw2mic(unsigned char *euc, unsigned char *p, int len)
+euc_tw2mic(const unsigned char *euc, unsigned char *p, int len)
 {
        int                     c1;
+       int                     l;
 
-       while (len >= 0 && (c1 = *euc++))
+       while (len > 0)
        {
-               if (c1 == SS2)
+               c1 = *euc;
+               if (IS_HIGHBIT_SET(c1))
                {
-                       len -= 4;
-                       c1 = *euc++;            /* plane No. */
-                       if (c1 == 0xa1)
-                               *p++ = LC_CNS11643_1;
-                       else if (c1 == 0xa2)
-                               *p++ = LC_CNS11643_2;
-                       else
+                       l = pg_encoding_verifymb(PG_EUC_TW, (const char *) euc, len);
+                       if (l < 0)
+                               report_invalid_encoding(PG_EUC_TW,
+                                                                               (const char *) euc, len);
+                       if (c1 == SS2)
                        {
-                               *p++ = 0x9d;    /* LCPRV2 */
-                               *p++ = 0xa3 - c1 + LC_CNS11643_3;
+                               c1 = euc[1];            /* plane No. */
+                               if (c1 == 0xa1)
+                                       *p++ = LC_CNS11643_1;
+                               else if (c1 == 0xa2)
+                                       *p++ = LC_CNS11643_2;
+                               else
+                               {
+                                       *p++ = 0x9d;    /* LCPRV2 */
+                                       *p++ = c1 - 0xa3 + LC_CNS11643_3;
+                               }
+                               *p++ = euc[2];
+                               *p++ = euc[3];
                        }
-                       *p++ = *euc++;
-                       *p++ = *euc++;
-               }
-               else if (c1 & 0x80)
-               {                                               /* CNS11643-1 */
-                       len -= 2;
-                       *p++ = LC_CNS11643_1;
-                       *p++ = c1;
-                       *p++ = *euc++;
+                       else
+                       {                                               /* CNS11643-1 */
+                               *p++ = LC_CNS11643_1;
+                               *p++ = c1;
+                               *p++ = euc[1];
+                       }
+                       euc += l;
+                       len -= l;
                }
                else
                {                                               /* should be ASCII */
-                       len--;
+                       if (c1 == 0)
+                               report_invalid_encoding(PG_EUC_TW,
+                                                                               (const char *) euc, len);
                        *p++ = c1;
+                       euc++;
+                       len--;
                }
        }
        *p = '\0';
@@ -197,42 +210,54 @@ euc_tw2mic(unsigned char *euc, unsigned char *p, int len)
  * MIC ---> EUC_TW
  */
 static void
-mic2euc_tw(unsigned char *mic, unsigned char *p, int len)
+mic2euc_tw(const unsigned char *mic, unsigned char *p, int len)
 {
        int                     c1;
+       int                     l;
 
-       while (len >= 0 && (c1 = *mic))
+       while (len > 0)
        {
-               len -= pg_mic_mblen(mic++);
-
+               c1 = *mic;
+               if (!IS_HIGHBIT_SET(c1))
+               {
+                       /* ASCII */
+                       if (c1 == 0)
+                               report_invalid_encoding(PG_MULE_INTERNAL,
+                                                                               (const char *) mic, len);
+                       *p++ = c1;
+                       mic++;
+                       len--;
+                       continue;
+               }
+               l = pg_encoding_verifymb(PG_MULE_INTERNAL, (const char *) mic, len);
+               if (l < 0)
+                       report_invalid_encoding(PG_MULE_INTERNAL,
+                                                                       (const char *) mic, len);
                if (c1 == LC_CNS11643_1)
                {
-                       *p++ = *mic++;
-                       *p++ = *mic++;
+                       *p++ = mic[1];
+                       *p++ = mic[2];
                }
                else if (c1 == LC_CNS11643_2)
                {
                        *p++ = SS2;
                        *p++ = 0xa2;
-                       *p++ = *mic++;
-                       *p++ = *mic++;
+                       *p++ = mic[1];
+                       *p++ = mic[2];
                }
-               else if (c1 == 0x9d)
+               else if (c1 == 0x9d &&
+                                mic[1] >= LC_CNS11643_3 && mic[1] <= LC_CNS11643_7)
                {                                               /* LCPRV2? */
                        *p++ = SS2;
-                       *p++ = *mic++ - LC_CNS11643_3 + 0xa3;
-                       *p++ = *mic++;
-                       *p++ = *mic++;
-               }
-               else if (c1 > 0x7f)
-               {                                               /* cannot convert to EUC_TW! */
-                       mic--;
-                       pg_print_bogus_char(&mic, &p);
+                       *p++ = mic[1] - LC_CNS11643_3 + 0xa3;
+                       *p++ = mic[2];
+                       *p++ = mic[3];
                }
                else
-               {                                               /* should be ASCII */
-                       *p++ = c1;
-               }
+                       report_untranslatable_char(PG_MULE_INTERNAL, PG_EUC_TW,
+                                                                          (const char *) mic, len);
+               mic += l;
+               len -= l;
        }
        *p = '\0';
 }
@@ -241,52 +266,49 @@ mic2euc_tw(unsigned char *mic, unsigned char *p, int len)
  * Big5 ---> MIC
  */
 static void
-big52mic(unsigned char *big5, unsigned char *p, int len)
+big52mic(const unsigned char *big5, unsigned char *p, int len)
 {
        unsigned short c1;
        unsigned short big5buf,
                                cnsBuf;
        unsigned char lc;
-       char            bogusBuf[3];
-       int                     i;
+       int                     l;
 
-       while (len >= 0 && (c1 = *big5++))
+       while (len > 0)
        {
-               if (c1 <= 0x7fU)
-               {                                               /* ASCII */
-                       len--;
+               c1 = *big5;
+               if (!IS_HIGHBIT_SET(c1))
+               {
+                       /* ASCII */
+                       if (c1 == 0)
+                               report_invalid_encoding(PG_BIG5,
+                                                                               (const char *) big5, len);
                        *p++ = c1;
+                       big5++;
+                       len--;
+                       continue;
                }
-               else
+               l = pg_encoding_verifymb(PG_BIG5, (const char *) big5, len);
+               if (l < 0)
+                       report_invalid_encoding(PG_BIG5,
+                                                                       (const char *) big5, len);
+               big5buf = (c1 << 8) | big5[1];
+               cnsBuf = BIG5toCNS(big5buf, &lc);
+               if (lc != 0)
                {
-                       len -= 2;
-                       big5buf = c1 << 8;
-                       c1 = *big5++;
-                       big5buf |= c1;
-                       cnsBuf = BIG5toCNS(big5buf, &lc);
-                       if (lc != 0)
+                       if (lc == LC_CNS11643_3 || lc == LC_CNS11643_4)
                        {
-                               if (lc == LC_CNS11643_3 || lc == LC_CNS11643_4)
-                               {
-                                       *p++ = 0x9d;    /* LCPRV2 */
-                               }
-                               *p++ = lc;              /* Plane No. */
-                               *p++ = (cnsBuf >> 8) & 0x00ff;
-                               *p++ = cnsBuf & 0x00ff;
-                       }
-                       else
-                       {                                       /* cannot convert */
-                               big5 -= 2;
-                               *p++ = '(';
-                               for (i = 0; i < 2; i++)
-                               {
-                                       sprintf(bogusBuf, "%02x", *big5++);
-                                       *p++ = bogusBuf[0];
-                                       *p++ = bogusBuf[1];
-                               }
-                               *p++ = ')';
+                               *p++ = 0x9d;    /* LCPRV2 */
                        }
+                       *p++ = lc;              /* Plane No. */
+                       *p++ = (cnsBuf >> 8) & 0x00ff;
+                       *p++ = cnsBuf & 0x00ff;
                }
+               else
+                       report_untranslatable_char(PG_BIG5, PG_MULE_INTERNAL,
+                                                                          (const char *) big5, len);
+               big5 += l;
+               len -= l;
        }
        *p = '\0';
 }
@@ -295,46 +317,55 @@ big52mic(unsigned char *big5, unsigned char *p, int len)
  * MIC ---> Big5
  */
 static void
-mic2big5(unsigned char *mic, unsigned char *p, int len)
+mic2big5(const unsigned char *mic, unsigned char *p, int len)
 {
-       int                     l;
        unsigned short c1;
        unsigned short big5buf,
                                cnsBuf;
+       int                     l;
 
-       while (len >= 0 && (c1 = *mic))
+       while (len > 0)
        {
-               l = pg_mic_mblen(mic++);
-               len -= l;
-
+               c1 = *mic;
+               if (!IS_HIGHBIT_SET(c1))
+               {
+                       /* ASCII */
+                       if (c1 == 0)
+                               report_invalid_encoding(PG_MULE_INTERNAL,
+                                                                               (const char *) mic, len);
+                       *p++ = c1;
+                       mic++;
+                       len--;
+                       continue;
+               }
+               l = pg_encoding_verifymb(PG_MULE_INTERNAL, (const char *) mic, len);
+               if (l < 0)
+                       report_invalid_encoding(PG_MULE_INTERNAL,
+                                                                       (const char *) mic, len);
                /* 0x9d means LCPRV2 */
                if (c1 == LC_CNS11643_1 || c1 == LC_CNS11643_2 || c1 == 0x9d)
                {
                        if (c1 == 0x9d)
                        {
-                               c1 = *mic++;    /* get plane no. */
-                       }
-                       cnsBuf = (*mic++) << 8;
-                       cnsBuf |= (*mic++) & 0x00ff;
-                       big5buf = CNStoBIG5(cnsBuf, c1);
-                       if (big5buf == 0)
-                       {                                       /* cannot convert to Big5! */
-                               mic -= l;
-                               pg_print_bogus_char(&mic, &p);
+                               c1 = mic[1];    /* get plane no. */
+                               cnsBuf = (mic[2] << 8) | mic[3];
                        }
                        else
                        {
-                               *p++ = (big5buf >> 8) & 0x00ff;
-                               *p++ = big5buf & 0x00ff;
+                               cnsBuf = (mic[1] << 8) | mic[2];
                        }
+                       big5buf = CNStoBIG5(cnsBuf, c1);
+                       if (big5buf == 0)
+                               report_untranslatable_char(PG_MULE_INTERNAL, PG_BIG5,
+                                                                                  (const char *) mic, len);
+                       *p++ = (big5buf >> 8) & 0x00ff;
+                       *p++ = big5buf & 0x00ff;
                }
-               else if (c1 <= 0x7f)    /* ASCII */
-                       *p++ = c1;
                else
-               {                                               /* cannot convert to Big5! */
-                       mic--;
-                       pg_print_bogus_char(&mic, &p);
-               }
+                       report_untranslatable_char(PG_MULE_INTERNAL, PG_BIG5,
+                                                                          (const char *) mic, len);
+               mic += l;
+               len -= l;
        }
        *p = '\0';
 }
index 4e9aec470aa9d5bfcdc350fdb1a59fd91941d059..fdc5b59bcbce3493ca1eaf72de27a0083d58f816 100644 (file)
@@ -6,7 +6,7 @@
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * IDENTIFICATION
- *       $Header: /cvsroot/pgsql/src/backend/utils/mb/conversion_procs/latin2_and_win1250/latin2_and_win1250.c,v 1.5 2002/10/26 15:01:00 tgl Exp $
+ *       $Header: /cvsroot/pgsql/src/backend/utils/mb/conversion_procs/latin2_and_win1250/latin2_and_win1250.c,v 1.5.2.1 2006/05/21 20:07:12 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -42,10 +42,10 @@ extern Datum win1250_to_latin2(PG_FUNCTION_ARGS);
  * ----------
  */
 
-static void latin22mic(unsigned char *l, unsigned char *p, int len);
-static void mic2latin2(unsigned char *mic, unsigned char *p, int len);
-static void win12502mic(unsigned char *l, unsigned char *p, int len);
-static void mic2win1250(unsigned char *mic, unsigned char *p, int len);
+static void latin22mic(const unsigned char *l, unsigned char *p, int len);
+static void mic2latin2(const unsigned char *mic, unsigned char *p, int len);
+static void win12502mic(const unsigned char *l, unsigned char *p, int len);
+static void mic2win1250(const unsigned char *mic, unsigned char *p, int len);
 
 Datum
 latin2_to_mic(PG_FUNCTION_ARGS)
@@ -152,14 +152,15 @@ win1250_to_latin2(PG_FUNCTION_ARGS)
 }
 
 static void
-latin22mic(unsigned char *l, unsigned char *p, int len)
+latin22mic(const unsigned char *l, unsigned char *p, int len)
 {
-       latin2mic(l, p, len, LC_ISO8859_2);
+       latin2mic(l, p, len, LC_ISO8859_2, PG_LATIN2);
 }
+
 static void
-mic2latin2(unsigned char *mic, unsigned char *p, int len)
+mic2latin2(const unsigned char *mic, unsigned char *p, int len)
 {
-       mic2latin(mic, p, len, LC_ISO8859_2);
+       mic2latin(mic, p, len, LC_ISO8859_2, PG_LATIN2);
 }
 
 /*-----------------------------------------------------------------
@@ -167,9 +168,9 @@ mic2latin2(unsigned char *mic, unsigned char *p, int len)
  * Microsoft's CP1250(windows-1250)
  *-----------------------------------------------------------------*/
 static void
-win12502mic(unsigned char *l, unsigned char *p, int len)
+win12502mic(const unsigned char *l, unsigned char *p, int len)
 {
-       static unsigned char win1250_2_iso88592[] = {
+       static const unsigned char win1250_2_iso88592[] = {
                0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
                0x88, 0x89, 0xA9, 0x8B, 0xA6, 0xAB, 0xAE, 0xAC,
                0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97,
@@ -188,12 +189,14 @@ win12502mic(unsigned char *l, unsigned char *p, int len)
                0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0xFF
        };
 
-       latin2mic_with_table(l, p, len, LC_ISO8859_2, win1250_2_iso88592);
+       latin2mic_with_table(l, p, len, LC_ISO8859_2, PG_WIN1250,
+                                                win1250_2_iso88592);
 }
+
 static void
-mic2win1250(unsigned char *mic, unsigned char *p, int len)
+mic2win1250(const unsigned char *mic, unsigned char *p, int len)
 {
-       static unsigned char iso88592_2_win1250[] = {
+       static const unsigned char iso88592_2_win1250[] = {
                0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
                0x88, 0x89, 0x00, 0x8B, 0x00, 0x00, 0x00, 0x00,
                0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97,
@@ -212,5 +215,6 @@ mic2win1250(unsigned char *mic, unsigned char *p, int len)
                0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0xFF
        };
 
-       mic2latin_with_table(mic, p, len, LC_ISO8859_2, iso88592_2_win1250);
+       mic2latin_with_table(mic, p, len, LC_ISO8859_2, PG_WIN1250,
+                                                iso88592_2_win1250);
 }
index c3389c01921a63b6a3d03fdc96211ecf69a53ad9..af22a2107a21fad6f112b26d02f5eb044f93e369 100644 (file)
@@ -6,7 +6,7 @@
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * IDENTIFICATION
- *       $Header: /cvsroot/pgsql/src/backend/utils/mb/conversion_procs/latin_and_mic/latin_and_mic.c,v 1.5 2002/10/26 15:01:00 tgl Exp $
+ *       $Header: /cvsroot/pgsql/src/backend/utils/mb/conversion_procs/latin_and_mic/latin_and_mic.c,v 1.5.2.1 2006/05/21 20:07:12 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -40,12 +40,12 @@ extern Datum mic_to_latin4(PG_FUNCTION_ARGS);
  * ----------
  */
 
-static void latin12mic(unsigned char *l, unsigned char *p, int len);
-static void mic2latin1(unsigned char *mic, unsigned char *p, int len);
-static void latin32mic(unsigned char *l, unsigned char *p, int len);
-static void mic2latin3(unsigned char *mic, unsigned char *p, int len);
-static void latin42mic(unsigned char *l, unsigned char *p, int len);
-static void mic2latin4(unsigned char *mic, unsigned char *p, int len);
+static void latin12mic(const unsigned char *l, unsigned char *p, int len);
+static void mic2latin1(const unsigned char *mic, unsigned char *p, int len);
+static void latin32mic(const unsigned char *l, unsigned char *p, int len);
+static void mic2latin3(const unsigned char *mic, unsigned char *p, int len);
+static void latin42mic(const unsigned char *l, unsigned char *p, int len);
+static void mic2latin4(const unsigned char *mic, unsigned char *p, int len);
 
 Datum
 latin1_to_mic(PG_FUNCTION_ARGS)
@@ -144,32 +144,37 @@ mic_to_latin4(PG_FUNCTION_ARGS)
 }
 
 static void
-latin12mic(unsigned char *l, unsigned char *p, int len)
+latin12mic(const unsigned char *l, unsigned char *p, int len)
 {
-       latin2mic(l, p, len, LC_ISO8859_1);
+       latin2mic(l, p, len, LC_ISO8859_1, PG_LATIN1);
 }
+
 static void
-mic2latin1(unsigned char *mic, unsigned char *p, int len)
+mic2latin1(const unsigned char *mic, unsigned char *p, int len)
 {
-       mic2latin(mic, p, len, LC_ISO8859_1);
+       mic2latin(mic, p, len, LC_ISO8859_1, PG_LATIN1);
 }
+
 static void
-latin32mic(unsigned char *l, unsigned char *p, int len)
+latin32mic(const unsigned char *l, unsigned char *p, int len)
 {
-       latin2mic(l, p, len, LC_ISO8859_3);
+       latin2mic(l, p, len, LC_ISO8859_3, PG_LATIN3);
 }
+
 static void
-mic2latin3(unsigned char *mic, unsigned char *p, int len)
+mic2latin3(const unsigned char *mic, unsigned char *p, int len)
 {
-       mic2latin(mic, p, len, LC_ISO8859_3);
+       mic2latin(mic, p, len, LC_ISO8859_3, PG_LATIN3);
 }
+
 static void
-latin42mic(unsigned char *l, unsigned char *p, int len)
+latin42mic(const unsigned char *l, unsigned char *p, int len)
 {
-       latin2mic(l, p, len, LC_ISO8859_4);
+       latin2mic(l, p, len, LC_ISO8859_4, PG_LATIN4);
 }
+
 static void
-mic2latin4(unsigned char *mic, unsigned char *p, int len)
+mic2latin4(const unsigned char *mic, unsigned char *p, int len)
 {
-       mic2latin(mic, p, len, LC_ISO8859_4);
+       mic2latin(mic, p, len, LC_ISO8859_4, PG_LATIN4);
 }
index 0de1cc2c589b945576ab57a2c17d2664734c4116..ad31a547b779991b2d0026fbe6c45d3c38e513f7 100644 (file)
@@ -6,7 +6,7 @@
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * IDENTIFICATION
- *       $Header: /cvsroot/pgsql/src/backend/utils/mb/conversion_procs/utf8_and_ascii/utf8_and_ascii.c,v 1.5 2002/10/26 15:01:00 tgl Exp $
+ *       $Header: /cvsroot/pgsql/src/backend/utils/mb/conversion_procs/utf8_and_ascii/utf8_and_ascii.c,v 1.5.2.1 2006/05/21 20:07:12 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -43,6 +43,7 @@ ascii_to_utf8(PG_FUNCTION_ARGS)
        Assert(PG_GETARG_INT32(1) == PG_UTF8);
        Assert(len >= 0);
 
+       /* this looks wrong, but basically we're just rejecting high-bit-set */
        pg_ascii2mic(src, dest, len);
 
        PG_RETURN_VOID();
@@ -59,6 +60,7 @@ utf8_to_ascii(PG_FUNCTION_ARGS)
        Assert(PG_GETARG_INT32(1) == PG_SQL_ASCII);
        Assert(len >= 0);
 
+       /* this looks wrong, but basically we're just rejecting high-bit-set */
        pg_mic2ascii(src, dest, len);
 
        PG_RETURN_VOID();
index d3a76821fbcc28ba53478aa3aaa66ad9dd0d8200..920514201c57f6c1d2aa9f772b84255ad8118be8 100644 (file)
@@ -6,7 +6,7 @@
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * IDENTIFICATION
- *       $Header: /cvsroot/pgsql/src/backend/utils/mb/conversion_procs/utf8_and_big5/utf8_and_big5.c,v 1.5 2002/10/26 15:01:00 tgl Exp $
+ *       $Header: /cvsroot/pgsql/src/backend/utils/mb/conversion_procs/utf8_and_big5/utf8_and_big5.c,v 1.5.2.1 2006/05/21 20:07:12 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -62,7 +62,7 @@ utf8_to_big5(PG_FUNCTION_ARGS)
        Assert(len >= 0);
 
        UtfToLocal(src, dest, ULmapBIG5,
-                          sizeof(ULmapBIG5) / sizeof(pg_utf_to_local), len);
+                          sizeof(ULmapBIG5) / sizeof(pg_utf_to_local), PG_BIG5, len);
 
        PG_RETURN_VOID();
 }
index d4c4f86fb8a5320a864c656d5427082f5c5c9ab8..9c972c35b71d66a6aed4916d65c0eb34e2a457d0 100644 (file)
@@ -6,7 +6,7 @@
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * IDENTIFICATION
- *       $Header: /cvsroot/pgsql/src/backend/utils/mb/conversion_procs/utf8_and_cyrillic/utf8_and_cyrillic.c,v 1.5 2002/10/26 15:01:00 tgl Exp $
+ *       $Header: /cvsroot/pgsql/src/backend/utils/mb/conversion_procs/utf8_and_cyrillic/utf8_and_cyrillic.c,v 1.5.2.1 2006/05/21 20:07:12 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -58,7 +58,7 @@ utf8_to_koi8r(PG_FUNCTION_ARGS)
        Assert(len >= 0);
 
        UtfToLocal(src, dest, ULmap_KOI8R,
-                          sizeof(ULmap_KOI8R) / sizeof(pg_utf_to_local), len);
+                          sizeof(ULmap_KOI8R) / sizeof(pg_utf_to_local), PG_KOI8R, len);
 
        PG_RETURN_VOID();
 }
@@ -92,7 +92,7 @@ utf8_to_win1251(PG_FUNCTION_ARGS)
        Assert(len >= 0);
 
        UtfToLocal(src, dest, ULmap_WIN1251,
-                          sizeof(ULmap_WIN1251) / sizeof(pg_utf_to_local), len);
+                          sizeof(ULmap_WIN1251) / sizeof(pg_utf_to_local), PG_WIN1251, len);
 
        PG_RETURN_VOID();
 }
@@ -126,7 +126,7 @@ utf8_to_alt(PG_FUNCTION_ARGS)
        Assert(len >= 0);
 
        UtfToLocal(src, dest, ULmap_ALT,
-                          sizeof(ULmap_ALT) / sizeof(pg_utf_to_local), len);
+                          sizeof(ULmap_ALT) / sizeof(pg_utf_to_local), PG_ALT, len);
 
        PG_RETURN_VOID();
 }
index 79f54565367ba2036c3d3e85667de69db256dc13..c2ac8a8736d8e22a57d0abb67e423099f8004022 100644 (file)
@@ -6,7 +6,7 @@
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * IDENTIFICATION
- *       $Header: /cvsroot/pgsql/src/backend/utils/mb/conversion_procs/utf8_and_euc_cn/utf8_and_euc_cn.c,v 1.5 2002/10/26 15:01:00 tgl Exp $
+ *       $Header: /cvsroot/pgsql/src/backend/utils/mb/conversion_procs/utf8_and_euc_cn/utf8_and_euc_cn.c,v 1.5.2.1 2006/05/21 20:07:12 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -62,7 +62,7 @@ utf8_to_euc_cn(PG_FUNCTION_ARGS)
        Assert(len >= 0);
 
        UtfToLocal(src, dest, ULmapEUC_CN,
-                          sizeof(ULmapEUC_CN) / sizeof(pg_utf_to_local), len);
+                          sizeof(ULmapEUC_CN) / sizeof(pg_utf_to_local), PG_EUC_CN, len);
 
        PG_RETURN_VOID();
 }
index 9fa364c2b2719cc478ad203aa6f42a1bcde58c62..ec383d644777d0108422334914867939e92d2fff 100644 (file)
@@ -6,7 +6,7 @@
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * IDENTIFICATION
- *       $Header: /cvsroot/pgsql/src/backend/utils/mb/conversion_procs/utf8_and_euc_jp/utf8_and_euc_jp.c,v 1.5 2002/10/26 15:01:00 tgl Exp $
+ *       $Header: /cvsroot/pgsql/src/backend/utils/mb/conversion_procs/utf8_and_euc_jp/utf8_and_euc_jp.c,v 1.5.2.1 2006/05/21 20:07:12 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -62,7 +62,7 @@ utf8_to_euc_jp(PG_FUNCTION_ARGS)
        Assert(len >= 0);
 
        UtfToLocal(src, dest, ULmapEUC_JP,
-                          sizeof(ULmapEUC_JP) / sizeof(pg_utf_to_local), len);
+                          sizeof(ULmapEUC_JP) / sizeof(pg_utf_to_local), PG_EUC_JP, len);
 
        PG_RETURN_VOID();
 }
index be13ee25a070eb21d9b4c5a42b6b292b5980a0fe..1708b9d8690f6d82b478dab02eeedcfb053079bf 100644 (file)
@@ -6,7 +6,7 @@
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * IDENTIFICATION
- *       $Header: /cvsroot/pgsql/src/backend/utils/mb/conversion_procs/utf8_and_euc_kr/utf8_and_euc_kr.c,v 1.5 2002/10/26 15:01:00 tgl Exp $
+ *       $Header: /cvsroot/pgsql/src/backend/utils/mb/conversion_procs/utf8_and_euc_kr/utf8_and_euc_kr.c,v 1.5.2.1 2006/05/21 20:07:12 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -62,7 +62,7 @@ utf8_to_euc_kr(PG_FUNCTION_ARGS)
        Assert(len >= 0);
 
        UtfToLocal(src, dest, ULmapEUC_KR,
-                          sizeof(ULmapEUC_KR) / sizeof(pg_utf_to_local), len);
+                          sizeof(ULmapEUC_KR) / sizeof(pg_utf_to_local), PG_EUC_KR, len);
 
        PG_RETURN_VOID();
 }
index f80a8f1701b420b27499e0a60ee027765e34c100..831fa97172b8de5d9e6da2ce652130176338d2c9 100644 (file)
@@ -6,7 +6,7 @@
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * IDENTIFICATION
- *       $Header: /cvsroot/pgsql/src/backend/utils/mb/conversion_procs/utf8_and_euc_tw/utf8_and_euc_tw.c,v 1.5 2002/10/26 15:01:00 tgl Exp $
+ *       $Header: /cvsroot/pgsql/src/backend/utils/mb/conversion_procs/utf8_and_euc_tw/utf8_and_euc_tw.c,v 1.5.2.1 2006/05/21 20:07:12 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -62,7 +62,7 @@ utf8_to_euc_tw(PG_FUNCTION_ARGS)
        Assert(len >= 0);
 
        UtfToLocal(src, dest, ULmapEUC_TW,
-                          sizeof(ULmapEUC_TW) / sizeof(pg_utf_to_local), len);
+                          sizeof(ULmapEUC_TW) / sizeof(pg_utf_to_local), PG_EUC_TW, len);
 
        PG_RETURN_VOID();
 }
index 65c7d540ba1dfaa129827f381908b5a959bc0f61..ab5793ddd421df8c88d8f649f6fe9a86e5c8651f 100644 (file)
@@ -6,7 +6,7 @@
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * IDENTIFICATION
- *       $Header: /cvsroot/pgsql/src/backend/utils/mb/conversion_procs/utf8_and_gb18030/utf8_and_gb18030.c,v 1.5 2002/10/26 15:01:00 tgl Exp $
+ *       $Header: /cvsroot/pgsql/src/backend/utils/mb/conversion_procs/utf8_and_gb18030/utf8_and_gb18030.c,v 1.5.2.1 2006/05/21 20:07:12 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -62,7 +62,7 @@ utf8_to_gb18030(PG_FUNCTION_ARGS)
        Assert(len >= 0);
 
        UtfToLocal(src, dest, ULmapGB18030,
-                          sizeof(ULmapGB18030) / sizeof(pg_utf_to_local), len);
+                          sizeof(ULmapGB18030) / sizeof(pg_utf_to_local), PG_GB18030, len);
 
        PG_RETURN_VOID();
 }
index 660c52d7e1ebbe5c8fa68946e8d9e202c9b4331f..dc1eccf197d192cc0949d27894af17aae7a87c72 100644 (file)
@@ -6,7 +6,7 @@
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * IDENTIFICATION
- *       $Header: /cvsroot/pgsql/src/backend/utils/mb/conversion_procs/utf8_and_gbk/utf8_and_gbk.c,v 1.5 2002/10/26 15:01:00 tgl Exp $
+ *       $Header: /cvsroot/pgsql/src/backend/utils/mb/conversion_procs/utf8_and_gbk/utf8_and_gbk.c,v 1.5.2.1 2006/05/21 20:07:12 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -62,7 +62,7 @@ utf8_to_gbk(PG_FUNCTION_ARGS)
        Assert(len >= 0);
 
        UtfToLocal(src, dest, ULmapGBK,
-                          sizeof(ULmapGBK) / sizeof(pg_utf_to_local), len);
+                          sizeof(ULmapGBK) / sizeof(pg_utf_to_local), PG_GBK, len);
 
        PG_RETURN_VOID();
 }
index 7c2e68a9d97c338575061ff878527cae35e78b35..6314f60c1eb980d879728fc8562a1124aa065ce3 100644 (file)
@@ -6,7 +6,7 @@
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * IDENTIFICATION
- *       $Header: /cvsroot/pgsql/src/backend/utils/mb/conversion_procs/utf8_and_iso8859/utf8_and_iso8859.c,v 1.5.2.1 2002/12/09 19:42:31 petere Exp $
+ *       $Header: /cvsroot/pgsql/src/backend/utils/mb/conversion_procs/utf8_and_iso8859/utf8_and_iso8859.c,v 1.5.2.2 2006/05/21 20:07:12 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -151,7 +151,7 @@ utf8_to_iso8859(PG_FUNCTION_ARGS)
        Assert(PG_GETARG_INT32(0) == PG_UTF8);
        Assert(len >= 0);
 
-       UtfToLocal(src, dest, maps[encoding].map2, maps[encoding].size2, len);
+       UtfToLocal(src, dest, maps[encoding].map2, maps[encoding].size2, encoding, len);
 
        PG_RETURN_VOID();
 }
index 78ca64e6d86c28f4c64438de917cd3f7139f6bdf..9d56effa06800ade09e0e4da69e3947b2c03bd80 100644 (file)
@@ -6,7 +6,7 @@
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * IDENTIFICATION
- *       $Header: /cvsroot/pgsql/src/backend/utils/mb/conversion_procs/utf8_and_iso8859_1/utf8_and_iso8859_1.c,v 1.5 2002/10/26 15:01:00 tgl Exp $
+ *       $Header: /cvsroot/pgsql/src/backend/utils/mb/conversion_procs/utf8_and_iso8859_1/utf8_and_iso8859_1.c,v 1.5.2.1 2006/05/21 20:07:12 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -44,15 +44,20 @@ iso8859_1_to_utf8(PG_FUNCTION_ARGS)
        Assert(PG_GETARG_INT32(1) == PG_UTF8);
        Assert(len >= 0);
 
-       while (len-- > 0 && (c = *src++))
+       while (len > 0)
        {
-               if (c < 0x80)
+               c = *src;
+               if (c == 0)
+                       report_invalid_encoding(PG_LATIN1, (const char *) src, len);
+               if (!IS_HIGHBIT_SET(c))
                        *dest++ = c;
                else
                {
                        *dest++ = (c >> 6) | 0xc0;
                        *dest++ = (c & 0x003f) | 0x80;
                }
+               src++;
+               len--;
        }
        *dest = '\0';
 
@@ -66,29 +71,44 @@ utf8_to_iso8859_1(PG_FUNCTION_ARGS)
        unsigned char *dest = PG_GETARG_CSTRING(3);
        int                     len = PG_GETARG_INT32(4);
        unsigned short c,
-                               c1,
-                               c2;
+                               c1;
 
        Assert(PG_GETARG_INT32(0) == PG_UTF8);
        Assert(PG_GETARG_INT32(1) == PG_LATIN1);
        Assert(len >= 0);
 
-       while (len >= 0 && (c = *src++))
+       while (len > 0)
        {
-               if ((c & 0xe0) == 0xc0)
+               c = *src;
+               if (c == 0)
+                       report_invalid_encoding(PG_UTF8, (const char *) src, len);
+               /* fast path for ASCII-subset characters */
+               if (!IS_HIGHBIT_SET(c))
                {
-                       c1 = c & 0x1f;
-                       c2 = *src++ & 0x3f;
-                       *dest = c1 << 6;
-                       *dest++ |= c2;
-                       len -= 2;
+                       *dest++ = c;
+                       src++;
+                       len--;
                }
-               else if ((c & 0xe0) == 0xe0)
-                       elog(ERROR, "Could not convert UTF-8 to ISO8859-1");
                else
                {
-                       *dest++ = c;
-                       len--;
+                       int             l = pg_utf_mblen(src);
+
+                       if (l > len || !pg_utf8_islegal(src, l))
+                               report_invalid_encoding(PG_UTF8, (const char *) src, len);
+                       if (l != 2)
+                               report_untranslatable_char(PG_UTF8, PG_LATIN1,
+                                                                                  (const char *) src, len);
+                       c1 = src[1] & 0x3f;
+                       c = ((c & 0x1f) << 6) | c1;
+                       if (c >= 0x80 && c <= 0xff)
+                       {
+                               *dest++ = (unsigned char) c;
+                               src += 2;
+                               len -= 2;
+                       }
+                       else
+                               report_untranslatable_char(PG_UTF8, PG_LATIN1,
+                                                                                  (const char *) src, len);
                }
        }
        *dest = '\0';
index c6a312cf111912fef7f43eb43b7a14f4761994b5..e5ef95ba54d4dedd75d705c2ebfb6d4236c986d7 100644 (file)
@@ -6,7 +6,7 @@
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * IDENTIFICATION
- *       $Header: /cvsroot/pgsql/src/backend/utils/mb/conversion_procs/utf8_and_johab/utf8_and_johab.c,v 1.5 2002/10/26 15:01:00 tgl Exp $
+ *       $Header: /cvsroot/pgsql/src/backend/utils/mb/conversion_procs/utf8_and_johab/utf8_and_johab.c,v 1.5.2.1 2006/05/21 20:07:12 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -62,7 +62,7 @@ utf8_to_johab(PG_FUNCTION_ARGS)
        Assert(len >= 0);
 
        UtfToLocal(src, dest, ULmapJOHAB,
-                          sizeof(ULmapJOHAB) / sizeof(pg_utf_to_local), len);
+                          sizeof(ULmapJOHAB) / sizeof(pg_utf_to_local), PG_JOHAB, len);
 
        PG_RETURN_VOID();
 }
index dd7bc251c21b6ff9c331478b4492263a682f75b2..f5dc3a89779ac60ced3c6ea5ad2e9678f150a9c2 100644 (file)
@@ -6,7 +6,7 @@
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * IDENTIFICATION
- *       $Header: /cvsroot/pgsql/src/backend/utils/mb/conversion_procs/utf8_and_sjis/utf8_and_sjis.c,v 1.5 2002/10/26 15:01:00 tgl Exp $
+ *       $Header: /cvsroot/pgsql/src/backend/utils/mb/conversion_procs/utf8_and_sjis/utf8_and_sjis.c,v 1.5.2.1 2006/05/21 20:07:13 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -62,7 +62,7 @@ utf8_to_sjis(PG_FUNCTION_ARGS)
        Assert(len >= 0);
 
        UtfToLocal(src, dest, ULmapSJIS,
-                          sizeof(ULmapSJIS) / sizeof(pg_utf_to_local), len);
+                          sizeof(ULmapSJIS) / sizeof(pg_utf_to_local), PG_SJIS, len);
 
        PG_RETURN_VOID();
 }
index 80ffbd8a608f19710db9c1ea6be24739e83ed90a..5c44f224778da83135f4fd3d22c8595b484e319e 100644 (file)
@@ -6,7 +6,7 @@
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * IDENTIFICATION
- *       $Header: /cvsroot/pgsql/src/backend/utils/mb/conversion_procs/utf8_and_tcvn/Attic/utf8_and_tcvn.c,v 1.5 2002/10/26 15:01:00 tgl Exp $
+ *       $Header: /cvsroot/pgsql/src/backend/utils/mb/conversion_procs/utf8_and_tcvn/Attic/utf8_and_tcvn.c,v 1.5.2.1 2006/05/21 20:07:13 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -62,7 +62,7 @@ utf8_to_tcvn(PG_FUNCTION_ARGS)
        Assert(len >= 0);
 
        UtfToLocal(src, dest, ULmapTCVN,
-                          sizeof(ULmapTCVN) / sizeof(pg_utf_to_local), len);
+                          sizeof(ULmapTCVN) / sizeof(pg_utf_to_local), PG_TCVN, len);
 
        PG_RETURN_VOID();
 }
index aec8537805fc96167cf76e499f93d51754879703..6c02c798f1908703237f9baece7a2e1443e5cb70 100644 (file)
@@ -6,7 +6,7 @@
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * IDENTIFICATION
- *       $Header: /cvsroot/pgsql/src/backend/utils/mb/conversion_procs/utf8_and_uhc/utf8_and_uhc.c,v 1.5 2002/10/26 15:01:01 tgl Exp $
+ *       $Header: /cvsroot/pgsql/src/backend/utils/mb/conversion_procs/utf8_and_uhc/utf8_and_uhc.c,v 1.5.2.1 2006/05/21 20:07:13 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -62,7 +62,7 @@ utf8_to_uhc(PG_FUNCTION_ARGS)
        Assert(len >= 0);
 
        UtfToLocal(src, dest, ULmapUHC,
-                          sizeof(ULmapUHC) / sizeof(pg_utf_to_local), len);
+                          sizeof(ULmapUHC) / sizeof(pg_utf_to_local), PG_UHC, len);
 
        PG_RETURN_VOID();
 }
index d6e066137a527d0865682a35df8c5ad41091f571..f1482f6fe73a2f2426659351d40937b3f5c9dad6 100644 (file)
@@ -6,7 +6,7 @@
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * IDENTIFICATION
- *       $Header: /cvsroot/pgsql/src/backend/utils/mb/conversion_procs/utf8_and_win1250/Attic/utf8_and_win1250.c,v 1.5 2002/10/26 15:01:01 tgl Exp $
+ *       $Header: /cvsroot/pgsql/src/backend/utils/mb/conversion_procs/utf8_and_win1250/Attic/utf8_and_win1250.c,v 1.5.2.1 2006/05/21 20:07:13 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -46,7 +46,7 @@ utf_to_win1250(PG_FUNCTION_ARGS)
        Assert(len >= 0);
 
        UtfToLocal(src, dest, ULmapWIN1250,
-                          sizeof(ULmapWIN1250) / sizeof(pg_utf_to_local), len);
+                          sizeof(ULmapWIN1250) / sizeof(pg_utf_to_local), PG_WIN1250, len);
 
        PG_RETURN_VOID();
 }
index b7fd63796b9917fa0f9ae1a028d72804bec386f9..293b878140509eb2cf5d1b01b4cf15581364fa5b 100644 (file)
@@ -6,7 +6,7 @@
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * IDENTIFICATION
- *       $Header: /cvsroot/pgsql/src/backend/utils/mb/conversion_procs/utf8_and_win1256/Attic/utf8_and_win1256.c,v 1.5 2002/10/26 15:01:01 tgl Exp $
+ *       $Header: /cvsroot/pgsql/src/backend/utils/mb/conversion_procs/utf8_and_win1256/Attic/utf8_and_win1256.c,v 1.5.2.1 2006/05/21 20:07:13 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -46,7 +46,7 @@ utf_to_win1256(PG_FUNCTION_ARGS)
        Assert(len >= 0);
 
        UtfToLocal(src, dest, ULmapWIN1256,
-                          sizeof(ULmapWIN1256) / sizeof(pg_utf_to_local), len);
+                          sizeof(ULmapWIN1256) / sizeof(pg_utf_to_local), PG_WIN1256, len);
 
        PG_RETURN_VOID();
 }
index 0b5e7b2d8d5ad348eb1f39fcf00f0c662cdec6ab..b41b9889b4c1e15ca6c7f655ea13b97d34450c16 100644 (file)
@@ -6,7 +6,7 @@
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * IDENTIFICATION
- *       $Header: /cvsroot/pgsql/src/backend/utils/mb/conversion_procs/utf8_and_win874/Attic/utf8_and_win874.c,v 1.5 2002/10/26 15:01:01 tgl Exp $
+ *       $Header: /cvsroot/pgsql/src/backend/utils/mb/conversion_procs/utf8_and_win874/Attic/utf8_and_win874.c,v 1.5.2.1 2006/05/21 20:07:13 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -46,7 +46,7 @@ utf_to_win874(PG_FUNCTION_ARGS)
        Assert(len >= 0);
 
        UtfToLocal(src, dest, ULmapWIN874,
-                          sizeof(ULmapWIN874) / sizeof(pg_utf_to_local), len);
+                          sizeof(ULmapWIN874) / sizeof(pg_utf_to_local), PG_WIN874, len);
 
        PG_RETURN_VOID();
 }
index 154d58e84d143534c4e8e31794cda5f581dc31eb..ec8955dade087a01b4b471faccdeb8610e660189 100644 (file)
@@ -4,7 +4,7 @@
  * (currently mule internal code (mic) is used)
  * Tatsuo Ishii
  *
- * $Header: /cvsroot/pgsql/src/backend/utils/mb/mbutils.c,v 1.36.2.2 2003/02/19 14:14:58 ishii Exp $
+ * $Header: /cvsroot/pgsql/src/backend/utils/mb/mbutils.c,v 1.36.2.3 2006/05/21 20:07:11 tgl Exp $
  */
 #include "postgres.h"
 
@@ -309,8 +309,48 @@ pg_client_to_server(unsigned char *s, int len)
        Assert(DatabaseEncoding);
        Assert(ClientEncoding);
 
-       if (ClientEncoding->encoding == DatabaseEncoding->encoding)
+       if (len <= 0)
+               return s;
+
+       if (ClientEncoding->encoding == DatabaseEncoding->encoding ||
+               ClientEncoding->encoding == PG_SQL_ASCII)
+       {
+               /*
+                * No conversion is needed, but we must still validate the data.
+                */
+               (void) pg_verify_mbstr(DatabaseEncoding->encoding, s, len, false);
+               return s;
+       }
+
+       if (DatabaseEncoding->encoding == PG_SQL_ASCII)
+       {
+               /*
+                * No conversion is possible, but we must still validate the data,
+                * because the client-side code might have done string escaping
+                * using the selected client_encoding.  If the client encoding is
+                * ASCII-safe then we just do a straight validation under that
+                * encoding.  For an ASCII-unsafe encoding we have a problem:
+                * we dare not pass such data to the parser but we have no way
+                * to convert it.  We compromise by rejecting the data if it
+                * contains any non-ASCII characters.
+                */
+               if (PG_VALID_BE_ENCODING(ClientEncoding->encoding))
+                       (void) pg_verify_mbstr(ClientEncoding->encoding, s, len, false);
+               else
+               {
+                       int             i;
+
+                       for (i = 0; i < len; i++)
+                       {
+                               if (s[i] == '\0' || IS_HIGHBIT_SET(s[i]))
+                                       elog(ERROR,
+                                                "invalid byte value for encoding \"%s\": 0x%02x",
+                                                pg_enc2name_tbl[PG_SQL_ASCII].name,
+                                                (unsigned char) s[i]);
+                       }
+               }
                return s;
+       }
 
        return perform_default_encoding_conversion(s, len, true);
 }
@@ -324,9 +364,14 @@ pg_server_to_client(unsigned char *s, int len)
        Assert(DatabaseEncoding);
        Assert(ClientEncoding);
 
-       if (ClientEncoding->encoding == DatabaseEncoding->encoding)
+       if (len <= 0)
                return s;
 
+       if (ClientEncoding->encoding == DatabaseEncoding->encoding ||
+               ClientEncoding->encoding == PG_SQL_ASCII ||
+               DatabaseEncoding->encoding == PG_SQL_ASCII)
+               return s;               /* assume data is valid */
+
        return perform_default_encoding_conversion(s, len, false);
 }
 
@@ -345,9 +390,6 @@ perform_default_encoding_conversion(unsigned char *src, int len, bool is_client_
                                dest_encoding;
        FmgrInfo   *flinfo;
 
-       if (len <= 0)
-               return src;
-
        if (is_client_to_server)
        {
                src_encoding = ClientEncoding->encoding;
@@ -364,12 +406,6 @@ perform_default_encoding_conversion(unsigned char *src, int len, bool is_client_
        if (flinfo == NULL)
                return src;
 
-       if (src_encoding == dest_encoding)
-               return src;
-
-       if (src_encoding == PG_SQL_ASCII || dest_encoding == PG_SQL_ASCII)
-               return src;
-
        result = palloc(len * 4 + 1);
 
        FunctionCall5(flinfo,
index ff78b30a91e506accaff7665c474b13ce1d8dbaa..fc4c4cbd82460109f85dc5a3c969b7e028d96489 100644 (file)
@@ -1,7 +1,7 @@
 /*
  * conversion functions between pg_wchar and multibyte streams.
  * Tatsuo Ishii
- * $Id: wchar.c,v 1.30.2.2 2005/12/24 12:08:10 ishii Exp $
+ * $Id: wchar.c,v 1.30.2.3 2006/05/21 20:07:11 tgl Exp $
  *
  * WIN1250 client encoding updated by Pavel Behal
  *
@@ -91,7 +91,7 @@ static int    pg_euc2wchar_with_len
        return (cnt);
 }
 
-static int
+static inline int
 pg_euc_mblen(const unsigned char *s)
 {
        int                     len;
@@ -535,42 +535,422 @@ pg_gb18030_mblen(const unsigned char *s)
        return (len);
 }
 
+/*
+ *-------------------------------------------------------------------
+ * multibyte sequence validators
+ *
+ * These functions accept "s", a pointer to the first byte of a string,
+ * and "len", the remaining length of the string.  If there is a validly
+ * encoded character beginning at *s, return its length in bytes; else
+ * return -1.
+ *
+ * The functions can assume that len > 0 and that *s != '\0', but they must
+ * test for and reject zeroes in any additional bytes of a multibyte character.
+ *
+ * Note that this definition allows the function for a single-byte
+ * encoding to be just "return 1".
+ *-------------------------------------------------------------------
+ */
+
+static int
+pg_ascii_verifier(const unsigned char *s, int len)
+{
+       return 1;
+}
+
+#define IS_EUC_RANGE_VALID(c)  ((c) >= 0xa1 && (c) <= 0xfe)
+
+static int
+pg_eucjp_verifier(const unsigned char *s, int len)
+{
+       int                     l;
+       unsigned char c1, c2;
+
+       c1 = *s++;
+
+       switch (c1)
+       {
+               case SS2:               /* JIS X 0201 */
+                       l = 2;
+                       if (l > len)
+                               return -1;
+                       c2 = *s++;
+                       if (c2 < 0xa1 || c2 > 0xdf)
+                               return -1;
+                       break;
+
+               case SS3:               /* JIS X 0212 */
+                       l = 3;
+                       if (l > len)
+                               return -1;
+                       c2 = *s++;
+                       if (!IS_EUC_RANGE_VALID(c2))
+                               return -1;
+                       c2 = *s++;
+                       if (!IS_EUC_RANGE_VALID(c2))
+                               return -1;
+                       break;
+
+               default:
+                       if (IS_HIGHBIT_SET(c1))         /* JIS X 0208? */
+                       {
+                               l = 2;
+                               if (l > len)
+                                       return -1;
+                               if (!IS_EUC_RANGE_VALID(c1))
+                                       return -1;
+                               c2 = *s++;
+                               if (!IS_EUC_RANGE_VALID(c2))
+                                       return -1;
+                       }
+                       else            /* must be ASCII */
+                       {
+                               l = 1;
+                       }
+                       break;
+       }
+
+       return l;
+}
+
+static int
+pg_euckr_verifier(const unsigned char *s, int len)
+{
+       int                     l;
+       unsigned char c1, c2;
+
+       c1 = *s++;
+
+       if (IS_HIGHBIT_SET(c1))
+       {
+               l = 2;
+               if (l > len)
+                       return -1;
+               if (!IS_EUC_RANGE_VALID(c1))
+                       return -1;
+               c2 = *s++;
+               if (!IS_EUC_RANGE_VALID(c2))
+                       return -1;
+       }
+       else            /* must be ASCII */
+       {
+               l = 1;
+       }
+
+       return l;
+}
+
+/* EUC-CN byte sequences are exactly same as EUC-KR */
+#define pg_euccn_verifier      pg_euckr_verifier
 
+static int
+pg_euctw_verifier(const unsigned char *s, int len)
+{
+       int                     l;
+       unsigned char c1, c2;
+
+       c1 = *s++;
+
+       switch (c1)
+       {
+               case SS2:               /* CNS 11643 Plane 1-7 */
+                       l = 4;
+                       if (l > len)
+                               return -1;
+                       c2 = *s++;
+                       if (c2 < 0xa1 || c2 > 0xa7)
+                               return -1;
+                       c2 = *s++;
+                       if (!IS_EUC_RANGE_VALID(c2))
+                               return -1;
+                       c2 = *s++;
+                       if (!IS_EUC_RANGE_VALID(c2))
+                               return -1;
+                       break;
+
+               case SS3:               /* unused */
+                       return -1;
+
+               default:
+                       if (IS_HIGHBIT_SET(c1))         /* CNS 11643 Plane 1 */
+                       {
+                               l = 2;
+                               if (l > len)
+                                       return -1;
+                               /* no further range check on c1? */
+                               c2 = *s++;
+                               if (!IS_EUC_RANGE_VALID(c2))
+                                       return -1;
+                       }
+                       else            /* must be ASCII */
+                       {
+                               l = 1;
+                       }
+                       break;
+       }
+       return l;
+}
+
+static int
+pg_johab_verifier(const unsigned char *s, int len)
+{
+       int l, mbl;
+       unsigned char c;
+
+       l = mbl = pg_johab_mblen(s);
+
+       if (len < l)
+               return -1;
+
+       if (!IS_HIGHBIT_SET(*s))
+               return mbl;
+
+       while (--l > 0)
+       {
+               c = *++s;
+               if (!IS_EUC_RANGE_VALID(c))
+                       return -1;
+       }
+       return mbl;
+}
+
+static int
+pg_mule_verifier(const unsigned char *s, int len)
+{
+       int l, mbl;
+       unsigned char c;
+
+       l = mbl = pg_mule_mblen(s);
+
+       if (len < l)
+               return -1;
+
+       while (--l > 0)
+       {
+               c = *++s;
+               if (!IS_HIGHBIT_SET(c))
+                       return -1;
+       }
+       return mbl;
+}
+
+static int
+pg_latin1_verifier(const unsigned char *s, int len)
+{
+       return 1;
+}
+
+static int
+pg_sjis_verifier(const unsigned char *s, int len)
+{
+       int l, mbl;
+       unsigned char c1, c2;
+
+       l = mbl = pg_sjis_mblen(s);
+
+       if (len < l)
+               return -1;
+
+       if (l == 1)                                     /* pg_sjis_mblen already verified it */
+               return mbl;
+
+       c1 = *s++;
+       c2 = *s;
+       if (!ISSJISHEAD(c1) || !ISSJISTAIL(c2))
+               return -1;
+       return mbl;
+}
+
+static int
+pg_big5_verifier(const unsigned char *s, int len)
+{
+       int l, mbl;
+
+       l = mbl = pg_big5_mblen(s);
+
+       if (len < l)
+               return -1;
+
+       while (--l > 0)
+       {
+               if (*++s == '\0')
+                       return -1;
+       }
+
+       return mbl;
+}
+
+static int
+pg_gbk_verifier(const unsigned char *s, int len)
+{
+       int l, mbl;
+
+       l = mbl = pg_gbk_mblen(s);
+
+       if (len < l)
+               return -1;
+
+       while (--l > 0)
+       {
+               if (*++s == '\0')
+                       return -1;
+       }
+
+       return mbl;
+}
+
+static int
+pg_uhc_verifier(const unsigned char *s, int len)
+{
+       int l, mbl;
+
+       l = mbl = pg_uhc_mblen(s);
+
+       if (len < l)
+               return -1;
+
+       while (--l > 0)
+       {
+               if (*++s == '\0')
+                       return -1;
+       }
+
+       return mbl;
+}
+
+static int
+pg_gb18030_verifier(const unsigned char *s, int len)
+{
+       int l, mbl;
+
+       l = mbl = pg_gb18030_mblen(s);
+
+       if (len < l)
+               return -1;
+
+       while (--l > 0)
+       {
+               if (*++s == '\0')
+                       return -1;
+       }
+
+       return mbl;
+}
+
+static int
+pg_utf8_verifier(const unsigned char *s, int len)
+{
+       int l = pg_utf_mblen(s);
+
+       if (len < l)
+               return -1;
+
+       if (!pg_utf8_islegal(s, l))
+               return -1;
+
+       return l;
+}
+
+/*
+ * Check for validity of a single UTF-8 encoded character
+ *
+ * This directly implements the rules in RFC3629, modified to restrict
+ * us to 16-bit Unicode code points (hence, at most 3 bytes in UTF8).
+ * The bizarre-looking
+ * restrictions on the second byte are meant to ensure that there isn't
+ * more than one encoding of a given Unicode character point; that is,
+ * you may not use a longer-than-necessary byte sequence with high order
+ * zero bits to represent a character that would fit in fewer bytes.
+ * To do otherwise is to create security hazards (eg, create an apparent
+ * non-ASCII character that decodes to plain ASCII).
+ *
+ * length is assumed to have been obtained by pg_utf_mblen(), and the
+ * caller must have checked that that many bytes are present in the buffer.
+ */
+bool
+pg_utf8_islegal(const unsigned char *source, int length)
+{
+       unsigned char a;
+
+       switch (length)
+       {
+               default:
+                       /* reject lengths 4, 5 and 6 for now */
+                       return false;
+               case 3:
+                       a = source[2];
+                       if (a < 0x80 || a > 0xBF)
+                               return false;
+                       /* FALL THRU */
+               case 2:
+                       a = source[1];
+                       switch (*source)
+                       {
+                               case 0xE0:
+                                       if (a < 0xA0 || a > 0xBF)
+                                               return false;
+                                       break;
+                               case 0xED:
+                                       if (a < 0x80 || a > 0x9F)
+                                               return false;
+                                       break;
+                               default:
+                                       if (a < 0x80 || a > 0xBF)
+                                               return false;
+                                       break;
+                       }
+                       /* FALL THRU */
+               case 1:
+                       a = *source;
+                       if (a >= 0x80 && a < 0xC2)
+                               return false;
+                       if (a > 0xEF)
+                               return false;
+                       break;
+       }
+       return true;
+}
+
+/*
+ *-------------------------------------------------------------------
+ * encoding info table
+ *-------------------------------------------------------------------
+ */
 pg_wchar_tbl pg_wchar_table[] = {
-       {pg_ascii2wchar_with_len, pg_ascii_mblen, 1},           /* 0; PG_SQL_ASCII      */
-       {pg_eucjp2wchar_with_len, pg_eucjp_mblen, 3},           /* 1; PG_EUC_JP */
-       {pg_euccn2wchar_with_len, pg_euccn_mblen, 3},           /* 2; PG_EUC_CN */
-       {pg_euckr2wchar_with_len, pg_euckr_mblen, 3},           /* 3; PG_EUC_KR */
-       {pg_euctw2wchar_with_len, pg_euctw_mblen, 3},           /* 4; PG_EUC_TW */
-       {pg_johab2wchar_with_len, pg_johab_mblen, 3},           /* 5; PG_JOHAB */
-       {pg_utf2wchar_with_len, pg_utf_mblen, 3},       /* 6; PG_UNICODE */
-       {pg_mule2wchar_with_len, pg_mule_mblen, 3}, /* 7; PG_MULE_INTERNAL */
-       {pg_latin12wchar_with_len, pg_latin1_mblen, 1},         /* 8; PG_LATIN1 */
-       {pg_latin12wchar_with_len, pg_latin1_mblen, 1},         /* 9; PG_LATIN2 */
-       {pg_latin12wchar_with_len, pg_latin1_mblen, 1},         /* 10; PG_LATIN3 */
-       {pg_latin12wchar_with_len, pg_latin1_mblen, 1},         /* 11; PG_LATIN4 */
-       {pg_latin12wchar_with_len, pg_latin1_mblen, 1},         /* 12; PG_LATIN5 */
-       {pg_latin12wchar_with_len, pg_latin1_mblen, 1},         /* 13; PG_LATIN6 */
-       {pg_latin12wchar_with_len, pg_latin1_mblen, 1},         /* 14; PG_LATIN7 */
-       {pg_latin12wchar_with_len, pg_latin1_mblen, 1},         /* 15; PG_LATIN8 */
-       {pg_latin12wchar_with_len, pg_latin1_mblen, 1},         /* 16; PG_LATIN9 */
-       {pg_latin12wchar_with_len, pg_latin1_mblen, 1},         /* 17; PG_LATIN10 */
-       {pg_latin12wchar_with_len, pg_latin1_mblen, 1},         /* 18; PG_WIN1256 */
-       {pg_latin12wchar_with_len, pg_latin1_mblen, 1},         /* 19; PG_TCVN */
-       {pg_latin12wchar_with_len, pg_latin1_mblen, 1},         /* 20; PG_WIN874 */
-       {pg_latin12wchar_with_len, pg_latin1_mblen, 1},         /* 21; PG_KOI8 */
-       {pg_latin12wchar_with_len, pg_latin1_mblen, 1},         /* 22; PG_WIN1251 */
-       {pg_latin12wchar_with_len, pg_latin1_mblen, 1},         /* 23; PG_ALT */
-       {pg_latin12wchar_with_len, pg_latin1_mblen, 1},         /* 24; ISO-8859-5 */
-       {pg_latin12wchar_with_len, pg_latin1_mblen, 1},         /* 25; ISO-8859-6 */
-       {pg_latin12wchar_with_len, pg_latin1_mblen, 1},         /* 26; ISO-8859-7 */
-       {pg_latin12wchar_with_len, pg_latin1_mblen, 1},         /* 27; ISO-8859-8 */
-       {0, pg_sjis_mblen, 2},          /* 28; PG_SJIS */
-       {0, pg_big5_mblen, 2},          /* 29; PG_BIG5 */
-       {0, pg_gbk_mblen, 2},           /* 30; PG_GBK */
-       {0, pg_uhc_mblen, 2},           /* 31; PG_UHC */
-       {pg_latin12wchar_with_len, pg_latin1_mblen, 1},         /* 32; PG_WIN1250 */
-       {0, pg_gb18030_mblen, 2}        /* 33; PG_GB18030 */
+       {pg_ascii2wchar_with_len, pg_ascii_mblen, pg_ascii_verifier, 1},                /* 0; PG_SQL_ASCII      */
+       {pg_eucjp2wchar_with_len, pg_eucjp_mblen, pg_eucjp_verifier, 3},                /* 1; PG_EUC_JP */
+       {pg_euccn2wchar_with_len, pg_euccn_mblen, pg_euccn_verifier, 3},                /* 2; PG_EUC_CN */
+       {pg_euckr2wchar_with_len, pg_euckr_mblen, pg_euckr_verifier, 3},                /* 3; PG_EUC_KR */
+       {pg_euctw2wchar_with_len, pg_euctw_mblen, pg_euctw_verifier, 3},                /* 4; PG_EUC_TW */
+       {pg_johab2wchar_with_len, pg_johab_mblen, pg_johab_verifier, 3},                /* 5; PG_JOHAB */
+       {pg_utf2wchar_with_len, pg_utf_mblen, pg_utf8_verifier, 3},     /* 6; PG_UNICODE */
+       {pg_mule2wchar_with_len, pg_mule_mblen, pg_mule_verifier, 3}, /* 7; PG_MULE_INTERNAL */
+       {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_verifier, 1},             /* 8; PG_LATIN1 */
+       {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_verifier, 1},             /* 9; PG_LATIN2 */
+       {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_verifier, 1},             /* 10; PG_LATIN3 */
+       {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_verifier, 1},             /* 11; PG_LATIN4 */
+       {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_verifier, 1},             /* 12; PG_LATIN5 */
+       {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_verifier, 1},             /* 13; PG_LATIN6 */
+       {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_verifier, 1},             /* 14; PG_LATIN7 */
+       {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_verifier, 1},             /* 15; PG_LATIN8 */
+       {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_verifier, 1},             /* 16; PG_LATIN9 */
+       {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_verifier, 1},             /* 17; PG_LATIN10 */
+       {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_verifier, 1},             /* 18; PG_WIN1256 */
+       {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_verifier, 1},             /* 19; PG_TCVN */
+       {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_verifier, 1},             /* 20; PG_WIN874 */
+       {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_verifier, 1},             /* 21; PG_KOI8 */
+       {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_verifier, 1},             /* 22; PG_WIN1251 */
+       {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_verifier, 1},             /* 23; PG_ALT */
+       {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_verifier, 1},             /* 24; ISO-8859-5 */
+       {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_verifier, 1},             /* 25; ISO-8859-6 */
+       {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_verifier, 1},             /* 26; ISO-8859-7 */
+       {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_verifier, 1},             /* 27; ISO-8859-8 */
+       {0, pg_sjis_mblen, pg_sjis_verifier, 2},                /* 28; PG_SJIS */
+       {0, pg_big5_mblen, pg_big5_verifier, 2},                /* 29; PG_BIG5 */
+       {0, pg_gbk_mblen, pg_gbk_verifier, 2},          /* 30; PG_GBK */
+       {0, pg_uhc_mblen, pg_uhc_verifier, 2},          /* 31; PG_UHC */
+       {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_verifier, 1},             /* 32; PG_WIN1250 */
+       {0, pg_gb18030_mblen, pg_gb18030_verifier, 2}   /* 33; PG_GB18030 */
 };
 
 /* returns the byte length of a word for mule internal code */
@@ -594,6 +974,22 @@ pg_encoding_mblen(int encoding, const unsigned char *mbstr)
                        ((*pg_wchar_table[PG_SQL_ASCII].mblen) (mbstr)));
 }
 
+/*
+ * Verify the first multibyte character of the given string.
+ * Return its byte length if good, -1 if bad.  (See comments above for
+ * full details of the mbverify API.)
+ */
+int
+pg_encoding_verifymb(int encoding, const char *mbstr, int len)
+{
+       Assert(PG_VALID_ENCODING(encoding));
+
+       return ((encoding >= 0 &&
+                        encoding < sizeof(pg_wchar_table) / sizeof(pg_wchar_tbl)) ?
+               ((*pg_wchar_table[encoding].mbverify) ((const unsigned char *) mbstr, len)) :
+       ((*pg_wchar_table[PG_SQL_ASCII].mbverify) ((const unsigned char *) mbstr, len)));
+}
+
 /*
  * fetch maximum length of a char encoding
  */
@@ -606,87 +1002,144 @@ pg_encoding_max_length(int encoding)
 }
 
 #ifndef FRONTEND
+
 /*
- * Verify mbstr to make sure that it has a valid character sequence.
- * mbstr is not necessarily NULL terminated. length of mbstr is
- * specified by len. If an error was found, returns an error message.
- * Note that the message is kept in a static buffer, the next invocation
- * might break the message.
- * If no error was found, this function returns NULL.
+ * fetch maximum length of the encoding for the current database
  */
-char *
-pg_verifymbstr(const unsigned char *mbstr, int len)
+int
+pg_database_encoding_max_length(void)
 {
-       int                     l;
-       int                     i,
-                               j;
-       static char buf[256];
-       int                     slen = 0;
+       return pg_wchar_table[GetDatabaseEncoding()].maxmblen;
+}
 
-       /* we do not check single byte encodings */
-       if (pg_database_encoding_max_length() <= 1)
-               return NULL;
+/*
+ * Verify mbstr to make sure that it is validly encoded in the current
+ * database encoding.  Otherwise same as pg_verify_mbstr().
+ */
+bool
+pg_verifymbstr(const char *mbstr, int len, bool noError)
+{
+       return pg_verify_mbstr(GetDatabaseEncoding(), mbstr, len, noError);
+}
+
+/*
+ * Verify mbstr to make sure that it is validly encoded in the specified
+ * encoding.
+ *
+ * mbstr is not necessarily zero terminated; length of mbstr is
+ * specified by len.
+ *
+ * If OK, return TRUE. If a problem is found, return FALSE when noError is
+ * true; when noError is false, elog() a descriptive message.
+ */
+bool
+pg_verify_mbstr(int encoding, const char *mbstr, int len, bool noError)
+{
+       mbverifier      mbverify;
+
+       Assert(PG_VALID_ENCODING(encoding));
 
-       while (len > 0 && *mbstr)
+       /*
+        * In single-byte encodings, we need only reject nulls (\0).
+        */
+       if (pg_encoding_max_length(encoding) <= 1)
        {
-               /* special UTF-8 check */
-               if (GetDatabaseEncoding() == PG_UTF8 &&
-                       (*mbstr & 0xf8) == 0xf0)
-               {
-                       snprintf(buf, sizeof(buf), "Unicode >= 0x10000 is not supported");
-                       return (buf);
-               }
+               const char *nullpos = memchr(mbstr, 0, len);
 
-               l = pg_mblen(mbstr);
+               if (nullpos == NULL)
+                       return true;
+               if (noError)
+                       return false;
+               report_invalid_encoding(encoding, nullpos, 1);
+       }
+
+       /* fetch function pointer just once */
+       mbverify = pg_wchar_table[encoding].mbverify;
+
+       while (len > 0)
+       {
+               int                     l;
 
-               /* multibyte letter? */
-               if (l > 1)
+               /* fast path for ASCII-subset characters */
+               if (!IS_HIGHBIT_SET(*mbstr))
                {
-                       for (i = 1; i < l; i++)
+                       if (*mbstr != '\0')
                        {
-                               if (i > len || *(mbstr + i) == '\0' ||
-
-                               /*
-                                * we assume that every multibyte letter consists of bytes
-                                * being the 8th bit set
-                                */
-                                       ((*(mbstr + i) & 0x80) == 0))
-                               {
-                                       int                     remains = sizeof(buf);
-                                       char       *p = buf;
-
-                                       slen = snprintf(p, remains, "Invalid %s character sequence found (0x",
-                                                                       GetDatabaseEncodingName());
-                                       p += slen;
-                                       remains -= slen;
-
-                                       i = ((*(mbstr + i) & 0x80) == 0) ? l : i;
-
-                                       for (j = 0; j < i; j++)
-                                       {
-                                               slen = snprintf(p, remains, "%02x",
-                                                                               *(mbstr + j));
-                                               p += slen;
-                                               remains -= slen;
-                                       }
-                                       snprintf(p, remains, ")");
-                                       return (buf);
-                               }
+                               mbstr++;
+                               len--;
+                               continue;
                        }
+                       if (noError)
+                               return false;
+                       report_invalid_encoding(encoding, mbstr, len);
                }
-               len -= l;
+
+               l = (*mbverify) ((const unsigned char *) mbstr, len);
+
+               if (l < 0)
+               {
+                       if (noError)
+                               return false;
+                       report_invalid_encoding(encoding, mbstr, len);
+               }
+
                mbstr += l;
+               len -= l;
        }
-       return NULL;
+       return true;
+}
+
+/*
+ * report_invalid_encoding: complain about invalid multibyte character
+ *
+ * note: len is remaining length of string, not length of character;
+ * len must be greater than zero, as we always examine the first byte.
+ */
+void
+report_invalid_encoding(int encoding, const char *mbstr, int len)
+{
+       int                     l = pg_encoding_mblen(encoding, mbstr);
+       char            buf[8 * 2 + 1];
+       char       *p = buf;
+       int                     j,
+                               jlimit;
+
+       jlimit = Min(l, len);
+       jlimit = Min(jlimit, 8);        /* prevent buffer overrun */
+
+       for (j = 0; j < jlimit; j++)
+               p += sprintf(p, "%02x", (unsigned char) mbstr[j]);
+
+       elog(ERROR, "invalid byte sequence for encoding \"%s\": 0x%s",
+                pg_enc2name_tbl[encoding].name, buf);
 }
 
 /*
- * fetch maximum length of a char encoding for the current database
+ * report_untranslatable_char: complain about untranslatable character
+ *
+ * note: len is remaining length of string, not length of character;
+ * len must be greater than zero, as we always examine the first byte.
  */
-int
-pg_database_encoding_max_length(void)
+void
+report_untranslatable_char(int src_encoding, int dest_encoding,
+                                                  const char *mbstr, int len)
 {
-       return pg_wchar_table[GetDatabaseEncoding()].maxmblen;
+       int                     l = pg_encoding_mblen(src_encoding, mbstr);
+       char            buf[8 * 2 + 1];
+       char       *p = buf;
+       int                     j,
+                               jlimit;
+
+       jlimit = Min(l, len);
+       jlimit = Min(jlimit, 8);        /* prevent buffer overrun */
+
+       for (j = 0; j < jlimit; j++)
+               p += sprintf(p, "%02x", (unsigned char) mbstr[j]);
+
+       elog(ERROR, "character 0x%s of encoding \"%s\" has no equivalent in \"%s\"",
+                buf,
+                pg_enc2name_tbl[src_encoding].name,
+                pg_enc2name_tbl[dest_encoding].name);
 }
 
 #endif
index f8e145ea96c680e4ff06936c59802778070f9418..586e88e177d4361799c687286ba0735f786c9c4e 100644 (file)
@@ -12,7 +12,7 @@
  * Portions Copyright (c) 1996-2002, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * $Id: c.h,v 1.130.2.2 2006/01/05 00:51:52 tgl Exp $
+ * $Id: c.h,v 1.130.2.3 2006/05/21 20:07:13 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -610,6 +610,8 @@ typedef NameData *Name;
 
 /* msb for char */
 #define CSIGNBIT (0x80)
+#define HIGHBIT                                        (0x80)
+#define IS_HIGHBIT_SET(ch)             ((unsigned char)(ch) & HIGHBIT)
 
 #define STATUS_OK                              (0)
 #define STATUS_ERROR                   (-1)
index d5fd497b6a60936c1d25b4ae75afb89a95071333..341be74d8ecf1cf467c5c03d00660ecdecc6e868 100644 (file)
@@ -1,4 +1,4 @@
-/* $Id: pg_wchar.h,v 1.44.2.1 2003/02/19 14:14:58 ishii Exp $ */
+/* $Id: pg_wchar.h,v 1.44.2.2 2006/05/21 20:07:13 tgl Exp $ */
 
 #ifndef PG_WCHAR_H
 #define PG_WCHAR_H
@@ -23,11 +23,17 @@ typedef unsigned int pg_wchar;
 #define SS2 0x8e                               /* single shift 2 (JIS0201) */
 #define SS3 0x8f                               /* single shift 3 (JIS0212) */
 
+/*
+ * SJIS validation macros
+ */
+#define ISSJISHEAD(c) (((c) >= 0x81 && (c) <= 0x9f) || ((c) >= 0xe0 && (c) <= 0xfc))
+#define ISSJISTAIL(c) (((c) >= 0x40 && (c) <= 0x7e) || ((c) >= 0x80 && (c) <= 0xfc))
+
 /*
  * Leading byte types or leading prefix byte for MULE internal code.
  * See http://www.xemacs.org for more details. (there is a doc titled
  * "XEmacs Internals Manual", "MULE Character Sets and Encodings"
- * section.
+ * section.)
  */
 /*
  * Is a leading byte for "official" single byte encodings?
@@ -64,7 +70,7 @@ typedef unsigned int pg_wchar;
 #define LC_ISO8859_8   0x88    /* Hebrew (not supported yet) */
 #define LC_JISX0201K   0x89    /* Japanese 1 byte kana */
 #define LC_JISX0201R   0x8a    /* Japanese 1 byte Roman */
-/* Note that 0x8b seems to be unused in as of Emacs 20.7.
+/* Note that 0x8b seems to be unused as of Emacs 20.7.
  * However, there might be a chance that 0x8b could be used
  * in later version of Emacs.
  */
@@ -137,13 +143,13 @@ typedef unsigned int pg_wchar;
 /* #define FREE                0xff    free (unused) */
 
 /*
- * Encoding numeral identificators
+ * PostgreSQL encoding identifiers
  *
  * WARNING: the order of this table must be same as order
  *                     in the pg_enc2name[] (mb/encnames.c) array!
  *
- *                     If you add some encoding don'y forget check
- *                     PG_ENCODING_[BE|FE]_LAST macros.
+ *                     If you add some encoding don't forget to check
+ *                     PG_ENCODING_BE_LAST macro.
  *
  *             The PG_SQL_ASCII is default encoding and must be = 0.
  */
@@ -190,7 +196,7 @@ typedef enum pg_enc
 } pg_enc;
 
 #define PG_ENCODING_BE_LAST PG_ISO_8859_8
-#define PG_ENCODING_FE_LAST PG_WIN1256
+#define PG_ENCODING_FE_LAST PG_GB18030
 
 /*
  * Please use these tests before access to pg_encconv_tbl[]
@@ -199,14 +205,13 @@ typedef enum pg_enc
 #define PG_VALID_BE_ENCODING(_enc) \
                ((_enc) >= 0 && (_enc) <= PG_ENCODING_BE_LAST)
 
-#define PG_ENCODING_IS_CLIEN_ONLY(_enc) \
-               (((_enc) > PG_ENCODING_BE_LAST && (_enc) <= PG_ENCODING_FE_LAST)
+#define PG_ENCODING_IS_CLIENT_ONLY(_enc) \
+               ((_enc) > PG_ENCODING_BE_LAST && (_enc) <= PG_ENCODING_FE_LAST)
 
 #define PG_VALID_ENCODING(_enc) \
                ((_enc) >= 0 && (_enc) < _PG_LAST_ENCODING_)
 
-/* On FE are possible all encodings
- */
+/* On FE are possible all encodings */
 #define PG_VALID_FE_ENCODING(_enc)     PG_VALID_ENCODING(_enc)
 
 /*
@@ -246,14 +251,18 @@ extern const char *pg_encoding_to_char(int encoding);
 typedef int (*mb2wchar_with_len_converter) (const unsigned char *from,
                                                                                                                pg_wchar *to,
                                                                                                                int len);
+
 typedef int (*mblen_converter) (const unsigned char *mbstr);
 
+typedef int (*mbverifier) (const unsigned char *mbstr, int len);
+
 typedef struct
 {
        mb2wchar_with_len_converter mb2wchar_with_len;          /* convert a multibyte
                                                                                                                 * string to a wchar */
-       mblen_converter mblen;          /* returns the length of a multibyte char */
-       int                     maxmblen;               /* max bytes for a char in this charset */
+       mblen_converter mblen;          /* get byte length of a char */
+       mbverifier      mbverify;               /* verify multibyte sequence */
+       int                     maxmblen;               /* max bytes for a char in this encoding */
 } pg_wchar_tbl;
 
 extern pg_wchar_tbl pg_wchar_table[];
@@ -284,6 +293,7 @@ extern int  pg_char_and_wchar_strncmp(const char *s1, const pg_wchar *s2, size_t
 extern size_t pg_wchar_strlen(const pg_wchar *wstr);
 extern int     pg_mblen(const unsigned char *mbstr);
 extern int     pg_encoding_mblen(int encoding, const unsigned char *mbstr);
+extern int     pg_encoding_verifymb(int encoding, const char *mbstr, int len);
 extern int     pg_mule_mblen(const unsigned char *mbstr);
 extern int     pg_mic_mblen(const unsigned char *mbstr);
 extern int     pg_mbstrlen(const unsigned char *mbstr);
@@ -317,20 +327,33 @@ extern unsigned char *pg_server_to_client(unsigned char *s, int len);
 extern unsigned short BIG5toCNS(unsigned short big5, unsigned char *lc);
 extern unsigned short CNStoBIG5(unsigned short cns, unsigned char lc);
 
-extern void LocalToUtf(unsigned char *iso, unsigned char *utf,
-                  pg_local_to_utf *map, int size, int encoding, int len);
-
-extern void UtfToLocal(unsigned char *utf, unsigned char *iso,
-                  pg_utf_to_local *map, int size, int len);
-
-extern char *pg_verifymbstr(const unsigned char *mbstr, int len);
-
-extern void pg_ascii2mic(unsigned char *src, unsigned char *dest, int len);
-extern void pg_mic2ascii(unsigned char *src, unsigned char *dest, int len);
-extern void pg_print_bogus_char(unsigned char **mic, unsigned char **p);
-extern void latin2mic(unsigned char *l, unsigned char *p, int len, int lc);
-extern void mic2latin(unsigned char *mic, unsigned char *p, int len, int lc);
-extern void latin2mic_with_table(unsigned char *l, unsigned char *p, int len, int lc, unsigned char *tab);
-extern void mic2latin_with_table(unsigned char *mic, unsigned char *p, int len, int lc, unsigned char *tab);
+extern void LocalToUtf(const unsigned char *iso, unsigned char *utf,
+                  const pg_local_to_utf *map, int size, int encoding, int len);
+
+extern void UtfToLocal(const unsigned char *utf, unsigned char *iso,
+                  const pg_utf_to_local *map, int size, int encoding, int len);
+
+extern bool pg_verifymbstr(const char *mbstr, int len, bool noError);
+extern bool pg_verify_mbstr(int encoding, const char *mbstr, int len,
+                                                       bool noError);
+
+extern void report_invalid_encoding(int encoding, const char *mbstr, int len);
+extern void report_untranslatable_char(int src_encoding, int dest_encoding,
+                                                                          const char *mbstr, int len);
+
+extern void pg_ascii2mic(const unsigned char *l, unsigned char *p, int len);
+extern void pg_mic2ascii(const unsigned char *mic, unsigned char *p, int len);
+extern void latin2mic(const unsigned char *l, unsigned char *p, int len,
+                                         int lc, int encoding);
+extern void mic2latin(const unsigned char *mic, unsigned char *p, int len,
+                                         int lc, int encoding);
+extern void latin2mic_with_table(const unsigned char *l, unsigned char *p,
+                                                                int len, int lc, int encoding,
+                                                                const unsigned char *tab);
+extern void mic2latin_with_table(const unsigned char *mic, unsigned char *p,
+                                                                int len, int lc, int encoding,
+                                                                const unsigned char *tab);
+
+extern bool pg_utf8_islegal(const unsigned char *source, int length);
 
 #endif   /* PG_WCHAR_H */