]> git.ipfire.org Git - thirdparty/postgresql.git/commitdiff
Fix usage of char2wchar/wchar2char. Changes:
authorTeodor Sigaev <teodor@sigaev.ru>
Mon, 2 Mar 2009 15:13:17 +0000 (15:13 +0000)
committerTeodor Sigaev <teodor@sigaev.ru>
Mon, 2 Mar 2009 15:13:17 +0000 (15:13 +0000)
- pg_wchar and wchar_t could have different size, so char2wchar
  doesn't call pg_mb2wchar_with_len to prevent out-of-bound
  memory bug
- make char2wchar/wchar2char symmetric, now they should not be
  called with C-locale because mbstowcs/wcstombs oftenly doesn't
  work correct with C-locale.
- Text parser uses pg_mb2wchar_with_len directly in case of
  C-locale and multibyte encoding

Per bug report by Hiroshi Inoue <inoue@tpf.co.jp> and
following discussion.

Backpatch up to 8.2 when multybyte support was implemented in tsearch.

contrib/tsearch2/ts_locale.c
contrib/tsearch2/wordparser/parser.c
contrib/tsearch2/wordparser/parser.h

index cb022d7e2a46458b2d0def4819cfb6a5f8e23fcc..46e6a1ac00752055821a18d6baa7dc941cf75d5c 100644 (file)
@@ -64,15 +64,8 @@ char2wchar(wchar_t *to, const char *from, size_t len)
        }
        else 
 #endif /* WIN32 */
-       if ( lc_ctype_is_c() )
-       {
-               /*
-                * pg_mb2wchar_with_len always adds trailing '\0', so 
-                * 'to' should be allocated with sufficient space 
-                */
-               return pg_mb2wchar_with_len(from, (pg_wchar *)to, len);
-       }
 
+       Assert( !lc_ctype_is_c() );
        return mbstowcs(to, from, len);
 }
 
index af58f59a9946b1d7693b3a7fbb8c9bfec18fe16b..19b2cfd228e63bf073733db0c77ebdde66b520e7 100644 (file)
@@ -1,4 +1,4 @@
-/* $PostgreSQL: pgsql/contrib/tsearch2/wordparser/parser.c,v 1.11.2.2 2007/03/22 15:59:09 teodor Exp $ */
+/* $PostgreSQL: pgsql/contrib/tsearch2/wordparser/parser.c,v 1.11.2.3 2009/03/02 15:13:17 teodor Exp $ */
 
 #include "postgres.h"
 
@@ -46,12 +46,24 @@ TParserInit(char *str, int len)
        if (prs->charmaxlen > 1)
        {
                prs->usewide = true;
-               prs->wstr = (wchar_t *) palloc(sizeof(wchar_t) * (prs->lenstr+1));
-               prs->lenwstr = char2wchar(prs->wstr, prs->str, prs->lenstr);
+               if ( lc_ctype_is_c() )
+               {
+                       /*
+                        * char2wchar doesn't work for C-locale and
+                        * sizeof(pg_wchar) could be not equal to sizeof(wchar_t)
+                        */
+                       prs->pgwstr = (pg_wchar*) palloc(sizeof(pg_wchar) * (prs->lenstr + 1));
+                       pg_mb2wchar_with_len(prs->str, prs->pgwstr, prs->lenstr);
+               }
+               else
+               {
+                       prs->wstr = (wchar_t *) palloc(sizeof(wchar_t) * (prs->lenstr+1));
+                       prs->lenwstr = char2wchar(prs->wstr, prs->str, prs->lenstr);
+               }
        }
        else
-#endif
                prs->usewide = false;
+#endif
 
        prs->state = newTParserPosition(NULL);
        prs->state->state = TPS_Base;
@@ -73,17 +85,21 @@ TParserClose(TParser * prs)
 #ifdef TS_USE_WIDE
        if (prs->wstr)
                pfree(prs->wstr);
+       if (prs->pgwstr)
+               pfree(prs->pgwstr);
 #endif
 
        pfree(prs);
 }
 
 /*
- * defining support function, equvalent is* macroses, but
- * working with any possible encodings and locales. Note,
- * that with multibyte encoding and C-locale isw* function may fail
- * or give wrong result. Note 2: multibyte encoding and C-locale 
- * often are used for Asian languages.
+ * Character-type support functions, equivalent to is* macros, but
+ * working with any possible encodings and locales. Notes:
+ *  - with multibyte encoding and C-locale isw* function may fail
+ *    or give wrong result. 
+ *  - multibyte encoding and C-locale often are used for 
+ *    Asian languages.
+ *  - if locale is C the we use pgwstr instead of wstr
  */
 
 #ifdef TS_USE_WIDE
@@ -94,8 +110,8 @@ p_is##type(TParser *prs) {                                                                                                   \
        Assert( prs->state );                                                                                                   \
        if ( prs->usewide )                                                                                                             \
        {                                                                                                                                               \
-               if ( lc_ctype_is_c() )                                                                                          \
-                       return is##type( 0xff & *( prs->wstr + prs->state->poschar) );  \
+               if ( prs->pgwstr )                                                                                                      \
+                       return is##type( 0xff & *( prs->pgwstr + prs->state->poschar) );\
                                                                                                                                                        \
                return isw##type( *(wint_t*)( prs->wstr + prs->state->poschar ) );      \
        }                                                                                                                                               \
@@ -115,9 +131,9 @@ p_isalnum(TParser *prs)
 
        if (prs->usewide)
        {
-               if (lc_ctype_is_c())
+               if (prs->pgwstr)
                {
-                       unsigned int c = *(prs->wstr + prs->state->poschar);
+                       unsigned int c = *(prs->pgwstr + prs->state->poschar);
 
                        /*
                         * any non-ascii symbol with multibyte encoding
@@ -148,9 +164,9 @@ p_isalpha(TParser *prs)
 
        if (prs->usewide)
        {
-               if (lc_ctype_is_c())
+               if (prs->pgwstr)
                {
-                       unsigned int c = *(prs->wstr + prs->state->poschar);
+                       unsigned int c = *(prs->pgwstr + prs->state->poschar);
 
                        /*
                         * any non-ascii symbol with multibyte encoding
index c40717a80f82de1f723e8b1a3d0dc9bc676dd6c3..67c30904952d2bfdfec00519b053e8df0b63da6f 100644 (file)
@@ -1,4 +1,4 @@
-/* $PostgreSQL: pgsql/contrib/tsearch2/wordparser/parser.h,v 1.11 2006/03/11 04:38:30 momjian Exp $ */
+/* $PostgreSQL: pgsql/contrib/tsearch2/wordparser/parser.h,v 1.11.2.1 2009/03/02 15:13:17 teodor Exp $ */
 
 #ifndef __PARSER_H__
 #define __PARSER_H__
@@ -138,12 +138,13 @@ typedef struct TParser
        int                     lenstr;                 /* length of mbstring */
 #ifdef TS_USE_WIDE
        wchar_t    *wstr;                       /* wide character string */
+       pg_wchar   *pgwstr;                     /* wide character string for C-locale */
        int                     lenwstr;                /* length of wsting */
+       bool            usewide;
 #endif
 
        /* State of parse */
        int                     charmaxlen;
-       bool            usewide;
        TParserPosition *state;
        bool            ignore;
        bool            wanthost;