]> git.ipfire.org Git - thirdparty/gcc.git/blame - gcc/mbchar.c
Oops, missed ChangeLog in last checkin...
[thirdparty/gcc.git] / gcc / mbchar.c
CommitLineData
56f48ce9
DB
1/* Multibyte Character Functions.
2 Copyright (C) 1998 Free Software Foundation, Inc.
3
4This file is part of GNU CC.
5
6GNU CC is free software; you can redistribute it and/or modify
7it under the terms of the GNU General Public License as published by
8the Free Software Foundation; either version 2, or (at your option)
9any later version.
10
11GNU CC is distributed in the hope that it will be useful,
12but WITHOUT ANY WARRANTY; without even the implied warranty of
13MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14GNU General Public License for more details.
15
16You should have received a copy of the GNU General Public License
17along with GNU CC; see the file COPYING. If not, write to
18the Free Software Foundation, 59 Temple Place - Suite 330,
19Boston, MA 02111-1307, USA. */
20
56f48ce9
DB
21/* Note regarding cross compilation:
22
c5c76735 23 In general, translation of multibyte characters to wide characters can
56f48ce9
DB
24 only work in a native compiler since the translation function (mbtowc)
25 needs to know about both the source and target character encoding. However,
26 this particular implementation for JIS, SJIS and EUCJP source characters
27 will work for any compiler with a newlib target. Other targets may also
28 work provided that their wchar_t implementation is 2 bytes and the encoding
29 leaves the source character values unchanged (except for removing the
30 state shifting markers). */
31
32#ifdef MULTIBYTE_CHARS
33#include "config.h"
34#include "system.h"
56f48ce9
DB
35#include "mbchar.h"
36#include <locale.h>
37
c5c76735
JL
38typedef enum {ESCAPE, DOLLAR, BRACKET, AT, B, J, NUL, JIS_CHAR, OTHER,
39 JIS_C_NUM} JIS_CHAR_TYPE;
56f48ce9 40
c5c76735
JL
41typedef enum {ASCII, A_ESC, A_ESC_DL, JIS, JIS_1, JIS_2, J_ESC, J_ESC_BR,
42 J2_ESC, J2_ESC_BR, INV, JIS_S_NUM} JIS_STATE;
43
44typedef enum {COPYA, COPYJ, COPYJ2, MAKE_A, MAKE_J, NOOP,
45 EMPTY, ERROR} JIS_ACTION;
46
47/* State/action tables for processing JIS encoding:
48
49 Where possible, switches to JIS are grouped with proceding JIS characters
50 and switches to ASCII are grouped with preceding JIS characters.
51 Thus, maximum returned length is:
52 2 (switch to JIS) + 2 (JIS characters) + 2 (switch back to ASCII) = 6. */
56f48ce9 53
56f48ce9 54static JIS_STATE JIS_state_table[JIS_S_NUM][JIS_C_NUM] = {
c5c76735 55/* ESCAPE DOLLAR BRACKET AT B J NUL JIS_CHAR OTH*/
56f48ce9
DB
56/*ASCII*/ { A_ESC, ASCII, ASCII, ASCII, ASCII, ASCII, ASCII,ASCII,ASCII},
57/*A_ESC*/ { ASCII, A_ESC_DL,ASCII, ASCII, ASCII, ASCII, ASCII,ASCII,ASCII},
58/*A_ESC_DL*/{ ASCII, ASCII, ASCII, JIS, JIS, ASCII, ASCII,ASCII,ASCII},
59/*JIS*/ { J_ESC, JIS_1, JIS_1, JIS_1, JIS_1, JIS_1, INV, JIS_1,INV },
60/*JIS_1*/ { INV, JIS_2, JIS_2, JIS_2, JIS_2, JIS_2, INV, JIS_2,INV },
61/*JIS_2*/ { J2_ESC,JIS, JIS, JIS, JIS, JIS, INV, JIS, JIS },
62/*J_ESC*/ { INV, INV, J_ESC_BR, INV, INV, INV, INV, INV, INV },
63/*J_ESC_BR*/{ INV, INV, INV, INV, ASCII, ASCII, INV, INV, INV },
64/*J2_ESC*/ { INV, INV, J2_ESC_BR,INV, INV, INV, INV, INV, INV },
65/*J2_ESC_BR*/{INV, INV, INV, INV, ASCII, ASCII, INV, INV, INV },
66};
67
68static JIS_ACTION JIS_action_table[JIS_S_NUM][JIS_C_NUM] = {
c5c76735 69/* ESCAPE DOLLAR BRACKET AT B J NUL JIS_CHAR OTH */
56f48ce9
DB
70/*ASCII */ {NOOP, COPYA, COPYA, COPYA, COPYA, COPYA, EMPTY, COPYA, COPYA},
71/*A_ESC */ {COPYA, NOOP, COPYA, COPYA, COPYA, COPYA, COPYA, COPYA, COPYA},
72/*A_ESC_DL */{COPYA, COPYA, COPYA, MAKE_J, MAKE_J, COPYA, COPYA, COPYA, COPYA},
c5c76735
JL
73/*JIS */ {NOOP, NOOP, NOOP, NOOP, NOOP, NOOP, ERROR, NOOP, ERROR},
74/*JIS_1 */ {ERROR, NOOP, NOOP, NOOP, NOOP, NOOP, ERROR, NOOP, ERROR},
56f48ce9 75/*JIS_2 */ {NOOP, COPYJ2,COPYJ2,COPYJ2, COPYJ2, COPYJ2,ERROR, COPYJ2,COPYJ2},
c5c76735
JL
76/*J_ESC */ {ERROR, ERROR, NOOP, ERROR, ERROR, ERROR, ERROR, ERROR, ERROR},
77/*J_ESC_BR */{ERROR, ERROR, ERROR, ERROR, NOOP, NOOP, ERROR, ERROR, ERROR},
78/*J2_ESC */ {ERROR, ERROR, NOOP, ERROR, ERROR, ERROR, ERROR, ERROR, ERROR},
79/*J2_ESC_BR*/{ERROR, ERROR, ERROR, ERROR, COPYJ, COPYJ, ERROR, ERROR, ERROR},
56f48ce9
DB
80};
81
82
5da1ecf2 83const char *literal_codeset = NULL;
56f48ce9 84
c5c76735
JL
85/* Store into *PWC (if PWC is not null) the wide character
86 corresponding to the multibyte character at the start of the
87 buffer S of size N. Return the number of bytes in the multibyte
88 character. Return -1 if the bytes do not form a valid character,
89 or 0 if S is null or points to a null byte.
90
91 This function behaves like the Standard C function mbtowc, except
92 it treats locale names of the form "C-..." specially. */
93
56f48ce9
DB
94int
95local_mbtowc (pwc, s, n)
c5c76735 96 wchar_t *pwc;
5da1ecf2 97 const char *s;
c5c76735 98 size_t n;
56f48ce9
DB
99{
100 static JIS_STATE save_state = ASCII;
101 JIS_STATE curr_state = save_state;
5da1ecf2 102 const unsigned char *t = (const unsigned char *) s;
56f48ce9
DB
103
104 if (s != NULL && n == 0)
105 return -1;
106
107 if (literal_codeset == NULL || strlen (literal_codeset) <= 1)
c5c76735
JL
108 /* This must be the "C" locale or unknown locale -- fall thru */
109 ;
56f48ce9
DB
110 else if (! strcmp (literal_codeset, "C-SJIS"))
111 {
112 int char1;
113 if (s == NULL)
c5c76735
JL
114 /* Not state-dependent. */
115 return 0;
116
56f48ce9
DB
117 char1 = *t;
118 if (ISSJIS1 (char1))
119 {
120 int char2 = t[1];
c5c76735 121
56f48ce9
DB
122 if (n <= 1)
123 return -1;
c5c76735 124
56f48ce9
DB
125 if (ISSJIS2 (char2))
126 {
127 if (pwc != NULL)
c5c76735 128 *pwc = (((wchar_t) *t) << 8) + (wchar_t) (*(t + 1));
56f48ce9
DB
129 return 2;
130 }
c5c76735 131
56f48ce9
DB
132 return -1;
133 }
c5c76735 134
56f48ce9 135 if (pwc != NULL)
c5c76735
JL
136 *pwc = (wchar_t) *t;
137
56f48ce9
DB
138 if (*t == '\0')
139 return 0;
c5c76735 140
56f48ce9
DB
141 return 1;
142 }
143 else if (! strcmp (literal_codeset, "C-EUCJP"))
144 {
145 int char1;
c5c76735 146
56f48ce9 147 if (s == NULL)
c5c76735
JL
148 /* Not state-dependent. */
149 return 0;
150
56f48ce9
DB
151 char1 = *t;
152 if (ISEUCJP (char1))
153 {
154 int char2 = t[1];
c5c76735 155
56f48ce9
DB
156 if (n <= 1)
157 return -1;
c5c76735 158
56f48ce9
DB
159 if (ISEUCJP (char2))
160 {
161 if (pwc != NULL)
c5c76735 162 *pwc = (((wchar_t) *t) << 8) + (wchar_t) (*(t + 1));
56f48ce9
DB
163 return 2;
164 }
c5c76735 165
56f48ce9
DB
166 return -1;
167 }
c5c76735 168
56f48ce9 169 if (pwc != NULL)
c5c76735
JL
170 *pwc = (wchar_t) *t;
171
56f48ce9
DB
172 if (*t == '\0')
173 return 0;
c5c76735 174
56f48ce9
DB
175 return 1;
176 }
177 else if (! strcmp (literal_codeset, "C-JIS"))
178 {
179 JIS_ACTION action;
180 JIS_CHAR_TYPE ch;
5da1ecf2
KG
181 const unsigned char *ptr;
182 size_t i, curr_ch;
56f48ce9
DB
183
184 if (s == NULL)
185 {
186 save_state = ASCII;
c5c76735
JL
187 /* State-dependent. */
188 return 1;
56f48ce9
DB
189 }
190
191 ptr = t;
192
c5c76735 193 for (i = 0; i < n; i++)
56f48ce9
DB
194 {
195 curr_ch = t[i];
196 switch (curr_ch)
197 {
198 case JIS_ESC_CHAR:
199 ch = ESCAPE;
200 break;
201 case '$':
202 ch = DOLLAR;
203 break;
204 case '@':
205 ch = AT;
206 break;
207 case '(':
208 ch = BRACKET;
209 break;
210 case 'B':
211 ch = B;
212 break;
213 case 'J':
214 ch = J;
215 break;
216 case '\0':
217 ch = NUL;
218 break;
219 default:
220 if (ISJIS (curr_ch))
221 ch = JIS_CHAR;
222 else
223 ch = OTHER;
224 }
225
226 action = JIS_action_table[curr_state][ch];
227 curr_state = JIS_state_table[curr_state][ch];
228
229 switch (action)
230 {
231 case NOOP:
232 break;
c5c76735 233
56f48ce9
DB
234 case EMPTY:
235 if (pwc != NULL)
c5c76735
JL
236 *pwc = (wchar_t) 0;
237
56f48ce9
DB
238 save_state = curr_state;
239 return i;
c5c76735 240
56f48ce9
DB
241 case COPYA:
242 if (pwc != NULL)
c5c76735 243 *pwc = (wchar_t) *ptr;
56f48ce9 244 save_state = curr_state;
c5c76735
JL
245 return i + 1;
246
56f48ce9
DB
247 case COPYJ:
248 if (pwc != NULL)
c5c76735
JL
249 *pwc = (((wchar_t) *ptr) << 8) + (wchar_t) (*(ptr + 1));
250
56f48ce9 251 save_state = curr_state;
c5c76735
JL
252 return i + 1;
253
56f48ce9
DB
254 case COPYJ2:
255 if (pwc != NULL)
c5c76735
JL
256 *pwc = (((wchar_t) *ptr) << 8) + (wchar_t) (*(ptr + 1));
257
56f48ce9 258 save_state = curr_state;
c5c76735
JL
259 return ptr - t + 2;
260
56f48ce9
DB
261 case MAKE_A:
262 case MAKE_J:
5da1ecf2 263 ptr = (const unsigned char *) (t + i + 1);
56f48ce9 264 break;
c5c76735 265
56f48ce9
DB
266 case ERROR:
267 default:
268 return -1;
269 }
270 }
271
c5c76735
JL
272 /* More than n bytes needed. */
273 return -1;
56f48ce9
DB
274 }
275
276#ifdef CROSS_COMPILE
277 if (s == NULL)
c5c76735
JL
278 /* Not state-dependent. */
279 return 0;
280
56f48ce9
DB
281 if (pwc != NULL)
282 *pwc = *s;
283 return 1;
284#else
c5c76735 285
56f48ce9
DB
286 /* This must be the "C" locale or unknown locale. */
287 return mbtowc (pwc, s, n);
288#endif
289}
290
c5c76735
JL
291/* Return the number of bytes in the multibyte character at the start
292 of the buffer S of size N. Return -1 if the bytes do not form a
293 valid character, or 0 if S is null or points to a null byte.
294
295 This function behaves like the Standard C function mblen, except
296 it treats locale names of the form "C-..." specially. */
297
56f48ce9
DB
298int
299local_mblen (s, n)
5da1ecf2 300 const char *s;
c5c76735 301 size_t n;
56f48ce9
DB
302{
303 return local_mbtowc (NULL, s, n);
304}
305
c5c76735
JL
306/* Return the maximum mumber of bytes in a multibyte character.
307
308 This function returns the same value as the Standard C macro MB_CUR_MAX,
309 except it treats locale names of the form "C-..." specially. */
310
56f48ce9
DB
311int
312local_mb_cur_max ()
313{
314 if (literal_codeset == NULL || strlen (literal_codeset) <= 1)
315 ;
316 else if (! strcmp (literal_codeset, "C-SJIS"))
317 return 2;
318 else if (! strcmp (literal_codeset, "C-EUCJP"))
319 return 2;
320 else if (! strcmp (literal_codeset, "C-JIS"))
321 return 8; /* 3 + 2 + 3 */
322
323#ifdef CROSS_COMPILE
324 return 1;
325#else
4d2a3f76
DB
326 if (MB_CUR_MAX > 0)
327 return MB_CUR_MAX;
328
329 return 1; /* default */
56f48ce9
DB
330#endif
331}
332#endif /* MULTIBYTE_CHARS */