]>
Commit | Line | Data |
---|---|---|
56f48ce9 DB |
1 | /* Multibyte Character Functions. |
2 | Copyright (C) 1998 Free Software Foundation, Inc. | |
3 | ||
4 | This file is part of GNU CC. | |
5 | ||
6 | GNU CC is free software; you can redistribute it and/or modify | |
7 | it under the terms of the GNU General Public License as published by | |
8 | the Free Software Foundation; either version 2, or (at your option) | |
9 | any later version. | |
10 | ||
11 | GNU CC is distributed in the hope that it will be useful, | |
12 | but WITHOUT ANY WARRANTY; without even the implied warranty of | |
13 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
14 | GNU General Public License for more details. | |
15 | ||
16 | You should have received a copy of the GNU General Public License | |
17 | along with GNU CC; see the file COPYING. If not, write to | |
18 | the Free Software Foundation, 59 Temple Place - Suite 330, | |
19 | Boston, MA 02111-1307, USA. */ | |
20 | ||
56f48ce9 DB |
21 | /* Note regarding cross compilation: |
22 | ||
c5c76735 | 23 | In general, translation of multibyte characters to wide characters can |
56f48ce9 DB |
24 | only work in a native compiler since the translation function (mbtowc) |
25 | needs to know about both the source and target character encoding. However, | |
26 | this particular implementation for JIS, SJIS and EUCJP source characters | |
27 | will work for any compiler with a newlib target. Other targets may also | |
28 | work provided that their wchar_t implementation is 2 bytes and the encoding | |
29 | leaves the source character values unchanged (except for removing the | |
30 | state shifting markers). */ | |
31 | ||
56f48ce9 | 32 | #include "config.h" |
f0225099 | 33 | #ifdef MULTIBYTE_CHARS |
56f48ce9 | 34 | #include "system.h" |
56f48ce9 DB |
35 | #include "mbchar.h" |
36 | #include <locale.h> | |
37 | ||
c5c76735 JL |
38 | typedef enum {ESCAPE, DOLLAR, BRACKET, AT, B, J, NUL, JIS_CHAR, OTHER, |
39 | JIS_C_NUM} JIS_CHAR_TYPE; | |
56f48ce9 | 40 | |
c5c76735 JL |
41 | typedef enum {ASCII, A_ESC, A_ESC_DL, JIS, JIS_1, JIS_2, J_ESC, J_ESC_BR, |
42 | J2_ESC, J2_ESC_BR, INV, JIS_S_NUM} JIS_STATE; | |
43 | ||
44 | typedef enum {COPYA, COPYJ, COPYJ2, MAKE_A, MAKE_J, NOOP, | |
45 | EMPTY, ERROR} JIS_ACTION; | |
46 | ||
47 | /* State/action tables for processing JIS encoding: | |
48 | ||
49 | Where possible, switches to JIS are grouped with proceding JIS characters | |
50 | and switches to ASCII are grouped with preceding JIS characters. | |
51 | Thus, maximum returned length is: | |
52 | 2 (switch to JIS) + 2 (JIS characters) + 2 (switch back to ASCII) = 6. */ | |
56f48ce9 | 53 | |
56f48ce9 | 54 | static JIS_STATE JIS_state_table[JIS_S_NUM][JIS_C_NUM] = { |
c5c76735 | 55 | /* ESCAPE DOLLAR BRACKET AT B J NUL JIS_CHAR OTH*/ |
56f48ce9 DB |
56 | /*ASCII*/ { A_ESC, ASCII, ASCII, ASCII, ASCII, ASCII, ASCII,ASCII,ASCII}, |
57 | /*A_ESC*/ { ASCII, A_ESC_DL,ASCII, ASCII, ASCII, ASCII, ASCII,ASCII,ASCII}, | |
58 | /*A_ESC_DL*/{ ASCII, ASCII, ASCII, JIS, JIS, ASCII, ASCII,ASCII,ASCII}, | |
59 | /*JIS*/ { J_ESC, JIS_1, JIS_1, JIS_1, JIS_1, JIS_1, INV, JIS_1,INV }, | |
60 | /*JIS_1*/ { INV, JIS_2, JIS_2, JIS_2, JIS_2, JIS_2, INV, JIS_2,INV }, | |
61 | /*JIS_2*/ { J2_ESC,JIS, JIS, JIS, JIS, JIS, INV, JIS, JIS }, | |
62 | /*J_ESC*/ { INV, INV, J_ESC_BR, INV, INV, INV, INV, INV, INV }, | |
63 | /*J_ESC_BR*/{ INV, INV, INV, INV, ASCII, ASCII, INV, INV, INV }, | |
64 | /*J2_ESC*/ { INV, INV, J2_ESC_BR,INV, INV, INV, INV, INV, INV }, | |
65 | /*J2_ESC_BR*/{INV, INV, INV, INV, ASCII, ASCII, INV, INV, INV }, | |
66 | }; | |
67 | ||
68 | static JIS_ACTION JIS_action_table[JIS_S_NUM][JIS_C_NUM] = { | |
c5c76735 | 69 | /* ESCAPE DOLLAR BRACKET AT B J NUL JIS_CHAR OTH */ |
56f48ce9 DB |
70 | /*ASCII */ {NOOP, COPYA, COPYA, COPYA, COPYA, COPYA, EMPTY, COPYA, COPYA}, |
71 | /*A_ESC */ {COPYA, NOOP, COPYA, COPYA, COPYA, COPYA, COPYA, COPYA, COPYA}, | |
72 | /*A_ESC_DL */{COPYA, COPYA, COPYA, MAKE_J, MAKE_J, COPYA, COPYA, COPYA, COPYA}, | |
c5c76735 JL |
73 | /*JIS */ {NOOP, NOOP, NOOP, NOOP, NOOP, NOOP, ERROR, NOOP, ERROR}, |
74 | /*JIS_1 */ {ERROR, NOOP, NOOP, NOOP, NOOP, NOOP, ERROR, NOOP, ERROR}, | |
56f48ce9 | 75 | /*JIS_2 */ {NOOP, COPYJ2,COPYJ2,COPYJ2, COPYJ2, COPYJ2,ERROR, COPYJ2,COPYJ2}, |
c5c76735 JL |
76 | /*J_ESC */ {ERROR, ERROR, NOOP, ERROR, ERROR, ERROR, ERROR, ERROR, ERROR}, |
77 | /*J_ESC_BR */{ERROR, ERROR, ERROR, ERROR, NOOP, NOOP, ERROR, ERROR, ERROR}, | |
78 | /*J2_ESC */ {ERROR, ERROR, NOOP, ERROR, ERROR, ERROR, ERROR, ERROR, ERROR}, | |
79 | /*J2_ESC_BR*/{ERROR, ERROR, ERROR, ERROR, COPYJ, COPYJ, ERROR, ERROR, ERROR}, | |
56f48ce9 DB |
80 | }; |
81 | ||
82 | ||
5da1ecf2 | 83 | const char *literal_codeset = NULL; |
56f48ce9 | 84 | |
c5c76735 JL |
85 | /* Store into *PWC (if PWC is not null) the wide character |
86 | corresponding to the multibyte character at the start of the | |
87 | buffer S of size N. Return the number of bytes in the multibyte | |
88 | character. Return -1 if the bytes do not form a valid character, | |
89 | or 0 if S is null or points to a null byte. | |
90 | ||
91 | This function behaves like the Standard C function mbtowc, except | |
92 | it treats locale names of the form "C-..." specially. */ | |
93 | ||
56f48ce9 DB |
94 | int |
95 | local_mbtowc (pwc, s, n) | |
c5c76735 | 96 | wchar_t *pwc; |
5da1ecf2 | 97 | const char *s; |
c5c76735 | 98 | size_t n; |
56f48ce9 DB |
99 | { |
100 | static JIS_STATE save_state = ASCII; | |
101 | JIS_STATE curr_state = save_state; | |
5da1ecf2 | 102 | const unsigned char *t = (const unsigned char *) s; |
56f48ce9 DB |
103 | |
104 | if (s != NULL && n == 0) | |
105 | return -1; | |
106 | ||
107 | if (literal_codeset == NULL || strlen (literal_codeset) <= 1) | |
c5c76735 JL |
108 | /* This must be the "C" locale or unknown locale -- fall thru */ |
109 | ; | |
56f48ce9 DB |
110 | else if (! strcmp (literal_codeset, "C-SJIS")) |
111 | { | |
112 | int char1; | |
113 | if (s == NULL) | |
c5c76735 JL |
114 | /* Not state-dependent. */ |
115 | return 0; | |
116 | ||
56f48ce9 DB |
117 | char1 = *t; |
118 | if (ISSJIS1 (char1)) | |
119 | { | |
120 | int char2 = t[1]; | |
c5c76735 | 121 | |
56f48ce9 DB |
122 | if (n <= 1) |
123 | return -1; | |
c5c76735 | 124 | |
56f48ce9 DB |
125 | if (ISSJIS2 (char2)) |
126 | { | |
127 | if (pwc != NULL) | |
c5c76735 | 128 | *pwc = (((wchar_t) *t) << 8) + (wchar_t) (*(t + 1)); |
56f48ce9 DB |
129 | return 2; |
130 | } | |
c5c76735 | 131 | |
56f48ce9 DB |
132 | return -1; |
133 | } | |
c5c76735 | 134 | |
56f48ce9 | 135 | if (pwc != NULL) |
c5c76735 JL |
136 | *pwc = (wchar_t) *t; |
137 | ||
56f48ce9 DB |
138 | if (*t == '\0') |
139 | return 0; | |
c5c76735 | 140 | |
56f48ce9 DB |
141 | return 1; |
142 | } | |
143 | else if (! strcmp (literal_codeset, "C-EUCJP")) | |
144 | { | |
145 | int char1; | |
c5c76735 | 146 | |
56f48ce9 | 147 | if (s == NULL) |
c5c76735 JL |
148 | /* Not state-dependent. */ |
149 | return 0; | |
150 | ||
56f48ce9 DB |
151 | char1 = *t; |
152 | if (ISEUCJP (char1)) | |
153 | { | |
154 | int char2 = t[1]; | |
c5c76735 | 155 | |
56f48ce9 DB |
156 | if (n <= 1) |
157 | return -1; | |
c5c76735 | 158 | |
56f48ce9 DB |
159 | if (ISEUCJP (char2)) |
160 | { | |
161 | if (pwc != NULL) | |
c5c76735 | 162 | *pwc = (((wchar_t) *t) << 8) + (wchar_t) (*(t + 1)); |
56f48ce9 DB |
163 | return 2; |
164 | } | |
c5c76735 | 165 | |
56f48ce9 DB |
166 | return -1; |
167 | } | |
c5c76735 | 168 | |
56f48ce9 | 169 | if (pwc != NULL) |
c5c76735 JL |
170 | *pwc = (wchar_t) *t; |
171 | ||
56f48ce9 DB |
172 | if (*t == '\0') |
173 | return 0; | |
c5c76735 | 174 | |
56f48ce9 DB |
175 | return 1; |
176 | } | |
177 | else if (! strcmp (literal_codeset, "C-JIS")) | |
178 | { | |
179 | JIS_ACTION action; | |
180 | JIS_CHAR_TYPE ch; | |
5da1ecf2 KG |
181 | const unsigned char *ptr; |
182 | size_t i, curr_ch; | |
56f48ce9 DB |
183 | |
184 | if (s == NULL) | |
185 | { | |
186 | save_state = ASCII; | |
dc297297 | 187 | /* State-dependent. */ |
c5c76735 | 188 | return 1; |
56f48ce9 DB |
189 | } |
190 | ||
191 | ptr = t; | |
192 | ||
c5c76735 | 193 | for (i = 0; i < n; i++) |
56f48ce9 DB |
194 | { |
195 | curr_ch = t[i]; | |
196 | switch (curr_ch) | |
197 | { | |
198 | case JIS_ESC_CHAR: | |
199 | ch = ESCAPE; | |
200 | break; | |
201 | case '$': | |
202 | ch = DOLLAR; | |
203 | break; | |
204 | case '@': | |
205 | ch = AT; | |
206 | break; | |
207 | case '(': | |
208 | ch = BRACKET; | |
209 | break; | |
210 | case 'B': | |
211 | ch = B; | |
212 | break; | |
213 | case 'J': | |
214 | ch = J; | |
215 | break; | |
216 | case '\0': | |
217 | ch = NUL; | |
218 | break; | |
219 | default: | |
220 | if (ISJIS (curr_ch)) | |
221 | ch = JIS_CHAR; | |
222 | else | |
223 | ch = OTHER; | |
224 | } | |
225 | ||
226 | action = JIS_action_table[curr_state][ch]; | |
227 | curr_state = JIS_state_table[curr_state][ch]; | |
228 | ||
229 | switch (action) | |
230 | { | |
231 | case NOOP: | |
232 | break; | |
c5c76735 | 233 | |
56f48ce9 DB |
234 | case EMPTY: |
235 | if (pwc != NULL) | |
c5c76735 JL |
236 | *pwc = (wchar_t) 0; |
237 | ||
56f48ce9 DB |
238 | save_state = curr_state; |
239 | return i; | |
c5c76735 | 240 | |
56f48ce9 DB |
241 | case COPYA: |
242 | if (pwc != NULL) | |
c5c76735 | 243 | *pwc = (wchar_t) *ptr; |
56f48ce9 | 244 | save_state = curr_state; |
c5c76735 JL |
245 | return i + 1; |
246 | ||
56f48ce9 DB |
247 | case COPYJ: |
248 | if (pwc != NULL) | |
c5c76735 JL |
249 | *pwc = (((wchar_t) *ptr) << 8) + (wchar_t) (*(ptr + 1)); |
250 | ||
56f48ce9 | 251 | save_state = curr_state; |
c5c76735 JL |
252 | return i + 1; |
253 | ||
56f48ce9 DB |
254 | case COPYJ2: |
255 | if (pwc != NULL) | |
c5c76735 JL |
256 | *pwc = (((wchar_t) *ptr) << 8) + (wchar_t) (*(ptr + 1)); |
257 | ||
56f48ce9 | 258 | save_state = curr_state; |
c5c76735 JL |
259 | return ptr - t + 2; |
260 | ||
56f48ce9 DB |
261 | case MAKE_A: |
262 | case MAKE_J: | |
5da1ecf2 | 263 | ptr = (const unsigned char *) (t + i + 1); |
56f48ce9 | 264 | break; |
c5c76735 | 265 | |
56f48ce9 DB |
266 | case ERROR: |
267 | default: | |
268 | return -1; | |
269 | } | |
270 | } | |
271 | ||
c5c76735 JL |
272 | /* More than n bytes needed. */ |
273 | return -1; | |
56f48ce9 DB |
274 | } |
275 | ||
276 | #ifdef CROSS_COMPILE | |
277 | if (s == NULL) | |
c5c76735 JL |
278 | /* Not state-dependent. */ |
279 | return 0; | |
280 | ||
56f48ce9 DB |
281 | if (pwc != NULL) |
282 | *pwc = *s; | |
283 | return 1; | |
284 | #else | |
c5c76735 | 285 | |
dc297297 | 286 | /* This must be the "C" locale or unknown locale. */ |
56f48ce9 DB |
287 | return mbtowc (pwc, s, n); |
288 | #endif | |
289 | } | |
290 | ||
c5c76735 JL |
291 | /* Return the number of bytes in the multibyte character at the start |
292 | of the buffer S of size N. Return -1 if the bytes do not form a | |
293 | valid character, or 0 if S is null or points to a null byte. | |
294 | ||
295 | This function behaves like the Standard C function mblen, except | |
296 | it treats locale names of the form "C-..." specially. */ | |
297 | ||
56f48ce9 DB |
298 | int |
299 | local_mblen (s, n) | |
5da1ecf2 | 300 | const char *s; |
c5c76735 | 301 | size_t n; |
56f48ce9 DB |
302 | { |
303 | return local_mbtowc (NULL, s, n); | |
304 | } | |
305 | ||
c5c76735 JL |
306 | /* Return the maximum mumber of bytes in a multibyte character. |
307 | ||
308 | This function returns the same value as the Standard C macro MB_CUR_MAX, | |
309 | except it treats locale names of the form "C-..." specially. */ | |
310 | ||
56f48ce9 DB |
311 | int |
312 | local_mb_cur_max () | |
313 | { | |
314 | if (literal_codeset == NULL || strlen (literal_codeset) <= 1) | |
315 | ; | |
316 | else if (! strcmp (literal_codeset, "C-SJIS")) | |
317 | return 2; | |
318 | else if (! strcmp (literal_codeset, "C-EUCJP")) | |
319 | return 2; | |
320 | else if (! strcmp (literal_codeset, "C-JIS")) | |
321 | return 8; /* 3 + 2 + 3 */ | |
322 | ||
323 | #ifdef CROSS_COMPILE | |
324 | return 1; | |
325 | #else | |
4d2a3f76 DB |
326 | if (MB_CUR_MAX > 0) |
327 | return MB_CUR_MAX; | |
328 | ||
329 | return 1; /* default */ | |
56f48ce9 DB |
330 | #endif |
331 | } | |
12cf91fe ZW |
332 | #else /* MULTIBYTE_CHARS */ |
333 | extern int dummy; /* silence 'ANSI C forbids an empty source file' warning */ | |
56f48ce9 | 334 | #endif /* MULTIBYTE_CHARS */ |