]>
Commit | Line | Data |
---|---|---|
15a2315c | 1 | /* Conversion module for ISO-2022-CN. |
77e1d15a | 2 | Copyright (C) 1999, 2000 Free Software Foundation, Inc. |
15a2315c UD |
3 | This file is part of the GNU C Library. |
4 | Contributed by Ulrich Drepper <drepper@cygnus.com>, 1999. | |
5 | ||
6 | The GNU C Library is free software; you can redistribute it and/or | |
7 | modify it under the terms of the GNU Library General Public License as | |
8 | published by the Free Software Foundation; either version 2 of the | |
9 | License, or (at your option) any later version. | |
10 | ||
11 | The GNU C Library is distributed in the hope that it will be useful, | |
12 | but WITHOUT ANY WARRANTY; without even the implied warranty of | |
13 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
14 | Library General Public License for more details. | |
15 | ||
16 | You should have received a copy of the GNU Library General Public | |
17 | License along with the GNU C Library; see the file COPYING.LIB. If not, | |
18 | write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, | |
19 | Boston, MA 02111-1307, USA. */ | |
20 | ||
55985355 | 21 | #include <dlfcn.h> |
15a2315c UD |
22 | #include <gconv.h> |
23 | #include <stdint.h> | |
24 | #include <string.h> | |
25 | #include "gb2312.h" | |
26 | #include "cns11643l1.h" | |
27 | #include "cns11643l2.h" | |
28 | ||
29 | #include <assert.h> | |
30 | ||
31 | /* This makes obvious what everybody knows: 0x1b is the Esc character. */ | |
32 | #define ESC 0x1b | |
33 | ||
34 | /* We have single-byte shift-in and shift-out sequences, and the single | |
35 | shift sequence SS2 which replaces the SS2 designation for the next | |
36 | two bytes. */ | |
37 | #define SI 0x0f | |
38 | #define SO 0x0e | |
39 | #define SS2_0 ESC | |
40 | #define SS2_1 0x4e | |
41 | ||
42 | /* Definitions used in the body of the `gconv' function. */ | |
43 | #define CHARSET_NAME "ISO-2022-CN//" | |
44 | #define DEFINE_INIT 1 | |
45 | #define DEFINE_FINI 1 | |
46 | #define FROM_LOOP from_iso2022cn_loop | |
47 | #define TO_LOOP to_iso2022cn_loop | |
48 | #define MIN_NEEDED_FROM 1 | |
49 | #define MAX_NEEDED_FROM 4 | |
50 | #define MIN_NEEDED_TO 4 | |
51 | #define MAX_NEEDED_TO 4 | |
52 | #define PREPARE_LOOP \ | |
53 | int save_set; \ | |
aa831d6d | 54 | int *setp = &data->__statep->__count; |
15a2315c UD |
55 | #define EXTRA_LOOP_ARGS , setp |
56 | ||
57 | ||
58 | /* The COUNT element of the state keeps track of the currently selected | |
59 | character set. The possible values are: */ | |
60 | enum | |
61 | { | |
62 | ASCII_set = 0, | |
fd1b5c0f UD |
63 | GB2312_set = 8, |
64 | CNS11643_1_set = 16, | |
65 | CNS11643_2_set = 24, | |
66 | CURRENT_SEL_MASK = 24, | |
67 | GB2312_ann = 32, | |
68 | CNS11643_1_ann = 64, | |
69 | CNS11643_2_ann = 128, | |
70 | CURRENT_ANN_MASK = 224 | |
15a2315c UD |
71 | }; |
72 | ||
73 | ||
74 | /* Since this is a stateful encoding we have to provide code which resets | |
75 | the output state to the initial state. This has to be done during the | |
76 | flushing. */ | |
77 | #define EMIT_SHIFT_TO_INIT \ | |
aa831d6d | 78 | if (data->__statep->__count != ASCII_set) \ |
15a2315c UD |
79 | { \ |
80 | if (FROM_DIRECTION) \ | |
81 | /* It's easy, we don't have to emit anything, we just reset the \ | |
82 | state for the input. */ \ | |
aa831d6d | 83 | data->__statep->__count = ASCII_set; \ |
15a2315c UD |
84 | else \ |
85 | { \ | |
86 | unsigned char *outbuf = data->__outbuf; \ | |
87 | \ | |
88 | /* We are not in the initial state. To switch back we have \ | |
89 | to emit `SI'. */ \ | |
89301d68 | 90 | if (__builtin_expect (outbuf == data->__outbufend, 0)) \ |
15a2315c UD |
91 | /* We don't have enough room in the output buffer. */ \ |
92 | status = __GCONV_FULL_OUTPUT; \ | |
93 | else \ | |
94 | { \ | |
95 | /* Write out the shift sequence. */ \ | |
96 | *outbuf++ = SI; \ | |
15a2315c | 97 | data->__outbuf = outbuf; \ |
aa831d6d | 98 | data->__statep->__count = ASCII_set; \ |
15a2315c UD |
99 | } \ |
100 | } \ | |
101 | } | |
102 | ||
103 | ||
104 | /* Since we might have to reset input pointer we must be able to save | |
105 | and retore the state. */ | |
106 | #define SAVE_RESET_STATE(Save) \ | |
107 | if (Save) \ | |
108 | save_set = *setp; \ | |
109 | else \ | |
110 | *setp = save_set | |
111 | ||
112 | ||
113 | /* First define the conversion function from ISO-2022-CN to UCS4. */ | |
114 | #define MIN_NEEDED_INPUT MIN_NEEDED_FROM | |
115 | #define MAX_NEEDED_INPUT MAX_NEEDED_FROM | |
116 | #define MIN_NEEDED_OUTPUT MIN_NEEDED_TO | |
117 | #define LOOPFCT FROM_LOOP | |
118 | #define BODY \ | |
119 | { \ | |
120 | uint32_t ch = *inptr; \ | |
121 | \ | |
122 | /* This is a 7bit character set, disallow all 8bit characters. */ \ | |
89301d68 | 123 | if (__builtin_expect (ch, 0) > 0x7f) \ |
15a2315c | 124 | { \ |
85830c4c UD |
125 | if (! ignore_errors_p ()) \ |
126 | { \ | |
127 | result = __GCONV_ILLEGAL_INPUT; \ | |
128 | break; \ | |
129 | } \ | |
130 | \ | |
131 | ++inptr; \ | |
38677ace | 132 | ++*irreversible; \ |
85830c4c | 133 | continue; \ |
15a2315c UD |
134 | } \ |
135 | \ | |
136 | /* Recognize escape sequences. */ \ | |
89301d68 | 137 | if (__builtin_expect (ch, 0) == ESC) \ |
15a2315c UD |
138 | { \ |
139 | /* There are two kinds of escape sequences we have to handle: \ | |
140 | - those announcing the use of GB and CNS characters on the \ | |
141 | line; we can simply ignore them \ | |
142 | - the initial byte of the SS2 sequence. \ | |
143 | */ \ | |
755104ed | 144 | if (__builtin_expect (inptr + 2 > inend, 0) \ |
55985355 | 145 | || (inptr[1] == '$' \ |
755104ed | 146 | && (__builtin_expect (inptr + 3 > inend, 0) \ |
55985355 | 147 | || (inptr[2] == ')' \ |
755104ed | 148 | && __builtin_expect (inptr + 4 > inend, 0)) \ |
55985355 | 149 | || (inptr[2] == '*' \ |
755104ed | 150 | && __builtin_expect (inptr + 4 > inend, 0)))) \ |
55985355 | 151 | || (inptr[1] == SS2_1 \ |
755104ed | 152 | && __builtin_expect (inptr + 4 > inend, 0))) \ |
15a2315c | 153 | { \ |
c7c3b0e9 | 154 | result = __GCONV_INCOMPLETE_INPUT; \ |
15a2315c UD |
155 | break; \ |
156 | } \ | |
157 | if (inptr[1] == '$' \ | |
158 | && ((inptr[2] == ')' && (inptr[3] == 'A' || inptr[3] == 'G')) \ | |
159 | || (inptr[2] == '*' && inptr[3] == 'H'))) \ | |
160 | { \ | |
161 | /* OK, we accept those character sets. */ \ | |
162 | if (inptr[3] == 'A') \ | |
163 | ann = GB2312_ann; \ | |
164 | else if (inptr[3] == 'G') \ | |
165 | ann = CNS11643_1_ann; \ | |
166 | inptr += 4; \ | |
167 | continue; \ | |
168 | } \ | |
169 | } \ | |
89301d68 | 170 | else if (__builtin_expect (ch, 0) == SO) \ |
15a2315c UD |
171 | { \ |
172 | /* Switch to use GB2312 or CNS 11643 plane 1, depending on which \ | |
173 | S0 designation came last. The only problem is what to do with \ | |
174 | faulty input files where no designator came. \ | |
175 | XXX For now I'll default to use GB2312. If this is not the \ | |
176 | best behaviour (e.g., we should flag an error) let me know. */ \ | |
177 | ++inptr; \ | |
178 | set = ann == CNS11643_1_ann ? CNS11643_1_set : GB2312_set; \ | |
179 | continue; \ | |
180 | } \ | |
89301d68 | 181 | else if (__builtin_expect (ch, 0) == SI) \ |
15a2315c UD |
182 | { \ |
183 | /* Switch to use ASCII. */ \ | |
184 | ++inptr; \ | |
185 | set = ASCII_set; \ | |
186 | continue; \ | |
187 | } \ | |
188 | \ | |
89301d68 | 189 | if (__builtin_expect (ch, 0) == ESC && inptr[1] == SS2_1) \ |
15a2315c UD |
190 | { \ |
191 | /* This is a character from CNS 11643 plane 2. \ | |
192 | XXX We could test here whether the use of this character \ | |
193 | set was announced. */ \ | |
194 | inptr += 2; \ | |
195 | ch = cns11643l2_to_ucs4 (&inptr, 2, 0); \ | |
89301d68 | 196 | if (__builtin_expect (ch, 0) == __UNKNOWN_10646_CHAR) \ |
15a2315c | 197 | { \ |
85830c4c UD |
198 | if (! ignore_errors_p ()) \ |
199 | { \ | |
200 | /* This is an illegal character. */ \ | |
201 | inptr -= 2; \ | |
202 | result = __GCONV_ILLEGAL_INPUT; \ | |
203 | break; \ | |
204 | } \ | |
205 | \ | |
38677ace | 206 | ++*irreversible; \ |
85830c4c | 207 | continue; \ |
15a2315c UD |
208 | } \ |
209 | } \ | |
210 | else if (set == ASCII_set) \ | |
211 | { \ | |
212 | /* Almost done, just advance the input pointer. */ \ | |
213 | ++inptr; \ | |
214 | } \ | |
215 | else \ | |
216 | { \ | |
217 | /* That's pretty easy, we have a dedicated functions for this. */ \ | |
218 | if (set == GB2312_set) \ | |
55985355 | 219 | ch = gb2312_to_ucs4 (&inptr, inend - inptr, 0); \ |
15a2315c UD |
220 | else \ |
221 | { \ | |
222 | assert (set == CNS11643_1_set); \ | |
55985355 | 223 | ch = cns11643l1_to_ucs4 (&inptr, inend - inptr, 0); \ |
15a2315c UD |
224 | } \ |
225 | \ | |
55985355 | 226 | if (__builtin_expect (ch, 1) == 0) \ |
15a2315c | 227 | { \ |
c7c3b0e9 | 228 | result = __GCONV_INCOMPLETE_INPUT; \ |
15a2315c UD |
229 | break; \ |
230 | } \ | |
89301d68 | 231 | else if (__builtin_expect (ch, 1) == __UNKNOWN_10646_CHAR) \ |
15a2315c | 232 | { \ |
85830c4c UD |
233 | if (! ignore_errors_p ()) \ |
234 | { \ | |
235 | /* This is an illegal character. */ \ | |
236 | result = __GCONV_ILLEGAL_INPUT; \ | |
237 | break; \ | |
238 | } \ | |
239 | \ | |
240 | ++inptr; \ | |
38677ace | 241 | ++*irreversible; \ |
85830c4c | 242 | continue; \ |
15a2315c UD |
243 | } \ |
244 | } \ | |
245 | \ | |
77e1d15a UD |
246 | put32 (outptr, ch); \ |
247 | outptr += 4; \ | |
15a2315c | 248 | } |
55985355 | 249 | #define LOOP_NEED_FLAGS |
15a2315c | 250 | #define EXTRA_LOOP_DECLS , int *setp |
fd1b5c0f UD |
251 | #define INIT_PARAMS int set = *setp & CURRENT_SEL_MASK; \ |
252 | int ann = *setp & CURRENT_ANN_MASK | |
15a2315c UD |
253 | #define UPDATE_PARAMS *setp = set | ann |
254 | #include <iconv/loop.c> | |
255 | ||
256 | ||
257 | /* Next, define the other direction. */ | |
258 | #define MIN_NEEDED_INPUT MIN_NEEDED_TO | |
259 | #define MIN_NEEDED_OUTPUT MIN_NEEDED_FROM | |
260 | #define MAX_NEEDED_OUTPUT MAX_NEEDED_FROM | |
261 | #define LOOPFCT TO_LOOP | |
262 | #define BODY \ | |
263 | { \ | |
89301d68 | 264 | uint32_t ch = get32 (inptr); \ |
15a2315c UD |
265 | \ |
266 | /* First see whether we can write the character using the currently \ | |
267 | selected character set. */ \ | |
268 | if (ch < 0x80) \ | |
269 | { \ | |
270 | if (set != ASCII_set) \ | |
271 | { \ | |
272 | *outptr++ = SI; \ | |
273 | set = ASCII_set; \ | |
55985355 | 274 | if (__builtin_expect (outptr == outend, 0)) \ |
15a2315c UD |
275 | { \ |
276 | result = __GCONV_FULL_OUTPUT; \ | |
277 | break; \ | |
278 | } \ | |
279 | } \ | |
280 | \ | |
281 | *outptr++ = ch; \ | |
15a2315c UD |
282 | \ |
283 | /* At the end of the line we have to clear the `ann' flags since \ | |
284 | every line must contain this information again. */ \ | |
285 | if (ch == L'\n') \ | |
286 | ann = 0; \ | |
287 | } \ | |
288 | else \ | |
289 | { \ | |
290 | char buf[2]; \ | |
291 | int used; \ | |
89301d68 | 292 | size_t written = 0; \ |
15a2315c UD |
293 | \ |
294 | if (set == GB2312_set || (ann & CNS11643_1_ann) == 0) \ | |
295 | { \ | |
296 | written = ucs4_to_gb2312 (ch, buf, 2); \ | |
297 | used = GB2312_set; \ | |
298 | } \ | |
299 | else \ | |
300 | { \ | |
301 | written = ucs4_to_cns11643l1 (ch, buf, 2); \ | |
302 | used = CNS11643_1_set; \ | |
303 | } \ | |
304 | \ | |
305 | if (written == __UNKNOWN_10646_CHAR) \ | |
306 | { \ | |
307 | /* Cannot convert it using the currently selected SO set. \ | |
308 | Next try the SS2 set. */ \ | |
309 | written = ucs4_to_cns11643l2 (ch, buf, 2); \ | |
310 | if (written != __UNKNOWN_10646_CHAR) \ | |
311 | /* Yep, that worked. */ \ | |
312 | used = CNS11643_2_set; \ | |
313 | else \ | |
314 | { \ | |
315 | /* Well, see whether we have to change the SO set. */ \ | |
755104ed | 316 | if (used == GB2312_set) \ |
15a2315c UD |
317 | written = ucs4_to_cns11643l1 (ch, buf, 2); \ |
318 | else \ | |
319 | written = ucs4_to_gb2312 (ch, buf, 2); \ | |
320 | \ | |
1f20e04b | 321 | if (__builtin_expect (written, 0) != __UNKNOWN_10646_CHAR) \ |
15a2315c | 322 | /* Oh well, then switch SO. */ \ |
755104ed | 323 | used = GB2312_set + CNS11643_1_set - used; \ |
15a2315c UD |
324 | else \ |
325 | { \ | |
326 | /* Even this does not work. Error. */ \ | |
d6204268 | 327 | STANDARD_ERR_HANDLER (4); \ |
15a2315c UD |
328 | } \ |
329 | } \ | |
330 | } \ | |
331 | assert (written == 2); \ | |
332 | \ | |
333 | /* See whether we have to emit an escape sequence. */ \ | |
334 | if (set != used) \ | |
335 | { \ | |
336 | /* First see whether we announced that we use this \ | |
337 | character set. */ \ | |
755104ed | 338 | if ((ann & (16 << (used >> 3))) == 0) \ |
15a2315c UD |
339 | { \ |
340 | const char *escseq; \ | |
341 | \ | |
55985355 | 342 | if (__builtin_expect (outptr + 4 > outend, 0)) \ |
15a2315c UD |
343 | { \ |
344 | result = __GCONV_FULL_OUTPUT; \ | |
345 | break; \ | |
346 | } \ | |
347 | \ | |
755104ed UD |
348 | assert ((used >> 3) >= 1 && (used >> 3) <= 3); \ |
349 | escseq = ")A)G*H" + ((used >> 3) - 1) * 2; \ | |
350 | *outptr++ = ESC; \ | |
351 | *outptr++ = '$'; \ | |
15a2315c UD |
352 | *outptr++ = *escseq++; \ |
353 | *outptr++ = *escseq++; \ | |
354 | \ | |
355 | if (used == GB2312_set) \ | |
356 | ann = (ann & CNS11643_2_ann) | GB2312_ann; \ | |
357 | else if (used == CNS11643_1_set) \ | |
358 | ann = (ann & CNS11643_2_ann) | CNS11643_1_ann; \ | |
359 | else \ | |
360 | ann |= CNS11643_2_ann; \ | |
361 | } \ | |
362 | \ | |
363 | if (used == CNS11643_2_set) \ | |
364 | { \ | |
89301d68 | 365 | if (__builtin_expect (outptr + 2 > outend, 0)) \ |
15a2315c UD |
366 | { \ |
367 | result = __GCONV_FULL_OUTPUT; \ | |
368 | break; \ | |
369 | } \ | |
370 | *outptr++ = SS2_0; \ | |
371 | *outptr++ = SS2_1; \ | |
372 | } \ | |
373 | else \ | |
374 | { \ | |
375 | /* We only have to emit something is currently ASCII is \ | |
376 | selected. Otherwise we are switching within the \ | |
377 | SO charset. */ \ | |
378 | if (set == ASCII_set) \ | |
379 | { \ | |
89301d68 | 380 | if (__builtin_expect (outptr + 1 > outend, 0)) \ |
15a2315c UD |
381 | { \ |
382 | result = __GCONV_FULL_OUTPUT; \ | |
383 | break; \ | |
384 | } \ | |
385 | *outptr++ = SO; \ | |
386 | } \ | |
387 | } \ | |
388 | \ | |
389 | /* Always test the length here since we have used up all the \ | |
390 | guaranteed output buffer slots. */ \ | |
89301d68 | 391 | if (__builtin_expect (outptr + 2 > outend, 0)) \ |
15a2315c UD |
392 | { \ |
393 | result = __GCONV_FULL_OUTPUT; \ | |
394 | break; \ | |
395 | } \ | |
396 | } \ | |
55985355 | 397 | else if (__builtin_expect (outptr + 2 > outend, 0)) \ |
15a2315c UD |
398 | { \ |
399 | result = __GCONV_FULL_OUTPUT; \ | |
400 | break; \ | |
401 | } \ | |
402 | \ | |
403 | *outptr++ = buf[0]; \ | |
404 | *outptr++ = buf[1]; \ | |
755104ed | 405 | set = used; \ |
15a2315c UD |
406 | } \ |
407 | \ | |
408 | /* Now that we wrote the output increment the input pointer. */ \ | |
409 | inptr += 4; \ | |
410 | } | |
55985355 | 411 | #define LOOP_NEED_FLAGS |
15a2315c | 412 | #define EXTRA_LOOP_DECLS , int *setp |
fd1b5c0f UD |
413 | #define INIT_PARAMS int set = *setp & CURRENT_SEL_MASK; \ |
414 | int ann = *setp & CURRENT_ANN_MASK | |
15a2315c UD |
415 | #define UPDATE_PARAMS *setp = set | ann |
416 | #include <iconv/loop.c> | |
417 | ||
418 | ||
419 | /* Now define the toplevel functions. */ | |
420 | #include <iconv/skeleton.c> |