]> git.ipfire.org Git - thirdparty/glibc.git/blame - iconvdata/iso-2022-cn.c
Update.
[thirdparty/glibc.git] / iconvdata / iso-2022-cn.c
CommitLineData
15a2315c 1/* Conversion module for ISO-2022-CN.
77e1d15a 2 Copyright (C) 1999, 2000 Free Software Foundation, Inc.
15a2315c
UD
3 This file is part of the GNU C Library.
4 Contributed by Ulrich Drepper <drepper@cygnus.com>, 1999.
5
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Library General Public License as
8 published by the Free Software Foundation; either version 2 of the
9 License, or (at your option) any later version.
10
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Library General Public License for more details.
15
16 You should have received a copy of the GNU Library General Public
17 License along with the GNU C Library; see the file COPYING.LIB. If not,
18 write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
19 Boston, MA 02111-1307, USA. */
20
55985355 21#include <dlfcn.h>
15a2315c
UD
22#include <gconv.h>
23#include <stdint.h>
24#include <string.h>
25#include "gb2312.h"
26#include "cns11643l1.h"
27#include "cns11643l2.h"
28
29#include <assert.h>
30
31/* This makes obvious what everybody knows: 0x1b is the Esc character. */
32#define ESC 0x1b
33
34/* We have single-byte shift-in and shift-out sequences, and the single
35 shift sequence SS2 which replaces the SS2 designation for the next
36 two bytes. */
37#define SI 0x0f
38#define SO 0x0e
39#define SS2_0 ESC
40#define SS2_1 0x4e
41
42/* Definitions used in the body of the `gconv' function. */
43#define CHARSET_NAME "ISO-2022-CN//"
44#define DEFINE_INIT 1
45#define DEFINE_FINI 1
46#define FROM_LOOP from_iso2022cn_loop
47#define TO_LOOP to_iso2022cn_loop
48#define MIN_NEEDED_FROM 1
49#define MAX_NEEDED_FROM 4
50#define MIN_NEEDED_TO 4
51#define MAX_NEEDED_TO 4
52#define PREPARE_LOOP \
53 int save_set; \
aa831d6d 54 int *setp = &data->__statep->__count;
15a2315c
UD
55#define EXTRA_LOOP_ARGS , setp
56
57
58/* The COUNT element of the state keeps track of the currently selected
59 character set. The possible values are: */
60enum
61{
62 ASCII_set = 0,
fd1b5c0f
UD
63 GB2312_set = 8,
64 CNS11643_1_set = 16,
65 CNS11643_2_set = 24,
66 CURRENT_SEL_MASK = 24,
67 GB2312_ann = 32,
68 CNS11643_1_ann = 64,
69 CNS11643_2_ann = 128,
70 CURRENT_ANN_MASK = 224
15a2315c
UD
71};
72
73
74/* Since this is a stateful encoding we have to provide code which resets
75 the output state to the initial state. This has to be done during the
76 flushing. */
77#define EMIT_SHIFT_TO_INIT \
aa831d6d 78 if (data->__statep->__count != ASCII_set) \
15a2315c
UD
79 { \
80 if (FROM_DIRECTION) \
81 /* It's easy, we don't have to emit anything, we just reset the \
82 state for the input. */ \
aa831d6d 83 data->__statep->__count = ASCII_set; \
15a2315c
UD
84 else \
85 { \
86 unsigned char *outbuf = data->__outbuf; \
87 \
88 /* We are not in the initial state. To switch back we have \
89 to emit `SI'. */ \
89301d68 90 if (__builtin_expect (outbuf == data->__outbufend, 0)) \
15a2315c
UD
91 /* We don't have enough room in the output buffer. */ \
92 status = __GCONV_FULL_OUTPUT; \
93 else \
94 { \
95 /* Write out the shift sequence. */ \
96 *outbuf++ = SI; \
15a2315c 97 data->__outbuf = outbuf; \
aa831d6d 98 data->__statep->__count = ASCII_set; \
15a2315c
UD
99 } \
100 } \
101 }
102
103
104/* Since we might have to reset input pointer we must be able to save
105 and retore the state. */
106#define SAVE_RESET_STATE(Save) \
107 if (Save) \
108 save_set = *setp; \
109 else \
110 *setp = save_set
111
112
113/* First define the conversion function from ISO-2022-CN to UCS4. */
114#define MIN_NEEDED_INPUT MIN_NEEDED_FROM
115#define MAX_NEEDED_INPUT MAX_NEEDED_FROM
116#define MIN_NEEDED_OUTPUT MIN_NEEDED_TO
117#define LOOPFCT FROM_LOOP
118#define BODY \
119 { \
120 uint32_t ch = *inptr; \
121 \
122 /* This is a 7bit character set, disallow all 8bit characters. */ \
89301d68 123 if (__builtin_expect (ch, 0) > 0x7f) \
15a2315c 124 { \
85830c4c
UD
125 if (! ignore_errors_p ()) \
126 { \
127 result = __GCONV_ILLEGAL_INPUT; \
128 break; \
129 } \
130 \
131 ++inptr; \
38677ace 132 ++*irreversible; \
85830c4c 133 continue; \
15a2315c
UD
134 } \
135 \
136 /* Recognize escape sequences. */ \
89301d68 137 if (__builtin_expect (ch, 0) == ESC) \
15a2315c
UD
138 { \
139 /* There are two kinds of escape sequences we have to handle: \
140 - those announcing the use of GB and CNS characters on the \
141 line; we can simply ignore them \
142 - the initial byte of the SS2 sequence. \
143 */ \
755104ed 144 if (__builtin_expect (inptr + 2 > inend, 0) \
55985355 145 || (inptr[1] == '$' \
755104ed 146 && (__builtin_expect (inptr + 3 > inend, 0) \
55985355 147 || (inptr[2] == ')' \
755104ed 148 && __builtin_expect (inptr + 4 > inend, 0)) \
55985355 149 || (inptr[2] == '*' \
755104ed 150 && __builtin_expect (inptr + 4 > inend, 0)))) \
55985355 151 || (inptr[1] == SS2_1 \
755104ed 152 && __builtin_expect (inptr + 4 > inend, 0))) \
15a2315c 153 { \
c7c3b0e9 154 result = __GCONV_INCOMPLETE_INPUT; \
15a2315c
UD
155 break; \
156 } \
157 if (inptr[1] == '$' \
158 && ((inptr[2] == ')' && (inptr[3] == 'A' || inptr[3] == 'G')) \
159 || (inptr[2] == '*' && inptr[3] == 'H'))) \
160 { \
161 /* OK, we accept those character sets. */ \
162 if (inptr[3] == 'A') \
163 ann = GB2312_ann; \
164 else if (inptr[3] == 'G') \
165 ann = CNS11643_1_ann; \
166 inptr += 4; \
167 continue; \
168 } \
169 } \
89301d68 170 else if (__builtin_expect (ch, 0) == SO) \
15a2315c
UD
171 { \
172 /* Switch to use GB2312 or CNS 11643 plane 1, depending on which \
173 S0 designation came last. The only problem is what to do with \
174 faulty input files where no designator came. \
175 XXX For now I'll default to use GB2312. If this is not the \
176 best behaviour (e.g., we should flag an error) let me know. */ \
177 ++inptr; \
178 set = ann == CNS11643_1_ann ? CNS11643_1_set : GB2312_set; \
179 continue; \
180 } \
89301d68 181 else if (__builtin_expect (ch, 0) == SI) \
15a2315c
UD
182 { \
183 /* Switch to use ASCII. */ \
184 ++inptr; \
185 set = ASCII_set; \
186 continue; \
187 } \
188 \
89301d68 189 if (__builtin_expect (ch, 0) == ESC && inptr[1] == SS2_1) \
15a2315c
UD
190 { \
191 /* This is a character from CNS 11643 plane 2. \
192 XXX We could test here whether the use of this character \
193 set was announced. */ \
194 inptr += 2; \
195 ch = cns11643l2_to_ucs4 (&inptr, 2, 0); \
89301d68 196 if (__builtin_expect (ch, 0) == __UNKNOWN_10646_CHAR) \
15a2315c 197 { \
85830c4c
UD
198 if (! ignore_errors_p ()) \
199 { \
200 /* This is an illegal character. */ \
201 inptr -= 2; \
202 result = __GCONV_ILLEGAL_INPUT; \
203 break; \
204 } \
205 \
38677ace 206 ++*irreversible; \
85830c4c 207 continue; \
15a2315c
UD
208 } \
209 } \
210 else if (set == ASCII_set) \
211 { \
212 /* Almost done, just advance the input pointer. */ \
213 ++inptr; \
214 } \
215 else \
216 { \
217 /* That's pretty easy, we have a dedicated functions for this. */ \
218 if (set == GB2312_set) \
55985355 219 ch = gb2312_to_ucs4 (&inptr, inend - inptr, 0); \
15a2315c
UD
220 else \
221 { \
222 assert (set == CNS11643_1_set); \
55985355 223 ch = cns11643l1_to_ucs4 (&inptr, inend - inptr, 0); \
15a2315c
UD
224 } \
225 \
55985355 226 if (__builtin_expect (ch, 1) == 0) \
15a2315c 227 { \
c7c3b0e9 228 result = __GCONV_INCOMPLETE_INPUT; \
15a2315c
UD
229 break; \
230 } \
89301d68 231 else if (__builtin_expect (ch, 1) == __UNKNOWN_10646_CHAR) \
15a2315c 232 { \
85830c4c
UD
233 if (! ignore_errors_p ()) \
234 { \
235 /* This is an illegal character. */ \
236 result = __GCONV_ILLEGAL_INPUT; \
237 break; \
238 } \
239 \
240 ++inptr; \
38677ace 241 ++*irreversible; \
85830c4c 242 continue; \
15a2315c
UD
243 } \
244 } \
245 \
77e1d15a
UD
246 put32 (outptr, ch); \
247 outptr += 4; \
15a2315c 248 }
55985355 249#define LOOP_NEED_FLAGS
15a2315c 250#define EXTRA_LOOP_DECLS , int *setp
fd1b5c0f
UD
251#define INIT_PARAMS int set = *setp & CURRENT_SEL_MASK; \
252 int ann = *setp & CURRENT_ANN_MASK
15a2315c
UD
253#define UPDATE_PARAMS *setp = set | ann
254#include <iconv/loop.c>
255
256
257/* Next, define the other direction. */
258#define MIN_NEEDED_INPUT MIN_NEEDED_TO
259#define MIN_NEEDED_OUTPUT MIN_NEEDED_FROM
260#define MAX_NEEDED_OUTPUT MAX_NEEDED_FROM
261#define LOOPFCT TO_LOOP
262#define BODY \
263 { \
89301d68 264 uint32_t ch = get32 (inptr); \
15a2315c
UD
265 \
266 /* First see whether we can write the character using the currently \
267 selected character set. */ \
268 if (ch < 0x80) \
269 { \
270 if (set != ASCII_set) \
271 { \
272 *outptr++ = SI; \
273 set = ASCII_set; \
55985355 274 if (__builtin_expect (outptr == outend, 0)) \
15a2315c
UD
275 { \
276 result = __GCONV_FULL_OUTPUT; \
277 break; \
278 } \
279 } \
280 \
281 *outptr++ = ch; \
15a2315c
UD
282 \
283 /* At the end of the line we have to clear the `ann' flags since \
284 every line must contain this information again. */ \
285 if (ch == L'\n') \
286 ann = 0; \
287 } \
288 else \
289 { \
290 char buf[2]; \
291 int used; \
89301d68 292 size_t written = 0; \
15a2315c
UD
293 \
294 if (set == GB2312_set || (ann & CNS11643_1_ann) == 0) \
295 { \
296 written = ucs4_to_gb2312 (ch, buf, 2); \
297 used = GB2312_set; \
298 } \
299 else \
300 { \
301 written = ucs4_to_cns11643l1 (ch, buf, 2); \
302 used = CNS11643_1_set; \
303 } \
304 \
305 if (written == __UNKNOWN_10646_CHAR) \
306 { \
307 /* Cannot convert it using the currently selected SO set. \
308 Next try the SS2 set. */ \
309 written = ucs4_to_cns11643l2 (ch, buf, 2); \
310 if (written != __UNKNOWN_10646_CHAR) \
311 /* Yep, that worked. */ \
312 used = CNS11643_2_set; \
313 else \
314 { \
315 /* Well, see whether we have to change the SO set. */ \
755104ed 316 if (used == GB2312_set) \
15a2315c
UD
317 written = ucs4_to_cns11643l1 (ch, buf, 2); \
318 else \
319 written = ucs4_to_gb2312 (ch, buf, 2); \
320 \
1f20e04b 321 if (__builtin_expect (written, 0) != __UNKNOWN_10646_CHAR) \
15a2315c 322 /* Oh well, then switch SO. */ \
755104ed 323 used = GB2312_set + CNS11643_1_set - used; \
15a2315c
UD
324 else \
325 { \
326 /* Even this does not work. Error. */ \
d6204268 327 STANDARD_ERR_HANDLER (4); \
15a2315c
UD
328 } \
329 } \
330 } \
331 assert (written == 2); \
332 \
333 /* See whether we have to emit an escape sequence. */ \
334 if (set != used) \
335 { \
336 /* First see whether we announced that we use this \
337 character set. */ \
755104ed 338 if ((ann & (16 << (used >> 3))) == 0) \
15a2315c
UD
339 { \
340 const char *escseq; \
341 \
55985355 342 if (__builtin_expect (outptr + 4 > outend, 0)) \
15a2315c
UD
343 { \
344 result = __GCONV_FULL_OUTPUT; \
345 break; \
346 } \
347 \
755104ed
UD
348 assert ((used >> 3) >= 1 && (used >> 3) <= 3); \
349 escseq = ")A)G*H" + ((used >> 3) - 1) * 2; \
350 *outptr++ = ESC; \
351 *outptr++ = '$'; \
15a2315c
UD
352 *outptr++ = *escseq++; \
353 *outptr++ = *escseq++; \
354 \
355 if (used == GB2312_set) \
356 ann = (ann & CNS11643_2_ann) | GB2312_ann; \
357 else if (used == CNS11643_1_set) \
358 ann = (ann & CNS11643_2_ann) | CNS11643_1_ann; \
359 else \
360 ann |= CNS11643_2_ann; \
361 } \
362 \
363 if (used == CNS11643_2_set) \
364 { \
89301d68 365 if (__builtin_expect (outptr + 2 > outend, 0)) \
15a2315c
UD
366 { \
367 result = __GCONV_FULL_OUTPUT; \
368 break; \
369 } \
370 *outptr++ = SS2_0; \
371 *outptr++ = SS2_1; \
372 } \
373 else \
374 { \
375 /* We only have to emit something is currently ASCII is \
376 selected. Otherwise we are switching within the \
377 SO charset. */ \
378 if (set == ASCII_set) \
379 { \
89301d68 380 if (__builtin_expect (outptr + 1 > outend, 0)) \
15a2315c
UD
381 { \
382 result = __GCONV_FULL_OUTPUT; \
383 break; \
384 } \
385 *outptr++ = SO; \
386 } \
387 } \
388 \
389 /* Always test the length here since we have used up all the \
390 guaranteed output buffer slots. */ \
89301d68 391 if (__builtin_expect (outptr + 2 > outend, 0)) \
15a2315c
UD
392 { \
393 result = __GCONV_FULL_OUTPUT; \
394 break; \
395 } \
396 } \
55985355 397 else if (__builtin_expect (outptr + 2 > outend, 0)) \
15a2315c
UD
398 { \
399 result = __GCONV_FULL_OUTPUT; \
400 break; \
401 } \
402 \
403 *outptr++ = buf[0]; \
404 *outptr++ = buf[1]; \
755104ed 405 set = used; \
15a2315c
UD
406 } \
407 \
408 /* Now that we wrote the output increment the input pointer. */ \
409 inptr += 4; \
410 }
55985355 411#define LOOP_NEED_FLAGS
15a2315c 412#define EXTRA_LOOP_DECLS , int *setp
fd1b5c0f
UD
413#define INIT_PARAMS int set = *setp & CURRENT_SEL_MASK; \
414 int ann = *setp & CURRENT_ANN_MASK
15a2315c
UD
415#define UPDATE_PARAMS *setp = set | ann
416#include <iconv/loop.c>
417
418
419/* Now define the toplevel functions. */
420#include <iconv/skeleton.c>