]> git.ipfire.org Git - thirdparty/glibc.git/blame - iconvdata/iso-2022-cn.c
Update copyright notices with scripts/update-copyrights
[thirdparty/glibc.git] / iconvdata / iso-2022-cn.c
CommitLineData
15a2315c 1/* Conversion module for ISO-2022-CN.
d4697bc9 2 Copyright (C) 1999-2014 Free Software Foundation, Inc.
15a2315c
UD
3 This file is part of the GNU C Library.
4 Contributed by Ulrich Drepper <drepper@cygnus.com>, 1999.
5
6 The GNU C Library is free software; you can redistribute it and/or
41bdb6e2
AJ
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
15a2315c
UD
10
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
41bdb6e2 14 Lesser General Public License for more details.
15a2315c 15
41bdb6e2 16 You should have received a copy of the GNU Lesser General Public
59ba27a6
PE
17 License along with the GNU C Library; if not, see
18 <http://www.gnu.org/licenses/>. */
15a2315c 19
55985355 20#include <dlfcn.h>
15a2315c
UD
21#include <gconv.h>
22#include <stdint.h>
23#include <string.h>
24#include "gb2312.h"
25#include "cns11643l1.h"
26#include "cns11643l2.h"
27
28#include <assert.h>
29
30/* This makes obvious what everybody knows: 0x1b is the Esc character. */
31#define ESC 0x1b
32
33/* We have single-byte shift-in and shift-out sequences, and the single
34 shift sequence SS2 which replaces the SS2 designation for the next
35 two bytes. */
36#define SI 0x0f
37#define SO 0x0e
38#define SS2_0 ESC
39#define SS2_1 0x4e
40
41/* Definitions used in the body of the `gconv' function. */
42#define CHARSET_NAME "ISO-2022-CN//"
43#define DEFINE_INIT 1
44#define DEFINE_FINI 1
45#define FROM_LOOP from_iso2022cn_loop
46#define TO_LOOP to_iso2022cn_loop
faaa6f62
UD
47#define FROM_LOOP_MIN_NEEDED_FROM 1
48#define FROM_LOOP_MAX_NEEDED_FROM 4
49#define FROM_LOOP_MIN_NEEDED_TO 4
50#define FROM_LOOP_MAX_NEEDED_TO 4
51#define TO_LOOP_MIN_NEEDED_FROM 4
52#define TO_LOOP_MAX_NEEDED_FROM 4
53#define TO_LOOP_MIN_NEEDED_TO 1
54#define TO_LOOP_MAX_NEEDED_TO 6
15a2315c
UD
55#define PREPARE_LOOP \
56 int save_set; \
aa831d6d 57 int *setp = &data->__statep->__count;
15a2315c
UD
58#define EXTRA_LOOP_ARGS , setp
59
60
61/* The COUNT element of the state keeps track of the currently selected
62 character set. The possible values are: */
63enum
64{
65 ASCII_set = 0,
fd1b5c0f
UD
66 GB2312_set = 8,
67 CNS11643_1_set = 16,
68 CNS11643_2_set = 24,
69 CURRENT_SEL_MASK = 24,
70 GB2312_ann = 32,
71 CNS11643_1_ann = 64,
72 CNS11643_2_ann = 128,
73 CURRENT_ANN_MASK = 224
15a2315c
UD
74};
75
76
77/* Since this is a stateful encoding we have to provide code which resets
78 the output state to the initial state. This has to be done during the
79 flushing. */
80#define EMIT_SHIFT_TO_INIT \
aa831d6d 81 if (data->__statep->__count != ASCII_set) \
15a2315c
UD
82 { \
83 if (FROM_DIRECTION) \
84 /* It's easy, we don't have to emit anything, we just reset the \
85 state for the input. */ \
aa831d6d 86 data->__statep->__count = ASCII_set; \
15a2315c
UD
87 else \
88 { \
15a2315c
UD
89 /* We are not in the initial state. To switch back we have \
90 to emit `SI'. */ \
2373b30e 91 if (__builtin_expect (outbuf == outend, 0)) \
15a2315c
UD
92 /* We don't have enough room in the output buffer. */ \
93 status = __GCONV_FULL_OUTPUT; \
94 else \
95 { \
96 /* Write out the shift sequence. */ \
97 *outbuf++ = SI; \
aa831d6d 98 data->__statep->__count = ASCII_set; \
15a2315c
UD
99 } \
100 } \
101 }
102
103
104/* Since we might have to reset input pointer we must be able to save
105 and retore the state. */
106#define SAVE_RESET_STATE(Save) \
107 if (Save) \
108 save_set = *setp; \
109 else \
110 *setp = save_set
111
112
113/* First define the conversion function from ISO-2022-CN to UCS4. */
faaa6f62
UD
114#define MIN_NEEDED_INPUT FROM_LOOP_MIN_NEEDED_FROM
115#define MAX_NEEDED_INPUT FROM_LOOP_MAX_NEEDED_FROM
116#define MIN_NEEDED_OUTPUT FROM_LOOP_MIN_NEEDED_TO
117#define MAX_NEEDED_OUTPUT FROM_LOOP_MAX_NEEDED_TO
15a2315c
UD
118#define LOOPFCT FROM_LOOP
119#define BODY \
120 { \
121 uint32_t ch = *inptr; \
122 \
123 /* This is a 7bit character set, disallow all 8bit characters. */ \
db2d05f9 124 if (__builtin_expect (ch >= 0x7f, 0)) \
e438a468 125 STANDARD_FROM_LOOP_ERR_HANDLER (1); \
15a2315c
UD
126 \
127 /* Recognize escape sequences. */ \
89301d68 128 if (__builtin_expect (ch, 0) == ESC) \
15a2315c
UD
129 { \
130 /* There are two kinds of escape sequences we have to handle: \
131 - those announcing the use of GB and CNS characters on the \
132 line; we can simply ignore them \
133 - the initial byte of the SS2 sequence. \
134 */ \
755104ed 135 if (__builtin_expect (inptr + 2 > inend, 0) \
55985355 136 || (inptr[1] == '$' \
755104ed 137 && (__builtin_expect (inptr + 3 > inend, 0) \
55985355 138 || (inptr[2] == ')' \
755104ed 139 && __builtin_expect (inptr + 4 > inend, 0)) \
55985355 140 || (inptr[2] == '*' \
755104ed 141 && __builtin_expect (inptr + 4 > inend, 0)))) \
55985355 142 || (inptr[1] == SS2_1 \
755104ed 143 && __builtin_expect (inptr + 4 > inend, 0))) \
15a2315c 144 { \
c7c3b0e9 145 result = __GCONV_INCOMPLETE_INPUT; \
15a2315c
UD
146 break; \
147 } \
148 if (inptr[1] == '$' \
149 && ((inptr[2] == ')' && (inptr[3] == 'A' || inptr[3] == 'G')) \
150 || (inptr[2] == '*' && inptr[3] == 'H'))) \
151 { \
152 /* OK, we accept those character sets. */ \
153 if (inptr[3] == 'A') \
154 ann = GB2312_ann; \
155 else if (inptr[3] == 'G') \
156 ann = CNS11643_1_ann; \
157 inptr += 4; \
158 continue; \
159 } \
160 } \
89301d68 161 else if (__builtin_expect (ch, 0) == SO) \
15a2315c
UD
162 { \
163 /* Switch to use GB2312 or CNS 11643 plane 1, depending on which \
164 S0 designation came last. The only problem is what to do with \
165 faulty input files where no designator came. \
166 XXX For now I'll default to use GB2312. If this is not the \
167 best behaviour (e.g., we should flag an error) let me know. */ \
168 ++inptr; \
169 set = ann == CNS11643_1_ann ? CNS11643_1_set : GB2312_set; \
170 continue; \
171 } \
89301d68 172 else if (__builtin_expect (ch, 0) == SI) \
15a2315c
UD
173 { \
174 /* Switch to use ASCII. */ \
175 ++inptr; \
176 set = ASCII_set; \
177 continue; \
178 } \
179 \
89301d68 180 if (__builtin_expect (ch, 0) == ESC && inptr[1] == SS2_1) \
15a2315c
UD
181 { \
182 /* This is a character from CNS 11643 plane 2. \
183 XXX We could test here whether the use of this character \
184 set was announced. */ \
185 inptr += 2; \
186 ch = cns11643l2_to_ucs4 (&inptr, 2, 0); \
89301d68 187 if (__builtin_expect (ch, 0) == __UNKNOWN_10646_CHAR) \
15a2315c 188 { \
e438a468
UD
189 inptr -= 2; \
190 STANDARD_FROM_LOOP_ERR_HANDLER (2); \
15a2315c
UD
191 } \
192 } \
193 else if (set == ASCII_set) \
194 { \
195 /* Almost done, just advance the input pointer. */ \
196 ++inptr; \
197 } \
198 else \
199 { \
200 /* That's pretty easy, we have a dedicated functions for this. */ \
201 if (set == GB2312_set) \
55985355 202 ch = gb2312_to_ucs4 (&inptr, inend - inptr, 0); \
15a2315c
UD
203 else \
204 { \
205 assert (set == CNS11643_1_set); \
55985355 206 ch = cns11643l1_to_ucs4 (&inptr, inend - inptr, 0); \
15a2315c
UD
207 } \
208 \
55985355 209 if (__builtin_expect (ch, 1) == 0) \
15a2315c 210 { \
c7c3b0e9 211 result = __GCONV_INCOMPLETE_INPUT; \
15a2315c
UD
212 break; \
213 } \
89301d68 214 else if (__builtin_expect (ch, 1) == __UNKNOWN_10646_CHAR) \
15a2315c 215 { \
e438a468 216 STANDARD_FROM_LOOP_ERR_HANDLER (1); \
15a2315c
UD
217 } \
218 } \
219 \
77e1d15a
UD
220 put32 (outptr, ch); \
221 outptr += 4; \
15a2315c 222 }
55985355 223#define LOOP_NEED_FLAGS
15a2315c 224#define EXTRA_LOOP_DECLS , int *setp
fd1b5c0f
UD
225#define INIT_PARAMS int set = *setp & CURRENT_SEL_MASK; \
226 int ann = *setp & CURRENT_ANN_MASK
15a2315c
UD
227#define UPDATE_PARAMS *setp = set | ann
228#include <iconv/loop.c>
229
230
231/* Next, define the other direction. */
faaa6f62
UD
232#define MIN_NEEDED_INPUT TO_LOOP_MIN_NEEDED_FROM
233#define MAX_NEEDED_INPUT TO_LOOP_MAX_NEEDED_FROM
234#define MIN_NEEDED_OUTPUT TO_LOOP_MIN_NEEDED_TO
235#define MAX_NEEDED_OUTPUT TO_LOOP_MAX_NEEDED_TO
15a2315c
UD
236#define LOOPFCT TO_LOOP
237#define BODY \
238 { \
89301d68 239 uint32_t ch = get32 (inptr); \
15a2315c
UD
240 \
241 /* First see whether we can write the character using the currently \
242 selected character set. */ \
243 if (ch < 0x80) \
244 { \
245 if (set != ASCII_set) \
246 { \
247 *outptr++ = SI; \
248 set = ASCII_set; \
55985355 249 if (__builtin_expect (outptr == outend, 0)) \
15a2315c
UD
250 { \
251 result = __GCONV_FULL_OUTPUT; \
252 break; \
253 } \
254 } \
255 \
256 *outptr++ = ch; \
15a2315c
UD
257 \
258 /* At the end of the line we have to clear the `ann' flags since \
259 every line must contain this information again. */ \
260 if (ch == L'\n') \
261 ann = 0; \
262 } \
263 else \
264 { \
085a4412 265 unsigned char buf[2]; \
278bfa00
UD
266 /* Fake initialization to keep gcc quiet. */ \
267 asm ("" : "=m" (buf)); \
268 \
15a2315c 269 int used; \
89301d68 270 size_t written = 0; \
15a2315c
UD
271 \
272 if (set == GB2312_set || (ann & CNS11643_1_ann) == 0) \
273 { \
274 written = ucs4_to_gb2312 (ch, buf, 2); \
275 used = GB2312_set; \
276 } \
277 else \
278 { \
279 written = ucs4_to_cns11643l1 (ch, buf, 2); \
280 used = CNS11643_1_set; \
281 } \
282 \
283 if (written == __UNKNOWN_10646_CHAR) \
284 { \
285 /* Cannot convert it using the currently selected SO set. \
286 Next try the SS2 set. */ \
287 written = ucs4_to_cns11643l2 (ch, buf, 2); \
288 if (written != __UNKNOWN_10646_CHAR) \
289 /* Yep, that worked. */ \
290 used = CNS11643_2_set; \
291 else \
292 { \
293 /* Well, see whether we have to change the SO set. */ \
755104ed 294 if (used == GB2312_set) \
15a2315c
UD
295 written = ucs4_to_cns11643l1 (ch, buf, 2); \
296 else \
297 written = ucs4_to_gb2312 (ch, buf, 2); \
298 \
1f20e04b 299 if (__builtin_expect (written, 0) != __UNKNOWN_10646_CHAR) \
15a2315c 300 /* Oh well, then switch SO. */ \
755104ed 301 used = GB2312_set + CNS11643_1_set - used; \
15a2315c
UD
302 else \
303 { \
601d2942
UD
304 UNICODE_TAG_HANDLER (ch, 4); \
305 \
15a2315c 306 /* Even this does not work. Error. */ \
e438a468 307 STANDARD_TO_LOOP_ERR_HANDLER (4); \
15a2315c
UD
308 } \
309 } \
310 } \
311 assert (written == 2); \
312 \
313 /* See whether we have to emit an escape sequence. */ \
314 if (set != used) \
315 { \
316 /* First see whether we announced that we use this \
317 character set. */ \
755104ed 318 if ((ann & (16 << (used >> 3))) == 0) \
15a2315c
UD
319 { \
320 const char *escseq; \
321 \
55985355 322 if (__builtin_expect (outptr + 4 > outend, 0)) \
15a2315c
UD
323 { \
324 result = __GCONV_FULL_OUTPUT; \
325 break; \
326 } \
327 \
755104ed
UD
328 assert ((used >> 3) >= 1 && (used >> 3) <= 3); \
329 escseq = ")A)G*H" + ((used >> 3) - 1) * 2; \
330 *outptr++ = ESC; \
331 *outptr++ = '$'; \
15a2315c
UD
332 *outptr++ = *escseq++; \
333 *outptr++ = *escseq++; \
334 \
335 if (used == GB2312_set) \
336 ann = (ann & CNS11643_2_ann) | GB2312_ann; \
337 else if (used == CNS11643_1_set) \
338 ann = (ann & CNS11643_2_ann) | CNS11643_1_ann; \
339 else \
340 ann |= CNS11643_2_ann; \
341 } \
342 \
343 if (used == CNS11643_2_set) \
344 { \
89301d68 345 if (__builtin_expect (outptr + 2 > outend, 0)) \
15a2315c
UD
346 { \
347 result = __GCONV_FULL_OUTPUT; \
348 break; \
349 } \
350 *outptr++ = SS2_0; \
351 *outptr++ = SS2_1; \
352 } \
353 else \
354 { \
355 /* We only have to emit something is currently ASCII is \
356 selected. Otherwise we are switching within the \
357 SO charset. */ \
358 if (set == ASCII_set) \
359 { \
89301d68 360 if (__builtin_expect (outptr + 1 > outend, 0)) \
15a2315c
UD
361 { \
362 result = __GCONV_FULL_OUTPUT; \
363 break; \
364 } \
365 *outptr++ = SO; \
366 } \
367 } \
368 \
369 /* Always test the length here since we have used up all the \
370 guaranteed output buffer slots. */ \
89301d68 371 if (__builtin_expect (outptr + 2 > outend, 0)) \
15a2315c
UD
372 { \
373 result = __GCONV_FULL_OUTPUT; \
374 break; \
375 } \
376 } \
55985355 377 else if (__builtin_expect (outptr + 2 > outend, 0)) \
15a2315c
UD
378 { \
379 result = __GCONV_FULL_OUTPUT; \
380 break; \
381 } \
382 \
383 *outptr++ = buf[0]; \
384 *outptr++ = buf[1]; \
755104ed 385 set = used; \
15a2315c
UD
386 } \
387 \
388 /* Now that we wrote the output increment the input pointer. */ \
389 inptr += 4; \
390 }
55985355 391#define LOOP_NEED_FLAGS
15a2315c 392#define EXTRA_LOOP_DECLS , int *setp
fd1b5c0f
UD
393#define INIT_PARAMS int set = *setp & CURRENT_SEL_MASK; \
394 int ann = *setp & CURRENT_ANN_MASK
4b1b449d
UD
395#define REINIT_PARAMS do \
396 { \
397 set = *setp & CURRENT_SEL_MASK; \
398 ann = *setp & CURRENT_ANN_MASK; \
399 } \
400 while (0)
15a2315c
UD
401#define UPDATE_PARAMS *setp = set | ann
402#include <iconv/loop.c>
403
404
405/* Now define the toplevel functions. */
406#include <iconv/skeleton.c>