]> git.ipfire.org Git - thirdparty/glibc.git/blame - iconvdata/iso-2022-cn.c
Update copyright dates with scripts/update-copyrights.
[thirdparty/glibc.git] / iconvdata / iso-2022-cn.c
CommitLineData
15a2315c 1/* Conversion module for ISO-2022-CN.
f7a9f785 2 Copyright (C) 1999-2016 Free Software Foundation, Inc.
15a2315c
UD
3 This file is part of the GNU C Library.
4 Contributed by Ulrich Drepper <drepper@cygnus.com>, 1999.
5
6 The GNU C Library is free software; you can redistribute it and/or
41bdb6e2
AJ
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
15a2315c
UD
10
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
41bdb6e2 14 Lesser General Public License for more details.
15a2315c 15
41bdb6e2 16 You should have received a copy of the GNU Lesser General Public
59ba27a6
PE
17 License along with the GNU C Library; if not, see
18 <http://www.gnu.org/licenses/>. */
15a2315c 19
55985355 20#include <dlfcn.h>
15a2315c
UD
21#include <gconv.h>
22#include <stdint.h>
23#include <string.h>
24#include "gb2312.h"
25#include "cns11643l1.h"
26#include "cns11643l2.h"
27
28#include <assert.h>
29
30/* This makes obvious what everybody knows: 0x1b is the Esc character. */
31#define ESC 0x1b
32
33/* We have single-byte shift-in and shift-out sequences, and the single
34 shift sequence SS2 which replaces the SS2 designation for the next
35 two bytes. */
36#define SI 0x0f
37#define SO 0x0e
38#define SS2_0 ESC
39#define SS2_1 0x4e
40
41/* Definitions used in the body of the `gconv' function. */
42#define CHARSET_NAME "ISO-2022-CN//"
43#define DEFINE_INIT 1
44#define DEFINE_FINI 1
45#define FROM_LOOP from_iso2022cn_loop
46#define TO_LOOP to_iso2022cn_loop
13e402e7 47#define ONE_DIRECTION 0
faaa6f62
UD
48#define FROM_LOOP_MIN_NEEDED_FROM 1
49#define FROM_LOOP_MAX_NEEDED_FROM 4
50#define FROM_LOOP_MIN_NEEDED_TO 4
51#define FROM_LOOP_MAX_NEEDED_TO 4
52#define TO_LOOP_MIN_NEEDED_FROM 4
53#define TO_LOOP_MAX_NEEDED_FROM 4
54#define TO_LOOP_MIN_NEEDED_TO 1
55#define TO_LOOP_MAX_NEEDED_TO 6
15a2315c
UD
56#define PREPARE_LOOP \
57 int save_set; \
aa831d6d 58 int *setp = &data->__statep->__count;
15a2315c
UD
59#define EXTRA_LOOP_ARGS , setp
60
61
62/* The COUNT element of the state keeps track of the currently selected
63 character set. The possible values are: */
64enum
65{
66 ASCII_set = 0,
fd1b5c0f
UD
67 GB2312_set = 8,
68 CNS11643_1_set = 16,
69 CNS11643_2_set = 24,
70 CURRENT_SEL_MASK = 24,
71 GB2312_ann = 32,
72 CNS11643_1_ann = 64,
73 CNS11643_2_ann = 128,
74 CURRENT_ANN_MASK = 224
15a2315c
UD
75};
76
77
78/* Since this is a stateful encoding we have to provide code which resets
79 the output state to the initial state. This has to be done during the
80 flushing. */
81#define EMIT_SHIFT_TO_INIT \
aa831d6d 82 if (data->__statep->__count != ASCII_set) \
15a2315c
UD
83 { \
84 if (FROM_DIRECTION) \
85 /* It's easy, we don't have to emit anything, we just reset the \
86 state for the input. */ \
aa831d6d 87 data->__statep->__count = ASCII_set; \
15a2315c
UD
88 else \
89 { \
15a2315c
UD
90 /* We are not in the initial state. To switch back we have \
91 to emit `SI'. */ \
a1ffb40e 92 if (__glibc_unlikely (outbuf == outend)) \
15a2315c
UD
93 /* We don't have enough room in the output buffer. */ \
94 status = __GCONV_FULL_OUTPUT; \
95 else \
96 { \
97 /* Write out the shift sequence. */ \
98 *outbuf++ = SI; \
aa831d6d 99 data->__statep->__count = ASCII_set; \
15a2315c
UD
100 } \
101 } \
102 }
103
104
105/* Since we might have to reset input pointer we must be able to save
106 and retore the state. */
107#define SAVE_RESET_STATE(Save) \
108 if (Save) \
109 save_set = *setp; \
110 else \
111 *setp = save_set
112
113
114/* First define the conversion function from ISO-2022-CN to UCS4. */
faaa6f62
UD
115#define MIN_NEEDED_INPUT FROM_LOOP_MIN_NEEDED_FROM
116#define MAX_NEEDED_INPUT FROM_LOOP_MAX_NEEDED_FROM
117#define MIN_NEEDED_OUTPUT FROM_LOOP_MIN_NEEDED_TO
118#define MAX_NEEDED_OUTPUT FROM_LOOP_MAX_NEEDED_TO
15a2315c
UD
119#define LOOPFCT FROM_LOOP
120#define BODY \
121 { \
122 uint32_t ch = *inptr; \
123 \
124 /* This is a 7bit character set, disallow all 8bit characters. */ \
a1ffb40e 125 if (__glibc_unlikely (ch >= 0x7f)) \
e438a468 126 STANDARD_FROM_LOOP_ERR_HANDLER (1); \
15a2315c
UD
127 \
128 /* Recognize escape sequences. */ \
89301d68 129 if (__builtin_expect (ch, 0) == ESC) \
15a2315c
UD
130 { \
131 /* There are two kinds of escape sequences we have to handle: \
132 - those announcing the use of GB and CNS characters on the \
133 line; we can simply ignore them \
134 - the initial byte of the SS2 sequence. \
135 */ \
755104ed 136 if (__builtin_expect (inptr + 2 > inend, 0) \
55985355 137 || (inptr[1] == '$' \
755104ed 138 && (__builtin_expect (inptr + 3 > inend, 0) \
55985355 139 || (inptr[2] == ')' \
755104ed 140 && __builtin_expect (inptr + 4 > inend, 0)) \
55985355 141 || (inptr[2] == '*' \
755104ed 142 && __builtin_expect (inptr + 4 > inend, 0)))) \
55985355 143 || (inptr[1] == SS2_1 \
755104ed 144 && __builtin_expect (inptr + 4 > inend, 0))) \
15a2315c 145 { \
c7c3b0e9 146 result = __GCONV_INCOMPLETE_INPUT; \
15a2315c
UD
147 break; \
148 } \
149 if (inptr[1] == '$' \
150 && ((inptr[2] == ')' && (inptr[3] == 'A' || inptr[3] == 'G')) \
151 || (inptr[2] == '*' && inptr[3] == 'H'))) \
152 { \
153 /* OK, we accept those character sets. */ \
154 if (inptr[3] == 'A') \
155 ann = GB2312_ann; \
156 else if (inptr[3] == 'G') \
157 ann = CNS11643_1_ann; \
158 inptr += 4; \
159 continue; \
160 } \
161 } \
89301d68 162 else if (__builtin_expect (ch, 0) == SO) \
15a2315c
UD
163 { \
164 /* Switch to use GB2312 or CNS 11643 plane 1, depending on which \
165 S0 designation came last. The only problem is what to do with \
166 faulty input files where no designator came. \
167 XXX For now I'll default to use GB2312. If this is not the \
168 best behaviour (e.g., we should flag an error) let me know. */ \
169 ++inptr; \
170 set = ann == CNS11643_1_ann ? CNS11643_1_set : GB2312_set; \
171 continue; \
172 } \
89301d68 173 else if (__builtin_expect (ch, 0) == SI) \
15a2315c
UD
174 { \
175 /* Switch to use ASCII. */ \
176 ++inptr; \
177 set = ASCII_set; \
178 continue; \
179 } \
180 \
89301d68 181 if (__builtin_expect (ch, 0) == ESC && inptr[1] == SS2_1) \
15a2315c
UD
182 { \
183 /* This is a character from CNS 11643 plane 2. \
184 XXX We could test here whether the use of this character \
185 set was announced. */ \
186 inptr += 2; \
187 ch = cns11643l2_to_ucs4 (&inptr, 2, 0); \
89301d68 188 if (__builtin_expect (ch, 0) == __UNKNOWN_10646_CHAR) \
15a2315c 189 { \
e438a468
UD
190 inptr -= 2; \
191 STANDARD_FROM_LOOP_ERR_HANDLER (2); \
15a2315c
UD
192 } \
193 } \
194 else if (set == ASCII_set) \
195 { \
196 /* Almost done, just advance the input pointer. */ \
197 ++inptr; \
198 } \
199 else \
200 { \
201 /* That's pretty easy, we have a dedicated functions for this. */ \
202 if (set == GB2312_set) \
55985355 203 ch = gb2312_to_ucs4 (&inptr, inend - inptr, 0); \
15a2315c
UD
204 else \
205 { \
206 assert (set == CNS11643_1_set); \
55985355 207 ch = cns11643l1_to_ucs4 (&inptr, inend - inptr, 0); \
15a2315c
UD
208 } \
209 \
55985355 210 if (__builtin_expect (ch, 1) == 0) \
15a2315c 211 { \
c7c3b0e9 212 result = __GCONV_INCOMPLETE_INPUT; \
15a2315c
UD
213 break; \
214 } \
89301d68 215 else if (__builtin_expect (ch, 1) == __UNKNOWN_10646_CHAR) \
15a2315c 216 { \
e438a468 217 STANDARD_FROM_LOOP_ERR_HANDLER (1); \
15a2315c
UD
218 } \
219 } \
220 \
77e1d15a
UD
221 put32 (outptr, ch); \
222 outptr += 4; \
15a2315c 223 }
55985355 224#define LOOP_NEED_FLAGS
15a2315c 225#define EXTRA_LOOP_DECLS , int *setp
fd1b5c0f
UD
226#define INIT_PARAMS int set = *setp & CURRENT_SEL_MASK; \
227 int ann = *setp & CURRENT_ANN_MASK
15a2315c
UD
228#define UPDATE_PARAMS *setp = set | ann
229#include <iconv/loop.c>
230
231
232/* Next, define the other direction. */
faaa6f62
UD
233#define MIN_NEEDED_INPUT TO_LOOP_MIN_NEEDED_FROM
234#define MAX_NEEDED_INPUT TO_LOOP_MAX_NEEDED_FROM
235#define MIN_NEEDED_OUTPUT TO_LOOP_MIN_NEEDED_TO
236#define MAX_NEEDED_OUTPUT TO_LOOP_MAX_NEEDED_TO
15a2315c
UD
237#define LOOPFCT TO_LOOP
238#define BODY \
239 { \
89301d68 240 uint32_t ch = get32 (inptr); \
15a2315c
UD
241 \
242 /* First see whether we can write the character using the currently \
243 selected character set. */ \
244 if (ch < 0x80) \
245 { \
246 if (set != ASCII_set) \
247 { \
248 *outptr++ = SI; \
249 set = ASCII_set; \
a1ffb40e 250 if (__glibc_unlikely (outptr == outend)) \
15a2315c
UD
251 { \
252 result = __GCONV_FULL_OUTPUT; \
253 break; \
254 } \
255 } \
256 \
257 *outptr++ = ch; \
15a2315c
UD
258 \
259 /* At the end of the line we have to clear the `ann' flags since \
260 every line must contain this information again. */ \
261 if (ch == L'\n') \
262 ann = 0; \
263 } \
264 else \
265 { \
085a4412 266 unsigned char buf[2]; \
278bfa00
UD
267 /* Fake initialization to keep gcc quiet. */ \
268 asm ("" : "=m" (buf)); \
269 \
15a2315c 270 int used; \
89301d68 271 size_t written = 0; \
15a2315c
UD
272 \
273 if (set == GB2312_set || (ann & CNS11643_1_ann) == 0) \
274 { \
275 written = ucs4_to_gb2312 (ch, buf, 2); \
276 used = GB2312_set; \
277 } \
278 else \
279 { \
280 written = ucs4_to_cns11643l1 (ch, buf, 2); \
281 used = CNS11643_1_set; \
282 } \
283 \
284 if (written == __UNKNOWN_10646_CHAR) \
285 { \
286 /* Cannot convert it using the currently selected SO set. \
287 Next try the SS2 set. */ \
288 written = ucs4_to_cns11643l2 (ch, buf, 2); \
289 if (written != __UNKNOWN_10646_CHAR) \
290 /* Yep, that worked. */ \
291 used = CNS11643_2_set; \
292 else \
293 { \
294 /* Well, see whether we have to change the SO set. */ \
755104ed 295 if (used == GB2312_set) \
15a2315c
UD
296 written = ucs4_to_cns11643l1 (ch, buf, 2); \
297 else \
298 written = ucs4_to_gb2312 (ch, buf, 2); \
299 \
1f20e04b 300 if (__builtin_expect (written, 0) != __UNKNOWN_10646_CHAR) \
15a2315c 301 /* Oh well, then switch SO. */ \
755104ed 302 used = GB2312_set + CNS11643_1_set - used; \
15a2315c
UD
303 else \
304 { \
601d2942
UD
305 UNICODE_TAG_HANDLER (ch, 4); \
306 \
15a2315c 307 /* Even this does not work. Error. */ \
e438a468 308 STANDARD_TO_LOOP_ERR_HANDLER (4); \
15a2315c
UD
309 } \
310 } \
311 } \
312 assert (written == 2); \
313 \
314 /* See whether we have to emit an escape sequence. */ \
315 if (set != used) \
316 { \
317 /* First see whether we announced that we use this \
318 character set. */ \
755104ed 319 if ((ann & (16 << (used >> 3))) == 0) \
15a2315c
UD
320 { \
321 const char *escseq; \
322 \
a1ffb40e 323 if (__glibc_unlikely (outptr + 4 > outend)) \
15a2315c
UD
324 { \
325 result = __GCONV_FULL_OUTPUT; \
326 break; \
327 } \
328 \
755104ed
UD
329 assert ((used >> 3) >= 1 && (used >> 3) <= 3); \
330 escseq = ")A)G*H" + ((used >> 3) - 1) * 2; \
331 *outptr++ = ESC; \
332 *outptr++ = '$'; \
15a2315c
UD
333 *outptr++ = *escseq++; \
334 *outptr++ = *escseq++; \
335 \
336 if (used == GB2312_set) \
337 ann = (ann & CNS11643_2_ann) | GB2312_ann; \
338 else if (used == CNS11643_1_set) \
339 ann = (ann & CNS11643_2_ann) | CNS11643_1_ann; \
340 else \
341 ann |= CNS11643_2_ann; \
342 } \
343 \
344 if (used == CNS11643_2_set) \
345 { \
a1ffb40e 346 if (__glibc_unlikely (outptr + 2 > outend)) \
15a2315c
UD
347 { \
348 result = __GCONV_FULL_OUTPUT; \
349 break; \
350 } \
351 *outptr++ = SS2_0; \
352 *outptr++ = SS2_1; \
353 } \
354 else \
355 { \
356 /* We only have to emit something is currently ASCII is \
357 selected. Otherwise we are switching within the \
358 SO charset. */ \
359 if (set == ASCII_set) \
360 { \
a1ffb40e 361 if (__glibc_unlikely (outptr + 1 > outend)) \
15a2315c
UD
362 { \
363 result = __GCONV_FULL_OUTPUT; \
364 break; \
365 } \
366 *outptr++ = SO; \
367 } \
368 } \
369 \
370 /* Always test the length here since we have used up all the \
371 guaranteed output buffer slots. */ \
a1ffb40e 372 if (__glibc_unlikely (outptr + 2 > outend)) \
15a2315c
UD
373 { \
374 result = __GCONV_FULL_OUTPUT; \
375 break; \
376 } \
377 } \
a1ffb40e 378 else if (__glibc_unlikely (outptr + 2 > outend)) \
15a2315c
UD
379 { \
380 result = __GCONV_FULL_OUTPUT; \
381 break; \
382 } \
383 \
384 *outptr++ = buf[0]; \
385 *outptr++ = buf[1]; \
755104ed 386 set = used; \
15a2315c
UD
387 } \
388 \
389 /* Now that we wrote the output increment the input pointer. */ \
390 inptr += 4; \
391 }
55985355 392#define LOOP_NEED_FLAGS
15a2315c 393#define EXTRA_LOOP_DECLS , int *setp
fd1b5c0f
UD
394#define INIT_PARAMS int set = *setp & CURRENT_SEL_MASK; \
395 int ann = *setp & CURRENT_ANN_MASK
4b1b449d
UD
396#define REINIT_PARAMS do \
397 { \
398 set = *setp & CURRENT_SEL_MASK; \
399 ann = *setp & CURRENT_ANN_MASK; \
400 } \
401 while (0)
15a2315c
UD
402#define UPDATE_PARAMS *setp = set | ann
403#include <iconv/loop.c>
404
405
406/* Now define the toplevel functions. */
407#include <iconv/skeleton.c>