]> git.ipfire.org Git - thirdparty/glibc.git/blame - iconvdata/johab.c
(UNICODE_TAG_HANDLER): New macro.
[thirdparty/glibc.git] / iconvdata / johab.c
CommitLineData
a44d2393 1/* Mapping tables for JOHAB handling.
77e1d15a 2 Copyright (C) 1998, 1999, 2000 Free Software Foundation, Inc.
a44d2393 3 This file is part of the GNU C Library.
8619129f
UD
4 Contributed by Jungshik Shin <jshin@pantheon.yale.edu>
5 and Ulrich Drepper <drepper@cygnus.com>, 1998.
a44d2393
UD
6
7 The GNU C Library is free software; you can redistribute it and/or
8 modify it under the terms of the GNU Library General Public License as
9 published by the Free Software Foundation; either version 2 of the
10 License, or (at your option) any later version.
11
12 The GNU C Library is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 Library General Public License for more details.
16
17 You should have received a copy of the GNU Library General Public
18 License along with the GNU C Library; see the file COPYING.LIB. If not,
19 write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
20 Boston, MA 02111-1307, USA. */
21
55985355 22#include <dlfcn.h>
2aea1d79 23#include <stdint.h>
a44d2393
UD
24#include <ksc5601.h>
25
a44d2393
UD
26/* The table for Bit pattern to Hangul Jamo
27 5 bits each are used to encode
28 leading consonants(19 + 1 filler), medial vowels(21 + 1 filler)
29 and trailing consonants(27 + 1 filler).
30
31 KS C 5601-1992 Annex 3 Table 2
32 0 : Filler, -1: invalid, >= 1 : valid
33
34 */
8619129f 35static const int init[32] =
a44d2393
UD
36{
37 -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
38 19, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1
39};
8619129f 40static const int mid[32] =
a44d2393
UD
41{
42 -1, -1, 0, 1, 2, 3, 4, 5,
43 -1, -1, 6, 7, 8, 9, 10, 11,
44 -1, -1, 12, 13, 14, 15, 16, 17,
45 -1, -1, 18, 19, 20, 21, -1, -1
46};
8619129f 47static const int final[32] =
a44d2393
UD
48{
49 -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
50 -1, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, -1, -1
51};
52
53/*
54 Hangul Jamo in Johab to Unicode 2.0 : Unicode 2.0
55 defines 51 Hangul Compatibility Jamos in the block [0x3131,0x314e]
56
57 It's to be considered later which Jamo block to use, Compatibility
58 block [0x3131,0x314e] or Hangul Conjoining Jamo block, [0x1100,0x11ff]
59
60 */
8619129f 61static const uint32_t init_to_ucs[19] =
a44d2393
UD
62{
63 0x3131, 0x3132, 0x3134, 0x3137, 0x3138, 0x3139, 0x3141, 0x3142,
64 0x3143, 0x3145, 0x3146, 0x3147, 0x3148, 0x3149, 0x314a, 0x314b,
65 0x314c, 0x314d, 0x314e
66};
67
c63598bf 68static const uint32_t final_to_ucs[31] =
a44d2393
UD
69{
70 L'\0', L'\0', 0x3133, L'\0', 0x3135, 0x3136, L'\0', L'\0',
079e46f0
UD
71 0x313a, 0x313b, 0x313c, 0x313d, 0x313e, 0x313f,
72 0x3140, L'\0', L'\0', 0x3144, L'\0', L'\0', L'\0', L'\0',
c63598bf 73 L'\0', L'\0', L'\0', L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'
a44d2393
UD
74};
75
76/* The following three arrays are used to convert
77 precomposed Hangul syllables in [0xac00,0xd???]
78 to Jamo bit patterns for Johab encoding
79
80 cf. : KS C 5601-1992, Annex3 Table 2
81
82 Arrays are used to speed up things although it's possible
83 to get the same result arithmetically.
84
85 */
8619129f 86static const int init_to_bit[19] =
a44d2393
UD
87{
88 0x8800, 0x8c00, 0x9000, 0x9400, 0x9800, 0x9c00,
89 0xa000, 0xa400, 0xa800, 0xac00, 0xb000, 0xb400,
90 0xb800, 0xbc00, 0xc000, 0xc400, 0xc800, 0xcc00,
91 0xd000
92};
93
8619129f 94static const int mid_to_bit[21] =
a44d2393
UD
95{
96 0x0060, 0x0080, 0x00a0, 0x00c0, 0x00e0,
97 0x0140, 0x0160, 0x0180, 0x01a0, 0x01c0, 0x1e0,
98 0x0240, 0x0260, 0x0280, 0x02a0, 0x02c0, 0x02e0,
99 0x0340, 0x0360, 0x0380, 0x03a0
100};
101
8619129f 102static const int final_to_bit[28] =
a44d2393
UD
103{
104 1, 2, 3, 4, 5, 6, 7, 8, 9, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11,
105 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d
106};
107
108/* The conversion table from
109 UCS4 Hangul Compatibility Jamo in [0x3131,0x3163]
110 to Johab
111
112 cf. 1. KS C 5601-1992 Annex 3 Table 2
113 2. Unicode 2.0 manual
114
115 */
8619129f 116static const uint16_t jamo_from_ucs_table[51] =
a44d2393
UD
117{
118 0x8841, 0x8c41,
119 0x8444,
120 0x9041,
121 0x8446, 0x8447,
122 0x9441, 0x9841, 0x9c41,
079e46f0 123 0x844a, 0x844b, 0x844c, 0x844d, 0x844e, 0x844f, 0x8450,
a44d2393
UD
124 0xa041, 0xa441, 0xa841,
125 0x8454,
126 0xac41, 0xb041, 0xb441, 0xb841, 0xbc41,
079e46f0 127 0xc041, 0xc441, 0xc841, 0xcc41, 0xd041,
a44d2393
UD
128 0x8461, 0x8481, 0x84a1, 0x84c1, 0x84e1,
129 0x8541, 0x8561, 0x8581, 0x85a1, 0x85c1, 0x85e1,
130 0x8641, 0x8661, 0x8681, 0x86a1, 0x86c1, 0x86e1,
131 0x8741, 0x8761, 0x8781, 0x87a1
132};
133
134
8619129f
UD
135static inline uint32_t
136johab_sym_hanja_to_ucs (uint_fast32_t idx, uint_fast32_t c1, uint_fast32_t c2)
a44d2393
UD
137{
138 if (idx <= 0xdefe)
8619129f
UD
139 return (uint32_t) __ksc5601_sym_to_ucs[(c1 - 0xd9) * 188 + c2
140 - (c2 > 0x90 ? 0x43 : 0x31)];
a44d2393 141 else
8619129f
UD
142 return (uint32_t) __ksc5601_hanja_to_ucs[(c1 - 0xe0) * 188 + c2
143 - (c2 > 0x90 ? 0x43 : 0x31)];
a44d2393 144}
8619129f 145/* Definitions used in the body of the `gconv' function. */
9b26f5c4 146#define CHARSET_NAME "JOHAB//"
8619129f
UD
147#define FROM_LOOP from_johab
148#define TO_LOOP to_johab
149#define DEFINE_INIT 1
150#define DEFINE_FINI 1
151#define MIN_NEEDED_FROM 1
152#define MAX_NEEDED_FROM 2
153#define MIN_NEEDED_TO 4
154
155
156/* First define the conversion function from JOHAB to UCS4. */
157#define MIN_NEEDED_INPUT MIN_NEEDED_FROM
158#define MAX_NEEDED_INPUT MAX_NEEDED_FROM
159#define MIN_NEEDED_OUTPUT MIN_NEEDED_TO
160#define LOOPFCT FROM_LOOP
161#define BODY \
162 { \
163 uint32_t ch = *inptr; \
164 \
079e46f0
UD
165 if (ch <= 0x7f) \
166 { \
167 /* Plain ISO646-KR. */ \
168 if (ch == 0x5c) \
169 ch = 0x20a9; /* half-width Korean Currency WON sign */ \
170 ++inptr; \
171 } \
8619129f
UD
172 /* Johab : 1. Hangul \
173 1st byte : 0x84-0xd3 \
174 2nd byte : 0x41-0x7e, 0x81-0xfe \
175 2. Hanja & Symbol : \
176 1st byte : 0xd8-0xde, 0xe0-0xf9 \
177 2nd byte : 0x31-0x7e, 0x91-0xfe \
178 0xd831-0xd87e and 0xd891-0xd8fe are user-defined area */ \
179 else \
180 { \
019357d2
UD
181 if (__builtin_expect (ch, 0) > 0xf9 \
182 || __builtin_expect (ch, 0) == 0xdf \
183 || (__builtin_expect (ch, 0) > 0x7e && ch < 0x84) \
184 || (__builtin_expect (ch, 0) > 0xd3 && ch < 0xd9)) \
8619129f
UD
185 { \
186 /* These are illegal. */ \
85830c4c
UD
187 if (! ignore_errors_p ()) \
188 { \
189 /* This is an illegal character. */ \
190 result = __GCONV_ILLEGAL_INPUT; \
191 break; \
192 } \
193 \
194 ++inptr; \
38677ace 195 ++*irreversible; \
85830c4c 196 continue; \
8619129f
UD
197 } \
198 else \
199 { \
200 /* Two-byte character. First test whether the next \
201 character is also available. */ \
202 uint32_t ch2; \
203 uint_fast32_t idx; \
204 \
55985355 205 if (__builtin_expect (inptr + 1 >= inend, 0)) \
8619129f
UD
206 { \
207 /* The second character is not available. Store the \
208 intermediate result. */ \
d64b6ad0 209 result = __GCONV_INCOMPLETE_INPUT; \
8619129f
UD
210 break; \
211 } \
212 \
213 ch2 = inptr[1]; \
214 idx = ch * 256 + ch2; \
019357d2 215 if (__builtin_expect (ch, 0) <= 0xd3) \
8619129f
UD
216 { \
217 /* Hangul */ \
218 uint_fast32_t i, m, f; \
219 \
220 i = init[(idx & 0x7c00) >> 10]; \
221 m = mid[(idx & 0x03e0) >> 5]; \
222 f = final[idx & 0x001f]; \
223 \
019357d2
UD
224 if (__builtin_expect (i, 0) == -1 \
225 || __builtin_expect (m, 0) == -1 \
226 || __builtin_expect (f, 0) == -1) \
8619129f
UD
227 { \
228 /* This is illegal. */ \
85830c4c
UD
229 if (! ignore_errors_p ()) \
230 { \
231 /* This is an illegal character. */ \
232 result = __GCONV_ILLEGAL_INPUT; \
233 break; \
234 } \
235 \
236 ++inptr; \
38677ace 237 ++*irreversible; \
85830c4c 238 continue; \
8619129f
UD
239 } \
240 else if (i > 0 && m > 0) \
241 ch = ((i - 1) * 21 + (m - 1)) * 28 + f + 0xac00; \
6796bc80 242 else if (i > 0 && m == 0 && f == 0) \
8619129f 243 ch = init_to_ucs[i - 1]; \
6796bc80 244 else if (i == 0 && m > 0 && f == 0) \
8619129f 245 ch = 0x314e + m; /* 0x314f + m - 1 */ \
019357d2
UD
246 else if (__builtin_expect (i | m, 0) == 0 \
247 && __builtin_expect (f, 1) > 0) \
8619129f
UD
248 ch = final_to_ucs[f - 1]; /* round trip?? */ \
249 else \
250 { \
251 /* This is illegal. */ \
85830c4c
UD
252 if (! ignore_errors_p ()) \
253 { \
254 /* This is an illegal character. */ \
255 result = __GCONV_ILLEGAL_INPUT; \
256 break; \
257 } \
258 \
259 ++inptr; \
38677ace 260 ++*irreversible; \
85830c4c 261 continue; \
8619129f
UD
262 } \
263 } \
264 else \
265 { \
019357d2
UD
266 if (__builtin_expect (ch2, 0x31) < 0x31 \
267 || (__builtin_expect (ch2, 0x7e) > 0x7e && ch2 < 0x91) \
268 || __builtin_expect (ch2, 0) == 0xff \
079e46f0 269 || (__builtin_expect (ch, 0) == 0xd9 && ch2 > 0xe5) \
019357d2 270 || (__builtin_expect (ch, 0) == 0xda \
079e46f0
UD
271 && ch2 > 0xa0 && ch2 < 0xd4) \
272 || (__builtin_expect (ch, 0) == 0xde && ch2 > 0xf1)) \
8619129f
UD
273 { \
274 /* This is illegal. */ \
85830c4c
UD
275 if (! ignore_errors_p ()) \
276 { \
277 /* This is an illegal character. */ \
278 result = __GCONV_ILLEGAL_INPUT; \
279 break; \
280 } \
281 \
282 ++inptr; \
38677ace 283 ++*irreversible; \
85830c4c 284 continue; \
8619129f 285 } \
8619129f
UD
286 else \
287 { \
288 ch = johab_sym_hanja_to_ucs (idx, ch, ch2); \
289 /* if (idx <= 0xdefe) \
290 ch = __ksc5601_sym_to_ucs[(ch - 0xd9) * 192 \
291 + ch2 - (ch2 > 0x90 \
292 ? 0x43 : 0x31)]; \
293 else \
294 ch = __ksc5601_hanja_to_ucs[(ch - 0xe0) *192 \
295 + ch2 - (ch2 > 0x90 \
296 ?0x43 : 0x31)];\
297 */ \
298 } \
299 } \
300 } \
301 \
019357d2 302 if (__builtin_expect (ch, 1) == 0) \
8619129f
UD
303 { \
304 /* This is an illegal character. */ \
85830c4c
UD
305 if (! ignore_errors_p ()) \
306 { \
307 /* This is an illegal character. */ \
308 result = __GCONV_ILLEGAL_INPUT; \
309 break; \
310 } \
311 \
312 inptr += 2; \
38677ace 313 ++*irreversible; \
85830c4c 314 continue; \
8619129f
UD
315 } \
316 \
317 inptr += 2; \
318 } \
319 \
77e1d15a
UD
320 put32 (outptr, ch); \
321 outptr += 4; \
8619129f 322 }
55985355 323#define LOOP_NEED_FLAGS
8619129f
UD
324#include <iconv/loop.c>
325
326
327/* Next, define the other direction. */
328#define MIN_NEEDED_INPUT MIN_NEEDED_TO
329#define MIN_NEEDED_OUTPUT MIN_NEEDED_FROM
330#define MAX_NEEDED_OUTPUT MAX_NEEDED_FROM
331#define LOOPFCT TO_LOOP
332#define BODY \
333 { \
77e1d15a 334 uint32_t ch = get32 (inptr); \
8619129f
UD
335 /* \
336 if (ch >= (sizeof (from_ucs4_lat1) / sizeof (from_ucs4_lat1[0]))) \
337 { \
338 if (ch >= 0x0391 && ch <= 0x0451) \
339 cp = from_ucs4_greek[ch - 0x391]; \
340 else if (ch >= 0x2010 && ch <= 0x9fa0) \
341 cp = from_ucs4_cjk[ch - 0x02010]; \
342 else \
343 break; \
344 } \
345 else \
346 cp = from_ucs4_lat1[ch]; \
347 */ \
8619129f 348 \
079e46f0 349 if (ch <= 0x7f && ch != 0x5c) \
bc900b11
UD
350 *outptr++ = ch; \
351 else \
8619129f 352 { \
bc900b11
UD
353 if (ch >= 0xac00 && ch <= 0xd7a3) \
354 { \
55985355 355 if (__builtin_expect (outptr + 2 > outend, 0)) \
bc900b11 356 { \
d64b6ad0 357 result = __GCONV_FULL_OUTPUT; \
bc900b11
UD
358 break; \
359 } \
360 \
c63598bf
UD
361 ch -= 0xac00; \
362 \
363 ch = (init_to_bit[ch / 588] /* 21 * 28 = 588 */ \
364 + mid_to_bit[(ch / 28) % 21]/* (ch % (21 * 28)) / 28 */ \
365 + final_to_bit[ch % 28]); /* (ch % (21 * 28)) % 28 */ \
366 \
bc900b11
UD
367 *outptr++ = ch / 256; \
368 *outptr++ = ch % 256; \
369 } \
370 /* KS C 5601-1992 Annex 3 regards 0xA4DA(Hangul Filler : U3164) \
371 as symbol */ \
372 else if (ch >= 0x3131 && ch <= 0x3163) \
8619129f 373 { \
bc900b11
UD
374 ch = jamo_from_ucs_table[ch - 0x3131]; \
375 \
55985355 376 if (__builtin_expect (outptr + 2 > outend, 0)) \
bc900b11 377 { \
d64b6ad0 378 result = __GCONV_FULL_OUTPUT; \
bc900b11
UD
379 break; \
380 } \
381 \
382 *outptr++ = ch / 256; \
383 *outptr++ = ch % 256; \
384 } \
c63598bf
UD
385 else if ((ch >= 0x4e00 && ch <= 0x9fa5) \
386 || (ch >= 0xf900 && ch <= 0xfa0b)) \
bc900b11
UD
387 { \
388 size_t written; \
c63598bf 389 uint32_t temp; \
bc900b11 390 \
55985355
UD
391 written = ucs4_to_ksc5601_hanja (ch, outptr, outend - outptr); \
392 if (__builtin_expect (written, 1) == 0) \
bc900b11 393 { \
d64b6ad0 394 result = __GCONV_FULL_OUTPUT; \
bc900b11
UD
395 break; \
396 } \
019357d2 397 if (__builtin_expect (written, 0) == __UNKNOWN_10646_CHAR) \
bc900b11 398 { \
d6204268 399 STANDARD_ERR_HANDLER (4); \
bc900b11
UD
400 } \
401 \
402 outptr[0] -= 0x4a; \
c63598bf
UD
403 outptr[1] -= 0x21; \
404 \
405 temp = outptr[0] * 94 + outptr[1]; \
bc900b11 406 \
c63598bf
UD
407 outptr[0] = 0xe0 + temp / 188; \
408 outptr[1] = temp % 188; \
409 outptr[1] += outptr[1] >= 78 ? 0x43 : 0x31; \
bc900b11
UD
410 \
411 outptr += 2; \
412 } \
079e46f0
UD
413 else if (ch == 0x20a9) \
414 *outptr++ = 0x5c; \
bc900b11
UD
415 else \
416 { \
417 size_t written; \
5134584a 418 uint32_t temp; \
bc900b11 419 \
55985355
UD
420 written = ucs4_to_ksc5601_sym (ch, outptr, outend - outptr); \
421 if (__builtin_expect (written, 1) == 0) \
bc900b11 422 { \
d64b6ad0 423 result = __GCONV_FULL_OUTPUT; \
bc900b11
UD
424 break; \
425 } \
079e46f0
UD
426 if (__builtin_expect (written, 1) == __UNKNOWN_10646_CHAR \
427 || (outptr[0] == 0x22 && outptr[1] > 0x65)) \
bc900b11 428 { \
d6204268 429 STANDARD_ERR_HANDLER (4); \
bc900b11
UD
430 } \
431 \
5134584a
UD
432 temp = (outptr[0] < 0x4a ? outptr[0] + 0x191 : outptr[0] + 0x176);\
433 outptr[1] += (temp % 2 ? 0x5e : 0); \
434 outptr[1] += (outptr[1] < 0x6f ? 0x10 : 0x22); \
435 outptr[0] = temp / 2; \
bc900b11
UD
436 \
437 outptr += 2; \
8619129f 438 } \
8619129f
UD
439 } \
440 \
441 inptr += 4; \
442 }
55985355 443#define LOOP_NEED_FLAGS
8619129f
UD
444#include <iconv/loop.c>
445
446
447/* Now define the toplevel functions. */
448#include <iconv/skeleton.c>