]>
Commit | Line | Data |
---|---|---|
a44d2393 UD |
1 | /* Mapping tables for JOHAB handling. |
2 | Copyright (C) 1998 Free Software Foundation, Inc. | |
3 | This file is part of the GNU C Library. | |
8619129f UD |
4 | Contributed by Jungshik Shin <jshin@pantheon.yale.edu> |
5 | and Ulrich Drepper <drepper@cygnus.com>, 1998. | |
a44d2393 UD |
6 | |
7 | The GNU C Library is free software; you can redistribute it and/or | |
8 | modify it under the terms of the GNU Library General Public License as | |
9 | published by the Free Software Foundation; either version 2 of the | |
10 | License, or (at your option) any later version. | |
11 | ||
12 | The GNU C Library is distributed in the hope that it will be useful, | |
13 | but WITHOUT ANY WARRANTY; without even the implied warranty of | |
14 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
15 | Library General Public License for more details. | |
16 | ||
17 | You should have received a copy of the GNU Library General Public | |
18 | License along with the GNU C Library; see the file COPYING.LIB. If not, | |
19 | write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, | |
20 | Boston, MA 02111-1307, USA. */ | |
21 | ||
2aea1d79 | 22 | #include <stdint.h> |
a44d2393 UD |
23 | #include <ksc5601.h> |
24 | ||
a44d2393 UD |
25 | /* The table for Bit pattern to Hangul Jamo |
26 | 5 bits each are used to encode | |
27 | leading consonants(19 + 1 filler), medial vowels(21 + 1 filler) | |
28 | and trailing consonants(27 + 1 filler). | |
29 | ||
30 | KS C 5601-1992 Annex 3 Table 2 | |
31 | 0 : Filler, -1: invalid, >= 1 : valid | |
32 | ||
33 | */ | |
8619129f | 34 | static const int init[32] = |
a44d2393 UD |
35 | { |
36 | -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, | |
37 | 19, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 | |
38 | }; | |
8619129f | 39 | static const int mid[32] = |
a44d2393 UD |
40 | { |
41 | -1, -1, 0, 1, 2, 3, 4, 5, | |
42 | -1, -1, 6, 7, 8, 9, 10, 11, | |
43 | -1, -1, 12, 13, 14, 15, 16, 17, | |
44 | -1, -1, 18, 19, 20, 21, -1, -1 | |
45 | }; | |
8619129f | 46 | static const int final[32] = |
a44d2393 UD |
47 | { |
48 | -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, | |
49 | -1, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, -1, -1 | |
50 | }; | |
51 | ||
52 | /* | |
53 | Hangul Jamo in Johab to Unicode 2.0 : Unicode 2.0 | |
54 | defines 51 Hangul Compatibility Jamos in the block [0x3131,0x314e] | |
55 | ||
56 | It's to be considered later which Jamo block to use, Compatibility | |
57 | block [0x3131,0x314e] or Hangul Conjoining Jamo block, [0x1100,0x11ff] | |
58 | ||
59 | */ | |
8619129f | 60 | static const uint32_t init_to_ucs[19] = |
a44d2393 UD |
61 | { |
62 | 0x3131, 0x3132, 0x3134, 0x3137, 0x3138, 0x3139, 0x3141, 0x3142, | |
63 | 0x3143, 0x3145, 0x3146, 0x3147, 0x3148, 0x3149, 0x314a, 0x314b, | |
64 | 0x314c, 0x314d, 0x314e | |
65 | }; | |
66 | ||
8619129f | 67 | static const uint32_t final_to_ucs[27] = |
a44d2393 UD |
68 | { |
69 | L'\0', L'\0', 0x3133, L'\0', 0x3135, 0x3136, L'\0', L'\0', | |
70 | 0x313a, 0x313b, 0x314c, 0x313d, 0x313e, 0x313f, | |
71 | 0x3140, L'\0', L'\0', 0x3144, L'\0', L'\0', L'\0', | |
72 | L'\0', L'\0', L'\0', L'\0', L'\0', L'\0' | |
73 | }; | |
74 | ||
75 | /* The following three arrays are used to convert | |
76 | precomposed Hangul syllables in [0xac00,0xd???] | |
77 | to Jamo bit patterns for Johab encoding | |
78 | ||
79 | cf. : KS C 5601-1992, Annex3 Table 2 | |
80 | ||
81 | Arrays are used to speed up things although it's possible | |
82 | to get the same result arithmetically. | |
83 | ||
84 | */ | |
8619129f | 85 | static const int init_to_bit[19] = |
a44d2393 UD |
86 | { |
87 | 0x8800, 0x8c00, 0x9000, 0x9400, 0x9800, 0x9c00, | |
88 | 0xa000, 0xa400, 0xa800, 0xac00, 0xb000, 0xb400, | |
89 | 0xb800, 0xbc00, 0xc000, 0xc400, 0xc800, 0xcc00, | |
90 | 0xd000 | |
91 | }; | |
92 | ||
8619129f | 93 | static const int mid_to_bit[21] = |
a44d2393 UD |
94 | { |
95 | 0x0060, 0x0080, 0x00a0, 0x00c0, 0x00e0, | |
96 | 0x0140, 0x0160, 0x0180, 0x01a0, 0x01c0, 0x1e0, | |
97 | 0x0240, 0x0260, 0x0280, 0x02a0, 0x02c0, 0x02e0, | |
98 | 0x0340, 0x0360, 0x0380, 0x03a0 | |
99 | }; | |
100 | ||
8619129f | 101 | static const int final_to_bit[28] = |
a44d2393 UD |
102 | { |
103 | 1, 2, 3, 4, 5, 6, 7, 8, 9, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, | |
104 | 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d | |
105 | }; | |
106 | ||
107 | /* The conversion table from | |
108 | UCS4 Hangul Compatibility Jamo in [0x3131,0x3163] | |
109 | to Johab | |
110 | ||
111 | cf. 1. KS C 5601-1992 Annex 3 Table 2 | |
112 | 2. Unicode 2.0 manual | |
113 | ||
114 | */ | |
8619129f | 115 | static const uint16_t jamo_from_ucs_table[51] = |
a44d2393 UD |
116 | { |
117 | 0x8841, 0x8c41, | |
118 | 0x8444, | |
119 | 0x9041, | |
120 | 0x8446, 0x8447, | |
121 | 0x9441, 0x9841, 0x9c41, | |
122 | 0x844a, 0x844b, 0x844c, 0x844d, 0x884e, 0x884f, 0x8450, | |
123 | 0xa041, 0xa441, 0xa841, | |
124 | 0x8454, | |
125 | 0xac41, 0xb041, 0xb441, 0xb841, 0xbc41, | |
126 | 0xc041, 0xc441, 0xc841, 0xca41, 0xd041, | |
127 | 0x8461, 0x8481, 0x84a1, 0x84c1, 0x84e1, | |
128 | 0x8541, 0x8561, 0x8581, 0x85a1, 0x85c1, 0x85e1, | |
129 | 0x8641, 0x8661, 0x8681, 0x86a1, 0x86c1, 0x86e1, | |
130 | 0x8741, 0x8761, 0x8781, 0x87a1 | |
131 | }; | |
132 | ||
133 | ||
8619129f UD |
134 | static inline uint32_t |
135 | johab_sym_hanja_to_ucs (uint_fast32_t idx, uint_fast32_t c1, uint_fast32_t c2) | |
a44d2393 UD |
136 | { |
137 | if (idx <= 0xdefe) | |
8619129f UD |
138 | return (uint32_t) __ksc5601_sym_to_ucs[(c1 - 0xd9) * 188 + c2 |
139 | - (c2 > 0x90 ? 0x43 : 0x31)]; | |
a44d2393 | 140 | else |
8619129f UD |
141 | return (uint32_t) __ksc5601_hanja_to_ucs[(c1 - 0xe0) * 188 + c2 |
142 | - (c2 > 0x90 ? 0x43 : 0x31)]; | |
a44d2393 UD |
143 | } |
144 | ||
145 | static uint16_t | |
8619129f | 146 | johab_hanja_from_ucs (uint32_t ch) |
a44d2393 | 147 | { |
a44d2393 UD |
148 | uint16_t idx; |
149 | if (ucs4_to_ksc5601_hanja (ch, &idx)) | |
150 | { | |
151 | int idx1, idx2; | |
152 | /* Hanja begins at the 42th row. 42=0x2a : 0x2a + 0x20 = 0x4a. */ | |
153 | idx1 = idx / 256 - 0x4a; | |
154 | idx2 = idx % 256 + 0x80; | |
155 | ||
156 | return ((idx1 / 2) * 256 + 0xe000 + idx2 | |
157 | + (idx1 % 2 ? 0 : (idx2 > 0xee ? 0x43 : 0x31) - 0xa1)); | |
158 | } | |
159 | else | |
160 | return 0; | |
161 | } | |
162 | ||
163 | static uint16_t | |
8619129f | 164 | johab_sym_from_ucs (uint32_t ch) |
a44d2393 UD |
165 | { |
166 | uint16_t idx; | |
167 | if (ucs4_to_ksc5601_sym (ch, &idx)) | |
168 | { | |
169 | int idx1, idx2; | |
170 | ||
171 | idx1 = idx / 256 - 0x21; | |
172 | idx2 = idx % 256 + 0x80; | |
173 | ||
174 | return ((idx1 / 2) * 256 + 0xd900 + idx2 | |
175 | + (idx1 % 2 ? 0 : (idx2 > 0xee ? 0x43 : 0x31) - 0xa1)); | |
176 | } | |
177 | else | |
178 | return 0; | |
179 | } | |
180 | ||
181 | ||
a44d2393 | 182 | static inline void |
8619129f | 183 | johab_from_ucs4 (uint32_t ch, unsigned char *cp) |
a44d2393 UD |
184 | { |
185 | if (ch >= 0x7f) | |
186 | { | |
187 | int idx; | |
188 | ||
189 | if (ch >= 0xac00 && ch <= 0xd7a3) | |
190 | { | |
191 | ch -= 0xac00; | |
192 | idx = init_to_bit[ch / 588]; /* 21*28 = 588 */ | |
193 | idx += mid_to_bit[(ch / 28) % 21]; /* (ch % (21 * 28)) / 28 */ | |
194 | idx += final_to_bit[ch % 28]; /* (ch % (21 * 28)) % 28 */ | |
195 | } | |
196 | /* KS C 5601-1992 Annex 3 regards 0xA4DA(Hangul Filler : U3164) | |
197 | as symbol */ | |
198 | else if (ch >= 0x3131 && ch <= 0x3163) | |
199 | idx = jamo_from_ucs_table[ch - 0x3131]; | |
200 | else if (ch >= 0x4e00 && ch <= 0x9fa5 | |
201 | || ch >= 0xf900 && ch <= 0xfa0b) | |
202 | idx = johab_hanja_from_ucs (ch); | |
203 | /* Half-width Korean Currency Won Sign | |
204 | else if ( ch == 0x20a9 ) | |
205 | idx = 0x5c00; | |
206 | */ | |
207 | else | |
208 | idx = johab_sym_from_ucs (ch); | |
209 | ||
8619129f UD |
210 | cp[0] = (unsigned char) (idx / 256); |
211 | cp[1] = (unsigned char) (idx & 0xff); | |
a44d2393 UD |
212 | |
213 | } | |
214 | else | |
215 | { | |
8619129f UD |
216 | cp[0] = (unsigned char) ch; |
217 | cp[1] = 0; | |
a44d2393 | 218 | } |
a44d2393 UD |
219 | } |
220 | ||
221 | ||
8619129f | 222 | /* Definitions used in the body of the `gconv' function. */ |
9b26f5c4 | 223 | #define CHARSET_NAME "JOHAB//" |
8619129f UD |
224 | #define FROM_LOOP from_johab |
225 | #define TO_LOOP to_johab | |
226 | #define DEFINE_INIT 1 | |
227 | #define DEFINE_FINI 1 | |
228 | #define MIN_NEEDED_FROM 1 | |
229 | #define MAX_NEEDED_FROM 2 | |
230 | #define MIN_NEEDED_TO 4 | |
231 | ||
232 | ||
233 | /* First define the conversion function from JOHAB to UCS4. */ | |
234 | #define MIN_NEEDED_INPUT MIN_NEEDED_FROM | |
235 | #define MAX_NEEDED_INPUT MAX_NEEDED_FROM | |
236 | #define MIN_NEEDED_OUTPUT MIN_NEEDED_TO | |
237 | #define LOOPFCT FROM_LOOP | |
238 | #define BODY \ | |
239 | { \ | |
240 | uint32_t ch = *inptr; \ | |
241 | \ | |
242 | /* half-width Korean Currency WON sign \ | |
243 | if (ch == 0x5c) \ | |
244 | ch = 0x20a9; \ | |
245 | else if (ch < 0x7f) \ | |
9b26f5c4 | 246 | ch = (uint32_t) ch; \ |
8619129f UD |
247 | */ \ |
248 | if (ch < 0x7f) \ | |
249 | /* Plain ASCII. */ \ | |
250 | ++inptr; \ | |
251 | /* Johab : 1. Hangul \ | |
252 | 1st byte : 0x84-0xd3 \ | |
253 | 2nd byte : 0x41-0x7e, 0x81-0xfe \ | |
254 | 2. Hanja & Symbol : \ | |
255 | 1st byte : 0xd8-0xde, 0xe0-0xf9 \ | |
256 | 2nd byte : 0x31-0x7e, 0x91-0xfe \ | |
257 | 0xd831-0xd87e and 0xd891-0xd8fe are user-defined area */ \ | |
258 | else \ | |
259 | { \ | |
260 | if (ch > 0xf9 || ch == 0xdf || (ch > 0x7e && ch < 0x84) \ | |
261 | || (ch > 0xd3 && ch < 0xd9)) \ | |
262 | { \ | |
263 | /* These are illegal. */ \ | |
264 | result = GCONV_ILLEGAL_INPUT; \ | |
265 | break; \ | |
266 | } \ | |
267 | else \ | |
268 | { \ | |
269 | /* Two-byte character. First test whether the next \ | |
270 | character is also available. */ \ | |
271 | uint32_t ch2; \ | |
272 | uint_fast32_t idx; \ | |
273 | \ | |
274 | if (NEED_LENGTH_TEST && inptr + 1 >= inend) \ | |
275 | { \ | |
276 | /* The second character is not available. Store the \ | |
277 | intermediate result. */ \ | |
278 | result = GCONV_INCOMPLETE_INPUT; \ | |
279 | break; \ | |
280 | } \ | |
281 | \ | |
282 | ch2 = inptr[1]; \ | |
283 | idx = ch * 256 + ch2; \ | |
284 | if (ch <= 0xd3) \ | |
285 | { \ | |
286 | /* Hangul */ \ | |
287 | uint_fast32_t i, m, f; \ | |
288 | \ | |
289 | i = init[(idx & 0x7c00) >> 10]; \ | |
290 | m = mid[(idx & 0x03e0) >> 5]; \ | |
291 | f = final[idx & 0x001f]; \ | |
292 | \ | |
293 | if (i == -1 || m == -1 || f == -1) \ | |
294 | { \ | |
295 | /* This is illegal. */ \ | |
296 | result = GCONV_ILLEGAL_INPUT; \ | |
297 | break; \ | |
298 | } \ | |
299 | else if (i > 0 && m > 0) \ | |
300 | ch = ((i - 1) * 21 + (m - 1)) * 28 + f + 0xac00; \ | |
301 | else if (i > 0 && m == 0 & f == 0) \ | |
302 | ch = init_to_ucs[i - 1]; \ | |
303 | else if (i == 0 && m > 0 & f == 0) \ | |
304 | ch = 0x314e + m; /* 0x314f + m - 1 */ \ | |
305 | else if (i == 0 && m == 0 & f > 0) \ | |
306 | ch = final_to_ucs[f - 1]; /* round trip?? */ \ | |
307 | else \ | |
308 | { \ | |
309 | /* This is illegal. */ \ | |
310 | result = GCONV_ILLEGAL_INPUT; \ | |
311 | break; \ | |
312 | } \ | |
313 | } \ | |
314 | else \ | |
315 | { \ | |
316 | if (ch2 < 0x31 || (ch2 > 0x7e && ch2 < 0x91) || ch2 == 0xff) \ | |
317 | { \ | |
318 | /* This is illegal. */ \ | |
319 | result = GCONV_ILLEGAL_INPUT; \ | |
320 | break; \ | |
321 | } \ | |
322 | else if (ch == 0xda && ch2 > 0xa0 && ch2 < 0xd4) \ | |
323 | { \ | |
324 | /* This is illegal. Modern Hangul Jaso is defined \ | |
325 | elsewhere in Johab */ \ | |
326 | result = GCONV_ILLEGAL_INPUT; \ | |
327 | break; \ | |
328 | } \ | |
329 | else \ | |
330 | { \ | |
331 | ch = johab_sym_hanja_to_ucs (idx, ch, ch2); \ | |
332 | /* if (idx <= 0xdefe) \ | |
333 | ch = __ksc5601_sym_to_ucs[(ch - 0xd9) * 192 \ | |
334 | + ch2 - (ch2 > 0x90 \ | |
335 | ? 0x43 : 0x31)]; \ | |
336 | else \ | |
337 | ch = __ksc5601_hanja_to_ucs[(ch - 0xe0) *192 \ | |
338 | + ch2 - (ch2 > 0x90 \ | |
339 | ?0x43 : 0x31)];\ | |
340 | */ \ | |
341 | } \ | |
342 | } \ | |
343 | } \ | |
344 | \ | |
345 | if (ch == 0) \ | |
346 | { \ | |
347 | /* This is an illegal character. */ \ | |
348 | result = GCONV_ILLEGAL_INPUT; \ | |
349 | break; \ | |
350 | } \ | |
351 | \ | |
352 | inptr += 2; \ | |
353 | } \ | |
354 | \ | |
355 | *((uint32_t *) outptr)++ = ch; \ | |
356 | } | |
357 | #include <iconv/loop.c> | |
358 | ||
359 | ||
360 | /* Next, define the other direction. */ | |
361 | #define MIN_NEEDED_INPUT MIN_NEEDED_TO | |
362 | #define MIN_NEEDED_OUTPUT MIN_NEEDED_FROM | |
363 | #define MAX_NEEDED_OUTPUT MAX_NEEDED_FROM | |
364 | #define LOOPFCT TO_LOOP | |
365 | #define BODY \ | |
366 | { \ | |
367 | uint32_t ch = *((uint32_t *) inptr); \ | |
368 | unsigned char cp[2]; \ | |
369 | /* \ | |
370 | if (ch >= (sizeof (from_ucs4_lat1) / sizeof (from_ucs4_lat1[0]))) \ | |
371 | { \ | |
372 | if (ch >= 0x0391 && ch <= 0x0451) \ | |
373 | cp = from_ucs4_greek[ch - 0x391]; \ | |
374 | else if (ch >= 0x2010 && ch <= 0x9fa0) \ | |
375 | cp = from_ucs4_cjk[ch - 0x02010]; \ | |
376 | else \ | |
377 | break; \ | |
378 | } \ | |
379 | else \ | |
380 | cp = from_ucs4_lat1[ch]; \ | |
381 | */ \ | |
382 | johab_from_ucs4 (ch, cp); \ | |
383 | \ | |
384 | if (cp[0] == '\0' && ch != 0) \ | |
385 | { \ | |
386 | /* Illegal character. */ \ | |
387 | result = GCONV_ILLEGAL_INPUT; \ | |
388 | break; \ | |
389 | } \ | |
390 | \ | |
391 | *outptr++ = cp[0]; \ | |
392 | /* Now test for a possible second byte and write this if possible. */ \ | |
393 | if (cp[1] != '\0') \ | |
394 | { \ | |
395 | if (NEED_LENGTH_TEST && outptr >= outend) \ | |
396 | { \ | |
397 | /* The result does not fit into the buffer. */ \ | |
398 | --outptr; \ | |
399 | result = GCONV_FULL_OUTPUT; \ | |
400 | break; \ | |
401 | } \ | |
402 | *outptr++ = cp[1]; \ | |
403 | } \ | |
404 | \ | |
405 | inptr += 4; \ | |
406 | } | |
407 | #include <iconv/loop.c> | |
408 | ||
409 | ||
410 | /* Now define the toplevel functions. */ | |
411 | #include <iconv/skeleton.c> |