]> git.ipfire.org Git - thirdparty/glibc.git/blob - iconv/gconv_simple.c
Update.
[thirdparty/glibc.git] / iconv / gconv_simple.c
1 /* Simple transformations functions.
2 Copyright (C) 1997, 1998 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Ulrich Drepper <drepper@cygnus.com>, 1997.
5
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Library General Public License as
8 published by the Free Software Foundation; either version 2 of the
9 License, or (at your option) any later version.
10
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Library General Public License for more details.
15
16 You should have received a copy of the GNU Library General Public
17 License along with the GNU C Library; see the file COPYING.LIB. If not,
18 write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
19 Boston, MA 02111-1307, USA. */
20
21 #include <byteswap.h>
22 #include <endian.h>
23 #include <errno.h>
24 #include <gconv.h>
25 #include <stdint.h>
26 #include <stdlib.h>
27 #include <string.h>
28 #include <wchar.h>
29 #include <sys/param.h>
30
31 #ifndef EILSEQ
32 # define EILSEQ EINVAL
33 #endif
34
35
36 /* These are definitions used by some of the functions for handling
37 UTF-8 encoding below. */
38 static const uint32_t encoding_mask[] =
39 {
40 ~0x7ff, ~0xffff, ~0x1fffff, ~0x3ffffff
41 };
42
43 static const unsigned char encoding_byte[] =
44 {
45 0xc0, 0xe0, 0xf0, 0xf8, 0xfc
46 };
47
48
49
50 int
51 __gconv_transform_dummy (struct gconv_step *step, struct gconv_step_data *data,
52 const char **inbuf, const char *inbufend,
53 size_t *written, int do_flush)
54 {
55 size_t do_write;
56
57 /* We have no stateful encoding. So we don't have to do anything
58 special. */
59 if (do_flush)
60 do_write = 0;
61 else
62 {
63 do_write = MIN (inbufend - *inbuf, data->outbufend - data->outbuf);
64
65 memcpy (data->outbuf, inbuf, do_write);
66
67 *inbuf -= do_write;
68 *data->outbuf += do_write;
69 }
70
71 /* ### TODO Actually, this number must be devided according to the
72 size of the input charset. I.e., if the input is in UCS4 the
73 number of copied bytes must be divided by 4. */
74 if (written != NULL)
75 *written = do_write;
76
77 return GCONV_OK;
78 }
79
80
81 /* Transform from the internal, UCS4-like format, to UCS4. The
82 difference between the internal ucs4 format and the real UCS4
83 format is, if any, the endianess. The Unicode/ISO 10646 says that
84 unless some higher protocol specifies it differently, the byte
85 order is big endian.*/
86 #define DEFINE_INIT 0
87 #define DEFINE_FINI 0
88 #define MIN_NEEDED_FROM 4
89 #define MIN_NEEDED_TO 4
90 #define FROM_DIRECTION 1
91 #define FROM_LOOP internal_ucs4_loop
92 #define TO_LOOP internal_ucs4_loop /* This is not used. */
93 #define FUNCTION_NAME __gconv_transform_internal_ucs4
94
95
96 static inline int
97 internal_ucs4_loop (const unsigned char **inptrp, const unsigned char *inend,
98 unsigned char **outptrp, unsigned char *outend,
99 mbstate_t *state, void *data, size_t *converted)
100 {
101 const unsigned char *inptr = *inptrp;
102 unsigned char *outptr = *outptrp;
103 size_t n_convert = MIN (inend - inptr, outend - outptr) / 4;
104 int result;
105
106 #if __BYTE_ORDER == __LITTLE_ENDIAN
107 /* Sigh, we have to do some real work. */
108 size_t cnt;
109
110 for (cnt = 0; cnt < n_convert; ++cnt)
111 *((uint32_t *) outptr)++ = bswap_32 (*((uint32_t *) inptr)++);
112
113 *inptrp = inptr;
114 *outptrp = outptr;
115 #elif __BYTE_ORDER == __BIG_ENDIAN
116 /* Simply copy the data. */
117 *inptrp = inptr + n_convert * 4;
118 *outptrp = __mempcpy (outptr, inptr, n_convert * 4);
119 #else
120 # error "This endianess is not supported."
121 #endif
122
123 /* Determine the status. */
124 if (*outptrp == outend)
125 result = GCONV_FULL_OUTPUT;
126 else if (*inptrp == inend)
127 result = GCONV_EMPTY_INPUT;
128 else
129 result = GCONV_INCOMPLETE_INPUT;
130
131 if (converted != NULL)
132 converted += n_convert;
133
134 return result;
135 }
136
137 #include <iconv/skeleton.c>
138
139
140 /* Convert from ISO 646-IRV to the internal (UCS4-like) format. */
141 #define DEFINE_INIT 0
142 #define DEFINE_FINI 0
143 #define MIN_NEEDED_FROM 1
144 #define MIN_NEEDED_TO 4
145 #define FROM_DIRECTION 1
146 #define FROM_LOOP ascii_internal_loop
147 #define TO_LOOP ascii_internal_loop /* This is not used. */
148 #define FUNCTION_NAME __gconv_transform_ascii_internal
149
150 #define MIN_NEEDED_INPUT MIN_NEEDED_FROM
151 #define MIN_NEEDED_OUTPUT MIN_NEEDED_TO
152 #define LOOPFCT FROM_LOOP
153 #define BODY \
154 { \
155 if (*inptr > '\x7f') \
156 { \
157 /* This is no correct ANSI_X3.4-1968 character. */ \
158 result = GCONV_ILLEGAL_INPUT; \
159 break; \
160 } \
161 \
162 /* It's an one byte sequence. */ \
163 *((uint32_t *) outptr)++ = *inptr++; \
164 }
165 #include <iconv/loop.c>
166 #include <iconv/skeleton.c>
167
168
169 /* Convert from the internal (UCS4-like) format to ISO 646-IRV. */
170 #define DEFINE_INIT 0
171 #define DEFINE_FINI 0
172 #define MIN_NEEDED_FROM 4
173 #define MIN_NEEDED_TO 1
174 #define FROM_DIRECTION 1
175 #define FROM_LOOP internal_ascii_loop
176 #define TO_LOOP internal_ascii_loop /* This is not used. */
177 #define FUNCTION_NAME __gconv_transform_internal_ascii
178
179 #define MIN_NEEDED_INPUT MIN_NEEDED_FROM
180 #define MIN_NEEDED_OUTPUT MIN_NEEDED_TO
181 #define LOOPFCT FROM_LOOP
182 #define BODY \
183 { \
184 if (*((uint32_t *) inptr) > 0x7f) \
185 { \
186 /* This is no correct ANSI_X3.4-1968 character. */ \
187 result = GCONV_ILLEGAL_INPUT; \
188 break; \
189 } \
190 \
191 /* It's an one byte sequence. */ \
192 *outptr++ = *((uint32_t *) inptr)++; \
193 }
194 #include <iconv/loop.c>
195 #include <iconv/skeleton.c>
196
197
198 /* Convert from the internal (UCS4-like) format to UTF-8. */
199 #define DEFINE_INIT 0
200 #define DEFINE_FINI 0
201 #define MIN_NEEDED_FROM 4
202 #define MIN_NEEDED_TO 1
203 #define MAX_NEEDED_TO 6
204 #define FROM_DIRECTION 1
205 #define FROM_LOOP internal_utf8_loop
206 #define TO_LOOP internal_utf8_loop /* This is not used. */
207 #define FUNCTION_NAME __gconv_transform_internal_utf8
208
209 #define MIN_NEEDED_INPUT MIN_NEEDED_FROM
210 #define MIN_NEEDED_OUTPUT MIN_NEEDED_TO
211 #define MAX_NEEDED_OUTPUT MAX_NEEDED_TO
212 #define LOOPFCT FROM_LOOP
213 #define BODY \
214 { \
215 uint32_t wc = *((uint32_t *) inptr); \
216 \
217 /* Since we control every character we read this cannot happen. */ \
218 assert (wc <= 0x7fffffff); \
219 \
220 if (wc < 0x80) \
221 /* It's an one byte sequence. */ \
222 *outptr++ = (unsigned char) wc; \
223 else \
224 { \
225 size_t step; \
226 char *start; \
227 \
228 for (step = 2; step < 6; ++step) \
229 if ((wc & encoding_mask[step - 2]) == 0) \
230 break; \
231 \
232 if (outptr + step >= outend) \
233 { \
234 /* Too long. */ \
235 result = GCONV_FULL_OUTPUT; \
236 break; \
237 } \
238 \
239 start = outptr; \
240 *outptr = encoding_byte[step - 2]; \
241 outptr += step; \
242 --step; \
243 do \
244 { \
245 start[step] = 0x80 | (wc & 0x3f); \
246 wc >>= 6; \
247 } \
248 while (--step > 0); \
249 start[0] |= wc; \
250 } \
251 \
252 inptr += 4; \
253 }
254 #include <iconv/loop.c>
255 #include <iconv/skeleton.c>
256
257
258 /* Convert from UTF-8 to the internal (UCS4-like) format. */
259 #define DEFINE_INIT 0
260 #define DEFINE_FINI 0
261 #define MIN_NEEDED_FROM 1
262 #define MAX_NEEDED_FROM 6
263 #define MIN_NEEDED_TO 4
264 #define FROM_DIRECTION 1
265 #define FROM_LOOP utf8_internal_loop
266 #define TO_LOOP utf8_internal_loop /* This is not used. */
267 #define FUNCTION_NAME __gconv_transform_utf8_internal
268
269 #define MIN_NEEDED_INPUT MIN_NEEDED_FROM
270 #define MAX_NEEDED_INPUT MAX_NEEDED_FROM
271 #define MIN_NEEDED_OUTPUT MIN_NEEDED_TO
272 #define LOOPFCT FROM_LOOP
273 #define BODY \
274 { \
275 uint32_t ch; \
276 uint_fast32_t cnt; \
277 uint_fast32_t i; \
278 \
279 /* Next input byte. */ \
280 ch = *inptr; \
281 \
282 if (ch < 0x80) \
283 { \
284 /* One byte sequence. */ \
285 cnt = 1; \
286 ++inptr; \
287 } \
288 else \
289 { \
290 if ((ch & 0xe0) == 0xc0) \
291 { \
292 cnt = 2; \
293 ch &= 0x1f; \
294 } \
295 else if ((ch & 0xf0) == 0xe0) \
296 { \
297 /* We expect three bytes. */ \
298 cnt = 3; \
299 ch &= 0x0f; \
300 } \
301 else if ((ch & 0xf8) == 0xf0) \
302 { \
303 /* We expect four bytes. */ \
304 cnt = 4; \
305 ch &= 0x07; \
306 } \
307 else if ((ch & 0xfc) == 0xf8) \
308 { \
309 /* We expect five bytes. */ \
310 cnt = 5; \
311 ch &= 0x03; \
312 } \
313 else if ((ch & 0xfe) == 0xfc) \
314 { \
315 /* We expect six bytes. */ \
316 cnt = 6; \
317 ch &= 0x01; \
318 } \
319 else \
320 { \
321 /* This is an illegal encoding. */ \
322 result = GCONV_ILLEGAL_INPUT; \
323 break; \
324 } \
325 \
326 if (NEED_LENGTH_TEST && inptr + cnt > inend) \
327 { \
328 /* We don't have enough input. */ \
329 result = GCONV_INCOMPLETE_INPUT; \
330 break; \
331 } \
332 \
333 /* Read the possible remaining bytes. */ \
334 for (i = 1; i < cnt; ++i) \
335 { \
336 uint32_t byte = inptr[i]; \
337 \
338 if ((byte & 0xc0) != 0x80) \
339 { \
340 /* This is an illegal encoding. */ \
341 result = GCONV_ILLEGAL_INPUT; \
342 break; \
343 } \
344 \
345 ch <<= 6; \
346 ch |= byte & 0x3f; \
347 } \
348 inptr += cnt; \
349 } \
350 \
351 /* Now adjust the pointers and store the result. */ \
352 *((uint32_t *) outptr)++ = ch; \
353 }
354 #include <iconv/loop.c>
355 #include <iconv/skeleton.c>
356
357
358 /* Convert from UCS2 to the internal (UCS4-like) format. */
359 #define DEFINE_INIT 0
360 #define DEFINE_FINI 0
361 #define MIN_NEEDED_FROM 2
362 #define MIN_NEEDED_TO 4
363 #define FROM_DIRECTION 1
364 #define FROM_LOOP ucs2_internal_loop
365 #define TO_LOOP ucs2_internal_loop /* This is not used. */
366 #define FUNCTION_NAME __gconv_transform_ucs2_internal
367
368 #define MIN_NEEDED_INPUT MIN_NEEDED_FROM
369 #define MIN_NEEDED_OUTPUT MIN_NEEDED_TO
370 #define LOOPFCT FROM_LOOP
371 #if __BYTE_ORDER == __LITTLE_ENDIAN
372 # define BODY \
373 *((uint32_t *) outptr)++ = bswap_16 (*((uint16_t *) inptr)++);
374 #else
375 # define BODY \
376 *((uint32_t *) outptr)++ = *((uint16_t *) inptr)++;
377 #endif
378 #include <iconv/loop.c>
379 #include <iconv/skeleton.c>
380
381
382 /* Convert from the internal (UCS4-like) format to UCS2. */
383 #define DEFINE_INIT 0
384 #define DEFINE_FINI 0
385 #define MIN_NEEDED_FROM 4
386 #define MIN_NEEDED_TO 2
387 #define FROM_DIRECTION 1
388 #define FROM_LOOP internal_ucs2_loop
389 #define TO_LOOP internal_ucs2_loop /* This is not used. */
390 #define FUNCTION_NAME __gconv_transform_internal_ucs2
391
392 #define MIN_NEEDED_INPUT MIN_NEEDED_FROM
393 #define MIN_NEEDED_OUTPUT MIN_NEEDED_TO
394 #define LOOPFCT FROM_LOOP
395 #if __BYTE_ORDER == __LITTLE_ENDIAN
396 # define BODY \
397 { \
398 if (*((uint32_t *) inptr) >= 0x10000) \
399 { \
400 result = GCONV_ILLEGAL_INPUT; \
401 break; \
402 } \
403 /* Please note that we use the `uint32_t' from-pointer as an `uint16_t' \
404 pointer which works since we are on a little endian machine. */ \
405 *((uint16_t *) outptr)++ = bswap_16 (*((uint16_t *) inptr)); \
406 inptr += 4; \
407 }
408 #else
409 # define BODY \
410 { \
411 if (*((uint32_t *) inptr) >= 0x10000) \
412 { \
413 result = GCONV_ILLEGAL_INPUT; \
414 break; \
415 } \
416 *((uint16_t *) outptr)++ = *((uint32_t *) inptr)++; \
417 }
418 #endif
419 #include <iconv/loop.c>
420 #include <iconv/skeleton.c>
421
422
423 /* Convert from UCS2 in little endian to the internal (UCS4-like) format. */
424 #define DEFINE_INIT 0
425 #define DEFINE_FINI 0
426 #define MIN_NEEDED_FROM 2
427 #define MIN_NEEDED_TO 4
428 #define FROM_DIRECTION 1
429 #define FROM_LOOP ucs2little_internal_loop
430 #define TO_LOOP ucs2little_internal_loop /* This is not used.*/
431 #define FUNCTION_NAME __gconv_transform_ucs2little_internal
432
433 #define MIN_NEEDED_INPUT MIN_NEEDED_FROM
434 #define MIN_NEEDED_OUTPUT MIN_NEEDED_TO
435 #define LOOPFCT FROM_LOOP
436 #if __BYTE_ORDER == __LITTLE_ENDIAN
437 # define BODY \
438 *((uint32_t *) outptr)++ = *((uint16_t *) inptr)++;
439 #else
440 # define BODY \
441 *((uint32_t *) outptr)++ = bswap_16 (*((uint16_t *) inptr)++);
442 #endif
443 #include <iconv/loop.c>
444 #include <iconv/skeleton.c>
445
446
447 /* Convert from the internal (UCS4-like) format to UCS2 in little endian. */
448 #define DEFINE_INIT 0
449 #define DEFINE_FINI 0
450 #define MIN_NEEDED_FROM 4
451 #define MIN_NEEDED_TO 2
452 #define FROM_DIRECTION 1
453 #define FROM_LOOP internal_ucs2little_loop
454 #define TO_LOOP internal_ucs2little_loop /* This is not used.*/
455 #define FUNCTION_NAME __gconv_transform_internal_ucs2little
456
457 #define MIN_NEEDED_INPUT MIN_NEEDED_FROM
458 #define MIN_NEEDED_OUTPUT MIN_NEEDED_TO
459 #define LOOPFCT FROM_LOOP
460 #if __BYTE_ORDER == __LITTLE_ENDIAN
461 # define BODY \
462 { \
463 if (*((uint32_t *) inptr) >= 0x10000) \
464 { \
465 result = GCONV_ILLEGAL_INPUT; \
466 break; \
467 } \
468 *((uint16_t *) outptr)++ = *((uint32_t *) inptr)++; \
469 }
470 #else
471 # define BODY \
472 { \
473 if (*((uint32_t *) inptr) >= 0x10000) \
474 { \
475 result = GCONV_ILLEGAL_INPUT; \
476 break; \
477 } \
478 /* Please note that we use the `uint32_t' from-pointer as an `uint16_t' \
479 pointer which works since we are on a little endian machine. */ \
480 *((uint16_t *) outptr)++ = bswap_16 (*((uint16_t *) inptr)); \
481 inptr += 4; \
482 }
483 #endif
484 #include <iconv/loop.c>
485 #include <iconv/skeleton.c>