]>
Commit | Line | Data |
---|---|---|
93693c4d | 1 | /* Convert using charmaps and possibly iconv(). |
ec09c1c4 | 2 | Copyright (C) 2001, 2005, 2006, 2008, 2012 Free Software Foundation, Inc. |
93693c4d UD |
3 | This file is part of the GNU C Library. |
4 | Contributed by Ulrich Drepper <drepper@redhat.com>, 2001. | |
5 | ||
43bc8ac6 | 6 | This program is free software; you can redistribute it and/or modify |
2e2efe65 RM |
7 | it under the terms of the GNU General Public License as published |
8 | by the Free Software Foundation; version 2 of the License, or | |
9 | (at your option) any later version. | |
93693c4d | 10 | |
43bc8ac6 | 11 | This program is distributed in the hope that it will be useful, |
93693c4d | 12 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
43bc8ac6 UD |
13 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
14 | GNU General Public License for more details. | |
93693c4d | 15 | |
43bc8ac6 UD |
16 | You should have received a copy of the GNU General Public License |
17 | along with this program; if not, write to the Free Software Foundation, | |
18 | Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ | |
93693c4d UD |
19 | |
20 | #include <assert.h> | |
21 | #include <errno.h> | |
22 | #include <error.h> | |
23 | #include <fcntl.h> | |
24 | #include <iconv.h> | |
25 | #include <libintl.h> | |
26 | #include <stdio.h> | |
27 | #include <stdlib.h> | |
28 | #include <unistd.h> | |
29 | #include <sys/mman.h> | |
30 | #include <sys/stat.h> | |
31 | ||
32 | #include "iconv_prog.h" | |
33 | ||
34 | ||
35 | /* Prototypes for a few program-wide used functions. */ | |
ec09c1c4 UD |
36 | extern void *xmalloc (size_t n) |
37 | __attribute_malloc__ __attribute_alloc_size (1); | |
38 | extern void *xcalloc (size_t n, size_t s) | |
39 | __attribute_malloc__ __attribute_alloc_size (1, 2); | |
93693c4d UD |
40 | |
41 | ||
42 | struct convtable | |
43 | { | |
44 | int term[256 / 8]; | |
45 | union | |
46 | { | |
47 | struct convtable *sub; | |
48 | struct charseq *out; | |
49 | } val[256]; | |
50 | }; | |
51 | ||
52 | ||
53 | static inline struct convtable * | |
54 | allocate_table (void) | |
55 | { | |
56 | return (struct convtable *) xcalloc (1, sizeof (struct convtable)); | |
57 | } | |
58 | ||
59 | ||
60 | static inline int | |
61 | is_term (struct convtable *tbl, unsigned int idx) | |
62 | { | |
63 | return tbl->term[idx / 8] & (1 << (idx % 8)); | |
64 | } | |
65 | ||
66 | ||
67 | static inline void | |
68 | clear_term (struct convtable *tbl, unsigned int idx) | |
69 | { | |
70 | tbl->term[idx / 8] &= ~(1 << (idx % 8)); | |
71 | } | |
72 | ||
73 | ||
74 | static inline void | |
75 | set_term (struct convtable *tbl, unsigned int idx) | |
76 | { | |
77 | tbl->term[idx / 8] |= 1 << (idx % 8); | |
78 | } | |
79 | ||
80 | ||
81 | /* Generate the conversion table. */ | |
82 | static struct convtable *use_from_charmap (struct charmap_t *from_charmap, | |
83 | const char *to_code); | |
84 | static struct convtable *use_to_charmap (const char *from_code, | |
85 | struct charmap_t *to_charmap); | |
86 | static struct convtable *use_both_charmaps (struct charmap_t *from_charmap, | |
87 | struct charmap_t *to_charmap); | |
88 | ||
89 | /* Prototypes for the functions doing the actual work. */ | |
90 | static int process_block (struct convtable *tbl, char *addr, size_t len, | |
91 | FILE *output); | |
92 | static int process_fd (struct convtable *tbl, int fd, FILE *output); | |
93 | static int process_file (struct convtable *tbl, FILE *input, FILE *output); | |
94 | ||
95 | ||
96 | int | |
97 | charmap_conversion (const char *from_code, struct charmap_t *from_charmap, | |
98 | const char *to_code, struct charmap_t *to_charmap, | |
5484ff51 UD |
99 | int argc, int remaining, char *argv[], |
100 | const char *output_file) | |
93693c4d UD |
101 | { |
102 | struct convtable *cvtbl; | |
103 | int status = EXIT_SUCCESS; | |
104 | ||
105 | /* We have three different cases to handle: | |
106 | ||
107 | - both, from_charmap and to_charmap, are available. This means we | |
108 | can assume that the symbolic names match and use them to create | |
109 | the mapping. | |
110 | ||
111 | - only from_charmap is available. In this case we can only hope that | |
112 | the symbolic names used are of the <Uxxxx> form in which case we | |
113 | can use a UCS4->"to_code" iconv() conversion for the second step. | |
114 | ||
115 | - only to_charmap is available. This is similar, only that we would | |
116 | use iconv() for the "to_code"->UCS4 conversion. | |
117 | ||
118 | We first create a table which maps input bytes into output bytes. | |
119 | Once this is done we can handle all three of the cases above | |
120 | equally. */ | |
121 | if (from_charmap != NULL) | |
122 | { | |
123 | if (to_charmap == NULL) | |
124 | cvtbl = use_from_charmap (from_charmap, to_code); | |
125 | else | |
126 | cvtbl = use_both_charmaps (from_charmap, to_charmap); | |
127 | } | |
128 | else | |
129 | { | |
130 | assert (to_charmap != NULL); | |
131 | cvtbl = use_to_charmap (from_code, to_charmap); | |
132 | } | |
133 | ||
134 | /* If we couldn't generate a table stop now. */ | |
135 | if (cvtbl == NULL) | |
136 | return EXIT_FAILURE; | |
137 | ||
5484ff51 UD |
138 | /* Determine output file. */ |
139 | FILE *output; | |
140 | if (output_file != NULL && strcmp (output_file, "-") != 0) | |
141 | { | |
142 | output = fopen (output_file, "w"); | |
143 | if (output == NULL) | |
144 | error (EXIT_FAILURE, errno, _("cannot open output file")); | |
145 | } | |
146 | else | |
147 | output = stdout; | |
148 | ||
93693c4d UD |
149 | /* We can now start the conversion. */ |
150 | if (remaining == argc) | |
151 | { | |
152 | if (process_file (cvtbl, stdin, output) != 0) | |
153 | status = EXIT_FAILURE; | |
154 | } | |
155 | else | |
156 | do | |
157 | { | |
158 | struct stat st; | |
159 | char *addr; | |
160 | int fd; | |
161 | ||
162 | if (verbose) | |
163 | printf ("%s:\n", argv[remaining]); | |
164 | if (strcmp (argv[remaining], "-") == 0) | |
165 | fd = 0; | |
166 | else | |
167 | { | |
168 | fd = open (argv[remaining], O_RDONLY); | |
169 | ||
170 | if (fd == -1) | |
171 | { | |
172 | error (0, errno, _("cannot open input file `%s'"), | |
173 | argv[remaining]); | |
174 | status = EXIT_FAILURE; | |
175 | continue; | |
176 | } | |
177 | } | |
178 | ||
179 | #ifdef _POSIX_MAPPED_FILES | |
180 | /* We have possibilities for reading the input file. First try | |
181 | to mmap() it since this will provide the fastest solution. */ | |
182 | if (fstat (fd, &st) == 0 | |
183 | && ((addr = mmap (NULL, st.st_size, PROT_READ, MAP_PRIVATE, | |
184 | fd, 0)) != MAP_FAILED)) | |
185 | { | |
186 | /* Yes, we can use mmap(). The descriptor is not needed | |
187 | anymore. */ | |
188 | if (close (fd) != 0) | |
189 | error (EXIT_FAILURE, errno, | |
190 | _("error while closing input `%s'"), argv[remaining]); | |
191 | ||
192 | if (process_block (cvtbl, addr, st.st_size, output) < 0) | |
193 | { | |
194 | /* Something went wrong. */ | |
195 | status = EXIT_FAILURE; | |
196 | ||
197 | /* We don't need the input data anymore. */ | |
198 | munmap ((void *) addr, st.st_size); | |
199 | ||
200 | /* We cannot go on with producing output since it might | |
201 | lead to problem because the last output might leave | |
202 | the output stream in an undefined state. */ | |
203 | break; | |
204 | } | |
205 | ||
206 | /* We don't need the input data anymore. */ | |
207 | munmap ((void *) addr, st.st_size); | |
208 | } | |
209 | else | |
210 | #endif /* _POSIX_MAPPED_FILES */ | |
211 | { | |
212 | /* Read the file in pieces. */ | |
213 | if (process_fd (cvtbl, fd, output) != 0) | |
214 | { | |
215 | /* Something went wrong. */ | |
216 | status = EXIT_FAILURE; | |
217 | ||
218 | /* We don't need the input file anymore. */ | |
219 | close (fd); | |
220 | ||
221 | /* We cannot go on with producing output since it might | |
222 | lead to problem because the last output might leave | |
223 | the output stream in an undefined state. */ | |
224 | break; | |
225 | } | |
226 | ||
227 | /* Now close the file. */ | |
228 | close (fd); | |
229 | } | |
230 | } | |
231 | while (++remaining < argc); | |
232 | ||
233 | /* All done. */ | |
234 | return status; | |
235 | } | |
236 | ||
237 | ||
238 | static void | |
239 | add_bytes (struct convtable *tbl, struct charseq *in, struct charseq *out) | |
240 | { | |
241 | int n = 0; | |
242 | unsigned int byte; | |
243 | ||
244 | assert (in->nbytes > 0); | |
245 | ||
246 | byte = ((unsigned char *) in->bytes)[n]; | |
247 | while (n + 1 < in->nbytes) | |
248 | { | |
249 | if (is_term (tbl, byte) || tbl->val[byte].sub == NULL) | |
250 | { | |
251 | /* Note that we simply ignore a definition for a byte sequence | |
252 | which is also the prefix for a longer one. */ | |
253 | clear_term (tbl, byte); | |
254 | tbl->val[byte].sub = | |
255 | (struct convtable *) xcalloc (1, sizeof (struct convtable)); | |
256 | } | |
257 | ||
258 | tbl = tbl->val[byte].sub; | |
259 | ||
260 | byte = ((unsigned char *) in->bytes)[++n]; | |
261 | } | |
262 | ||
263 | /* Only add the new sequence if there is none yet and the byte sequence | |
264 | is not part of an even longer one. */ | |
265 | if (! is_term (tbl, byte) && tbl->val[byte].sub == NULL) | |
266 | { | |
267 | set_term (tbl, byte); | |
268 | tbl->val[byte].out = out; | |
269 | } | |
270 | } | |
271 | ||
272 | ||
273 | static struct convtable * | |
274 | use_from_charmap (struct charmap_t *from_charmap, const char *to_code) | |
275 | { | |
276 | /* We iterate over all entries in the from_charmap and for those which | |
277 | have a known UCS4 representation we use an iconv() call to determine | |
278 | the mapping to the to_code charset. */ | |
279 | struct convtable *rettbl; | |
280 | iconv_t cd; | |
281 | void *ptr = NULL; | |
282 | const void *key; | |
283 | size_t keylen; | |
284 | void *data; | |
285 | ||
286 | cd = iconv_open (to_code, "WCHAR_T"); | |
287 | if (cd == (iconv_t) -1) | |
288 | /* We cannot do anything. */ | |
289 | return NULL; | |
290 | ||
291 | rettbl = allocate_table (); | |
292 | ||
293 | while (iterate_table (&from_charmap->char_table, &ptr, &key, &keylen, &data) | |
294 | >= 0) | |
295 | { | |
296 | struct charseq *in = (struct charseq *) data; | |
297 | ||
298 | if (in->ucs4 != UNINITIALIZED_CHAR_VALUE) | |
299 | { | |
300 | /* There is a chance. Try the iconv module. */ | |
301 | wchar_t inbuf[1] = { in->ucs4 }; | |
302 | unsigned char outbuf[64]; | |
303 | char *inptr = (char *) inbuf; | |
304 | size_t inlen = sizeof (inbuf); | |
305 | char *outptr = (char *) outbuf; | |
306 | size_t outlen = sizeof (outbuf); | |
307 | ||
308 | (void) iconv (cd, &inptr, &inlen, &outptr, &outlen); | |
309 | ||
310 | if (outptr != (char *) outbuf) | |
311 | { | |
312 | /* We got some output. Good, use it. */ | |
313 | struct charseq *newp; | |
314 | ||
315 | outlen = sizeof (outbuf) - outlen; | |
316 | assert ((char *) outbuf + outlen == outptr); | |
317 | ||
318 | newp = (struct charseq *) xmalloc (sizeof (struct charseq) | |
319 | + outlen); | |
320 | newp->name = in->name; | |
321 | newp->ucs4 = in->ucs4; | |
322 | newp->nbytes = outlen; | |
323 | memcpy (newp->bytes, outbuf, outlen); | |
324 | ||
325 | add_bytes (rettbl, in, newp); | |
326 | } | |
327 | ||
328 | /* Clear any possible state left behind. */ | |
329 | (void) iconv (cd, NULL, NULL, NULL, NULL); | |
330 | } | |
331 | } | |
332 | ||
333 | iconv_close (cd); | |
334 | ||
335 | return rettbl; | |
336 | } | |
337 | ||
338 | ||
339 | static struct convtable * | |
340 | use_to_charmap (const char *from_code, struct charmap_t *to_charmap) | |
341 | { | |
342 | /* We iterate over all entries in the to_charmap and for those which | |
343 | have a known UCS4 representation we use an iconv() call to determine | |
344 | the mapping to the from_code charset. */ | |
345 | struct convtable *rettbl; | |
346 | iconv_t cd; | |
347 | void *ptr = NULL; | |
348 | const void *key; | |
349 | size_t keylen; | |
350 | void *data; | |
351 | ||
352 | /* Note that the conversion we use here is the reverse direction. Without | |
353 | exhaustive search we cannot figure out which input yields the UCS4 | |
354 | character we are looking for. Therefore we determine it the other | |
355 | way round. */ | |
356 | cd = iconv_open (from_code, "WCHAR_T"); | |
357 | if (cd == (iconv_t) -1) | |
358 | /* We cannot do anything. */ | |
359 | return NULL; | |
360 | ||
361 | rettbl = allocate_table (); | |
362 | ||
363 | while (iterate_table (&to_charmap->char_table, &ptr, &key, &keylen, &data) | |
364 | >= 0) | |
365 | { | |
366 | struct charseq *out = (struct charseq *) data; | |
367 | ||
368 | if (out->ucs4 != UNINITIALIZED_CHAR_VALUE) | |
369 | { | |
370 | /* There is a chance. Try the iconv module. */ | |
371 | wchar_t inbuf[1] = { out->ucs4 }; | |
372 | unsigned char outbuf[64]; | |
373 | char *inptr = (char *) inbuf; | |
374 | size_t inlen = sizeof (inbuf); | |
375 | char *outptr = (char *) outbuf; | |
376 | size_t outlen = sizeof (outbuf); | |
377 | ||
378 | (void) iconv (cd, &inptr, &inlen, &outptr, &outlen); | |
379 | ||
380 | if (outptr != (char *) outbuf) | |
381 | { | |
382 | /* We got some output. Good, use it. */ | |
51e59260 UD |
383 | union |
384 | { | |
385 | struct charseq seq; | |
386 | struct | |
387 | { | |
388 | const char *name; | |
389 | uint32_t ucs4; | |
390 | int nbytes; | |
391 | unsigned char bytes[outlen]; | |
392 | } mem; | |
393 | } new; | |
93693c4d UD |
394 | |
395 | outlen = sizeof (outbuf) - outlen; | |
396 | assert ((char *) outbuf + outlen == outptr); | |
397 | ||
51e59260 UD |
398 | new.mem.name = out->name; |
399 | new.mem.ucs4 = out->ucs4; | |
400 | new.mem.nbytes = outlen; | |
401 | memcpy (new.mem.bytes, outbuf, outlen); | |
93693c4d | 402 | |
51e59260 | 403 | add_bytes (rettbl, &new.seq, out); |
93693c4d UD |
404 | } |
405 | ||
406 | /* Clear any possible state left behind. */ | |
407 | (void) iconv (cd, NULL, NULL, NULL, NULL); | |
408 | } | |
409 | } | |
410 | ||
411 | iconv_close (cd); | |
412 | ||
413 | return rettbl; | |
414 | } | |
415 | ||
416 | ||
417 | static struct convtable * | |
418 | use_both_charmaps (struct charmap_t *from_charmap, | |
419 | struct charmap_t *to_charmap) | |
420 | { | |
421 | /* In this case we iterate over all the entries in the from_charmap, | |
422 | determine the internal name, and find an appropriate entry in the | |
423 | to_charmap (if it exists). */ | |
424 | struct convtable *rettbl = allocate_table (); | |
425 | void *ptr = NULL; | |
426 | const void *key; | |
427 | size_t keylen; | |
428 | void *data; | |
429 | ||
430 | while (iterate_table (&from_charmap->char_table, &ptr, &key, &keylen, &data) | |
431 | >= 0) | |
432 | { | |
433 | struct charseq *in = (struct charseq *) data; | |
434 | struct charseq *out = charmap_find_value (to_charmap, key, keylen); | |
435 | ||
436 | if (out != NULL) | |
437 | add_bytes (rettbl, in, out); | |
438 | } | |
439 | ||
440 | return rettbl; | |
441 | } | |
442 | ||
443 | ||
444 | static int | |
445 | process_block (struct convtable *tbl, char *addr, size_t len, FILE *output) | |
446 | { | |
447 | size_t n = 0; | |
448 | ||
449 | while (n < len) | |
450 | { | |
451 | struct convtable *cur = tbl; | |
452 | unsigned char *curp = (unsigned char *) addr; | |
453 | unsigned int byte = *curp; | |
454 | int cnt; | |
455 | struct charseq *out; | |
456 | ||
457 | while (! is_term (cur, byte)) | |
458 | if (cur->val[byte].sub == NULL) | |
459 | { | |
460 | /* This is a invalid sequence. Skip the first byte if we are | |
461 | ignoring errors. Otherwise punt. */ | |
462 | if (! omit_invalid) | |
463 | { | |
464 | error (0, 0, _("illegal input sequence at position %Zd"), n); | |
465 | return -1; | |
466 | } | |
467 | ||
468 | n -= curp - (unsigned char *) addr; | |
469 | ||
470 | byte = *(curp = (unsigned char *) ++addr); | |
471 | if (++n >= len) | |
472 | /* All converted. */ | |
473 | return 0; | |
474 | ||
475 | cur = tbl; | |
476 | } | |
477 | else | |
478 | { | |
479 | cur = cur->val[byte].sub; | |
480 | ||
481 | if (++n >= len) | |
482 | { | |
483 | error (0, 0, _("\ | |
484 | incomplete character or shift sequence at end of buffer")); | |
485 | return -1; | |
486 | } | |
487 | ||
488 | byte = *++curp; | |
489 | } | |
490 | ||
491 | /* We found a final byte. Write the output bytes. */ | |
492 | out = cur->val[byte].out; | |
493 | for (cnt = 0; cnt < out->nbytes; ++cnt) | |
494 | fputc_unlocked (out->bytes[cnt], output); | |
495 | ||
496 | addr = (char *) curp + 1; | |
497 | ++n; | |
498 | } | |
499 | ||
500 | return 0; | |
501 | } | |
502 | ||
503 | ||
504 | static int | |
505 | process_fd (struct convtable *tbl, int fd, FILE *output) | |
506 | { | |
d1dddedf | 507 | /* We have a problem with reading from a descriptor since we must not |
93693c4d UD |
508 | provide the iconv() function an incomplete character or shift |
509 | sequence at the end of the buffer. Since we have to deal with | |
510 | arbitrary encodings we must read the whole text in a buffer and | |
511 | process it in one step. */ | |
512 | static char *inbuf = NULL; | |
513 | static size_t maxlen = 0; | |
053f7b2b | 514 | char *inptr = inbuf; |
93693c4d UD |
515 | size_t actlen = 0; |
516 | ||
517 | while (actlen < maxlen) | |
518 | { | |
519 | ssize_t n = read (fd, inptr, maxlen - actlen); | |
520 | ||
521 | if (n == 0) | |
522 | /* No more text to read. */ | |
523 | break; | |
524 | ||
525 | if (n == -1) | |
526 | { | |
527 | /* Error while reading. */ | |
528 | error (0, errno, _("error while reading the input")); | |
529 | return -1; | |
530 | } | |
531 | ||
532 | inptr += n; | |
533 | actlen += n; | |
534 | } | |
535 | ||
536 | if (actlen == maxlen) | |
537 | while (1) | |
538 | { | |
539 | ssize_t n; | |
d1dddedf | 540 | char *new_inbuf; |
93693c4d UD |
541 | |
542 | /* Increase the buffer. */ | |
d1dddedf UD |
543 | new_inbuf = (char *) realloc (inbuf, maxlen + 32768); |
544 | if (new_inbuf == NULL) | |
545 | { | |
546 | error (0, errno, _("unable to allocate buffer for input")); | |
547 | return -1; | |
548 | } | |
549 | inbuf = new_inbuf; | |
93693c4d | 550 | maxlen += 32768; |
93693c4d UD |
551 | inptr = inbuf + actlen; |
552 | ||
553 | do | |
554 | { | |
555 | n = read (fd, inptr, maxlen - actlen); | |
556 | ||
557 | if (n == 0) | |
558 | /* No more text to read. */ | |
559 | break; | |
560 | ||
561 | if (n == -1) | |
562 | { | |
563 | /* Error while reading. */ | |
564 | error (0, errno, _("error while reading the input")); | |
565 | return -1; | |
566 | } | |
567 | ||
568 | inptr += n; | |
569 | actlen += n; | |
570 | } | |
571 | while (actlen < maxlen); | |
572 | ||
573 | if (n == 0) | |
574 | /* Break again so we leave both loops. */ | |
575 | break; | |
576 | } | |
577 | ||
578 | /* Now we have all the input in the buffer. Process it in one run. */ | |
579 | return process_block (tbl, inbuf, actlen, output); | |
580 | } | |
581 | ||
582 | ||
583 | static int | |
584 | process_file (struct convtable *tbl, FILE *input, FILE *output) | |
585 | { | |
586 | /* This should be safe since we use this function only for `stdin' and | |
587 | we haven't read anything so far. */ | |
588 | return process_fd (tbl, fileno (input), output); | |
589 | } |