]> git.ipfire.org Git - thirdparty/glibc.git/blob - iconv/iconv_prog.c
Update.
[thirdparty/glibc.git] / iconv / iconv_prog.c
1 /* Convert text in given files from the specified from-set to the to-set.
2 Copyright (C) 1998,1999,2000,2001,2002,2003 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Ulrich Drepper <drepper@cygnus.com>, 1998.
5
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
10
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
15
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, write to the Free
18 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
19 02111-1307 USA. */
20
21 #include <argp.h>
22 #include <assert.h>
23 #include <ctype.h>
24 #include <errno.h>
25 #include <error.h>
26 #include <fcntl.h>
27 #include <iconv.h>
28 #include <langinfo.h>
29 #include <locale.h>
30 #include <search.h>
31 #include <stdbool.h>
32 #include <stdio.h>
33 #include <stdlib.h>
34 #include <string.h>
35 #include <unistd.h>
36 #include <libintl.h>
37 #ifdef _POSIX_MAPPED_FILES
38 # include <sys/mman.h>
39 #endif
40 #include <charmap.h>
41 #include <gconv_int.h>
42 #include "iconv_prog.h"
43 #include "iconvconfig.h"
44
45 /* Get libc version number. */
46 #include "../version.h"
47
48 #define PACKAGE _libc_intl_domainname
49
50
51 /* Name and version of program. */
52 static void print_version (FILE *stream, struct argp_state *state);
53 void (*argp_program_version_hook) (FILE *, struct argp_state *) = print_version;
54
55 #define OPT_VERBOSE 1000
56 #define OPT_LIST 'l'
57
58 /* Definitions of arguments for argp functions. */
59 static const struct argp_option options[] =
60 {
61 { NULL, 0, NULL, 0, N_("Input/Output format specification:") },
62 { "from-code", 'f', "NAME", 0, N_("encoding of original text") },
63 { "to-code", 't', "NAME", 0, N_("encoding for output") },
64 { NULL, 0, NULL, 0, N_("Information:") },
65 { "list", 'l', NULL, 0, N_("list all known coded character sets") },
66 { NULL, 0, NULL, 0, N_("Output control:") },
67 { NULL, 'c', NULL, 0, N_("omit invalid characters from output") },
68 { "output", 'o', "FILE", 0, N_("output file") },
69 { "silent", 's', NULL, 0, N_("suppress warnings") },
70 { "verbose", OPT_VERBOSE, NULL, 0, N_("print progress information") },
71 { NULL, 0, NULL, 0, NULL }
72 };
73
74 /* Short description of program. */
75 static const char doc[] = N_("\
76 Convert encoding of given files from one encoding to another.");
77
78 /* Strings for arguments in help texts. */
79 static const char args_doc[] = N_("[FILE...]");
80
81 /* Prototype for option handler. */
82 static error_t parse_opt (int key, char *arg, struct argp_state *state);
83
84 /* Function to print some extra text in the help message. */
85 static char *more_help (int key, const char *text, void *input);
86
87 /* Data structure to communicate with argp functions. */
88 static struct argp argp =
89 {
90 options, parse_opt, args_doc, doc, NULL, more_help
91 };
92
93 /* Code sets to convert from and to respectively. An empty string as the
94 default causes the 'iconv_open' function to look up the charset of the
95 currently selected locale and use it. */
96 static const char *from_code = "";
97 static const char *to_code = "";
98
99 /* File to write output to. If NULL write to stdout. */
100 static const char *output_file;
101
102 /* Nonzero if verbose ouput is wanted. */
103 int verbose;
104
105 /* Nonzero if list of all coded character sets is wanted. */
106 static int list;
107
108 /* If nonzero omit invalid character from output. */
109 int omit_invalid;
110
111 /* Prototypes for the functions doing the actual work. */
112 static int process_block (iconv_t cd, char *addr, size_t len, FILE *output);
113 static int process_fd (iconv_t cd, int fd, FILE *output);
114 static int process_file (iconv_t cd, FILE *input, FILE *output);
115 static void print_known_names (void) internal_function;
116
117
118 int
119 main (int argc, char *argv[])
120 {
121 int status = EXIT_SUCCESS;
122 int remaining;
123 FILE *output;
124 iconv_t cd;
125 const char *orig_to_code;
126 struct charmap_t *from_charmap = NULL;
127 struct charmap_t *to_charmap = NULL;
128
129 /* Set locale via LC_ALL. */
130 setlocale (LC_ALL, "");
131
132 /* Set the text message domain. */
133 textdomain (_libc_intl_domainname);
134
135 /* Parse and process arguments. */
136 argp_parse (&argp, argc, argv, 0, &remaining, NULL);
137
138 /* List all coded character sets if wanted. */
139 if (list)
140 {
141 print_known_names ();
142 exit (EXIT_SUCCESS);
143 }
144
145 /* If we have to ignore errors make sure we use the appropriate name for
146 the to-character-set. */
147 orig_to_code = to_code;
148 if (omit_invalid)
149 {
150 const char *errhand = strchrnul (to_code, '/');
151 int nslash = 2;
152 char *newp;
153 char *cp;
154
155 if (*errhand == '/')
156 {
157 --nslash;
158 errhand = strchrnul (errhand, '/');
159
160 if (*errhand == '/')
161 {
162 --nslash;
163 ++errhand;
164 }
165 }
166
167 newp = (char *) alloca (errhand - to_code + nslash + 6 + 1);
168 cp = mempcpy (newp, to_code, errhand - to_code);
169 while (nslash-- > 0)
170 *cp++ = '/';
171 memcpy (cp, "IGNORE", sizeof ("IGNORE"));
172
173 to_code = newp;
174 }
175
176 /* POSIX 1003.2b introduces a silly thing: the arguments to -t anf -f
177 can be file names of charmaps. In this case iconv will have to read
178 those charmaps and use them to do the conversion. But there are
179 holes in the specification. There is nothing said that if -f is a
180 charmap filename that -t must be, too. And vice versa. There is
181 also no word about the symbolic names used. What if they don't
182 match? */
183 if (strchr (from_code, '/') != NULL)
184 /* The from-name might be a charmap file name. Try reading the
185 file. */
186 from_charmap = charmap_read (from_code, /*0, 1*/1, 0, 0);
187
188 if (strchr (orig_to_code, '/') != NULL)
189 /* The to-name might be a charmap file name. Try reading the
190 file. */
191 to_charmap = charmap_read (orig_to_code, /*0, 1,*/1,0, 0);
192
193
194 /* Determine output file. */
195 if (output_file != NULL && strcmp (output_file, "-") != 0)
196 {
197 output = fopen (output_file, "w");
198 if (output == NULL)
199 error (EXIT_FAILURE, errno, _("cannot open output file"));
200 }
201 else
202 output = stdout;
203
204 /* At this point we have to handle two cases. The first one is
205 where a charmap is used for the from- or to-charset, or both. We
206 handle this special since it is very different from the sane way of
207 doing things. The other case allows converting using the iconv()
208 function. */
209 if (from_charmap != NULL || to_charmap != NULL)
210 /* Construct the conversion table and do the conversion. */
211 status = charmap_conversion (from_code, from_charmap, to_code, to_charmap,
212 argc, remaining, argv, output);
213 else
214 {
215 /* Let's see whether we have these coded character sets. */
216 cd = iconv_open (to_code, from_code);
217 if (cd == (iconv_t) -1)
218 {
219 if (errno == EINVAL)
220 {
221 /* Try to be nice with the user and tell her which of the
222 two encoding names is wrong. This is possible because
223 all supported encodings can be converted from/to Unicode,
224 in other words, because the graph of encodings is
225 connected. */
226 bool from_wrong =
227 (iconv_open ("UTF-8", from_code) == (iconv_t) -1
228 && errno == EINVAL);
229 bool to_wrong =
230 (iconv_open (to_code, "UTF-8") == (iconv_t) -1
231 && errno == EINVAL);
232 const char *from_pretty =
233 (from_code[0] ? from_code : nl_langinfo (CODESET));
234 const char *to_pretty =
235 (orig_to_code[0] ? orig_to_code : nl_langinfo (CODESET));
236
237 if (from_wrong)
238 {
239 if (to_wrong)
240 error (EXIT_FAILURE, 0,
241 _("\
242 conversion from `%s' and to `%s' are not supported"),
243 from_pretty, to_pretty);
244 else
245 error (EXIT_FAILURE, 0,
246 _("conversion from `%s' is not supported"),
247 from_pretty);
248 }
249 else
250 {
251 if (to_wrong)
252 error (EXIT_FAILURE, 0,
253 _("conversion to `%s' is not supported"),
254 to_pretty);
255 else
256 error (EXIT_FAILURE, 0,
257 _("conversion from `%s' to `%s' is not supported"),
258 from_pretty, to_pretty);
259 }
260 }
261 else
262 error (EXIT_FAILURE, errno,
263 _("failed to start conversion processing"));
264 }
265
266 /* Now process the remaining files. Write them to stdout or the file
267 specified with the `-o' parameter. If we have no file given as
268 the parameter process all from stdin. */
269 if (remaining == argc)
270 {
271 if (process_file (cd, stdin, output) != 0)
272 status = EXIT_FAILURE;
273 }
274 else
275 do
276 {
277 #ifdef _POSIX_MAPPED_FILES
278 struct stat st;
279 char *addr;
280 #endif
281 int fd;
282
283 if (verbose)
284 printf ("%s:\n", argv[remaining]);
285 if (strcmp (argv[remaining], "-") == 0)
286 fd = 0;
287 else
288 {
289 fd = open (argv[remaining], O_RDONLY);
290
291 if (fd == -1)
292 {
293 error (0, errno, _("cannot open input file `%s'"),
294 argv[remaining]);
295 status = EXIT_FAILURE;
296 continue;
297 }
298 }
299
300 #ifdef _POSIX_MAPPED_FILES
301 /* We have possibilities for reading the input file. First try
302 to mmap() it since this will provide the fastest solution. */
303 if (fstat (fd, &st) == 0
304 && ((addr = mmap (NULL, st.st_size, PROT_READ, MAP_PRIVATE,
305 fd, 0)) != MAP_FAILED))
306 {
307 /* Yes, we can use mmap(). The descriptor is not needed
308 anymore. */
309 if (close (fd) != 0)
310 error (EXIT_FAILURE, errno,
311 _("error while closing input `%s'"),
312 argv[remaining]);
313
314 if (process_block (cd, addr, st.st_size, output) < 0)
315 {
316 /* Something went wrong. */
317 status = EXIT_FAILURE;
318
319 /* We don't need the input data anymore. */
320 munmap ((void *) addr, st.st_size);
321
322 /* We cannot go on with producing output since it might
323 lead to problem because the last output might leave
324 the output stream in an undefined state. */
325 break;
326 }
327
328 /* We don't need the input data anymore. */
329 munmap ((void *) addr, st.st_size);
330 }
331 else
332 #endif /* _POSIX_MAPPED_FILES */
333 {
334 /* Read the file in pieces. */
335 if (process_fd (cd, fd, output) != 0)
336 {
337 /* Something went wrong. */
338 status = EXIT_FAILURE;
339
340 /* We don't need the input file anymore. */
341 close (fd);
342
343 /* We cannot go on with producing output since it might
344 lead to problem because the last output might leave
345 the output stream in an undefined state. */
346 break;
347 }
348
349 /* Now close the file. */
350 close (fd);
351 }
352 }
353 while (++remaining < argc);
354 }
355
356 /* Close the output file now. */
357 if (fclose (output))
358 error (EXIT_FAILURE, errno, _("error while closing output file"));
359
360 return status;
361 }
362
363
364 /* Handle program arguments. */
365 static error_t
366 parse_opt (int key, char *arg, struct argp_state *state)
367 {
368 switch (key)
369 {
370 case 'f':
371 from_code = arg;
372 break;
373 case 't':
374 to_code = arg;
375 break;
376 case 'o':
377 output_file = arg;
378 break;
379 case 's':
380 /* Nothing, for now at least. We are not giving out any information
381 about missing character or so. */
382 break;
383 case 'c':
384 /* Omit invalid characters from output. */
385 omit_invalid = 1;
386 break;
387 case OPT_VERBOSE:
388 verbose = 1;
389 break;
390 case OPT_LIST:
391 list = 1;
392 break;
393 default:
394 return ARGP_ERR_UNKNOWN;
395 }
396 return 0;
397 }
398
399
400 static char *
401 more_help (int key, const char *text, void *input)
402 {
403 switch (key)
404 {
405 case ARGP_KEY_HELP_EXTRA:
406 /* We print some extra information. */
407 return strdup (gettext ("\
408 Report bugs using the `glibcbug' script to <bugs@gnu.org>.\n"));
409 default:
410 break;
411 }
412 return (char *) text;
413 }
414
415
416 /* Print the version information. */
417 static void
418 print_version (FILE *stream, struct argp_state *state)
419 {
420 fprintf (stream, "iconv (GNU %s) %s\n", PACKAGE, VERSION);
421 fprintf (stream, gettext ("\
422 Copyright (C) %s Free Software Foundation, Inc.\n\
423 This is free software; see the source for copying conditions. There is NO\n\
424 warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.\n\
425 "), "2003");
426 fprintf (stream, gettext ("Written by %s.\n"), "Ulrich Drepper");
427 }
428
429
430 static int
431 process_block (iconv_t cd, char *addr, size_t len, FILE *output)
432 {
433 #define OUTBUF_SIZE 32768
434 const char *start = addr;
435 char outbuf[OUTBUF_SIZE];
436 char *outptr;
437 size_t outlen;
438 size_t n;
439
440 while (len > 0)
441 {
442 outptr = outbuf;
443 outlen = OUTBUF_SIZE;
444 n = iconv (cd, &addr, &len, &outptr, &outlen);
445
446 if (outptr != outbuf)
447 {
448 /* We have something to write out. */
449 int errno_save = errno;
450
451 if (fwrite (outbuf, 1, outptr - outbuf, output)
452 < (size_t) (outptr - outbuf)
453 || ferror (output))
454 {
455 /* Error occurred while printing the result. */
456 error (0, 0, _("\
457 conversion stopped due to problem in writing the output"));
458 return -1;
459 }
460
461 errno = errno_save;
462 }
463
464 if (n != (size_t) -1)
465 {
466 /* All the input test is processed. For state-dependent
467 character sets we have to flush the state now. */
468 outptr = outbuf;
469 outlen = OUTBUF_SIZE;
470 (void) iconv (cd, NULL, NULL, &outptr, &outlen);
471
472 if (outptr != outbuf)
473 {
474 /* We have something to write out. */
475 int errno_save = errno;
476
477 if (fwrite (outbuf, 1, outptr - outbuf, output)
478 < (size_t) (outptr - outbuf)
479 || ferror (output))
480 {
481 /* Error occurred while printing the result. */
482 error (0, 0, _("\
483 conversion stopped due to problem in writing the output"));
484 return -1;
485 }
486
487 errno = errno_save;
488 }
489
490 break;
491 }
492
493 if (errno != E2BIG)
494 {
495 /* iconv() ran into a problem. */
496 switch (errno)
497 {
498 case EILSEQ:
499 error (0, 0, _("illegal input sequence at position %ld"),
500 (long) (addr - start));
501 break;
502 case EINVAL:
503 error (0, 0, _("\
504 incomplete character or shift sequence at end of buffer"));
505 break;
506 case EBADF:
507 error (0, 0, _("internal error (illegal descriptor)"));
508 break;
509 default:
510 error (0, 0, _("unknown iconv() error %d"), errno);
511 break;
512 }
513
514 return -1;
515 }
516 }
517
518 return 0;
519 }
520
521
522 static int
523 process_fd (iconv_t cd, int fd, FILE *output)
524 {
525 /* we have a problem with reading from a desriptor since we must not
526 provide the iconv() function an incomplete character or shift
527 sequence at the end of the buffer. Since we have to deal with
528 arbitrary encodings we must read the whole text in a buffer and
529 process it in one step. */
530 static char *inbuf = NULL;
531 static size_t maxlen = 0;
532 char *inptr = NULL;
533 size_t actlen = 0;
534
535 while (actlen < maxlen)
536 {
537 ssize_t n = read (fd, inptr, maxlen - actlen);
538
539 if (n == 0)
540 /* No more text to read. */
541 break;
542
543 if (n == -1)
544 {
545 /* Error while reading. */
546 error (0, errno, _("error while reading the input"));
547 return -1;
548 }
549
550 inptr += n;
551 actlen += n;
552 }
553
554 if (actlen == maxlen)
555 while (1)
556 {
557 ssize_t n;
558 char *new_inbuf;
559
560 /* Increase the buffer. */
561 new_inbuf = (char *) realloc (inbuf, maxlen + 32768);
562 if (new_inbuf == NULL)
563 {
564 error (0, errno, _("unable to allocate buffer for input"));
565 return -1;
566 }
567 inbuf = new_inbuf;
568 maxlen += 32768;
569 inptr = inbuf + actlen;
570
571 do
572 {
573 n = read (fd, inptr, maxlen - actlen);
574
575 if (n == 0)
576 /* No more text to read. */
577 break;
578
579 if (n == -1)
580 {
581 /* Error while reading. */
582 error (0, errno, _("error while reading the input"));
583 return -1;
584 }
585
586 inptr += n;
587 actlen += n;
588 }
589 while (actlen < maxlen);
590
591 if (n == 0)
592 /* Break again so we leave both loops. */
593 break;
594 }
595
596 /* Now we have all the input in the buffer. Process it in one run. */
597 return process_block (cd, inbuf, actlen, output);
598 }
599
600
601 static int
602 process_file (iconv_t cd, FILE *input, FILE *output)
603 {
604 /* This should be safe since we use this function only for `stdin' and
605 we haven't read anything so far. */
606 return process_fd (cd, fileno (input), output);
607 }
608
609
610 /* Print all known character sets/encodings. */
611 static void *printlist;
612 static size_t column;
613 static int not_first;
614
615 static void
616 insert_print_list (const void *nodep, VISIT value, int level)
617 {
618 if (value == leaf || value == postorder)
619 {
620 const struct gconv_alias *s = *(const struct gconv_alias **) nodep;
621 tsearch (s->fromname, &printlist, (__compar_fn_t) strverscmp);
622 }
623 }
624
625 static void
626 do_print_human (const void *nodep, VISIT value, int level)
627 {
628 if (value == leaf || value == postorder)
629 {
630 const char *s = *(const char **) nodep;
631 size_t len = strlen (s);
632 size_t cnt;
633
634 while (len > 0 && s[len - 1] == '/')
635 --len;
636
637 for (cnt = 0; cnt < len; ++cnt)
638 if (isalnum (s[cnt]))
639 break;
640 if (cnt == len)
641 return;
642
643 if (not_first)
644 {
645 putchar (',');
646 ++column;
647
648 if (column > 2 && column + len > 77)
649 {
650 fputs ("\n ", stdout);
651 column = 2;
652 }
653 else
654 {
655 putchar (' ');
656 ++column;
657 }
658 }
659 else
660 not_first = 1;
661
662 fwrite (s, len, 1, stdout);
663 column += len;
664 }
665 }
666
667 static void
668 do_print (const void *nodep, VISIT value, int level)
669 {
670 if (value == leaf || value == postorder)
671 {
672 const char *s = *(const char **) nodep;
673
674 puts (s);
675 }
676 }
677
678 static void
679 internal_function
680 add_known_names (struct gconv_module *node)
681 {
682 if (node->left != NULL)
683 add_known_names (node->left);
684 if (node->right != NULL)
685 add_known_names (node->right);
686 do
687 {
688 if (strcmp (node->from_string, "INTERNAL"))
689 tsearch (node->from_string, &printlist,
690 (__compar_fn_t) strverscmp);
691 if (strcmp (node->to_string, "INTERNAL") != 0)
692 tsearch (node->to_string, &printlist, (__compar_fn_t) strverscmp);
693
694 node = node->same;
695 }
696 while (node != NULL);
697 }
698
699
700 static void
701 insert_cache (void)
702 {
703 const struct gconvcache_header *header;
704 const char *strtab;
705 const struct hash_entry *hashtab;
706 size_t cnt;
707
708 header = (const struct gconvcache_header *) __gconv_get_cache ();
709 strtab = (char *) header + header->string_offset;
710 hashtab = (struct hash_entry *) ((char *) header + header->hash_offset);
711
712 for (cnt = 0; cnt < header->hash_size; ++cnt)
713 if (hashtab[cnt].string_offset != 0)
714 {
715 const char *str = strtab + hashtab[cnt].string_offset;
716
717 if (strcmp (str, "INTERNAL") != 0)
718 tsearch (str, &printlist, (__compar_fn_t) strverscmp);
719 }
720 }
721
722
723 static void
724 internal_function
725 print_known_names (void)
726 {
727 iconv_t h;
728 void *cache;
729
730 /* We must initialize the internal databases first. */
731 h = iconv_open ("L1", "L1");
732 iconv_close (h);
733
734 /* See whether we have a cache. */
735 cache = __gconv_get_cache ();
736 if (cache != NULL)
737 /* Yep, use only this information. */
738 insert_cache ();
739 else
740 {
741 struct gconv_module *modules;
742
743 /* No, then use the information read from the gconv-modules file.
744 First add the aliases. */
745 twalk (__gconv_get_alias_db (), insert_print_list);
746
747 /* Add the from- and to-names from the known modules. */
748 modules = __gconv_get_modules_db ();
749 if (modules != NULL)
750 add_known_names (modules);
751 }
752
753 fputs (_("\
754 The following list contain all the coded character sets known. This does\n\
755 not necessarily mean that all combinations of these names can be used for\n\
756 the FROM and TO command line parameters. One coded character set can be\n\
757 listed with several different names (aliases).\n\n "), stdout);
758
759 /* Now print the collected names. */
760 column = 2;
761 if (isatty (fileno (stdout)))
762 {
763 twalk (printlist, do_print_human);
764
765 if (column != 0)
766 puts ("");
767 }
768 else
769 twalk (printlist, do_print);
770 }