]> git.ipfire.org Git - thirdparty/glibc.git/blob - iconv/iconv_prog.c
intl: Handle translation output codesets with suffixes [BZ #26383]
[thirdparty/glibc.git] / iconv / iconv_prog.c
1 /* Convert text in given files from the specified from-set to the to-set.
2 Copyright (C) 1998-2020 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Ulrich Drepper <drepper@cygnus.com>, 1998.
5
6 This program is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published
8 by the Free Software Foundation; version 2 of the License, or
9 (at your option) any later version.
10
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
15
16 You should have received a copy of the GNU General Public License
17 along with this program; if not, see <https://www.gnu.org/licenses/>. */
18
19 #include <argp.h>
20 #include <assert.h>
21 #include <ctype.h>
22 #include <errno.h>
23 #include <error.h>
24 #include <fcntl.h>
25 #include <iconv.h>
26 #include <langinfo.h>
27 #include <locale.h>
28 #include <search.h>
29 #include <stdbool.h>
30 #include <stdio.h>
31 #include <stdlib.h>
32 #include <string.h>
33 #include <unistd.h>
34 #include <libintl.h>
35 #ifdef _POSIX_MAPPED_FILES
36 # include <sys/mman.h>
37 #endif
38 #include <charmap.h>
39 #include <gconv_int.h>
40 #include "iconv_prog.h"
41 #include "iconvconfig.h"
42 #include "gconv_charset.h"
43
44 /* Get libc version number. */
45 #include "../version.h"
46
47 #define PACKAGE _libc_intl_domainname
48
49
50 /* Name and version of program. */
51 static void print_version (FILE *stream, struct argp_state *state);
52 void (*argp_program_version_hook) (FILE *, struct argp_state *) = print_version;
53
54 #define OPT_VERBOSE 1000
55 #define OPT_LIST 'l'
56
57 /* Definitions of arguments for argp functions. */
58 static const struct argp_option options[] =
59 {
60 { NULL, 0, NULL, 0, N_("Input/Output format specification:") },
61 { "from-code", 'f', N_("NAME"), 0, N_("encoding of original text") },
62 { "to-code", 't', N_("NAME"), 0, N_("encoding for output") },
63 { NULL, 0, NULL, 0, N_("Information:") },
64 { "list", 'l', NULL, 0, N_("list all known coded character sets") },
65 { NULL, 0, NULL, 0, N_("Output control:") },
66 { NULL, 'c', NULL, 0, N_("omit invalid characters from output") },
67 { "output", 'o', N_("FILE"), 0, N_("output file") },
68 { "silent", 's', NULL, 0, N_("suppress warnings") },
69 { "verbose", OPT_VERBOSE, NULL, 0, N_("print progress information") },
70 { NULL, 0, NULL, 0, NULL }
71 };
72
73 /* Short description of program. */
74 static const char doc[] = N_("\
75 Convert encoding of given files from one encoding to another.");
76
77 /* Strings for arguments in help texts. */
78 static const char args_doc[] = N_("[FILE...]");
79
80 /* Prototype for option handler. */
81 static error_t parse_opt (int key, char *arg, struct argp_state *state);
82
83 /* Function to print some extra text in the help message. */
84 static char *more_help (int key, const char *text, void *input);
85
86 /* Data structure to communicate with argp functions. */
87 static struct argp argp =
88 {
89 options, parse_opt, args_doc, doc, NULL, more_help
90 };
91
92 /* Code sets to convert from and to respectively. An empty string as the
93 default causes the 'iconv_open' function to look up the charset of the
94 currently selected locale and use it. */
95 static const char *from_code = "";
96 static const char *to_code = "";
97
98 /* File to write output to. If NULL write to stdout. */
99 static const char *output_file;
100
101 /* Nonzero if list of all coded character sets is wanted. */
102 static int list;
103
104 /* If nonzero omit invalid character from output. */
105 int omit_invalid;
106
107 /* Prototypes for the functions doing the actual work. */
108 static int process_block (iconv_t cd, char *addr, size_t len, FILE **output,
109 const char *output_file);
110 static int process_fd (iconv_t cd, int fd, FILE **output,
111 const char *output_file);
112 static int process_file (iconv_t cd, FILE *input, FILE **output,
113 const char *output_file);
114 static void print_known_names (void);
115
116
117 int
118 main (int argc, char *argv[])
119 {
120 int status = EXIT_SUCCESS;
121 int remaining;
122 __gconv_t cd;
123 struct charmap_t *from_charmap = NULL;
124 struct charmap_t *to_charmap = NULL;
125
126 /* Set locale via LC_ALL. */
127 setlocale (LC_ALL, "");
128
129 /* Set the text message domain. */
130 textdomain (_libc_intl_domainname);
131
132 /* Parse and process arguments. */
133 argp_parse (&argp, argc, argv, 0, &remaining, NULL);
134
135 /* List all coded character sets if wanted. */
136 if (list)
137 {
138 print_known_names ();
139 exit (EXIT_SUCCESS);
140 }
141
142 /* POSIX 1003.2b introduces a silly thing: the arguments to -t anf -f
143 can be file names of charmaps. In this case iconv will have to read
144 those charmaps and use them to do the conversion. But there are
145 holes in the specification. There is nothing said that if -f is a
146 charmap filename that -t must be, too. And vice versa. There is
147 also no word about the symbolic names used. What if they don't
148 match? */
149 if (strchr (from_code, '/') != NULL)
150 /* The from-name might be a charmap file name. Try reading the
151 file. */
152 from_charmap = charmap_read (from_code, /*0, 1*/1, 0, 0, 0);
153
154 if (strchr (to_code, '/') != NULL)
155 /* The to-name might be a charmap file name. Try reading the
156 file. */
157 to_charmap = charmap_read (to_code, /*0, 1,*/1, 0, 0, 0);
158
159
160 /* At this point we have to handle two cases. The first one is
161 where a charmap is used for the from- or to-charset, or both. We
162 handle this special since it is very different from the sane way of
163 doing things. The other case allows converting using the iconv()
164 function. */
165 if (from_charmap != NULL || to_charmap != NULL)
166 /* Construct the conversion table and do the conversion. */
167 status = charmap_conversion (from_code, from_charmap, to_code, to_charmap,
168 argc, remaining, argv, output_file);
169 else
170 {
171 struct gconv_spec conv_spec;
172 int res;
173
174 if (__gconv_create_spec (&conv_spec, from_code, to_code) == NULL)
175 {
176 error (EXIT_FAILURE, errno,
177 _("failed to start conversion processing"));
178 exit (1);
179 }
180
181 if (omit_invalid)
182 conv_spec.ignore = true;
183
184 /* Let's see whether we have these coded character sets. */
185 res = __gconv_open (&conv_spec, &cd, 0);
186
187 __gconv_destroy_spec (&conv_spec);
188
189 if (res != __GCONV_OK)
190 {
191 if (errno == EINVAL)
192 {
193 /* Try to be nice with the user and tell her which of the
194 two encoding names is wrong. This is possible because
195 all supported encodings can be converted from/to Unicode,
196 in other words, because the graph of encodings is
197 connected. */
198 bool from_wrong =
199 (iconv_open ("UTF-8", from_code) == (iconv_t) -1
200 && errno == EINVAL);
201 bool to_wrong =
202 (iconv_open (to_code, "UTF-8") == (iconv_t) -1
203 && errno == EINVAL);
204 const char *from_pretty =
205 (from_code[0] ? from_code : nl_langinfo (CODESET));
206 const char *to_pretty =
207 (to_code[0] ? to_code : nl_langinfo (CODESET));
208
209 if (from_wrong)
210 {
211 if (to_wrong)
212 error (0, 0,
213 _("\
214 conversions from `%s' and to `%s' are not supported"),
215 from_pretty, to_pretty);
216 else
217 error (0, 0,
218 _("conversion from `%s' is not supported"),
219 from_pretty);
220 }
221 else
222 {
223 if (to_wrong)
224 error (0, 0,
225 _("conversion to `%s' is not supported"),
226 to_pretty);
227 else
228 error (0, 0,
229 _("conversion from `%s' to `%s' is not supported"),
230 from_pretty, to_pretty);
231 }
232
233 argp_help (&argp, stderr, ARGP_HELP_SEE,
234 program_invocation_short_name);
235 exit (1);
236 }
237 else
238 error (EXIT_FAILURE, errno,
239 _("failed to start conversion processing"));
240 }
241
242 /* The output file. Will be opened when we are ready to produce
243 output. */
244 FILE *output = NULL;
245
246 /* Now process the remaining files. Write them to stdout or the file
247 specified with the `-o' parameter. If we have no file given as
248 the parameter process all from stdin. */
249 if (remaining == argc)
250 {
251 if (process_file (cd, stdin, &output, output_file) != 0)
252 status = EXIT_FAILURE;
253 }
254 else
255 do
256 {
257 #ifdef _POSIX_MAPPED_FILES
258 struct stat64 st;
259 char *addr;
260 #endif
261 int fd, ret;
262
263 if (verbose)
264 fprintf (stderr, "%s:\n", argv[remaining]);
265 if (strcmp (argv[remaining], "-") == 0)
266 fd = 0;
267 else
268 {
269 fd = open (argv[remaining], O_RDONLY);
270
271 if (fd == -1)
272 {
273 error (0, errno, _("cannot open input file `%s'"),
274 argv[remaining]);
275 status = EXIT_FAILURE;
276 continue;
277 }
278 }
279
280 #ifdef _POSIX_MAPPED_FILES
281 /* We have possibilities for reading the input file. First try
282 to mmap() it since this will provide the fastest solution. */
283 if (fstat64 (fd, &st) == 0
284 && ((addr = mmap (NULL, st.st_size, PROT_READ, MAP_PRIVATE,
285 fd, 0)) != MAP_FAILED))
286 {
287 /* Yes, we can use mmap(). The descriptor is not needed
288 anymore. */
289 if (close (fd) != 0)
290 error (EXIT_FAILURE, errno,
291 _("error while closing input `%s'"),
292 argv[remaining]);
293
294 ret = process_block (cd, addr, st.st_size, &output,
295 output_file);
296
297 /* We don't need the input data anymore. */
298 munmap ((void *) addr, st.st_size);
299
300 if (ret != 0)
301 {
302 status = EXIT_FAILURE;
303
304 if (ret < 0)
305 /* We cannot go on with producing output since it might
306 lead to problem because the last output might leave
307 the output stream in an undefined state. */
308 break;
309 }
310 }
311 else
312 #endif /* _POSIX_MAPPED_FILES */
313 {
314 /* Read the file in pieces. */
315 ret = process_fd (cd, fd, &output, output_file);
316
317 /* Now close the file. */
318 close (fd);
319
320 if (ret != 0)
321 {
322 /* Something went wrong. */
323 status = EXIT_FAILURE;
324
325 if (ret < 0)
326 /* We cannot go on with producing output since it might
327 lead to problem because the last output might leave
328 the output stream in an undefined state. */
329 break;
330 }
331 }
332 }
333 while (++remaining < argc);
334
335 /* Close the output file now. */
336 if (output != NULL && fclose (output))
337 error (EXIT_FAILURE, errno, _("error while closing output file"));
338 }
339
340 return status;
341 }
342
343
344 /* Handle program arguments. */
345 static error_t
346 parse_opt (int key, char *arg, struct argp_state *state)
347 {
348 switch (key)
349 {
350 case 'f':
351 from_code = arg;
352 break;
353 case 't':
354 to_code = arg;
355 break;
356 case 'o':
357 output_file = arg;
358 break;
359 case 's':
360 /* Nothing, for now at least. We are not giving out any information
361 about missing character or so. */
362 break;
363 case 'c':
364 /* Omit invalid characters from output. */
365 omit_invalid = 1;
366 break;
367 case OPT_VERBOSE:
368 verbose = 1;
369 break;
370 case OPT_LIST:
371 list = 1;
372 break;
373 default:
374 return ARGP_ERR_UNKNOWN;
375 }
376 return 0;
377 }
378
379
380 static char *
381 more_help (int key, const char *text, void *input)
382 {
383 char *tp = NULL;
384 switch (key)
385 {
386 case ARGP_KEY_HELP_EXTRA:
387 /* We print some extra information. */
388 if (asprintf (&tp, gettext ("\
389 For bug reporting instructions, please see:\n\
390 %s.\n"), REPORT_BUGS_TO) < 0)
391 return NULL;
392 return tp;
393 default:
394 break;
395 }
396 return (char *) text;
397 }
398
399
400 /* Print the version information. */
401 static void
402 print_version (FILE *stream, struct argp_state *state)
403 {
404 fprintf (stream, "iconv %s%s\n", PKGVERSION, VERSION);
405 fprintf (stream, gettext ("\
406 Copyright (C) %s Free Software Foundation, Inc.\n\
407 This is free software; see the source for copying conditions. There is NO\n\
408 warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.\n\
409 "), "2020");
410 fprintf (stream, gettext ("Written by %s.\n"), "Ulrich Drepper");
411 }
412
413
414 static int
415 write_output (const char *outbuf, const char *outptr, FILE **output,
416 const char *output_file)
417 {
418 /* We have something to write out. */
419 int errno_save = errno;
420
421 if (*output == NULL)
422 {
423 /* Determine output file. */
424 if (output_file != NULL && strcmp (output_file, "-") != 0)
425 {
426 *output = fopen (output_file, "w");
427 if (*output == NULL)
428 error (EXIT_FAILURE, errno, _("cannot open output file"));
429 }
430 else
431 *output = stdout;
432 }
433
434 if (fwrite (outbuf, 1, outptr - outbuf, *output) < (size_t) (outptr - outbuf)
435 || ferror (*output))
436 {
437 /* Error occurred while printing the result. */
438 error (0, 0, _("\
439 conversion stopped due to problem in writing the output"));
440 return -1;
441 }
442
443 errno = errno_save;
444
445 return 0;
446 }
447
448
449 static int
450 process_block (iconv_t cd, char *addr, size_t len, FILE **output,
451 const char *output_file)
452 {
453 #define OUTBUF_SIZE 32768
454 const char *start = addr;
455 char outbuf[OUTBUF_SIZE];
456 char *outptr;
457 size_t outlen;
458 size_t n;
459 int ret = 0;
460
461 while (len > 0)
462 {
463 outptr = outbuf;
464 outlen = OUTBUF_SIZE;
465 n = iconv (cd, &addr, &len, &outptr, &outlen);
466
467 if (n == (size_t) -1 && omit_invalid && errno == EILSEQ)
468 {
469 ret = 1;
470 if (len == 0)
471 n = 0;
472 else
473 errno = E2BIG;
474 }
475
476 if (outptr != outbuf)
477 {
478 ret = write_output (outbuf, outptr, output, output_file);
479 if (ret != 0)
480 break;
481 }
482
483 if (n != (size_t) -1)
484 {
485 /* All the input test is processed. For state-dependent
486 character sets we have to flush the state now. */
487 outptr = outbuf;
488 outlen = OUTBUF_SIZE;
489 n = iconv (cd, NULL, NULL, &outptr, &outlen);
490
491 if (outptr != outbuf)
492 {
493 ret = write_output (outbuf, outptr, output, output_file);
494 if (ret != 0)
495 break;
496 }
497
498 if (n != (size_t) -1)
499 break;
500
501 if (omit_invalid && errno == EILSEQ)
502 {
503 ret = 1;
504 break;
505 }
506 }
507
508 if (errno != E2BIG)
509 {
510 /* iconv() ran into a problem. */
511 switch (errno)
512 {
513 case EILSEQ:
514 if (! omit_invalid)
515 error (0, 0, _("illegal input sequence at position %ld"),
516 (long int) (addr - start));
517 break;
518 case EINVAL:
519 error (0, 0, _("\
520 incomplete character or shift sequence at end of buffer"));
521 break;
522 case EBADF:
523 error (0, 0, _("internal error (illegal descriptor)"));
524 break;
525 default:
526 error (0, 0, _("unknown iconv() error %d"), errno);
527 break;
528 }
529
530 return -1;
531 }
532 }
533
534 return ret;
535 }
536
537
538 static int
539 process_fd (iconv_t cd, int fd, FILE **output, const char *output_file)
540 {
541 /* we have a problem with reading from a desriptor since we must not
542 provide the iconv() function an incomplete character or shift
543 sequence at the end of the buffer. Since we have to deal with
544 arbitrary encodings we must read the whole text in a buffer and
545 process it in one step. */
546 static char *inbuf = NULL;
547 static size_t maxlen = 0;
548 char *inptr = NULL;
549 size_t actlen = 0;
550
551 while (actlen < maxlen)
552 {
553 ssize_t n = read (fd, inptr, maxlen - actlen);
554
555 if (n == 0)
556 /* No more text to read. */
557 break;
558
559 if (n == -1)
560 {
561 /* Error while reading. */
562 error (0, errno, _("error while reading the input"));
563 return -1;
564 }
565
566 inptr += n;
567 actlen += n;
568 }
569
570 if (actlen == maxlen)
571 while (1)
572 {
573 ssize_t n;
574 char *new_inbuf;
575
576 /* Increase the buffer. */
577 new_inbuf = (char *) realloc (inbuf, maxlen + 32768);
578 if (new_inbuf == NULL)
579 {
580 error (0, errno, _("unable to allocate buffer for input"));
581 return -1;
582 }
583 inbuf = new_inbuf;
584 maxlen += 32768;
585 inptr = inbuf + actlen;
586
587 do
588 {
589 n = read (fd, inptr, maxlen - actlen);
590
591 if (n == 0)
592 /* No more text to read. */
593 break;
594
595 if (n == -1)
596 {
597 /* Error while reading. */
598 error (0, errno, _("error while reading the input"));
599 return -1;
600 }
601
602 inptr += n;
603 actlen += n;
604 }
605 while (actlen < maxlen);
606
607 if (n == 0)
608 /* Break again so we leave both loops. */
609 break;
610 }
611
612 /* Now we have all the input in the buffer. Process it in one run. */
613 return process_block (cd, inbuf, actlen, output, output_file);
614 }
615
616
617 static int
618 process_file (iconv_t cd, FILE *input, FILE **output, const char *output_file)
619 {
620 /* This should be safe since we use this function only for `stdin' and
621 we haven't read anything so far. */
622 return process_fd (cd, fileno (input), output, output_file);
623 }
624
625
626 /* Print all known character sets/encodings. */
627 static void *printlist;
628 static size_t column;
629 static int not_first;
630
631 static void
632 insert_print_list (const void *nodep, VISIT value, int level)
633 {
634 if (value == leaf || value == postorder)
635 {
636 const struct gconv_alias *s = *(const struct gconv_alias **) nodep;
637 tsearch (s->fromname, &printlist, (__compar_fn_t) strverscmp);
638 }
639 }
640
641 static void
642 do_print_human (const void *nodep, VISIT value, int level)
643 {
644 if (value == leaf || value == postorder)
645 {
646 const char *s = *(const char **) nodep;
647 size_t len = strlen (s);
648 size_t cnt;
649
650 while (len > 0 && s[len - 1] == '/')
651 --len;
652
653 for (cnt = 0; cnt < len; ++cnt)
654 if (isalnum (s[cnt]))
655 break;
656 if (cnt == len)
657 return;
658
659 if (not_first)
660 {
661 putchar (',');
662 ++column;
663
664 if (column > 2 && column + len > 77)
665 {
666 fputs ("\n ", stdout);
667 column = 2;
668 }
669 else
670 {
671 putchar (' ');
672 ++column;
673 }
674 }
675 else
676 not_first = 1;
677
678 fwrite (s, len, 1, stdout);
679 column += len;
680 }
681 }
682
683 static void
684 do_print (const void *nodep, VISIT value, int level)
685 {
686 if (value == leaf || value == postorder)
687 {
688 const char *s = *(const char **) nodep;
689
690 puts (s);
691 }
692 }
693
694 static void
695 add_known_names (struct gconv_module *node)
696 {
697 if (node->left != NULL)
698 add_known_names (node->left);
699 if (node->right != NULL)
700 add_known_names (node->right);
701 do
702 {
703 if (strcmp (node->from_string, "INTERNAL") != 0)
704 tsearch (node->from_string, &printlist, (__compar_fn_t) strverscmp);
705 if (strcmp (node->to_string, "INTERNAL") != 0)
706 tsearch (node->to_string, &printlist, (__compar_fn_t) strverscmp);
707
708 node = node->same;
709 }
710 while (node != NULL);
711 }
712
713
714 static void
715 insert_cache (void)
716 {
717 const struct gconvcache_header *header;
718 const char *strtab;
719 const struct hash_entry *hashtab;
720 size_t cnt;
721
722 header = (const struct gconvcache_header *) __gconv_get_cache ();
723 strtab = (char *) header + header->string_offset;
724 hashtab = (struct hash_entry *) ((char *) header + header->hash_offset);
725
726 for (cnt = 0; cnt < header->hash_size; ++cnt)
727 if (hashtab[cnt].string_offset != 0)
728 {
729 const char *str = strtab + hashtab[cnt].string_offset;
730
731 if (strcmp (str, "INTERNAL") != 0)
732 tsearch (str, &printlist, (__compar_fn_t) strverscmp);
733 }
734 }
735
736
737 static void
738 print_known_names (void)
739 {
740 iconv_t h;
741 void *cache;
742
743 /* We must initialize the internal databases first. */
744 h = iconv_open ("L1", "L1");
745 iconv_close (h);
746
747 /* See whether we have a cache. */
748 cache = __gconv_get_cache ();
749 if (cache != NULL)
750 /* Yep, use only this information. */
751 insert_cache ();
752 else
753 {
754 struct gconv_module *modules;
755
756 /* No, then use the information read from the gconv-modules file.
757 First add the aliases. */
758 twalk (__gconv_get_alias_db (), insert_print_list);
759
760 /* Add the from- and to-names from the known modules. */
761 modules = __gconv_get_modules_db ();
762 if (modules != NULL)
763 add_known_names (modules);
764 }
765
766 bool human_readable = isatty (fileno (stdout));
767
768 if (human_readable)
769 fputs (_("\
770 The following list contains all the coded character sets known. This does\n\
771 not necessarily mean that all combinations of these names can be used for\n\
772 the FROM and TO command line parameters. One coded character set can be\n\
773 listed with several different names (aliases).\n\n "), stdout);
774
775 /* Now print the collected names. */
776 column = 2;
777 twalk (printlist, human_readable ? do_print_human : do_print);
778
779 if (human_readable && column != 0)
780 puts ("");
781 }