1 /* Convert text in given files from the specified from-set to the to-set.
2 Copyright (C) 1998-2020 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Ulrich Drepper <drepper@cygnus.com>, 1998.
6 This program is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published
8 by the Free Software Foundation; version 2 of the License, or
9 (at your option) any later version.
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with this program; if not, see <https://www.gnu.org/licenses/>. */
35 #ifdef _POSIX_MAPPED_FILES
36 # include <sys/mman.h>
39 #include <gconv_int.h>
40 #include "iconv_prog.h"
41 #include "iconvconfig.h"
42 #include "gconv_charset.h"
44 /* Get libc version number. */
45 #include "../version.h"
47 #define PACKAGE _libc_intl_domainname
50 /* Name and version of program. */
51 static void print_version (FILE *stream
, struct argp_state
*state
);
52 void (*argp_program_version_hook
) (FILE *, struct argp_state
*) = print_version
;
54 #define OPT_VERBOSE 1000
57 /* Definitions of arguments for argp functions. */
58 static const struct argp_option options
[] =
60 { NULL
, 0, NULL
, 0, N_("Input/Output format specification:") },
61 { "from-code", 'f', N_("NAME"), 0, N_("encoding of original text") },
62 { "to-code", 't', N_("NAME"), 0, N_("encoding for output") },
63 { NULL
, 0, NULL
, 0, N_("Information:") },
64 { "list", 'l', NULL
, 0, N_("list all known coded character sets") },
65 { NULL
, 0, NULL
, 0, N_("Output control:") },
66 { NULL
, 'c', NULL
, 0, N_("omit invalid characters from output") },
67 { "output", 'o', N_("FILE"), 0, N_("output file") },
68 { "silent", 's', NULL
, 0, N_("suppress warnings") },
69 { "verbose", OPT_VERBOSE
, NULL
, 0, N_("print progress information") },
70 { NULL
, 0, NULL
, 0, NULL
}
73 /* Short description of program. */
74 static const char doc
[] = N_("\
75 Convert encoding of given files from one encoding to another.");
77 /* Strings for arguments in help texts. */
78 static const char args_doc
[] = N_("[FILE...]");
80 /* Prototype for option handler. */
81 static error_t
parse_opt (int key
, char *arg
, struct argp_state
*state
);
83 /* Function to print some extra text in the help message. */
84 static char *more_help (int key
, const char *text
, void *input
);
86 /* Data structure to communicate with argp functions. */
87 static struct argp argp
=
89 options
, parse_opt
, args_doc
, doc
, NULL
, more_help
92 /* Code sets to convert from and to respectively. An empty string as the
93 default causes the 'iconv_open' function to look up the charset of the
94 currently selected locale and use it. */
95 static const char *from_code
= "";
96 static const char *to_code
= "";
98 /* File to write output to. If NULL write to stdout. */
99 static const char *output_file
;
101 /* Nonzero if list of all coded character sets is wanted. */
104 /* If nonzero omit invalid character from output. */
107 /* Prototypes for the functions doing the actual work. */
108 static int process_block (iconv_t cd
, char *addr
, size_t len
, FILE **output
,
109 const char *output_file
);
110 static int process_fd (iconv_t cd
, int fd
, FILE **output
,
111 const char *output_file
);
112 static int process_file (iconv_t cd
, FILE *input
, FILE **output
,
113 const char *output_file
);
114 static void print_known_names (void);
118 main (int argc
, char *argv
[])
120 int status
= EXIT_SUCCESS
;
123 struct charmap_t
*from_charmap
= NULL
;
124 struct charmap_t
*to_charmap
= NULL
;
126 /* Set locale via LC_ALL. */
127 setlocale (LC_ALL
, "");
129 /* Set the text message domain. */
130 textdomain (_libc_intl_domainname
);
132 /* Parse and process arguments. */
133 argp_parse (&argp
, argc
, argv
, 0, &remaining
, NULL
);
135 /* List all coded character sets if wanted. */
138 print_known_names ();
142 /* POSIX 1003.2b introduces a silly thing: the arguments to -t anf -f
143 can be file names of charmaps. In this case iconv will have to read
144 those charmaps and use them to do the conversion. But there are
145 holes in the specification. There is nothing said that if -f is a
146 charmap filename that -t must be, too. And vice versa. There is
147 also no word about the symbolic names used. What if they don't
149 if (strchr (from_code
, '/') != NULL
)
150 /* The from-name might be a charmap file name. Try reading the
152 from_charmap
= charmap_read (from_code
, /*0, 1*/1, 0, 0, 0);
154 if (strchr (to_code
, '/') != NULL
)
155 /* The to-name might be a charmap file name. Try reading the
157 to_charmap
= charmap_read (to_code
, /*0, 1,*/1, 0, 0, 0);
160 /* At this point we have to handle two cases. The first one is
161 where a charmap is used for the from- or to-charset, or both. We
162 handle this special since it is very different from the sane way of
163 doing things. The other case allows converting using the iconv()
165 if (from_charmap
!= NULL
|| to_charmap
!= NULL
)
166 /* Construct the conversion table and do the conversion. */
167 status
= charmap_conversion (from_code
, from_charmap
, to_code
, to_charmap
,
168 argc
, remaining
, argv
, output_file
);
171 struct gconv_spec conv_spec
;
174 if (__gconv_create_spec (&conv_spec
, from_code
, to_code
) == NULL
)
176 error (EXIT_FAILURE
, errno
,
177 _("failed to start conversion processing"));
182 conv_spec
.ignore
= true;
184 /* Let's see whether we have these coded character sets. */
185 res
= __gconv_open (&conv_spec
, &cd
, 0);
187 __gconv_destroy_spec (&conv_spec
);
189 if (res
!= __GCONV_OK
)
193 /* Try to be nice with the user and tell her which of the
194 two encoding names is wrong. This is possible because
195 all supported encodings can be converted from/to Unicode,
196 in other words, because the graph of encodings is
199 (iconv_open ("UTF-8", from_code
) == (iconv_t
) -1
202 (iconv_open (to_code
, "UTF-8") == (iconv_t
) -1
204 const char *from_pretty
=
205 (from_code
[0] ? from_code
: nl_langinfo (CODESET
));
206 const char *to_pretty
=
207 (to_code
[0] ? to_code
: nl_langinfo (CODESET
));
214 conversions from `%s' and to `%s' are not supported"),
215 from_pretty
, to_pretty
);
218 _("conversion from `%s' is not supported"),
225 _("conversion to `%s' is not supported"),
229 _("conversion from `%s' to `%s' is not supported"),
230 from_pretty
, to_pretty
);
233 argp_help (&argp
, stderr
, ARGP_HELP_SEE
,
234 program_invocation_short_name
);
238 error (EXIT_FAILURE
, errno
,
239 _("failed to start conversion processing"));
242 /* The output file. Will be opened when we are ready to produce
246 /* Now process the remaining files. Write them to stdout or the file
247 specified with the `-o' parameter. If we have no file given as
248 the parameter process all from stdin. */
249 if (remaining
== argc
)
251 if (process_file (cd
, stdin
, &output
, output_file
) != 0)
252 status
= EXIT_FAILURE
;
257 #ifdef _POSIX_MAPPED_FILES
264 fprintf (stderr
, "%s:\n", argv
[remaining
]);
265 if (strcmp (argv
[remaining
], "-") == 0)
269 fd
= open (argv
[remaining
], O_RDONLY
);
273 error (0, errno
, _("cannot open input file `%s'"),
275 status
= EXIT_FAILURE
;
280 #ifdef _POSIX_MAPPED_FILES
281 /* We have possibilities for reading the input file. First try
282 to mmap() it since this will provide the fastest solution. */
283 if (fstat64 (fd
, &st
) == 0
284 && ((addr
= mmap (NULL
, st
.st_size
, PROT_READ
, MAP_PRIVATE
,
285 fd
, 0)) != MAP_FAILED
))
287 /* Yes, we can use mmap(). The descriptor is not needed
290 error (EXIT_FAILURE
, errno
,
291 _("error while closing input `%s'"),
294 ret
= process_block (cd
, addr
, st
.st_size
, &output
,
297 /* We don't need the input data anymore. */
298 munmap ((void *) addr
, st
.st_size
);
302 status
= EXIT_FAILURE
;
305 /* We cannot go on with producing output since it might
306 lead to problem because the last output might leave
307 the output stream in an undefined state. */
312 #endif /* _POSIX_MAPPED_FILES */
314 /* Read the file in pieces. */
315 ret
= process_fd (cd
, fd
, &output
, output_file
);
317 /* Now close the file. */
322 /* Something went wrong. */
323 status
= EXIT_FAILURE
;
326 /* We cannot go on with producing output since it might
327 lead to problem because the last output might leave
328 the output stream in an undefined state. */
333 while (++remaining
< argc
);
335 /* Close the output file now. */
336 if (output
!= NULL
&& fclose (output
))
337 error (EXIT_FAILURE
, errno
, _("error while closing output file"));
344 /* Handle program arguments. */
346 parse_opt (int key
, char *arg
, struct argp_state
*state
)
360 /* Nothing, for now at least. We are not giving out any information
361 about missing character or so. */
364 /* Omit invalid characters from output. */
374 return ARGP_ERR_UNKNOWN
;
381 more_help (int key
, const char *text
, void *input
)
386 case ARGP_KEY_HELP_EXTRA
:
387 /* We print some extra information. */
388 if (asprintf (&tp
, gettext ("\
389 For bug reporting instructions, please see:\n\
390 %s.\n"), REPORT_BUGS_TO
) < 0)
396 return (char *) text
;
400 /* Print the version information. */
402 print_version (FILE *stream
, struct argp_state
*state
)
404 fprintf (stream
, "iconv %s%s\n", PKGVERSION
, VERSION
);
405 fprintf (stream
, gettext ("\
406 Copyright (C) %s Free Software Foundation, Inc.\n\
407 This is free software; see the source for copying conditions. There is NO\n\
408 warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.\n\
410 fprintf (stream
, gettext ("Written by %s.\n"), "Ulrich Drepper");
415 write_output (const char *outbuf
, const char *outptr
, FILE **output
,
416 const char *output_file
)
418 /* We have something to write out. */
419 int errno_save
= errno
;
423 /* Determine output file. */
424 if (output_file
!= NULL
&& strcmp (output_file
, "-") != 0)
426 *output
= fopen (output_file
, "w");
428 error (EXIT_FAILURE
, errno
, _("cannot open output file"));
434 if (fwrite (outbuf
, 1, outptr
- outbuf
, *output
) < (size_t) (outptr
- outbuf
)
437 /* Error occurred while printing the result. */
439 conversion stopped due to problem in writing the output"));
450 process_block (iconv_t cd
, char *addr
, size_t len
, FILE **output
,
451 const char *output_file
)
453 #define OUTBUF_SIZE 32768
454 const char *start
= addr
;
455 char outbuf
[OUTBUF_SIZE
];
464 outlen
= OUTBUF_SIZE
;
465 n
= iconv (cd
, &addr
, &len
, &outptr
, &outlen
);
467 if (n
== (size_t) -1 && omit_invalid
&& errno
== EILSEQ
)
476 if (outptr
!= outbuf
)
478 ret
= write_output (outbuf
, outptr
, output
, output_file
);
483 if (n
!= (size_t) -1)
485 /* All the input test is processed. For state-dependent
486 character sets we have to flush the state now. */
488 outlen
= OUTBUF_SIZE
;
489 n
= iconv (cd
, NULL
, NULL
, &outptr
, &outlen
);
491 if (outptr
!= outbuf
)
493 ret
= write_output (outbuf
, outptr
, output
, output_file
);
498 if (n
!= (size_t) -1)
501 if (omit_invalid
&& errno
== EILSEQ
)
510 /* iconv() ran into a problem. */
515 error (0, 0, _("illegal input sequence at position %ld"),
516 (long int) (addr
- start
));
520 incomplete character or shift sequence at end of buffer"));
523 error (0, 0, _("internal error (illegal descriptor)"));
526 error (0, 0, _("unknown iconv() error %d"), errno
);
539 process_fd (iconv_t cd
, int fd
, FILE **output
, const char *output_file
)
541 /* we have a problem with reading from a desriptor since we must not
542 provide the iconv() function an incomplete character or shift
543 sequence at the end of the buffer. Since we have to deal with
544 arbitrary encodings we must read the whole text in a buffer and
545 process it in one step. */
546 static char *inbuf
= NULL
;
547 static size_t maxlen
= 0;
551 while (actlen
< maxlen
)
553 ssize_t n
= read (fd
, inptr
, maxlen
- actlen
);
556 /* No more text to read. */
561 /* Error while reading. */
562 error (0, errno
, _("error while reading the input"));
570 if (actlen
== maxlen
)
576 /* Increase the buffer. */
577 new_inbuf
= (char *) realloc (inbuf
, maxlen
+ 32768);
578 if (new_inbuf
== NULL
)
580 error (0, errno
, _("unable to allocate buffer for input"));
585 inptr
= inbuf
+ actlen
;
589 n
= read (fd
, inptr
, maxlen
- actlen
);
592 /* No more text to read. */
597 /* Error while reading. */
598 error (0, errno
, _("error while reading the input"));
605 while (actlen
< maxlen
);
608 /* Break again so we leave both loops. */
612 /* Now we have all the input in the buffer. Process it in one run. */
613 return process_block (cd
, inbuf
, actlen
, output
, output_file
);
618 process_file (iconv_t cd
, FILE *input
, FILE **output
, const char *output_file
)
620 /* This should be safe since we use this function only for `stdin' and
621 we haven't read anything so far. */
622 return process_fd (cd
, fileno (input
), output
, output_file
);
626 /* Print all known character sets/encodings. */
627 static void *printlist
;
628 static size_t column
;
629 static int not_first
;
632 insert_print_list (const void *nodep
, VISIT value
, int level
)
634 if (value
== leaf
|| value
== postorder
)
636 const struct gconv_alias
*s
= *(const struct gconv_alias
**) nodep
;
637 tsearch (s
->fromname
, &printlist
, (__compar_fn_t
) strverscmp
);
642 do_print_human (const void *nodep
, VISIT value
, int level
)
644 if (value
== leaf
|| value
== postorder
)
646 const char *s
= *(const char **) nodep
;
647 size_t len
= strlen (s
);
650 while (len
> 0 && s
[len
- 1] == '/')
653 for (cnt
= 0; cnt
< len
; ++cnt
)
654 if (isalnum (s
[cnt
]))
664 if (column
> 2 && column
+ len
> 77)
666 fputs ("\n ", stdout
);
678 fwrite (s
, len
, 1, stdout
);
684 do_print (const void *nodep
, VISIT value
, int level
)
686 if (value
== leaf
|| value
== postorder
)
688 const char *s
= *(const char **) nodep
;
695 add_known_names (struct gconv_module
*node
)
697 if (node
->left
!= NULL
)
698 add_known_names (node
->left
);
699 if (node
->right
!= NULL
)
700 add_known_names (node
->right
);
703 if (strcmp (node
->from_string
, "INTERNAL") != 0)
704 tsearch (node
->from_string
, &printlist
, (__compar_fn_t
) strverscmp
);
705 if (strcmp (node
->to_string
, "INTERNAL") != 0)
706 tsearch (node
->to_string
, &printlist
, (__compar_fn_t
) strverscmp
);
710 while (node
!= NULL
);
717 const struct gconvcache_header
*header
;
719 const struct hash_entry
*hashtab
;
722 header
= (const struct gconvcache_header
*) __gconv_get_cache ();
723 strtab
= (char *) header
+ header
->string_offset
;
724 hashtab
= (struct hash_entry
*) ((char *) header
+ header
->hash_offset
);
726 for (cnt
= 0; cnt
< header
->hash_size
; ++cnt
)
727 if (hashtab
[cnt
].string_offset
!= 0)
729 const char *str
= strtab
+ hashtab
[cnt
].string_offset
;
731 if (strcmp (str
, "INTERNAL") != 0)
732 tsearch (str
, &printlist
, (__compar_fn_t
) strverscmp
);
738 print_known_names (void)
743 /* We must initialize the internal databases first. */
744 h
= iconv_open ("L1", "L1");
747 /* See whether we have a cache. */
748 cache
= __gconv_get_cache ();
750 /* Yep, use only this information. */
754 struct gconv_module
*modules
;
756 /* No, then use the information read from the gconv-modules file.
757 First add the aliases. */
758 twalk (__gconv_get_alias_db (), insert_print_list
);
760 /* Add the from- and to-names from the known modules. */
761 modules
= __gconv_get_modules_db ();
763 add_known_names (modules
);
766 bool human_readable
= isatty (fileno (stdout
));
770 The following list contains all the coded character sets known. This does\n\
771 not necessarily mean that all combinations of these names can be used for\n\
772 the FROM and TO command line parameters. One coded character set can be\n\
773 listed with several different names (aliases).\n\n "), stdout
);
775 /* Now print the collected names. */
777 twalk (printlist
, human_readable
? do_print_human
: do_print
);
779 if (human_readable
&& column
!= 0)