]>
Commit | Line | Data |
---|---|---|
fb5663ca UD |
1 | /* Convert text in given files from the specified from-set to the to-set. |
2 | Copyright (C) 1998 Free Software Foundation, Inc. | |
3 | This file is part of the GNU C Library. | |
4 | Contributed by Ulrich Drepper <drepper@cygnus.com>, 1998. | |
5 | ||
6 | The GNU C Library is free software; you can redistribute it and/or | |
7 | modify it under the terms of the GNU Library General Public License as | |
8 | published by the Free Software Foundation; either version 2 of the | |
9 | License, or (at your option) any later version. | |
10 | ||
11 | The GNU C Library is distributed in the hope that it will be useful, | |
12 | but WITHOUT ANY WARRANTY; without even the implied warranty of | |
13 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
14 | Library General Public License for more details. | |
15 | ||
16 | You should have received a copy of the GNU Library General Public | |
17 | License along with the GNU C Library; see the file COPYING.LIB. If not, | |
18 | write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, | |
19 | Boston, MA 02111-1307, USA. */ | |
20 | ||
21 | #include <argp.h> | |
8fe0fd03 | 22 | #include <ctype.h> |
fb5663ca UD |
23 | #include <errno.h> |
24 | #include <error.h> | |
25 | #include <fcntl.h> | |
26 | #include <iconv.h> | |
27 | #include <locale.h> | |
8fe0fd03 | 28 | #include <search.h> |
fb5663ca UD |
29 | #include <stdio.h> |
30 | #include <stdlib.h> | |
31 | #include <string.h> | |
32 | #include <unistd.h> | |
33 | #include <sys/mman.h> | |
e62c19f1 | 34 | #include <gconv_int.h> |
fb5663ca UD |
35 | |
36 | /* Get libc version number. */ | |
37 | #include "../version.h" | |
38 | ||
39 | #define PACKAGE _libc_intl_domainname | |
40 | ||
41 | ||
42 | /* Name and version of program. */ | |
43 | static void print_version (FILE *stream, struct argp_state *state); | |
44 | void (*argp_program_version_hook) (FILE *, struct argp_state *) = print_version; | |
45 | ||
46 | #define OPT_VERBOSE 1000 | |
8fe0fd03 | 47 | #define OPT_LIST 1001 |
fb5663ca UD |
48 | |
49 | /* Definitions of arguments for argp functions. */ | |
50 | static const struct argp_option options[] = | |
51 | { | |
52 | { NULL, 0, NULL, 0, N_("Input/Output format specification:") }, | |
53 | { "from-code", 'f', "NAME", 0, N_("encoding of original text") }, | |
54 | { "to-code", 't', "NAME", 0, N_("encoding for output") }, | |
8fe0fd03 UD |
55 | { NULL, 0, NULL, 0, N_("Information:") }, |
56 | { "list", OPT_LIST, NULL, 0, N_("list all known coded character sets") }, | |
fb5663ca UD |
57 | { NULL, 0, NULL, 0, N_("Output control:") }, |
58 | { "output", 'o', "FILE", 0, N_("output file") }, | |
59 | { "verbose", OPT_VERBOSE, NULL, 0, N_("print progress information") }, | |
60 | { NULL, 0, NULL, 0, NULL } | |
61 | }; | |
62 | ||
63 | /* Short description of program. */ | |
64 | static const char doc[] = N_("\ | |
65 | Convert encoding of given files from one encoding to another."); | |
66 | ||
67 | /* Strings for arguments in help texts. */ | |
68 | static const char args_doc[] = N_("[FILE...]"); | |
69 | ||
70 | /* Prototype for option handler. */ | |
71 | static error_t parse_opt __P ((int key, char *arg, struct argp_state *state)); | |
72 | ||
73 | /* Function to print some extra text in the help message. */ | |
74 | static char *more_help __P ((int key, const char *text, void *input)); | |
75 | ||
76 | /* Data structure to communicate with argp functions. */ | |
77 | static struct argp argp = | |
78 | { | |
79 | options, parse_opt, args_doc, doc, NULL, more_help | |
80 | }; | |
81 | ||
82 | /* Code sets to convert from and to respectively. */ | |
83 | static const char *from_code; | |
84 | static const char *to_code; | |
85 | ||
86 | /* File to write output to. If NULL write to stdout. */ | |
87 | static const char *output_file; | |
88 | ||
89 | /* Nonzero if verbose ouput is wanted. */ | |
90 | static int verbose; | |
91 | ||
8fe0fd03 UD |
92 | /* Nonzero if list of all coded character sets is wanted. */ |
93 | static int list; | |
94 | ||
fb5663ca UD |
95 | /* Prototypes for the functions doing the actual work. */ |
96 | static int process_block (iconv_t cd, const char *addr, size_t len, | |
97 | FILE *output); | |
98 | static int process_fd (iconv_t cd, int fd, FILE *output); | |
99 | static int process_file (iconv_t cd, FILE *input, FILE *output); | |
8fe0fd03 | 100 | static void print_known_names (void); |
fb5663ca UD |
101 | |
102 | ||
103 | int | |
104 | main (int argc, char *argv[]) | |
105 | { | |
106 | int status = EXIT_SUCCESS; | |
107 | int remaining; | |
108 | FILE *output; | |
109 | iconv_t cd; | |
110 | ||
111 | /* Set locale via LC_ALL. */ | |
112 | setlocale (LC_ALL, ""); | |
113 | ||
114 | /* Set the text message domain. */ | |
115 | textdomain (_libc_intl_domainname); | |
116 | ||
117 | /* Parse and process arguments. */ | |
118 | argp_parse (&argp, argc, argv, 0, &remaining, NULL); | |
119 | ||
8fe0fd03 UD |
120 | /* List all coded character sets if wanted. */ |
121 | if (list) | |
122 | { | |
123 | print_known_names (); | |
124 | exit (EXIT_SUCCESS); | |
125 | } | |
126 | ||
fb5663ca UD |
127 | /* If either the from- or to-code is not specified this is an error |
128 | since we do not know what to do. */ | |
129 | if (from_code == NULL && to_code == NULL) | |
130 | error (EXIT_FAILURE, 0, | |
03e4219e | 131 | _("neither original nor target encoding specified")); |
fb5663ca UD |
132 | if (from_code == NULL) |
133 | error (EXIT_FAILURE, 0, _("original encoding not specified using `-f'")); | |
134 | if (to_code == NULL) | |
135 | error (EXIT_FAILURE, 0, _("target encoding not specified using `-t'")); | |
136 | ||
137 | /* Let's see whether we have these coded character sets. */ | |
138 | cd = iconv_open (to_code, from_code); | |
139 | if (cd == (iconv_t) -1) | |
140 | if (errno == EINVAL) | |
141 | error (EXIT_FAILURE, 0, _("conversion from `%s' to `%s' not supported"), | |
142 | from_code, to_code); | |
143 | else | |
144 | error (EXIT_FAILURE, errno, _("failed to start conversion processing")); | |
145 | ||
146 | /* Determine output file. */ | |
147 | if (output_file != NULL) | |
148 | { | |
149 | output = fopen (output_file, "w"); | |
150 | if (output == NULL) | |
151 | error (EXIT_FAILURE, errno, _("cannot open output file")); | |
152 | } | |
153 | else | |
154 | output = stdout; | |
155 | ||
156 | /* Now process the remaining files. Write them to stdout or the file | |
157 | specified with the `-o' parameter. If we have no file given as | |
158 | the parameter process all from stdin. */ | |
159 | if (remaining == argc) | |
160 | process_file (cd, stdin, output); | |
161 | else | |
162 | do | |
163 | { | |
164 | struct stat st; | |
165 | const char *addr; | |
166 | int fd = open (argv[remaining], O_RDONLY); | |
167 | ||
168 | if (verbose) | |
169 | printf ("%s:\n", argv[remaining]); | |
170 | ||
171 | if (fd == -1) | |
172 | { | |
173 | error (0, errno, _("cannot open input file `%s'"), | |
174 | argv[remaining]); | |
175 | status = EXIT_FAILURE; | |
176 | continue; | |
177 | } | |
178 | ||
179 | /* We have possibilities for reading the input file. First try | |
180 | to mmap() it since this will provide the fastest solution. */ | |
181 | if (fstat (fd, &st) == 0 | |
182 | && ((addr = mmap (NULL, st.st_size, PROT_READ, MAP_PRIVATE, fd, 0)) | |
183 | != MAP_FAILED)) | |
184 | { | |
185 | /* Yes, we can use mmap(). The descriptor is not needed | |
186 | anymore. */ | |
187 | if (close (fd) != 0) | |
188 | error (EXIT_FAILURE, errno, _("error while closing input `%s'"), | |
189 | argv[remaining]); | |
190 | ||
480bc727 | 191 | if (process_block (cd, addr, st.st_size, output) < 0) |
fb5663ca UD |
192 | { |
193 | /* Something went wrong. */ | |
194 | status = EXIT_FAILURE; | |
195 | ||
196 | /* We don't need the input data anymore. */ | |
197 | munmap ((void *) addr, st.st_size); | |
198 | ||
199 | /* We cannot go on with producing output since it might | |
200 | lead to problem because the last output might leave | |
201 | the output stream in an undefined state. */ | |
202 | break; | |
203 | } | |
204 | ||
205 | /* We don't need the input data anymore. */ | |
206 | munmap ((void *) addr, st.st_size); | |
207 | } | |
208 | else | |
209 | { | |
210 | /* Read the file in pieces. */ | |
211 | if (process_fd (cd, fd, output) != 0) | |
212 | { | |
213 | /* Something went wrong. */ | |
214 | status = EXIT_FAILURE; | |
215 | ||
216 | /* We don't need the input file anymore. */ | |
217 | close (fd); | |
218 | ||
219 | /* We cannot go on with producing output since it might | |
220 | lead to problem because the last output might leave | |
221 | the output stream in an undefined state. */ | |
222 | break; | |
223 | } | |
224 | ||
225 | /* Now close the file. */ | |
226 | close (fd); | |
227 | } | |
228 | } | |
229 | while (++remaining < argc); | |
230 | ||
231 | /* Close the output file now. */ | |
232 | if (fclose (output)) | |
233 | error (EXIT_FAILURE, errno, _("error while closing output file")); | |
234 | ||
235 | return status; | |
236 | } | |
237 | ||
238 | ||
239 | /* Handle program arguments. */ | |
240 | static error_t | |
241 | parse_opt (int key, char *arg, struct argp_state *state) | |
242 | { | |
243 | switch (key) | |
244 | { | |
245 | case 'f': | |
246 | from_code = arg; | |
247 | break; | |
248 | case 't': | |
249 | to_code = arg; | |
250 | break; | |
251 | case 'o': | |
252 | output_file = arg; | |
253 | break; | |
254 | case OPT_VERBOSE: | |
255 | verbose = 1; | |
256 | break; | |
8fe0fd03 UD |
257 | case OPT_LIST: |
258 | list = 1; | |
259 | break; | |
fb5663ca UD |
260 | default: |
261 | return ARGP_ERR_UNKNOWN; | |
262 | } | |
263 | return 0; | |
264 | } | |
265 | ||
266 | ||
267 | static char * | |
268 | more_help (int key, const char *text, void *input) | |
269 | { | |
270 | switch (key) | |
271 | { | |
272 | case ARGP_KEY_HELP_EXTRA: | |
273 | /* We print some extra information. */ | |
274 | return strdup (gettext ("\ | |
275 | Report bugs using the `glibcbug' script to <bugs@gnu.org>.\n")); | |
276 | default: | |
277 | break; | |
278 | } | |
279 | return (char *) text; | |
280 | } | |
281 | ||
282 | ||
283 | /* Print the version information. */ | |
284 | static void | |
285 | print_version (FILE *stream, struct argp_state *state) | |
286 | { | |
287 | fprintf (stream, "iconv (GNU %s) %s\n", PACKAGE, VERSION); | |
288 | fprintf (stream, gettext ("\ | |
289 | Copyright (C) %s Free Software Foundation, Inc.\n\ | |
290 | This is free software; see the source for copying conditions. There is NO\n\ | |
291 | warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.\n\ | |
292 | "), "1998"); | |
293 | fprintf (stream, gettext ("Written by %s.\n"), "Ulrich Drepper"); | |
294 | } | |
295 | ||
296 | ||
297 | static int | |
298 | process_block (iconv_t cd, const char *addr, size_t len, FILE *output) | |
299 | { | |
300 | #define OUTBUF_SIZE 32768 | |
9b26f5c4 | 301 | const char *start = addr; |
fb5663ca | 302 | char outbuf[OUTBUF_SIZE]; |
8619129f UD |
303 | char *outptr; |
304 | size_t outlen; | |
305 | size_t n; | |
fb5663ca UD |
306 | |
307 | while (len > 0) | |
308 | { | |
8619129f UD |
309 | outptr = outbuf; |
310 | outlen = OUTBUF_SIZE; | |
311 | n = iconv (cd, &addr, &len, &outptr, &outlen); | |
fb5663ca UD |
312 | |
313 | if (outptr != outbuf) | |
314 | { | |
315 | /* We have something to write out. */ | |
a44d2393 | 316 | if (fwrite (outbuf, 1, outptr - outbuf, output) < outptr - outbuf |
fb5663ca UD |
317 | || ferror (output)) |
318 | { | |
319 | /* Error occurred while printing the result. */ | |
320 | error (0, 0, _("\ | |
321 | conversion stopped due to problem in writing the output")); | |
322 | return -1; | |
323 | } | |
324 | } | |
325 | ||
326 | if (n != (size_t) -1) | |
327 | /* Everything is processed. */ | |
328 | break; | |
329 | ||
330 | if (errno != E2BIG) | |
331 | { | |
332 | /* iconv() ran into a problem. */ | |
333 | switch (errno) | |
334 | { | |
335 | case EILSEQ: | |
9b26f5c4 UD |
336 | error (0, 0, _("illegal input sequence at position %ld"), |
337 | addr - start); | |
fb5663ca UD |
338 | break; |
339 | case EINVAL: | |
340 | error (0, 0, _("\ | |
341 | incomplete character or shift sequence at end of buffer")); | |
342 | break; | |
343 | case EBADF: | |
344 | error (0, 0, _("internal error (illegal descriptor)")); | |
345 | break; | |
346 | default: | |
347 | error (0, 0, _("unknown iconv() error %d"), errno); | |
348 | break; | |
349 | } | |
350 | ||
351 | return -1; | |
352 | } | |
353 | } | |
354 | ||
355 | return 0; | |
356 | } | |
357 | ||
358 | ||
359 | static int | |
360 | process_fd (iconv_t cd, int fd, FILE *output) | |
361 | { | |
362 | /* we have a problem with reading from a desriptor since we must not | |
363 | provide the iconv() function an incomplete character or shift | |
364 | sequence at the end of the buffer. Since we have to deal with | |
365 | arbitrary encodings we must read the whole text in a buffer and | |
366 | process it in one step. */ | |
367 | static char *inbuf = NULL; | |
368 | static size_t maxlen = 0; | |
369 | char *inptr = NULL; | |
370 | size_t actlen = 0; | |
371 | ||
372 | while (actlen < maxlen) | |
373 | { | |
374 | size_t n = read (fd, inptr, maxlen - actlen); | |
375 | ||
376 | if (n == 0) | |
377 | /* No more text to read. */ | |
378 | break; | |
379 | ||
380 | if (n == -1) | |
381 | { | |
382 | /* Error while reading. */ | |
383 | error (0, errno, _("error while reading the input")); | |
384 | return -1; | |
385 | } | |
386 | ||
387 | inptr += n; | |
388 | actlen += n; | |
389 | } | |
390 | ||
391 | if (actlen == maxlen) | |
392 | while (1) | |
393 | { | |
394 | size_t n; | |
395 | ||
396 | /* Increase the buffer. */ | |
397 | maxlen += 32768; | |
398 | inbuf = realloc (inbuf, maxlen); | |
399 | if (inbuf == NULL) | |
400 | error (0, errno, _("unable to allocate buffer for input")); | |
401 | inptr = inbuf + actlen; | |
402 | ||
403 | do | |
404 | { | |
405 | n = read (fd, inptr, maxlen - actlen); | |
406 | ||
407 | if (n == 0) | |
408 | /* No more text to read. */ | |
409 | break; | |
410 | ||
411 | if (n == -1) | |
412 | { | |
413 | /* Error while reading. */ | |
414 | error (0, errno, _("error while reading the input")); | |
415 | return -1; | |
416 | } | |
417 | ||
418 | inptr += n; | |
419 | actlen += n; | |
420 | } | |
421 | while (actlen < maxlen); | |
422 | ||
423 | if (n == 0) | |
424 | /* Break again so we leave both loops. */ | |
425 | break; | |
426 | } | |
427 | ||
428 | /* Now we have all the input in the buffer. Process it in one run. */ | |
429 | return process_block (cd, inbuf, actlen, output); | |
430 | } | |
431 | ||
432 | ||
433 | static int | |
434 | process_file (iconv_t cd, FILE *input, FILE *output) | |
435 | { | |
436 | /* This should be safe since we use this function only for `stdin' and | |
437 | we haven't read anything so far. */ | |
438 | return process_fd (cd, fileno (input), output); | |
439 | } | |
8fe0fd03 UD |
440 | |
441 | ||
442 | /* Print all known character sets/encodings. */ | |
443 | static void *printlist; | |
444 | static size_t column; | |
445 | static int not_first; | |
446 | ||
447 | static void | |
448 | insert_print_list (const void *nodep, VISIT value, int level) | |
449 | { | |
450 | if (value == leaf || value == postorder) | |
451 | { | |
452 | const struct gconv_alias *s = *(const struct gconv_alias **) nodep; | |
9b26f5c4 | 453 | tsearch (s->fromname, &printlist, (__compar_fn_t) strverscmp); |
8fe0fd03 UD |
454 | } |
455 | } | |
456 | ||
457 | static void | |
458 | do_print (const void *nodep, VISIT value, int level) | |
459 | { | |
460 | if (value == leaf || value == postorder) | |
461 | { | |
462 | const char *s = *(const char **) nodep; | |
463 | size_t len = strlen (s); | |
464 | size_t cnt; | |
465 | ||
466 | while (len > 0 && s[len - 1] == '/') | |
467 | --len; | |
468 | ||
469 | for (cnt = 0; cnt < len; ++cnt) | |
470 | if (isalnum (s[cnt])) | |
471 | break; | |
472 | if (cnt == len) | |
473 | return; | |
474 | ||
475 | if (not_first) | |
476 | { | |
477 | putchar (','); | |
478 | ++column; | |
479 | ||
480 | if (column > 2 && column + len > 77) | |
481 | { | |
482 | fputs ("\n ", stdout); | |
483 | column = 2; | |
484 | } | |
485 | else | |
486 | { | |
487 | putchar (' '); | |
488 | ++column; | |
489 | } | |
490 | } | |
491 | else | |
9b26f5c4 | 492 | not_first = 1; |
8fe0fd03 UD |
493 | |
494 | fwrite (s, len, 1, stdout); | |
495 | column += len; | |
496 | } | |
497 | } | |
498 | ||
499 | static void | |
500 | print_known_names (void) | |
501 | { | |
502 | size_t cnt; | |
503 | iconv_t h; | |
504 | ||
505 | /* We must initialize the internal databases first. */ | |
506 | h = iconv_open ("L1", "L1"); | |
507 | iconv_close (h); | |
508 | ||
509 | /* First add the aliases. */ | |
510 | twalk (__gconv_alias_db, insert_print_list); | |
511 | ||
512 | /* Add the from- and to-names from the known modules. */ | |
513 | for (cnt = 0; cnt < __gconv_nmodules; ++cnt) | |
514 | { | |
515 | if (__gconv_modules_db[cnt]->from_pattern == NULL) | |
516 | { | |
f1fa8b68 UD |
517 | if (strcmp (__gconv_modules_db[cnt]->from_constpfx, "INTERNAL")) |
518 | tsearch (__gconv_modules_db[cnt]->from_constpfx, &printlist, | |
3dcf8ea6 | 519 | (__compar_fn_t) strverscmp); |
f1fa8b68 UD |
520 | if (strcmp (__gconv_modules_db[cnt]->to_string, "INTERNAL")) |
521 | tsearch (__gconv_modules_db[cnt]->to_string, &printlist, | |
3dcf8ea6 | 522 | (__compar_fn_t) strverscmp); |
8fe0fd03 UD |
523 | } |
524 | else | |
f1fa8b68 UD |
525 | if (strcmp (__gconv_modules_db[cnt]->from_pattern, "INTERNAL")) |
526 | tsearch (__gconv_modules_db[cnt]->from_pattern, &printlist, | |
3dcf8ea6 | 527 | (__compar_fn_t) strverscmp); |
8fe0fd03 UD |
528 | } |
529 | ||
530 | fputs (_("\ | |
531 | The following list contain all the coded character sets known. This does\n\ | |
532 | not necessarily mean that all combinations of these names can be used for\n\ | |
533 | the FROM and TO command line parameters. One coded character set can be\n\ | |
534 | listed with several different names (aliases).\n\ | |
535 | Some of the names are no plain strings but instead regular expressions and\n\ | |
536 | they match a variety of names which can be given as parameters to the\n\ | |
537 | program.\n\n "), stdout); | |
538 | ||
539 | /* Now print the collected names. */ | |
540 | column = 2; | |
541 | twalk (printlist, do_print); | |
542 | ||
543 | if (column != 0) | |
544 | puts (""); | |
545 | } |