]> git.ipfire.org Git - thirdparty/glibc.git/blame - iconv/iconv_charmap.c
Optimize xmalloc, xcalloc, xrealloc, and xstrdup
[thirdparty/glibc.git] / iconv / iconv_charmap.c
CommitLineData
93693c4d 1/* Convert using charmaps and possibly iconv().
ec09c1c4 2 Copyright (C) 2001, 2005, 2006, 2008, 2012 Free Software Foundation, Inc.
93693c4d
UD
3 This file is part of the GNU C Library.
4 Contributed by Ulrich Drepper <drepper@redhat.com>, 2001.
5
43bc8ac6 6 This program is free software; you can redistribute it and/or modify
2e2efe65
RM
7 it under the terms of the GNU General Public License as published
8 by the Free Software Foundation; version 2 of the License, or
9 (at your option) any later version.
93693c4d 10
43bc8ac6 11 This program is distributed in the hope that it will be useful,
93693c4d 12 but WITHOUT ANY WARRANTY; without even the implied warranty of
43bc8ac6
UD
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
93693c4d 15
43bc8ac6
UD
16 You should have received a copy of the GNU General Public License
17 along with this program; if not, write to the Free Software Foundation,
18 Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */
93693c4d
UD
19
20#include <assert.h>
21#include <errno.h>
22#include <error.h>
23#include <fcntl.h>
24#include <iconv.h>
25#include <libintl.h>
26#include <stdio.h>
27#include <stdlib.h>
28#include <unistd.h>
29#include <sys/mman.h>
30#include <sys/stat.h>
31
32#include "iconv_prog.h"
33
34
35/* Prototypes for a few program-wide used functions. */
ec09c1c4
UD
36extern void *xmalloc (size_t n)
37 __attribute_malloc__ __attribute_alloc_size (1);
38extern void *xcalloc (size_t n, size_t s)
39 __attribute_malloc__ __attribute_alloc_size (1, 2);
93693c4d
UD
40
41
42struct convtable
43{
44 int term[256 / 8];
45 union
46 {
47 struct convtable *sub;
48 struct charseq *out;
49 } val[256];
50};
51
52
53static inline struct convtable *
54allocate_table (void)
55{
56 return (struct convtable *) xcalloc (1, sizeof (struct convtable));
57}
58
59
60static inline int
61is_term (struct convtable *tbl, unsigned int idx)
62{
63 return tbl->term[idx / 8] & (1 << (idx % 8));
64}
65
66
67static inline void
68clear_term (struct convtable *tbl, unsigned int idx)
69{
70 tbl->term[idx / 8] &= ~(1 << (idx % 8));
71}
72
73
74static inline void
75set_term (struct convtable *tbl, unsigned int idx)
76{
77 tbl->term[idx / 8] |= 1 << (idx % 8);
78}
79
80
81/* Generate the conversion table. */
82static struct convtable *use_from_charmap (struct charmap_t *from_charmap,
83 const char *to_code);
84static struct convtable *use_to_charmap (const char *from_code,
85 struct charmap_t *to_charmap);
86static struct convtable *use_both_charmaps (struct charmap_t *from_charmap,
87 struct charmap_t *to_charmap);
88
89/* Prototypes for the functions doing the actual work. */
90static int process_block (struct convtable *tbl, char *addr, size_t len,
91 FILE *output);
92static int process_fd (struct convtable *tbl, int fd, FILE *output);
93static int process_file (struct convtable *tbl, FILE *input, FILE *output);
94
95
96int
97charmap_conversion (const char *from_code, struct charmap_t *from_charmap,
98 const char *to_code, struct charmap_t *to_charmap,
5484ff51
UD
99 int argc, int remaining, char *argv[],
100 const char *output_file)
93693c4d
UD
101{
102 struct convtable *cvtbl;
103 int status = EXIT_SUCCESS;
104
105 /* We have three different cases to handle:
106
107 - both, from_charmap and to_charmap, are available. This means we
108 can assume that the symbolic names match and use them to create
109 the mapping.
110
111 - only from_charmap is available. In this case we can only hope that
112 the symbolic names used are of the <Uxxxx> form in which case we
113 can use a UCS4->"to_code" iconv() conversion for the second step.
114
115 - only to_charmap is available. This is similar, only that we would
116 use iconv() for the "to_code"->UCS4 conversion.
117
118 We first create a table which maps input bytes into output bytes.
119 Once this is done we can handle all three of the cases above
120 equally. */
121 if (from_charmap != NULL)
122 {
123 if (to_charmap == NULL)
124 cvtbl = use_from_charmap (from_charmap, to_code);
125 else
126 cvtbl = use_both_charmaps (from_charmap, to_charmap);
127 }
128 else
129 {
130 assert (to_charmap != NULL);
131 cvtbl = use_to_charmap (from_code, to_charmap);
132 }
133
134 /* If we couldn't generate a table stop now. */
135 if (cvtbl == NULL)
136 return EXIT_FAILURE;
137
5484ff51
UD
138 /* Determine output file. */
139 FILE *output;
140 if (output_file != NULL && strcmp (output_file, "-") != 0)
141 {
142 output = fopen (output_file, "w");
143 if (output == NULL)
144 error (EXIT_FAILURE, errno, _("cannot open output file"));
145 }
146 else
147 output = stdout;
148
93693c4d
UD
149 /* We can now start the conversion. */
150 if (remaining == argc)
151 {
152 if (process_file (cvtbl, stdin, output) != 0)
153 status = EXIT_FAILURE;
154 }
155 else
156 do
157 {
158 struct stat st;
159 char *addr;
160 int fd;
161
162 if (verbose)
163 printf ("%s:\n", argv[remaining]);
164 if (strcmp (argv[remaining], "-") == 0)
165 fd = 0;
166 else
167 {
168 fd = open (argv[remaining], O_RDONLY);
169
170 if (fd == -1)
171 {
172 error (0, errno, _("cannot open input file `%s'"),
173 argv[remaining]);
174 status = EXIT_FAILURE;
175 continue;
176 }
177 }
178
179#ifdef _POSIX_MAPPED_FILES
180 /* We have possibilities for reading the input file. First try
181 to mmap() it since this will provide the fastest solution. */
182 if (fstat (fd, &st) == 0
183 && ((addr = mmap (NULL, st.st_size, PROT_READ, MAP_PRIVATE,
184 fd, 0)) != MAP_FAILED))
185 {
186 /* Yes, we can use mmap(). The descriptor is not needed
187 anymore. */
188 if (close (fd) != 0)
189 error (EXIT_FAILURE, errno,
190 _("error while closing input `%s'"), argv[remaining]);
191
192 if (process_block (cvtbl, addr, st.st_size, output) < 0)
193 {
194 /* Something went wrong. */
195 status = EXIT_FAILURE;
196
197 /* We don't need the input data anymore. */
198 munmap ((void *) addr, st.st_size);
199
200 /* We cannot go on with producing output since it might
201 lead to problem because the last output might leave
202 the output stream in an undefined state. */
203 break;
204 }
205
206 /* We don't need the input data anymore. */
207 munmap ((void *) addr, st.st_size);
208 }
209 else
210#endif /* _POSIX_MAPPED_FILES */
211 {
212 /* Read the file in pieces. */
213 if (process_fd (cvtbl, fd, output) != 0)
214 {
215 /* Something went wrong. */
216 status = EXIT_FAILURE;
217
218 /* We don't need the input file anymore. */
219 close (fd);
220
221 /* We cannot go on with producing output since it might
222 lead to problem because the last output might leave
223 the output stream in an undefined state. */
224 break;
225 }
226
227 /* Now close the file. */
228 close (fd);
229 }
230 }
231 while (++remaining < argc);
232
233 /* All done. */
234 return status;
235}
236
237
238static void
239add_bytes (struct convtable *tbl, struct charseq *in, struct charseq *out)
240{
241 int n = 0;
242 unsigned int byte;
243
244 assert (in->nbytes > 0);
245
246 byte = ((unsigned char *) in->bytes)[n];
247 while (n + 1 < in->nbytes)
248 {
249 if (is_term (tbl, byte) || tbl->val[byte].sub == NULL)
250 {
251 /* Note that we simply ignore a definition for a byte sequence
252 which is also the prefix for a longer one. */
253 clear_term (tbl, byte);
254 tbl->val[byte].sub =
255 (struct convtable *) xcalloc (1, sizeof (struct convtable));
256 }
257
258 tbl = tbl->val[byte].sub;
259
260 byte = ((unsigned char *) in->bytes)[++n];
261 }
262
263 /* Only add the new sequence if there is none yet and the byte sequence
264 is not part of an even longer one. */
265 if (! is_term (tbl, byte) && tbl->val[byte].sub == NULL)
266 {
267 set_term (tbl, byte);
268 tbl->val[byte].out = out;
269 }
270}
271
272
273static struct convtable *
274use_from_charmap (struct charmap_t *from_charmap, const char *to_code)
275{
276 /* We iterate over all entries in the from_charmap and for those which
277 have a known UCS4 representation we use an iconv() call to determine
278 the mapping to the to_code charset. */
279 struct convtable *rettbl;
280 iconv_t cd;
281 void *ptr = NULL;
282 const void *key;
283 size_t keylen;
284 void *data;
285
286 cd = iconv_open (to_code, "WCHAR_T");
287 if (cd == (iconv_t) -1)
288 /* We cannot do anything. */
289 return NULL;
290
291 rettbl = allocate_table ();
292
293 while (iterate_table (&from_charmap->char_table, &ptr, &key, &keylen, &data)
294 >= 0)
295 {
296 struct charseq *in = (struct charseq *) data;
297
298 if (in->ucs4 != UNINITIALIZED_CHAR_VALUE)
299 {
300 /* There is a chance. Try the iconv module. */
301 wchar_t inbuf[1] = { in->ucs4 };
302 unsigned char outbuf[64];
303 char *inptr = (char *) inbuf;
304 size_t inlen = sizeof (inbuf);
305 char *outptr = (char *) outbuf;
306 size_t outlen = sizeof (outbuf);
307
308 (void) iconv (cd, &inptr, &inlen, &outptr, &outlen);
309
310 if (outptr != (char *) outbuf)
311 {
312 /* We got some output. Good, use it. */
313 struct charseq *newp;
314
315 outlen = sizeof (outbuf) - outlen;
316 assert ((char *) outbuf + outlen == outptr);
317
318 newp = (struct charseq *) xmalloc (sizeof (struct charseq)
319 + outlen);
320 newp->name = in->name;
321 newp->ucs4 = in->ucs4;
322 newp->nbytes = outlen;
323 memcpy (newp->bytes, outbuf, outlen);
324
325 add_bytes (rettbl, in, newp);
326 }
327
328 /* Clear any possible state left behind. */
329 (void) iconv (cd, NULL, NULL, NULL, NULL);
330 }
331 }
332
333 iconv_close (cd);
334
335 return rettbl;
336}
337
338
339static struct convtable *
340use_to_charmap (const char *from_code, struct charmap_t *to_charmap)
341{
342 /* We iterate over all entries in the to_charmap and for those which
343 have a known UCS4 representation we use an iconv() call to determine
344 the mapping to the from_code charset. */
345 struct convtable *rettbl;
346 iconv_t cd;
347 void *ptr = NULL;
348 const void *key;
349 size_t keylen;
350 void *data;
351
352 /* Note that the conversion we use here is the reverse direction. Without
353 exhaustive search we cannot figure out which input yields the UCS4
354 character we are looking for. Therefore we determine it the other
355 way round. */
356 cd = iconv_open (from_code, "WCHAR_T");
357 if (cd == (iconv_t) -1)
358 /* We cannot do anything. */
359 return NULL;
360
361 rettbl = allocate_table ();
362
363 while (iterate_table (&to_charmap->char_table, &ptr, &key, &keylen, &data)
364 >= 0)
365 {
366 struct charseq *out = (struct charseq *) data;
367
368 if (out->ucs4 != UNINITIALIZED_CHAR_VALUE)
369 {
370 /* There is a chance. Try the iconv module. */
371 wchar_t inbuf[1] = { out->ucs4 };
372 unsigned char outbuf[64];
373 char *inptr = (char *) inbuf;
374 size_t inlen = sizeof (inbuf);
375 char *outptr = (char *) outbuf;
376 size_t outlen = sizeof (outbuf);
377
378 (void) iconv (cd, &inptr, &inlen, &outptr, &outlen);
379
380 if (outptr != (char *) outbuf)
381 {
382 /* We got some output. Good, use it. */
51e59260
UD
383 union
384 {
385 struct charseq seq;
386 struct
387 {
388 const char *name;
389 uint32_t ucs4;
390 int nbytes;
391 unsigned char bytes[outlen];
392 } mem;
393 } new;
93693c4d
UD
394
395 outlen = sizeof (outbuf) - outlen;
396 assert ((char *) outbuf + outlen == outptr);
397
51e59260
UD
398 new.mem.name = out->name;
399 new.mem.ucs4 = out->ucs4;
400 new.mem.nbytes = outlen;
401 memcpy (new.mem.bytes, outbuf, outlen);
93693c4d 402
51e59260 403 add_bytes (rettbl, &new.seq, out);
93693c4d
UD
404 }
405
406 /* Clear any possible state left behind. */
407 (void) iconv (cd, NULL, NULL, NULL, NULL);
408 }
409 }
410
411 iconv_close (cd);
412
413 return rettbl;
414}
415
416
417static struct convtable *
418use_both_charmaps (struct charmap_t *from_charmap,
419 struct charmap_t *to_charmap)
420{
421 /* In this case we iterate over all the entries in the from_charmap,
422 determine the internal name, and find an appropriate entry in the
423 to_charmap (if it exists). */
424 struct convtable *rettbl = allocate_table ();
425 void *ptr = NULL;
426 const void *key;
427 size_t keylen;
428 void *data;
429
430 while (iterate_table (&from_charmap->char_table, &ptr, &key, &keylen, &data)
431 >= 0)
432 {
433 struct charseq *in = (struct charseq *) data;
434 struct charseq *out = charmap_find_value (to_charmap, key, keylen);
435
436 if (out != NULL)
437 add_bytes (rettbl, in, out);
438 }
439
440 return rettbl;
441}
442
443
444static int
445process_block (struct convtable *tbl, char *addr, size_t len, FILE *output)
446{
447 size_t n = 0;
448
449 while (n < len)
450 {
451 struct convtable *cur = tbl;
452 unsigned char *curp = (unsigned char *) addr;
453 unsigned int byte = *curp;
454 int cnt;
455 struct charseq *out;
456
457 while (! is_term (cur, byte))
458 if (cur->val[byte].sub == NULL)
459 {
460 /* This is a invalid sequence. Skip the first byte if we are
461 ignoring errors. Otherwise punt. */
462 if (! omit_invalid)
463 {
464 error (0, 0, _("illegal input sequence at position %Zd"), n);
465 return -1;
466 }
467
468 n -= curp - (unsigned char *) addr;
469
470 byte = *(curp = (unsigned char *) ++addr);
471 if (++n >= len)
472 /* All converted. */
473 return 0;
474
475 cur = tbl;
476 }
477 else
478 {
479 cur = cur->val[byte].sub;
480
481 if (++n >= len)
482 {
483 error (0, 0, _("\
484incomplete character or shift sequence at end of buffer"));
485 return -1;
486 }
487
488 byte = *++curp;
489 }
490
491 /* We found a final byte. Write the output bytes. */
492 out = cur->val[byte].out;
493 for (cnt = 0; cnt < out->nbytes; ++cnt)
494 fputc_unlocked (out->bytes[cnt], output);
495
496 addr = (char *) curp + 1;
497 ++n;
498 }
499
500 return 0;
501}
502
503
504static int
505process_fd (struct convtable *tbl, int fd, FILE *output)
506{
d1dddedf 507 /* We have a problem with reading from a descriptor since we must not
93693c4d
UD
508 provide the iconv() function an incomplete character or shift
509 sequence at the end of the buffer. Since we have to deal with
510 arbitrary encodings we must read the whole text in a buffer and
511 process it in one step. */
512 static char *inbuf = NULL;
513 static size_t maxlen = 0;
053f7b2b 514 char *inptr = inbuf;
93693c4d
UD
515 size_t actlen = 0;
516
517 while (actlen < maxlen)
518 {
519 ssize_t n = read (fd, inptr, maxlen - actlen);
520
521 if (n == 0)
522 /* No more text to read. */
523 break;
524
525 if (n == -1)
526 {
527 /* Error while reading. */
528 error (0, errno, _("error while reading the input"));
529 return -1;
530 }
531
532 inptr += n;
533 actlen += n;
534 }
535
536 if (actlen == maxlen)
537 while (1)
538 {
539 ssize_t n;
d1dddedf 540 char *new_inbuf;
93693c4d
UD
541
542 /* Increase the buffer. */
d1dddedf
UD
543 new_inbuf = (char *) realloc (inbuf, maxlen + 32768);
544 if (new_inbuf == NULL)
545 {
546 error (0, errno, _("unable to allocate buffer for input"));
547 return -1;
548 }
549 inbuf = new_inbuf;
93693c4d 550 maxlen += 32768;
93693c4d
UD
551 inptr = inbuf + actlen;
552
553 do
554 {
555 n = read (fd, inptr, maxlen - actlen);
556
557 if (n == 0)
558 /* No more text to read. */
559 break;
560
561 if (n == -1)
562 {
563 /* Error while reading. */
564 error (0, errno, _("error while reading the input"));
565 return -1;
566 }
567
568 inptr += n;
569 actlen += n;
570 }
571 while (actlen < maxlen);
572
573 if (n == 0)
574 /* Break again so we leave both loops. */
575 break;
576 }
577
578 /* Now we have all the input in the buffer. Process it in one run. */
579 return process_block (tbl, inbuf, actlen, output);
580}
581
582
583static int
584process_file (struct convtable *tbl, FILE *input, FILE *output)
585{
586 /* This should be safe since we use this function only for `stdin' and
587 we haven't read anything so far. */
588 return process_fd (tbl, fileno (input), output);
589}