]> git.ipfire.org Git - thirdparty/cups.git/blob - cups/transcode.c
40bf2f05469733fec0ea0f1d1715175ce14512a5
[thirdparty/cups.git] / cups / transcode.c
1 /*
2 * "$Id: transcode.c 5838 2006-08-17 14:41:42Z mike $"
3 *
4 * Transcoding support for the Common UNIX Printing System (CUPS).
5 *
6 * Copyright 1997-2006 by Easy Software Products.
7 *
8 * These coded instructions, statements, and computer programs are
9 * the property of Easy Software Products and are protected by Federal
10 * copyright law. Distribution and use rights are outlined in the
11 * file "LICENSE.txt" which should have been included with this file.
12 * If this file is missing or damaged please contact Easy Software
13 * Products at:
14 *
15 * Attn: CUPS Licensing Information
16 * Easy Software Products
17 * 44141 Airport View Drive, Suite 204
18 * Hollywood, Maryland 20636 USA
19 *
20 * Voice: (301) 373-9600
21 * EMail: cups-info@cups.org
22 * WWW: http://www.cups.org
23 *
24 * Contents:
25 *
26 * _cupsCharmapFlush() - Flush all character set maps out of cache.
27 * _cupsCharmapFree() - Free a character set map.
28 * _cupsCharmapGet() - Get a character set map.
29 * cupsCharsetToUTF8() - Convert legacy character set to UTF-8.
30 * cupsUTF8ToCharset() - Convert UTF-8 to legacy character set.
31 * cupsUTF8ToUTF32() - Convert UTF-8 to UTF-32.
32 * cupsUTF32ToUTF8() - Convert UTF-32 to UTF-8.
33 * compare_wide() - Compare key for wide (VBCS) match.
34 * conv_sbcs_to_utf8() - Convert legacy SBCS to UTF-8.
35 * conv_utf8_to_sbcs() - Convert UTF-8 to legacy SBCS.
36 * conv_utf8_to_vbcs() - Convert UTF-8 to legacy DBCS/VBCS.
37 * conv_vbcs_to_utf8() - Convert legacy DBCS/VBCS to UTF-8.
38 * free_sbcs_charmap() - Free memory used by a single byte character set.
39 * free_vbcs_charmap() - Free memory used by a variable byte character set.
40 * get_charmap() - Lookup or get a character set map (private).
41 * get_charmap_count() - Count lines in a charmap file.
42 * get_sbcs_charmap() - Get SBCS Charmap.
43 * get_vbcs_charmap() - Get DBCS/VBCS Charmap.
44 */
45
46 /*
47 * Include necessary headers...
48 */
49
50 #include "globals.h"
51 #include "debug.h"
52 #include <limits.h>
53 #include <stdlib.h>
54 #include <errno.h>
55 #include <time.h>
56
57
58 /*
59 * Local globals...
60 */
61
62 #ifdef HAVE_PTHREAD_H
63 static pthread_mutex_t map_mutex = PTHREAD_MUTEX_INITIALIZER;
64 /* Mutex to control access to maps */
65 #endif /* HAVE_PTHREAD_H */
66 static _cups_cmap_t *cmap_cache = NULL;
67 /* SBCS Charmap Cache */
68 static _cups_vmap_t *vmap_cache = NULL;
69 /* VBCS Charmap Cache */
70
71
72 /*
73 * Local functions...
74 */
75
76 static int compare_wide(const void *k1, const void *k2);
77 static int conv_sbcs_to_utf8(cups_utf8_t *dest,
78 const cups_sbcs_t *src,
79 int maxout,
80 const cups_encoding_t encoding);
81 static int conv_utf8_to_sbcs(cups_sbcs_t *dest,
82 const cups_utf8_t *src,
83 int maxout,
84 const cups_encoding_t encoding);
85 static int conv_utf8_to_vbcs(cups_sbcs_t *dest,
86 const cups_utf8_t *src,
87 int maxout,
88 const cups_encoding_t encoding);
89 static int conv_vbcs_to_utf8(cups_utf8_t *dest,
90 const cups_sbcs_t *src,
91 int maxout,
92 const cups_encoding_t encoding);
93 static void free_sbcs_charmap(_cups_cmap_t *sbcs);
94 static void free_vbcs_charmap(_cups_vmap_t *vbcs);
95 static void *get_charmap(const cups_encoding_t encoding);
96 static int get_charmap_count(cups_file_t *fp);
97 static _cups_cmap_t *get_sbcs_charmap(const cups_encoding_t encoding,
98 const char *filename);
99 static _cups_vmap_t *get_vbcs_charmap(const cups_encoding_t encoding,
100 const char *filename);
101
102
103 /*
104 * '_cupsCharmapFlush()' - Flush all character set maps out of cache.
105 */
106
107 void
108 _cupsCharmapFlush(void)
109 {
110 _cups_cmap_t *cmap, /* Legacy SBCS / Unicode Charset Map */
111 *cnext; /* Next Legacy SBCS Charset Map */
112 _cups_vmap_t *vmap, /* Legacy VBCS / Unicode Charset Map */
113 *vnext; /* Next Legacy VBCS Charset Map */
114
115
116 #ifdef HAVE_PTHREAD_H
117 pthread_mutex_lock(&map_mutex);
118 #endif /* HAVE_PTHREAD_H */
119
120 /*
121 * Loop through SBCS charset map cache, free all memory...
122 */
123
124 for (cmap = cmap_cache; cmap; cmap = cnext)
125 {
126 cnext = cmap->next;
127
128 free_sbcs_charmap(cmap);
129 }
130
131 cmap_cache = NULL;
132
133 /*
134 * Loop through DBCS/VBCS charset map cache, free all memory...
135 */
136
137 for (vmap = vmap_cache; vmap; vmap = vnext)
138 {
139 vnext = vmap->next;
140
141 free_vbcs_charmap(vmap);
142
143 free(vmap);
144 }
145
146 vmap_cache = NULL;
147
148 #ifdef HAVE_PTHREAD_H
149 pthread_mutex_unlock(&map_mutex);
150 #endif /* HAVE_PTHREAD_H */
151 }
152
153
154 /*
155 * '_cupsCharmapFree()' - Free a character set map.
156 *
157 * This does not actually free; use '_cupsCharmapFlush()' for that.
158 */
159
160 void
161 _cupsCharmapFree(
162 const cups_encoding_t encoding) /* I - Encoding */
163 {
164 _cups_cmap_t *cmap; /* Legacy SBCS / Unicode Charset Map */
165 _cups_vmap_t *vmap; /* Legacy VBCS / Unicode Charset Map */
166
167
168 /*
169 * See if we already have this SBCS charset map loaded...
170 */
171
172 #ifdef HAVE_PTHREAD_H
173 pthread_mutex_lock(&map_mutex);
174 #endif /* HAVE_PTHREAD_H */
175
176 for (cmap = cmap_cache; cmap; cmap = cmap->next)
177 {
178 if (cmap->encoding == encoding)
179 {
180 if (cmap->used > 0)
181 cmap->used --;
182 break;
183 }
184 }
185
186 /*
187 * See if we already have this DBCS/VBCS charset map loaded...
188 */
189
190 for (vmap = vmap_cache; vmap; vmap = vmap->next)
191 {
192 if (vmap->encoding == encoding)
193 {
194 if (vmap->used > 0)
195 vmap->used --;
196 break;
197 }
198 }
199
200 #ifdef HAVE_PTHREAD_H
201 pthread_mutex_unlock(&map_mutex);
202 #endif /* HAVE_PTHREAD_H */
203 }
204
205
206 /*
207 * '_cupsCharmapGet()' - Get a character set map.
208 *
209 * This code handles single-byte (SBCS), double-byte (DBCS), and
210 * variable-byte (VBCS) character sets _without_ charset escapes...
211 * This code does not handle multiple-byte character sets (MBCS)
212 * (such as ISO-2022-JP) with charset switching via escapes...
213 */
214
215 void * /* O - Charset map pointer */
216 _cupsCharmapGet(
217 const cups_encoding_t encoding) /* I - Encoding */
218 {
219 void *charmap; /* Charset map pointer */
220
221
222 DEBUG_printf(("_cupsCharmapGet(encoding=%d)\n", encoding));
223
224 /*
225 * Check for valid arguments...
226 */
227
228 if (encoding < 0 || encoding >= CUPS_ENCODING_VBCS_END)
229 {
230 DEBUG_puts(" Bad encoding, returning NULL!");
231 return (NULL);
232 }
233
234 /*
235 * Lookup or get the charset map pointer and return...
236 */
237
238 #ifdef HAVE_PTHREAD_H
239 pthread_mutex_lock(&map_mutex);
240 #endif /* HAVE_PTHREAD_H */
241
242 charmap = get_charmap(encoding);
243
244 #ifdef HAVE_PTHREAD_H
245 pthread_mutex_unlock(&map_mutex);
246 #endif /* HAVE_PTHREAD_H */
247
248 return (charmap);
249 }
250
251
252 /*
253 * 'cupsCharsetToUTF8()' - Convert legacy character set to UTF-8.
254 *
255 * This code handles single-byte (SBCS), double-byte (DBCS), and
256 * variable-byte (VBCS) character sets _without_ charset escapes...
257 * This code does not handle multiple-byte character sets (MBCS)
258 * (such as ISO-2022-JP) with charset switching via escapes...
259 */
260
261 int /* O - Count or -1 on error */
262 cupsCharsetToUTF8(
263 cups_utf8_t *dest, /* O - Target string */
264 const char *src, /* I - Source string */
265 const int maxout, /* I - Max output */
266 const cups_encoding_t encoding) /* I - Encoding */
267 {
268 int bytes; /* Number of bytes converted */
269
270
271 /*
272 * Check for valid arguments...
273 */
274
275 DEBUG_printf(("cupsCharsetToUTF8(dest=%p, src=\"%s\", maxout=%d, encoding=%d)\n",
276 dest, src, maxout, encoding));
277
278 if (dest)
279 *dest = '\0';
280
281 if (!dest || !src || maxout < 1 || maxout > CUPS_MAX_USTRING)
282 {
283 DEBUG_puts(" Bad arguments, returning -1");
284 return (-1);
285 }
286
287 /*
288 * Handle identity conversions...
289 */
290
291 if (encoding == CUPS_UTF8 ||
292 encoding < 0 || encoding >= CUPS_ENCODING_VBCS_END)
293 {
294 strlcpy((char *)dest, src, maxout);
295 return (strlen((char *)dest));
296 }
297
298 /*
299 * Convert input legacy charset to UTF-8...
300 */
301
302 #ifdef HAVE_PTHREAD_H
303 pthread_mutex_lock(&map_mutex);
304 #endif /* HAVE_PTHREAD_H */
305
306 if (encoding < CUPS_ENCODING_SBCS_END)
307 bytes = conv_sbcs_to_utf8(dest, (cups_sbcs_t *)src, maxout, encoding);
308 else if (encoding < CUPS_ENCODING_VBCS_END)
309 bytes = conv_vbcs_to_utf8(dest, (cups_sbcs_t *)src, maxout, encoding);
310 else
311 {
312 DEBUG_puts(" Bad encoding, returning -1");
313 bytes = -1;
314 }
315
316 #ifdef HAVE_PTHREAD_H
317 pthread_mutex_unlock(&map_mutex);
318 #endif /* HAVE_PTHREAD_H */
319
320 return (bytes);
321 }
322
323
324 /*
325 * 'cupsUTF8ToCharset()' - Convert UTF-8 to legacy character set.
326 *
327 * This code handles single-byte (SBCS), double-byte (DBCS), and
328 * variable-byte (VBCS) character sets _without_ charset escapes...
329 * This code does not handle multiple-byte character sets (MBCS)
330 * (such as ISO-2022-JP) with charset switching via escapes...
331 */
332
333 int /* O - Count or -1 on error */
334 cupsUTF8ToCharset(
335 char *dest, /* O - Target string */
336 const cups_utf8_t *src, /* I - Source string */
337 const int maxout, /* I - Max output */
338 const cups_encoding_t encoding) /* I - Encoding */
339 {
340 int bytes; /* Number of bytes converted */
341
342
343 /*
344 * Check for valid arguments...
345 */
346
347 if (!dest || !src || maxout < 1 || maxout > CUPS_MAX_USTRING)
348 {
349 if (dest)
350 *dest = '\0';
351
352 return (-1);
353 }
354
355 /*
356 * Handle identity conversions...
357 */
358
359 if (encoding == CUPS_UTF8 ||
360 encoding < 0 || encoding >= CUPS_ENCODING_VBCS_END)
361 {
362 strlcpy(dest, (char *)src, maxout);
363 return (strlen(dest));
364 }
365
366 /*
367 * Convert input UTF-8 to legacy charset...
368 */
369
370 #ifdef HAVE_PTHREAD_H
371 pthread_mutex_lock(&map_mutex);
372 #endif /* HAVE_PTHREAD_H */
373
374 if (encoding < CUPS_ENCODING_SBCS_END)
375 bytes = conv_utf8_to_sbcs((cups_sbcs_t *)dest, src, maxout, encoding);
376 else if (encoding < CUPS_ENCODING_VBCS_END)
377 bytes = conv_utf8_to_vbcs((cups_sbcs_t *)dest, src, maxout, encoding);
378 else
379 bytes = -1;
380
381 #ifdef HAVE_PTHREAD_H
382 pthread_mutex_unlock(&map_mutex);
383 #endif /* HAVE_PTHREAD_H */
384
385 return (bytes);
386 }
387
388
389 /*
390 * 'cupsUTF8ToUTF32()' - Convert UTF-8 to UTF-32.
391 *
392 * 32-bit UTF-32 (actually 21-bit) maps to UTF-8 as follows...
393 *
394 * UTF-32 char UTF-8 char(s)
395 * --------------------------------------------------
396 * 0 to 127 = 0xxxxxxx (US-ASCII)
397 * 128 to 2047 = 110xxxxx 10yyyyyy
398 * 2048 to 65535 = 1110xxxx 10yyyyyy 10zzzzzz
399 * > 65535 = 11110xxx 10yyyyyy 10zzzzzz 10xxxxxx
400 *
401 * UTF-32 prohibits chars beyond Plane 16 (> 0x10ffff) in UCS-4,
402 * which would convert to five- or six-octet UTF-8 sequences...
403 */
404
405 int /* O - Count or -1 on error */
406 cupsUTF8ToUTF32(
407 cups_utf32_t *dest, /* O - Target string */
408 const cups_utf8_t *src, /* I - Source string */
409 const int maxout) /* I - Max output */
410 {
411 int i; /* Looping variable */
412 cups_utf8_t ch; /* Character value */
413 cups_utf8_t next; /* Next character value */
414 cups_utf32_t ch32; /* UTF-32 character value */
415
416
417 /*
418 * Check for valid arguments and clear output...
419 */
420
421 if (dest)
422 *dest = 0;
423
424 if (!dest || !src || maxout < 1 || maxout > CUPS_MAX_USTRING)
425 return (-1);
426
427 /*
428 * Convert input UTF-8 to output UTF-32 (and insert BOM)...
429 */
430
431 *dest++ = 0xfeff;
432
433 for (i = maxout - 1; *src && i > 0; i --)
434 {
435 ch = *src++;
436
437 /*
438 * Convert UTF-8 character(s) to UTF-32 character...
439 */
440
441 if (!(ch & 0x80))
442 {
443 /*
444 * One-octet UTF-8 <= 127 (US-ASCII)...
445 */
446
447 *dest++ = ch;
448 }
449 else if ((ch & 0xe0) == 0xc0)
450 {
451 /*
452 * Two-octet UTF-8 <= 2047 (Latin-x)...
453 */
454
455 next = *src++;
456 if (!next)
457 return (-1);
458
459 ch32 = ((ch & 0x1f) << 6) | (next & 0x3f);
460
461 /*
462 * Check for non-shortest form (invalid UTF-8)...
463 */
464
465 if (ch32 < 0x80)
466 return (-1);
467
468 *dest++ = ch32;
469 }
470 else if ((ch & 0xf0) == 0xe0)
471 {
472 /*
473 * Three-octet UTF-8 <= 65535 (Plane 0 - BMP)...
474 */
475
476 next = *src++;
477 if (!next)
478 return (-1);
479
480 ch32 = ((ch & 0x0f) << 6) | (next & 0x3f);
481
482 next = *src++;
483 if (!next)
484 return (-1);
485
486 ch32 = (ch32 << 6) | (next & 0x3f);
487
488 /*
489 * Check for non-shortest form (invalid UTF-8)...
490 */
491
492 if (ch32 < 0x800)
493 return (-1);
494
495 *dest++ = ch32;
496 }
497 else if ((ch & 0xf8) == 0xf0)
498 {
499 /*
500 * Four-octet UTF-8...
501 */
502
503 next = *src++;
504 if (!next)
505 return (-1);
506
507 ch32 = ((ch & 0x07) << 6) | (next & 0x3f);
508
509 next = *src++;
510 if (!next)
511 return (-1);
512
513 ch32 = (ch32 << 6) | (next & 0x3f);
514
515 next = *src++;
516 if (!next)
517 return (-1);
518
519 ch32 = (ch32 << 6) | (next & 0x3f);
520
521 /*
522 * Check for non-shortest form (invalid UTF-8)...
523 */
524
525 if (ch32 < 0x10000)
526 return (-1);
527
528 *dest++ = ch32;
529 }
530 else
531 {
532 /*
533 * More than 4-octet (invalid UTF-8 sequence)...
534 */
535
536 return (-1);
537 }
538
539 /*
540 * Check for UTF-16 surrogate (illegal UTF-8)...
541 */
542
543 if (*dest >= 0xd800 && *dest <= 0xdfff)
544 return (-1);
545 }
546
547 *dest = 0;
548
549 return (i);
550 }
551
552
553 /*
554 * 'cupsUTF32ToUTF8()' - Convert UTF-32 to UTF-8.
555 *
556 * 32-bit UTF-32 (actually 21-bit) maps to UTF-8 as follows...
557 *
558 * UTF-32 char UTF-8 char(s)
559 * --------------------------------------------------
560 * 0 to 127 = 0xxxxxxx (US-ASCII)
561 * 128 to 2047 = 110xxxxx 10yyyyyy
562 * 2048 to 65535 = 1110xxxx 10yyyyyy 10zzzzzz
563 * > 65535 = 11110xxx 10yyyyyy 10zzzzzz 10xxxxxx
564 *
565 * UTF-32 prohibits chars beyond Plane 16 (> 0x10ffff) in UCS-4,
566 * which would convert to five- or six-octet UTF-8 sequences...
567 */
568
569 int /* O - Count or -1 on error */
570 cupsUTF32ToUTF8(
571 cups_utf8_t *dest, /* O - Target string */
572 const cups_utf32_t *src, /* I - Source string */
573 const int maxout) /* I - Max output */
574 {
575 cups_utf8_t *start; /* Start of destination string */
576 int i; /* Looping variable */
577 int swap; /* Byte-swap input to output */
578 cups_utf32_t ch; /* Character value */
579
580
581 /*
582 * Check for valid arguments and clear output...
583 */
584
585 if (dest)
586 *dest = '\0';
587
588 if (!dest || !src || maxout < 1)
589 return (-1);
590
591 /*
592 * Check for leading BOM in UTF-32 and inverted BOM...
593 */
594
595 start = dest;
596 swap = *src == 0xfffe0000;
597
598 if (*src == 0xfffe0000 || *src == 0xfeff)
599 src ++;
600
601 /*
602 * Convert input UTF-32 to output UTF-8...
603 */
604
605 for (i = maxout - 1; *src && i > 0;)
606 {
607 ch = *src++;
608
609 /*
610 * Byte swap input UTF-32, if necessary...
611 * (only byte-swapping 24 of 32 bits)
612 */
613
614 if (swap)
615 ch = ((ch >> 24) | ((ch >> 8) & 0xff00) | ((ch << 8) & 0xff0000));
616
617 /*
618 * Check for beyond Plane 16 (invalid UTF-32)...
619 */
620
621 if (ch > 0x10ffff)
622 return (-1);
623
624 /*
625 * Convert UTF-32 character to UTF-8 character(s)...
626 */
627
628 if (ch < 0x80)
629 {
630 /*
631 * One-octet UTF-8 <= 127 (US-ASCII)...
632 */
633
634 *dest++ = (cups_utf8_t)ch;
635 i --;
636 }
637 else if (ch < 0x800)
638 {
639 /*
640 * Two-octet UTF-8 <= 2047 (Latin-x)...
641 */
642
643 if (i < 2)
644 return (-1);
645
646 *dest++ = (cups_utf8_t)(0xc0 | ((ch >> 6) & 0x1f));
647 *dest++ = (cups_utf8_t)(0x80 | (ch & 0x3f));
648 i -= 2;
649 }
650 else if (ch < 0x10000)
651 {
652 /*
653 * Three-octet UTF-8 <= 65535 (Plane 0 - BMP)...
654 */
655
656 if (i < 3)
657 return (-1);
658
659 *dest++ = (cups_utf8_t)(0xe0 | ((ch >> 12) & 0x0f));
660 *dest++ = (cups_utf8_t)(0x80 | ((ch >> 6) & 0x3f));
661 *dest++ = (cups_utf8_t)(0x80 | (ch & 0x3f));
662 i -= 3;
663 }
664 else
665 {
666 /*
667 * Four-octet UTF-8...
668 */
669
670 if (i < 4)
671 return (-1);
672
673 *dest++ = (cups_utf8_t)(0xf0 | ((ch >> 18) & 0x07));
674 *dest++ = (cups_utf8_t)(0x80 | ((ch >> 12) & 0x3f));
675 *dest++ = (cups_utf8_t)(0x80 | ((ch >> 6) & 0x3f));
676 *dest++ = (cups_utf8_t)(0x80 | (ch & 0x3f));
677 i -= 4;
678 }
679 }
680
681 *dest = '\0';
682
683 return ((int)(dest - start));
684 }
685
686
687 /*
688 * 'compare_wide()' - Compare key for wide (VBCS) match.
689 */
690
691 static int
692 compare_wide(const void *k1, /* I - Key char */
693 const void *k2) /* I - Map char */
694 {
695 cups_vbcs_t key; /* Legacy key character */
696 cups_vbcs_t map; /* Legacy map character */
697
698
699 key = *((cups_vbcs_t *)k1);
700 map = ((_cups_wide2uni_t *)k2)->widechar;
701
702 return ((int)(key - map));
703 }
704
705
706 /*
707 * 'conv_sbcs_to_utf8()' - Convert legacy SBCS to UTF-8.
708 */
709
710 static int /* O - Count or -1 on error */
711 conv_sbcs_to_utf8(
712 cups_utf8_t *dest, /* O - Target string */
713 const cups_sbcs_t *src, /* I - Source string */
714 int maxout, /* I - Max output */
715 const cups_encoding_t encoding) /* I - Encoding */
716 {
717 _cups_cmap_t *cmap; /* Legacy SBCS / Unicode Charset Map */
718 cups_ucs2_t *crow; /* Pointer to UCS-2 row in 'char2uni' */
719 cups_sbcs_t legchar; /* Legacy character value */
720 cups_utf32_t work[CUPS_MAX_USTRING], /* Internal UCS-4 string */
721 *workptr; /* Pointer into string */
722
723
724 /*
725 * Find legacy charset map in cache...
726 */
727
728 if ((cmap = (_cups_cmap_t *)get_charmap(encoding)) == NULL)
729 return (-1);
730
731 /*
732 * Convert input legacy charset to internal UCS-4 (and insert BOM)...
733 */
734
735 work[0] = 0xfeff;
736 for (workptr = work + 1; *src && workptr < (work + CUPS_MAX_USTRING - 1);)
737 {
738 legchar = *src++;
739
740 /*
741 * Convert ASCII verbatim (optimization)...
742 */
743
744 if (legchar < 0x80)
745 *workptr++ = (cups_utf32_t)legchar;
746 else
747 {
748 /*
749 * Convert unknown character to Replacement Character...
750 */
751
752 crow = cmap->char2uni + legchar;
753
754 if (!*crow)
755 *workptr++ = 0xfffd;
756 else
757 *workptr++ = (cups_utf32_t)*crow;
758 }
759 }
760
761 *workptr = 0;
762
763 /*
764 * Convert internal UCS-4 to output UTF-8 (and delete BOM)...
765 */
766
767 cmap->used --;
768
769 return (cupsUTF32ToUTF8(dest, work, maxout));
770 }
771
772
773 /*
774 * 'conv_utf8_to_sbcs()' - Convert UTF-8 to legacy SBCS.
775 */
776
777 static int /* O - Count or -1 on error */
778 conv_utf8_to_sbcs(
779 cups_sbcs_t *dest, /* O - Target string */
780 const cups_utf8_t *src, /* I - Source string */
781 int maxout, /* I - Max output */
782 const cups_encoding_t encoding) /* I - Encoding */
783 {
784 cups_sbcs_t *start; /* Start of destination string */
785 _cups_cmap_t *cmap; /* Legacy SBCS / Unicode Charset Map */
786 cups_sbcs_t *srow; /* Pointer to SBCS row in 'uni2char' */
787 cups_utf32_t unichar; /* Character value */
788 cups_utf32_t work[CUPS_MAX_USTRING], /* Internal UCS-4 string */
789 *workptr; /* Pointer into string */
790
791
792 /*
793 * Find legacy charset map in cache...
794 */
795
796 if ((cmap = (_cups_cmap_t *)get_charmap(encoding)) == NULL)
797 return (-1);
798
799 /*
800 * Convert input UTF-8 to internal UCS-4 (and insert BOM)...
801 */
802
803 if (cupsUTF8ToUTF32(work, src, CUPS_MAX_USTRING) < 0)
804 return (-1);
805
806 /*
807 * Convert internal UCS-4 to SBCS legacy charset (and delete BOM)...
808 */
809
810 for (workptr = work + 1, start = dest; *workptr && maxout > 1; maxout --)
811 {
812 unichar = *workptr++;
813 if (!unichar)
814 break;
815
816 /*
817 * Convert ASCII verbatim (optimization)...
818 */
819
820 if (unichar < 0x80)
821 {
822 *dest++ = (cups_sbcs_t)unichar;
823 continue;
824 }
825
826 /*
827 * Convert unknown character to visible replacement...
828 */
829
830 srow = cmap->uni2char[(int)((unichar >> 8) & 0xff)];
831
832 if (srow)
833 srow += (int)(unichar & 0xff);
834
835 if (!srow || !*srow)
836 *dest++ = '?';
837 else
838 *dest++ = *srow;
839 }
840
841 *dest = '\0';
842
843 cmap->used --;
844
845 return ((int)(dest - start));
846 }
847
848
849 /*
850 * 'conv_utf8_to_vbcs()' - Convert UTF-8 to legacy DBCS/VBCS.
851 */
852
853 static int /* O - Count or -1 on error */
854 conv_utf8_to_vbcs(
855 cups_sbcs_t *dest, /* O - Target string */
856 const cups_utf8_t *src, /* I - Source string */
857 int maxout, /* I - Max output */
858 const cups_encoding_t encoding) /* I - Encoding */
859 {
860 cups_sbcs_t *start; /* Start of destination string */
861 _cups_vmap_t *vmap; /* Legacy DBCS / Unicode Charset Map */
862 cups_vbcs_t *vrow; /* Pointer to VBCS row in 'uni2char' */
863 cups_utf32_t unichar; /* Character value */
864 cups_vbcs_t legchar; /* Legacy character value */
865 cups_utf32_t work[CUPS_MAX_USTRING], /* Internal UCS-4 string */
866 *workptr; /* Pointer into string */
867
868
869 /*
870 * Find legacy charset map in cache...
871 */
872
873 if ((vmap = (_cups_vmap_t *)get_charmap(encoding)) == NULL)
874 return (-1);
875
876 /*
877 * Convert input UTF-8 to internal UCS-4 (and insert BOM)...
878 */
879
880 if (cupsUTF8ToUTF32(work, src, CUPS_MAX_USTRING) < 0)
881 return (-1);
882
883 /*
884 * Convert internal UCS-4 to VBCS legacy charset (and delete BOM)...
885 */
886
887 for (start = dest, workptr = work + 1; *workptr && maxout > 1; maxout --)
888 {
889 unichar = *workptr++;
890 if (!unichar)
891 break;
892
893 /*
894 * Convert ASCII verbatim (optimization)...
895 */
896
897 if (unichar < 0x80)
898 {
899 *dest++ = (cups_vbcs_t)unichar;
900 continue;
901 }
902
903 /*
904 * Convert unknown character to visible replacement...
905 */
906
907 vrow = vmap->uni2char[(int)((unichar >> 8) & 0xff)];
908
909 if (vrow)
910 vrow += (int)(unichar & 0xff);
911
912 if (!vrow || !*vrow)
913 legchar = (cups_vbcs_t)'?';
914 else
915 legchar = (cups_vbcs_t)*vrow;
916
917 /*
918 * Save n-byte legacy character...
919 */
920
921 if (legchar > 0xffffff)
922 {
923 if (maxout < 5)
924 return (-1);
925
926 *dest++ = (cups_sbcs_t)(legchar >> 24);
927 *dest++ = (cups_sbcs_t)(legchar >> 16);
928 *dest++ = (cups_sbcs_t)(legchar >> 8);
929 *dest++ = (cups_sbcs_t)legchar;
930
931 maxout -= 3;
932 }
933 else if (legchar > 0xffff)
934 {
935 if (maxout < 4)
936 return (-1);
937
938 *dest++ = (cups_sbcs_t)(legchar >> 16);
939 *dest++ = (cups_sbcs_t)(legchar >> 8);
940 *dest++ = (cups_sbcs_t)legchar;
941
942 maxout -= 2;
943 }
944 else if (legchar > 0xff)
945 {
946 *dest++ = (cups_sbcs_t)(legchar >> 8);
947 *dest++ = (cups_sbcs_t)legchar;
948
949 maxout --;
950 }
951 }
952
953 *dest = '\0';
954
955 vmap->used --;
956
957 return ((int)(dest - start));
958 }
959
960
961 /*
962 * 'conv_vbcs_to_utf8()' - Convert legacy DBCS/VBCS to UTF-8.
963 */
964
965 static int /* O - Count or -1 on error */
966 conv_vbcs_to_utf8(
967 cups_utf8_t *dest, /* O - Target string */
968 const cups_sbcs_t *src, /* I - Source string */
969 int maxout, /* I - Max output */
970 const cups_encoding_t encoding) /* I - Encoding */
971 {
972 _cups_vmap_t *vmap; /* Legacy VBCS / Unicode Charset Map */
973 cups_ucs2_t *crow; /* Pointer to UCS-2 row in 'char2uni' */
974 _cups_wide2uni_t *wide2uni; /* Pointer to row in 'wide2uni' */
975 cups_sbcs_t leadchar; /* Lead char of n-byte legacy char */
976 cups_vbcs_t legchar; /* Legacy character value */
977 cups_utf32_t work[CUPS_MAX_USTRING], /* Internal UCS-4 string */
978 *workptr; /* Pointer into string */
979
980
981 /*
982 * Find legacy charset map in cache...
983 */
984
985 if ((vmap = (_cups_vmap_t *)get_charmap(encoding)) == NULL)
986 return (-1);
987
988 /*
989 * Convert input legacy charset to internal UCS-4 (and insert BOM)...
990 */
991
992 work[0] = 0xfeff;
993 for (workptr = work + 1; *src && workptr < (work + CUPS_MAX_USTRING - 1);)
994 {
995 legchar = *src++;
996 leadchar = (cups_sbcs_t)legchar;
997
998 /*
999 * Convert ASCII verbatim (optimization)...
1000 */
1001
1002 if (legchar < 0x80)
1003 {
1004 *workptr++ = (cups_utf32_t)legchar;
1005 continue;
1006 }
1007
1008 /*
1009 * Convert 2-byte legacy character...
1010 */
1011
1012 if (vmap->lead2char[(int)leadchar] == leadchar)
1013 {
1014 if (!*src)
1015 return (-1);
1016
1017 legchar = (legchar << 8) | *src++;
1018
1019 /*
1020 * Convert unknown character to Replacement Character...
1021 */
1022
1023 crow = vmap->char2uni[(int)((legchar >> 8) & 0xff)];
1024 if (crow)
1025 crow += (int) (legchar & 0xff);
1026
1027 if (!crow || !*crow)
1028 *workptr++ = 0xfffd;
1029 else
1030 *workptr++ = (cups_utf32_t)*crow;
1031 continue;
1032 }
1033
1034 /*
1035 * Fetch 3-byte or 4-byte legacy character...
1036 */
1037
1038 if (vmap->lead3char[(int)leadchar] == leadchar)
1039 {
1040 if (!*src || !src[1])
1041 return (-1);
1042
1043 legchar = (legchar << 8) | *src++;
1044 legchar = (legchar << 8) | *src++;
1045 }
1046 else if (vmap->lead4char[(int)leadchar] == leadchar)
1047 {
1048 if (!*src || !src[1] || !src[2])
1049 return (-1);
1050
1051 legchar = (legchar << 8) | *src++;
1052 legchar = (legchar << 8) | *src++;
1053 legchar = (legchar << 8) | *src++;
1054 }
1055 else
1056 return (-1);
1057
1058 /*
1059 * Find 3-byte or 4-byte legacy character...
1060 */
1061
1062 wide2uni = (_cups_wide2uni_t *)bsearch(&legchar,
1063 vmap->wide2uni,
1064 vmap->widecount,
1065 sizeof(_cups_wide2uni_t),
1066 compare_wide);
1067
1068 /*
1069 * Convert unknown character to Replacement Character...
1070 */
1071
1072 if (!wide2uni || !wide2uni->unichar)
1073 *workptr++ = 0xfffd;
1074 else
1075 *workptr++ = wide2uni->unichar;
1076 }
1077
1078 *workptr = 0;
1079
1080 vmap->used --;
1081
1082 /*
1083 * Convert internal UCS-4 to output UTF-8 (and delete BOM)...
1084 */
1085
1086 return (cupsUTF32ToUTF8(dest, work, maxout));
1087 }
1088
1089
1090 /*
1091 * 'free_sbcs_charmap()' - Free memory used by a single byte character set.
1092 */
1093
1094 static void
1095 free_sbcs_charmap(_cups_cmap_t *cmap) /* I - Character set */
1096 {
1097 int i; /* Looping variable */
1098
1099
1100 for (i = 0; i < 256; i ++)
1101 if (cmap->uni2char[i])
1102 free(cmap->uni2char[i]);
1103
1104 free(cmap);
1105 }
1106
1107
1108 /*
1109 * 'free_vbcs_charmap()' - Free memory used by a variable byte character set.
1110 */
1111
1112 static void
1113 free_vbcs_charmap(_cups_vmap_t *vmap) /* I - Character set */
1114 {
1115 int i; /* Looping variable */
1116
1117
1118 for (i = 0; i < 256; i ++)
1119 if (vmap->char2uni[i])
1120 free(vmap->char2uni[i]);
1121
1122 for (i = 0; i < 256; i ++)
1123 if (vmap->uni2char[i])
1124 free(vmap->uni2char[i]);
1125
1126 if (vmap->wide2uni)
1127 free(vmap->wide2uni);
1128
1129 free(vmap);
1130 }
1131
1132
1133 /*
1134 * 'get_charmap()' - Lookup or get a character set map (private).
1135 *
1136 * This code handles single-byte (SBCS), double-byte (DBCS), and
1137 * variable-byte (VBCS) character sets _without_ charset escapes...
1138 * This code does not handle multiple-byte character sets (MBCS)
1139 * (such as ISO-2022-JP) with charset switching via escapes...
1140 */
1141
1142
1143 static void * /* O - Charset map pointer */
1144 get_charmap(
1145 const cups_encoding_t encoding) /* I - Encoding */
1146 {
1147 char filename[1024]; /* Filename for charset map file */
1148 _cups_globals_t *cg = _cupsGlobals(); /* Global data */
1149
1150
1151 /*
1152 * Get the data directory and charset map name...
1153 */
1154
1155 snprintf(filename, sizeof(filename), "%s/charmaps/%s.txt",
1156 cg->cups_datadir, _cupsEncodingName(encoding));
1157
1158 DEBUG_printf((" filename=\"%s\"\n", filename));
1159
1160 /*
1161 * Read charset map input file into cache...
1162 */
1163
1164 if (encoding < CUPS_ENCODING_SBCS_END)
1165 return (get_sbcs_charmap(encoding, filename));
1166 else if (encoding < CUPS_ENCODING_VBCS_END)
1167 return (get_vbcs_charmap(encoding, filename));
1168 else
1169 return (NULL);
1170 }
1171
1172
1173 /*
1174 * 'get_charmap_count()' - Count lines in a charmap file.
1175 */
1176
1177 static int /* O - Count or -1 on error */
1178 get_charmap_count(cups_file_t *fp) /* I - File to read from */
1179 {
1180 int count; /* Number of lines */
1181 char line[256]; /* Line from input map file */
1182
1183
1184 /*
1185 * Count lines in map input file...
1186 */
1187
1188 count = 0;
1189
1190 while (cupsFileGets(fp, line, sizeof(line)))
1191 if (line[0] == '0')
1192 count ++;
1193
1194 /*
1195 * Return the number of lines...
1196 */
1197
1198 if (count > 0)
1199 return (count);
1200 else
1201 return (-1);
1202 }
1203
1204
1205 /*
1206 * 'get_sbcs_charmap()' - Get SBCS Charmap.
1207 */
1208
1209 static _cups_cmap_t * /* O - Charmap or 0 on error */
1210 get_sbcs_charmap(
1211 const cups_encoding_t encoding, /* I - Charmap Encoding */
1212 const char *filename) /* I - Charmap Filename */
1213 {
1214 unsigned long legchar; /* Legacy character value */
1215 cups_utf32_t unichar; /* Unicode character value */
1216 _cups_cmap_t *cmap; /* Legacy SBCS / Unicode Charset Map */
1217 cups_file_t *fp; /* Charset map file pointer */
1218 char *s; /* Line parsing pointer */
1219 cups_ucs2_t *crow; /* Pointer to UCS-2 row in 'char2uni' */
1220 cups_sbcs_t *srow; /* Pointer to SBCS row in 'uni2char' */
1221 char line[256]; /* Line from charset map file */
1222
1223
1224 /*
1225 * See if we already have this SBCS charset map loaded...
1226 */
1227
1228 for (cmap = cmap_cache; cmap; cmap = cmap->next)
1229 {
1230 if (cmap->encoding == encoding)
1231 {
1232 cmap->used ++;
1233 DEBUG_printf((" returning existing cmap=%p\n", cmap));
1234
1235 return ((void *)cmap);
1236 }
1237 }
1238
1239 /*
1240 * Open SBCS charset map input file...
1241 */
1242
1243 if ((fp = cupsFileOpen(filename, "r")) == NULL)
1244 return (NULL);
1245
1246 /*
1247 * Allocate memory for SBCS charset map...
1248 */
1249
1250 if ((cmap = (_cups_cmap_t *)calloc(1, sizeof(_cups_cmap_t))) == NULL)
1251 {
1252 cupsFileClose(fp);
1253 DEBUG_puts(" Unable to allocate memory!");
1254
1255 return (NULL);
1256 }
1257
1258 cmap->used ++;
1259 cmap->encoding = encoding;
1260
1261 /*
1262 * Save SBCS charset map into memory for transcoding...
1263 */
1264
1265 while (cupsFileGets(fp, line, sizeof(line)))
1266 {
1267 if (line[0] != '0')
1268 continue;
1269
1270 legchar = strtol(line, &s, 16);
1271 if (legchar < 0 || legchar > 0xff)
1272 goto sbcs_error;
1273
1274 unichar = strtol(s, NULL, 16);
1275 if (unichar < 0 || unichar > 0xffff)
1276 goto sbcs_error;
1277
1278 /*
1279 * Save legacy to Unicode mapping in direct lookup table...
1280 */
1281
1282 crow = cmap->char2uni + legchar;
1283 *crow = (cups_ucs2_t)(unichar & 0xffff);
1284
1285 /*
1286 * Save Unicode to legacy mapping in indirect lookup table...
1287 */
1288
1289 srow = cmap->uni2char[(unichar >> 8) & 0xff];
1290 if (!srow)
1291 {
1292 srow = (cups_sbcs_t *)calloc(256, sizeof(cups_sbcs_t));
1293 if (!srow)
1294 goto sbcs_error;
1295
1296 cmap->uni2char[(unichar >> 8) & 0xff] = srow;
1297 }
1298
1299 srow += unichar & 0xff;
1300
1301 /*
1302 * Convert Replacement Character to visible replacement...
1303 */
1304
1305 if (unichar == 0xfffd)
1306 legchar = (unsigned long)'?';
1307
1308 /*
1309 * First (oldest) legacy character uses Unicode mapping cell...
1310 */
1311
1312 if (!*srow)
1313 *srow = (cups_sbcs_t)legchar;
1314 }
1315
1316 cupsFileClose(fp);
1317
1318 /*
1319 * Add it to the cache and return...
1320 */
1321
1322 cmap->next = cmap_cache;
1323 cmap_cache = cmap;
1324
1325 DEBUG_printf((" returning new cmap=%p\n", cmap));
1326
1327 return (cmap);
1328
1329 /*
1330 * If we get here, there was an error in the cmap file...
1331 */
1332
1333 sbcs_error:
1334
1335 free_sbcs_charmap(cmap);
1336
1337 cupsFileClose(fp);
1338
1339 DEBUG_puts(" Error, returning NULL!");
1340
1341 return (NULL);
1342 }
1343
1344
1345 /*
1346 * 'get_vbcs_charmap()' - Get DBCS/VBCS Charmap.
1347 */
1348
1349 static _cups_vmap_t * /* O - Charmap or 0 on error */
1350 get_vbcs_charmap(
1351 const cups_encoding_t encoding, /* I - Charmap Encoding */
1352 const char *filename) /* I - Charmap Filename */
1353 {
1354 _cups_vmap_t *vmap; /* Legacy VBCS / Unicode Charset Map */
1355 cups_ucs2_t *crow; /* Pointer to UCS-2 row in 'char2uni' */
1356 cups_vbcs_t *vrow; /* Pointer to VBCS row in 'uni2char' */
1357 _cups_wide2uni_t *wide2uni; /* Pointer to row in 'wide2uni' */
1358 cups_sbcs_t leadchar; /* Lead char of 2-byte legacy char */
1359 unsigned long legchar; /* Legacy character value */
1360 cups_utf32_t unichar; /* Unicode character value */
1361 int mapcount; /* Count of lines in charmap file */
1362 cups_file_t *fp; /* Charset map file pointer */
1363 char *s; /* Line parsing pointer */
1364 char line[256]; /* Line from charset map file */
1365 int i; /* Loop variable */
1366 int wide; /* 32-bit legacy char */
1367
1368
1369 DEBUG_printf(("get_vbcs_charmap(encoding=%d, filename=\"%s\")\n",
1370 encoding, filename));
1371
1372 /*
1373 * See if we already have this DBCS/VBCS charset map loaded...
1374 */
1375
1376 for (vmap = vmap_cache; vmap; vmap = vmap->next)
1377 {
1378 if (vmap->encoding == encoding)
1379 {
1380 vmap->used ++;
1381 DEBUG_printf((" returning existing vmap=%p\n", vmap));
1382
1383 return ((void *)vmap);
1384 }
1385 }
1386
1387 /*
1388 * Open VBCS charset map input file...
1389 */
1390
1391 if ((fp = cupsFileOpen(filename, "r")) == NULL)
1392 {
1393 DEBUG_printf((" Unable to open file: %s\n", strerror(errno)));
1394
1395 return (NULL);
1396 }
1397
1398 /*
1399 * Count lines in charmap file...
1400 */
1401
1402 if ((mapcount = get_charmap_count(fp)) <= 0)
1403 {
1404 DEBUG_puts(" Unable to get charmap count!");
1405
1406 return (NULL);
1407 }
1408
1409 DEBUG_printf((" mapcount=%d\n", mapcount));
1410
1411 /*
1412 * Allocate memory for DBCS/VBCS charset map...
1413 */
1414
1415 if ((vmap = (_cups_vmap_t *)calloc(1, sizeof(_cups_vmap_t))) == NULL)
1416 {
1417 cupsFileClose(fp);
1418 DEBUG_puts(" Unable to allocate memory!");
1419
1420 return (NULL);
1421 }
1422
1423 vmap->used ++;
1424 vmap->encoding = encoding;
1425
1426 /*
1427 * Save DBCS/VBCS charset map into memory for transcoding...
1428 */
1429
1430 leadchar = 0;
1431 wide2uni = NULL;
1432
1433 cupsFileRewind(fp);
1434
1435 i = 0;
1436 wide = 0;
1437
1438 while (cupsFileGets(fp, line, sizeof(line)))
1439 {
1440 if (line[0] != '0')
1441 continue;
1442
1443 legchar = strtoul(line, &s, 16);
1444 if (legchar == ULONG_MAX)
1445 goto vbcs_error;
1446
1447 unichar = strtol(s, NULL, 16);
1448 if (unichar < 0 || unichar > 0xffff)
1449 goto vbcs_error;
1450
1451 i ++;
1452
1453 /* DEBUG_printf((" i=%d, legchar=0x%08lx, unichar=0x%04x\n", i,
1454 legchar, (unsigned)unichar)); */
1455
1456 /*
1457 * Save lead char of 2/3/4-byte legacy char...
1458 */
1459
1460 if (legchar > 0xff && legchar <= 0xffff)
1461 {
1462 leadchar = (cups_sbcs_t)(legchar >> 8);
1463 vmap->lead2char[leadchar] = leadchar;
1464 }
1465
1466 if (legchar > 0xffff && legchar <= 0xffffff)
1467 {
1468 leadchar = (cups_sbcs_t)(legchar >> 16);
1469 vmap->lead3char[leadchar] = leadchar;
1470 }
1471
1472 if (legchar > 0xffffff)
1473 {
1474 leadchar = (cups_sbcs_t)(legchar >> 24);
1475 vmap->lead4char[leadchar] = leadchar;
1476 }
1477
1478 /*
1479 * Save Legacy to Unicode mapping...
1480 */
1481
1482 if (legchar <= 0xffff)
1483 {
1484 /*
1485 * Save DBCS 16-bit to Unicode mapping in indirect lookup table...
1486 */
1487
1488 crow = vmap->char2uni[(int)leadchar];
1489 if (!crow)
1490 {
1491 crow = (cups_ucs2_t *)calloc(256, sizeof(cups_ucs2_t));
1492 if (!crow)
1493 goto vbcs_error;
1494
1495 vmap->char2uni[(int)leadchar] = crow;
1496 }
1497
1498 crow[(int)(legchar & 0xff)] = (cups_ucs2_t)unichar;
1499 }
1500 else
1501 {
1502 /*
1503 * Save VBCS 32-bit to Unicode mapping in sorted list table...
1504 */
1505
1506 if (!wide)
1507 {
1508 wide = 1;
1509 vmap->widecount = (mapcount - i + 1);
1510 wide2uni = (_cups_wide2uni_t *)calloc(vmap->widecount,
1511 sizeof(_cups_wide2uni_t));
1512 if (!wide2uni)
1513 goto vbcs_error;
1514
1515 vmap->wide2uni = wide2uni;
1516 }
1517
1518 wide2uni->widechar = (cups_vbcs_t)legchar;
1519 wide2uni->unichar = (cups_ucs2_t)unichar;
1520 wide2uni ++;
1521 }
1522
1523 /*
1524 * Save Unicode to legacy mapping in indirect lookup table...
1525 */
1526
1527 vrow = vmap->uni2char[(int)((unichar >> 8) & 0xff)];
1528 if (!vrow)
1529 {
1530 vrow = (cups_vbcs_t *)calloc(256, sizeof(cups_vbcs_t));
1531 if (!vrow)
1532 goto vbcs_error;
1533
1534 vmap->uni2char[(int) ((unichar >> 8) & 0xff)] = vrow;
1535 }
1536
1537 vrow += (int)(unichar & 0xff);
1538
1539 /*
1540 * Convert Replacement Character to visible replacement...
1541 */
1542
1543 if (unichar == 0xfffd)
1544 legchar = (unsigned long)'?';
1545
1546 /*
1547 * First (oldest) legacy character uses Unicode mapping cell...
1548 */
1549
1550 if (!*vrow)
1551 *vrow = (cups_vbcs_t)legchar;
1552 }
1553
1554 vmap->charcount = (i - vmap->widecount);
1555
1556 cupsFileClose(fp);
1557
1558 /*
1559 * Add it to the cache and return...
1560 */
1561
1562 vmap->next = vmap_cache;
1563 vmap_cache = vmap;
1564
1565 DEBUG_printf((" returning new vmap=%p\n", vmap));
1566
1567 return (vmap);
1568
1569 /*
1570 * If we get here, the file contains errors...
1571 */
1572
1573 vbcs_error:
1574
1575 free_vbcs_charmap(vmap);
1576
1577 cupsFileClose(fp);
1578
1579 DEBUG_puts(" Error, returning NULL!");
1580
1581 return (NULL);
1582 }
1583
1584
1585 /*
1586 * End of "$Id: transcode.c 5838 2006-08-17 14:41:42Z mike $"
1587 */