]> git.ipfire.org Git - thirdparty/cups.git/blob - cups/transcode.c
f21ab22e6a919cdcd3ae5a54d277da469e086eb1
[thirdparty/cups.git] / cups / transcode.c
1 /*
2 * "$Id$"
3 *
4 * Transcoding support for the Common UNIX Printing System (CUPS).
5 *
6 * Copyright 1997-2006 by Easy Software Products.
7 *
8 * These coded instructions, statements, and computer programs are
9 * the property of Easy Software Products and are protected by Federal
10 * copyright law. Distribution and use rights are outlined in the
11 * file "LICENSE.txt" which should have been included with this file.
12 * If this file is missing or damaged please contact Easy Software
13 * Products at:
14 *
15 * Attn: CUPS Licensing Information
16 * Easy Software Products
17 * 44141 Airport View Drive, Suite 204
18 * Hollywood, Maryland 20636 USA
19 *
20 * Voice: (301) 373-9600
21 * EMail: cups-info@cups.org
22 * WWW: http://www.cups.org
23 *
24 * Contents:
25 *
26 * _cupsCharmapFlush() - Flush all character set maps out of cache.
27 * _cupsCharmapFree() - Free a character set map.
28 * _cupsCharmapGet() - Get a character set map.
29 * cupsCharsetToUTF8() - Convert legacy character set to UTF-8.
30 * cupsUTF8ToCharset() - Convert UTF-8 to legacy character set.
31 * cupsUTF8ToUTF32() - Convert UTF-8 to UTF-32.
32 * cupsUTF32ToUTF8() - Convert UTF-32 to UTF-8.
33 * compare_wide() - Compare key for wide (VBCS) match.
34 * conv_sbcs_to_utf8() - Convert legacy SBCS to UTF-8.
35 * conv_utf8_to_sbcs() - Convert UTF-8 to legacy SBCS.
36 * conv_utf8_to_vbcs() - Convert UTF-8 to legacy DBCS/VBCS.
37 * conv_vbcs_to_utf8() - Convert legacy DBCS/VBCS to UTF-8.
38 * free_sbcs_charmap() - Free memory used by a single byte character set.
39 * free_vbcs_charmap() - Free memory used by a variable byte character set.
40 * get_charmap() - Lookup or get a character set map (private).
41 * get_charmap_count() - Count lines in a charmap file.
42 * get_sbcs_charmap() - Get SBCS Charmap.
43 * get_vbcs_charmap() - Get DBCS/VBCS Charmap.
44 */
45
46 /*
47 * Include necessary headers...
48 */
49
50 #include "globals.h"
51 #include "debug.h"
52 #include <limits.h>
53 #include <stdlib.h>
54 #include <errno.h>
55 #include <time.h>
56
57
58 /*
59 * Local globals...
60 */
61
62 #ifdef HAVE_PTHREAD_H
63 static pthread_mutex_t map_mutex = PTHREAD_MUTEX_INITIALIZER;
64 /* Mutex to control access to maps */
65 #endif /* HAVE_PTHREAD_H */
66 static _cups_cmap_t *cmap_cache = NULL;
67 /* SBCS Charmap Cache */
68 static _cups_vmap_t *vmap_cache = NULL;
69 /* VBCS Charmap Cache */
70
71
72 /*
73 * Local functions...
74 */
75
76 static int compare_wide(const void *k1, const void *k2);
77 static int conv_sbcs_to_utf8(cups_utf8_t *dest,
78 const cups_sbcs_t *src,
79 int maxout,
80 const cups_encoding_t encoding);
81 static int conv_utf8_to_sbcs(cups_sbcs_t *dest,
82 const cups_utf8_t *src,
83 int maxout,
84 const cups_encoding_t encoding);
85 static int conv_utf8_to_vbcs(cups_sbcs_t *dest,
86 const cups_utf8_t *src,
87 int maxout,
88 const cups_encoding_t encoding);
89 static int conv_vbcs_to_utf8(cups_utf8_t *dest,
90 const cups_sbcs_t *src,
91 int maxout,
92 const cups_encoding_t encoding);
93 static void free_sbcs_charmap(_cups_cmap_t *sbcs);
94 static void free_vbcs_charmap(_cups_vmap_t *vbcs);
95 static void *get_charmap(const cups_encoding_t encoding);
96 static int get_charmap_count(cups_file_t *fp);
97 static _cups_cmap_t *get_sbcs_charmap(const cups_encoding_t encoding,
98 const char *filename);
99 static _cups_vmap_t *get_vbcs_charmap(const cups_encoding_t encoding,
100 const char *filename);
101
102
103 /*
104 * '_cupsCharmapFlush()' - Flush all character set maps out of cache.
105 */
106
107 void
108 _cupsCharmapFlush(void)
109 {
110 _cups_cmap_t *cmap, /* Legacy SBCS / Unicode Charset Map */
111 *cnext; /* Next Legacy SBCS Charset Map */
112 _cups_vmap_t *vmap, /* Legacy VBCS / Unicode Charset Map */
113 *vnext; /* Next Legacy VBCS Charset Map */
114
115
116 #ifdef HAVE_PTHREAD_H
117 pthread_mutex_lock(&map_mutex);
118 #endif /* HAVE_PTHREAD_H */
119
120 /*
121 * Loop through SBCS charset map cache, free all memory...
122 */
123
124 for (cmap = cmap_cache; cmap; cmap = cnext)
125 {
126 cnext = cmap->next;
127
128 free_sbcs_charmap(cmap);
129 }
130
131 cmap_cache = NULL;
132
133 /*
134 * Loop through DBCS/VBCS charset map cache, free all memory...
135 */
136
137 for (vmap = vmap_cache; vmap; vmap = vnext)
138 {
139 vnext = vmap->next;
140
141 free_vbcs_charmap(vmap);
142
143 free(vmap);
144 }
145
146 vmap_cache = NULL;
147
148 #ifdef HAVE_PTHREAD_H
149 pthread_mutex_unlock(&map_mutex);
150 #endif /* HAVE_PTHREAD_H */
151 }
152
153
154 /*
155 * '_cupsCharmapFree()' - Free a character set map.
156 *
157 * This does not actually free; use '_cupsCharmapFlush()' for that.
158 */
159
160 void
161 _cupsCharmapFree(
162 const cups_encoding_t encoding) /* I - Encoding */
163 {
164 _cups_cmap_t *cmap; /* Legacy SBCS / Unicode Charset Map */
165 _cups_vmap_t *vmap; /* Legacy VBCS / Unicode Charset Map */
166
167
168 /*
169 * See if we already have this SBCS charset map loaded...
170 */
171
172 #ifdef HAVE_PTHREAD_H
173 pthread_mutex_lock(&map_mutex);
174 #endif /* HAVE_PTHREAD_H */
175
176 for (cmap = cmap_cache; cmap; cmap = cmap->next)
177 {
178 if (cmap->encoding == encoding)
179 {
180 if (cmap->used > 0)
181 cmap->used --;
182 break;
183 }
184 }
185
186 /*
187 * See if we already have this DBCS/VBCS charset map loaded...
188 */
189
190 for (vmap = vmap_cache; vmap; vmap = vmap->next)
191 {
192 if (vmap->encoding == encoding)
193 {
194 if (vmap->used > 0)
195 vmap->used --;
196 break;
197 }
198 }
199
200 #ifdef HAVE_PTHREAD_H
201 pthread_mutex_unlock(&map_mutex);
202 #endif /* HAVE_PTHREAD_H */
203 }
204
205
206 /*
207 * '_cupsCharmapGet()' - Get a character set map.
208 *
209 * This code handles single-byte (SBCS), double-byte (DBCS), and
210 * variable-byte (VBCS) character sets _without_ charset escapes...
211 * This code does not handle multiple-byte character sets (MBCS)
212 * (such as ISO-2022-JP) with charset switching via escapes...
213 */
214
215 void * /* O - Charset map pointer */
216 _cupsCharmapGet(
217 const cups_encoding_t encoding) /* I - Encoding */
218 {
219 void *charmap; /* Charset map pointer */
220
221
222 DEBUG_printf(("_cupsCharmapGet(encoding=%d)\n", encoding));
223
224 /*
225 * Check for valid arguments...
226 */
227
228 if (encoding < 0 || encoding >= CUPS_ENCODING_VBCS_END)
229 {
230 DEBUG_puts(" Bad encoding, returning NULL!");
231 return (NULL);
232 }
233
234 /*
235 * Lookup or get the charset map pointer and return...
236 */
237
238 #ifdef HAVE_PTHREAD_H
239 pthread_mutex_lock(&map_mutex);
240 #endif /* HAVE_PTHREAD_H */
241
242 charmap = get_charmap(encoding);
243
244 #ifdef HAVE_PTHREAD_H
245 pthread_mutex_unlock(&map_mutex);
246 #endif /* HAVE_PTHREAD_H */
247
248 return (charmap);
249 }
250
251
252 /*
253 * 'cupsCharsetToUTF8()' - Convert legacy character set to UTF-8.
254 *
255 * This code handles single-byte (SBCS), double-byte (DBCS), and
256 * variable-byte (VBCS) character sets _without_ charset escapes...
257 * This code does not handle multiple-byte character sets (MBCS)
258 * (such as ISO-2022-JP) with charset switching via escapes...
259 */
260
261 int /* O - Count or -1 on error */
262 cupsCharsetToUTF8(
263 cups_utf8_t *dest, /* O - Target string */
264 const char *src, /* I - Source string */
265 const int maxout, /* I - Max output */
266 const cups_encoding_t encoding) /* I - Encoding */
267 {
268 int bytes; /* Number of bytes converted */
269
270
271 /*
272 * Check for valid arguments...
273 */
274
275 DEBUG_printf(("cupsCharsetToUTF8(dest=%p, src=\"%s\", maxout=%d, encoding=%d)\n",
276 dest, src, maxout, encoding));
277
278 if (dest)
279 *dest = '\0';
280
281 if (!dest || !src || maxout < 1 || maxout > CUPS_MAX_USTRING)
282 {
283 DEBUG_puts(" Bad arguments, returning -1");
284 return (-1);
285 }
286
287 /*
288 * Handle identity conversions...
289 */
290
291 if (encoding == CUPS_UTF8 ||
292 encoding < 0 || encoding >= CUPS_ENCODING_VBCS_END)
293 {
294 strlcpy((char *)dest, src, maxout);
295 return (strlen((char *)dest));
296 }
297
298 /*
299 * Convert input legacy charset to UTF-8...
300 */
301
302 #ifdef HAVE_PTHREAD_H
303 pthread_mutex_lock(&map_mutex);
304 #endif /* HAVE_PTHREAD_H */
305
306 if (encoding < CUPS_ENCODING_SBCS_END)
307 bytes = conv_sbcs_to_utf8(dest, (cups_sbcs_t *)src, maxout, encoding);
308 else if (encoding < CUPS_ENCODING_VBCS_END)
309 bytes = conv_vbcs_to_utf8(dest, (cups_sbcs_t *)src, maxout, encoding);
310 else
311 {
312 DEBUG_puts(" Bad encoding, returning -1");
313 bytes = -1;
314 }
315
316 #ifdef HAVE_PTHREAD_H
317 pthread_mutex_unlock(&map_mutex);
318 #endif /* HAVE_PTHREAD_H */
319
320 return (bytes);
321 }
322
323
324 /*
325 * 'cupsUTF8ToCharset()' - Convert UTF-8 to legacy character set.
326 *
327 * This code handles single-byte (SBCS), double-byte (DBCS), and
328 * variable-byte (VBCS) character sets _without_ charset escapes...
329 * This code does not handle multiple-byte character sets (MBCS)
330 * (such as ISO-2022-JP) with charset switching via escapes...
331 */
332
333 int /* O - Count or -1 on error */
334 cupsUTF8ToCharset(
335 char *dest, /* O - Target string */
336 const cups_utf8_t *src, /* I - Source string */
337 const int maxout, /* I - Max output */
338 const cups_encoding_t encoding) /* I - Encoding */
339 {
340 int bytes; /* Number of bytes converted */
341
342
343 /*
344 * Check for valid arguments...
345 */
346
347 if (!dest || !src || maxout < 1 || maxout > CUPS_MAX_USTRING)
348 {
349 if (dest)
350 *dest = '\0';
351
352 return (-1);
353 }
354
355 /*
356 * Handle identity conversions...
357 */
358
359 if (encoding == CUPS_UTF8 ||
360 encoding < 0 || encoding >= CUPS_ENCODING_VBCS_END)
361 {
362 strlcpy(dest, (char *)src, maxout);
363 return (strlen(dest));
364 }
365
366 /*
367 * Convert input UTF-8 to legacy charset...
368 */
369
370 #ifdef HAVE_PTHREAD_H
371 pthread_mutex_lock(&map_mutex);
372 #endif /* HAVE_PTHREAD_H */
373
374 if (encoding < CUPS_ENCODING_SBCS_END)
375 bytes = conv_utf8_to_sbcs((cups_sbcs_t *)dest, src, maxout, encoding);
376 else if (encoding < CUPS_ENCODING_VBCS_END)
377 bytes = conv_utf8_to_vbcs((cups_sbcs_t *)dest, src, maxout, encoding);
378 else
379 bytes = -1;
380
381 #ifdef HAVE_PTHREAD_H
382 pthread_mutex_unlock(&map_mutex);
383 #endif /* HAVE_PTHREAD_H */
384
385 return (bytes);
386 }
387
388
389 /*
390 * 'cupsUTF8ToUTF32()' - Convert UTF-8 to UTF-32.
391 *
392 * 32-bit UTF-32 (actually 21-bit) maps to UTF-8 as follows...
393 *
394 * UTF-32 char UTF-8 char(s)
395 * --------------------------------------------------
396 * 0 to 127 = 0xxxxxxx (US-ASCII)
397 * 128 to 2047 = 110xxxxx 10yyyyyy
398 * 2048 to 65535 = 1110xxxx 10yyyyyy 10zzzzzz
399 * > 65535 = 11110xxx 10yyyyyy 10zzzzzz 10xxxxxx
400 *
401 * UTF-32 prohibits chars beyond Plane 16 (> 0x10ffff) in UCS-4,
402 * which would convert to five- or six-octet UTF-8 sequences...
403 */
404
405 int /* O - Count or -1 on error */
406 cupsUTF8ToUTF32(
407 cups_utf32_t *dest, /* O - Target string */
408 const cups_utf8_t *src, /* I - Source string */
409 const int maxout) /* I - Max output */
410 {
411 size_t srclen; /* Source string length */
412 int i; /* Looping variable */
413 cups_utf8_t ch; /* Character value */
414 cups_utf8_t next; /* Next character value */
415 cups_utf32_t ch32; /* UTF-32 character value */
416
417
418 /*
419 * Check for valid arguments and clear output...
420 */
421
422 if (dest)
423 *dest = 0;
424
425 if (!dest || !src || maxout < 1 || maxout > CUPS_MAX_USTRING)
426 return (-1);
427
428 /*
429 * Convert input UTF-8 to output UTF-32 (and insert BOM)...
430 */
431
432 *dest++ = 0xfeff;
433 srclen = strlen((char *)src);
434
435 for (i = maxout - 1; *src && i > 0; i --)
436 {
437 ch = *src++;
438
439 /*
440 * Convert UTF-8 character(s) to UTF-32 character...
441 */
442
443 if (!(ch & 0x80))
444 {
445 /*
446 * One-octet UTF-8 <= 127 (US-ASCII)...
447 */
448
449 *dest++ = ch;
450 }
451 else if ((ch & 0xe0) == 0xc0)
452 {
453 /*
454 * Two-octet UTF-8 <= 2047 (Latin-x)...
455 */
456
457 next = *src++;
458 if (!next)
459 return (-1);
460
461 ch32 = ((ch & 0x1f) << 6) | (next & 0x3f);
462
463 /*
464 * Check for non-shortest form (invalid UTF-8)...
465 */
466
467 if (ch32 < 0x80)
468 return (-1);
469
470 *dest++ = ch32;
471 }
472 else if ((ch & 0xf0) == 0xe0)
473 {
474 /*
475 * Three-octet UTF-8 <= 65535 (Plane 0 - BMP)...
476 */
477
478 next = *src++;
479 if (!next)
480 return (-1);
481
482 ch32 = ((ch & 0x0f) << 6) | (next & 0x3f);
483
484 next = *src++;
485 if (!next)
486 return (-1);
487
488 ch32 = (ch32 << 6) | (next & 0x3f);
489
490 /*
491 * Check for non-shortest form (invalid UTF-8)...
492 */
493
494 if (ch32 < 0x800)
495 return (-1);
496
497 *dest++ = ch32;
498 }
499 else if ((ch & 0xf8) == 0xf0)
500 {
501 /*
502 * Four-octet UTF-8...
503 */
504
505 next = *src++;
506 if (!next)
507 return (-1);
508
509 ch32 = ((ch & 0x07) << 6) | (next & 0x3f);
510
511 next = *src++;
512 if (!next)
513 return (-1);
514
515 ch32 = (ch32 << 6) | (next & 0x3f);
516
517 next = *src++;
518 if (!next)
519 return (-1);
520
521 ch32 = (ch32 << 6) | (next & 0x3f);
522
523 /*
524 * Check for non-shortest form (invalid UTF-8)...
525 */
526
527 if (ch32 < 0x10000)
528 return (-1);
529
530 *dest++ = ch32;
531 }
532 else
533 {
534 /*
535 * More than 4-octet (invalid UTF-8 sequence)...
536 */
537
538 return (-1);
539 }
540
541 /*
542 * Check for UTF-16 surrogate (illegal UTF-8)...
543 */
544
545 if (*dest >= 0xd800 && *dest <= 0xdfff)
546 return (-1);
547 }
548
549 *dest = 0;
550
551 return (i);
552 }
553
554
555 /*
556 * 'cupsUTF32ToUTF8()' - Convert UTF-32 to UTF-8.
557 *
558 * 32-bit UTF-32 (actually 21-bit) maps to UTF-8 as follows...
559 *
560 * UTF-32 char UTF-8 char(s)
561 * --------------------------------------------------
562 * 0 to 127 = 0xxxxxxx (US-ASCII)
563 * 128 to 2047 = 110xxxxx 10yyyyyy
564 * 2048 to 65535 = 1110xxxx 10yyyyyy 10zzzzzz
565 * > 65535 = 11110xxx 10yyyyyy 10zzzzzz 10xxxxxx
566 *
567 * UTF-32 prohibits chars beyond Plane 16 (> 0x10ffff) in UCS-4,
568 * which would convert to five- or six-octet UTF-8 sequences...
569 */
570
571 int /* O - Count or -1 on error */
572 cupsUTF32ToUTF8(
573 cups_utf8_t *dest, /* O - Target string */
574 const cups_utf32_t *src, /* I - Source string */
575 const int maxout) /* I - Max output */
576 {
577 cups_utf8_t *start; /* Start of destination string */
578 int i; /* Looping variable */
579 int swap; /* Byte-swap input to output */
580 cups_utf32_t ch; /* Character value */
581
582
583 /*
584 * Check for valid arguments and clear output...
585 */
586
587 if (dest)
588 *dest = '\0';
589
590 if (!dest || !src || maxout < 1)
591 return (-1);
592
593 /*
594 * Check for leading BOM in UTF-32 and inverted BOM...
595 */
596
597 start = dest;
598 swap = *src == 0xfffe0000;
599
600 if (*src == 0xfffe0000 || *src == 0xfeff)
601 src ++;
602
603 /*
604 * Convert input UTF-32 to output UTF-8...
605 */
606
607 for (i = maxout - 1; *src && i > 0;)
608 {
609 ch = *src++;
610
611 /*
612 * Byte swap input UTF-32, if necessary...
613 * (only byte-swapping 24 of 32 bits)
614 */
615
616 if (swap)
617 ch = ((ch >> 24) | ((ch >> 8) & 0xff00) | ((ch << 8) & 0xff0000));
618
619 /*
620 * Check for beyond Plane 16 (invalid UTF-32)...
621 */
622
623 if (ch > 0x10ffff)
624 return (-1);
625
626 /*
627 * Convert UTF-32 character to UTF-8 character(s)...
628 */
629
630 if (ch < 0x80)
631 {
632 /*
633 * One-octet UTF-8 <= 127 (US-ASCII)...
634 */
635
636 *dest++ = (cups_utf8_t)ch;
637 i --;
638 }
639 else if (ch < 0x800)
640 {
641 /*
642 * Two-octet UTF-8 <= 2047 (Latin-x)...
643 */
644
645 if (i < 2)
646 return (-1);
647
648 *dest++ = (cups_utf8_t)(0xc0 | ((ch >> 6) & 0x1f));
649 *dest++ = (cups_utf8_t)(0x80 | (ch & 0x3f));
650 i -= 2;
651 }
652 else if (ch < 0x10000)
653 {
654 /*
655 * Three-octet UTF-8 <= 65535 (Plane 0 - BMP)...
656 */
657
658 if (i < 3)
659 return (-1);
660
661 *dest++ = (cups_utf8_t)(0xe0 | ((ch >> 12) & 0x0f));
662 *dest++ = (cups_utf8_t)(0x80 | ((ch >> 6) & 0x3f));
663 *dest++ = (cups_utf8_t)(0x80 | (ch & 0x3f));
664 i -= 3;
665 }
666 else
667 {
668 /*
669 * Four-octet UTF-8...
670 */
671
672 if (i < 4)
673 return (-1);
674
675 *dest++ = (cups_utf8_t)(0xf0 | ((ch >> 18) & 0x07));
676 *dest++ = (cups_utf8_t)(0x80 | ((ch >> 12) & 0x3f));
677 *dest++ = (cups_utf8_t)(0x80 | ((ch >> 6) & 0x3f));
678 *dest++ = (cups_utf8_t)(0x80 | (ch & 0x3f));
679 i -= 4;
680 }
681 }
682
683 *dest = '\0';
684
685 return ((int)(dest - start));
686 }
687
688
689 /*
690 * 'compare_wide()' - Compare key for wide (VBCS) match.
691 */
692
693 static int
694 compare_wide(const void *k1, /* I - Key char */
695 const void *k2) /* I - Map char */
696 {
697 cups_vbcs_t key; /* Legacy key character */
698 cups_vbcs_t map; /* Legacy map character */
699
700
701 key = *((cups_vbcs_t *)k1);
702 map = ((_cups_wide2uni_t *)k2)->widechar;
703
704 return ((int)(key - map));
705 }
706
707
708 /*
709 * 'conv_sbcs_to_utf8()' - Convert legacy SBCS to UTF-8.
710 */
711
712 static int /* O - Count or -1 on error */
713 conv_sbcs_to_utf8(
714 cups_utf8_t *dest, /* O - Target string */
715 const cups_sbcs_t *src, /* I - Source string */
716 int maxout, /* I - Max output */
717 const cups_encoding_t encoding) /* I - Encoding */
718 {
719 _cups_cmap_t *cmap; /* Legacy SBCS / Unicode Charset Map */
720 cups_ucs2_t *crow; /* Pointer to UCS-2 row in 'char2uni' */
721 cups_sbcs_t legchar; /* Legacy character value */
722 cups_utf32_t work[CUPS_MAX_USTRING], /* Internal UCS-4 string */
723 *workptr; /* Pointer into string */
724
725
726 /*
727 * Find legacy charset map in cache...
728 */
729
730 if ((cmap = (_cups_cmap_t *)get_charmap(encoding)) == NULL)
731 return (-1);
732
733 /*
734 * Convert input legacy charset to internal UCS-4 (and insert BOM)...
735 */
736
737 work[0] = 0xfeff;
738 for (workptr = work + 1; *src && workptr < (work + CUPS_MAX_USTRING - 1);)
739 {
740 legchar = *src++;
741
742 /*
743 * Convert ASCII verbatim (optimization)...
744 */
745
746 if (legchar < 0x80)
747 *workptr++ = (cups_utf32_t)legchar;
748 else
749 {
750 /*
751 * Convert unknown character to Replacement Character...
752 */
753
754 crow = cmap->char2uni + legchar;
755
756 if (!*crow)
757 *workptr++ = 0xfffd;
758 else
759 *workptr++ = (cups_utf32_t)*crow;
760 }
761 }
762
763 *workptr = 0;
764
765 /*
766 * Convert internal UCS-4 to output UTF-8 (and delete BOM)...
767 */
768
769 cmap->used --;
770
771 return (cupsUTF32ToUTF8(dest, work, maxout));
772 }
773
774
775 /*
776 * 'conv_utf8_to_sbcs()' - Convert UTF-8 to legacy SBCS.
777 */
778
779 static int /* O - Count or -1 on error */
780 conv_utf8_to_sbcs(
781 cups_sbcs_t *dest, /* O - Target string */
782 const cups_utf8_t *src, /* I - Source string */
783 int maxout, /* I - Max output */
784 const cups_encoding_t encoding) /* I - Encoding */
785 {
786 cups_sbcs_t *start; /* Start of destination string */
787 _cups_cmap_t *cmap; /* Legacy SBCS / Unicode Charset Map */
788 cups_sbcs_t *srow; /* Pointer to SBCS row in 'uni2char' */
789 cups_utf32_t unichar; /* Character value */
790 cups_utf32_t work[CUPS_MAX_USTRING], /* Internal UCS-4 string */
791 *workptr; /* Pointer into string */
792
793
794 /*
795 * Find legacy charset map in cache...
796 */
797
798 if ((cmap = (_cups_cmap_t *)get_charmap(encoding)) == NULL)
799 return (-1);
800
801 /*
802 * Convert input UTF-8 to internal UCS-4 (and insert BOM)...
803 */
804
805 if (cupsUTF8ToUTF32(work, src, CUPS_MAX_USTRING) < 0)
806 return (-1);
807
808 /*
809 * Convert internal UCS-4 to SBCS legacy charset (and delete BOM)...
810 */
811
812 for (workptr = work + 1, start = dest; *workptr && maxout > 1; maxout --)
813 {
814 unichar = *workptr++;
815 if (!unichar)
816 break;
817
818 /*
819 * Convert ASCII verbatim (optimization)...
820 */
821
822 if (unichar < 0x80)
823 {
824 *dest++ = (cups_sbcs_t)unichar;
825 continue;
826 }
827
828 /*
829 * Convert unknown character to visible replacement...
830 */
831
832 srow = cmap->uni2char[(int)((unichar >> 8) & 0xff)];
833
834 if (srow)
835 srow += (int)(unichar & 0xff);
836
837 if (!srow || !*srow)
838 *dest++ = '?';
839 else
840 *dest++ = *srow;
841 }
842
843 *dest = '\0';
844
845 cmap->used --;
846
847 return ((int)(dest - start));
848 }
849
850
851 /*
852 * 'conv_utf8_to_vbcs()' - Convert UTF-8 to legacy DBCS/VBCS.
853 */
854
855 static int /* O - Count or -1 on error */
856 conv_utf8_to_vbcs(
857 cups_sbcs_t *dest, /* O - Target string */
858 const cups_utf8_t *src, /* I - Source string */
859 int maxout, /* I - Max output */
860 const cups_encoding_t encoding) /* I - Encoding */
861 {
862 cups_sbcs_t *start; /* Start of destination string */
863 _cups_vmap_t *vmap; /* Legacy DBCS / Unicode Charset Map */
864 cups_vbcs_t *vrow; /* Pointer to VBCS row in 'uni2char' */
865 cups_utf32_t unichar; /* Character value */
866 cups_vbcs_t legchar; /* Legacy character value */
867 cups_utf32_t work[CUPS_MAX_USTRING], /* Internal UCS-4 string */
868 *workptr; /* Pointer into string */
869
870
871 /*
872 * Find legacy charset map in cache...
873 */
874
875 if ((vmap = (_cups_vmap_t *)get_charmap(encoding)) == NULL)
876 return (-1);
877
878 /*
879 * Convert input UTF-8 to internal UCS-4 (and insert BOM)...
880 */
881
882 if (cupsUTF8ToUTF32(work, src, CUPS_MAX_USTRING) < 0)
883 return (-1);
884
885 /*
886 * Convert internal UCS-4 to VBCS legacy charset (and delete BOM)...
887 */
888
889 for (start = dest, workptr = work + 1; *workptr && maxout > 1; maxout --)
890 {
891 unichar = *workptr++;
892 if (!unichar)
893 break;
894
895 /*
896 * Convert ASCII verbatim (optimization)...
897 */
898
899 if (unichar < 0x80)
900 {
901 *dest++ = (cups_vbcs_t)unichar;
902 continue;
903 }
904
905 /*
906 * Convert unknown character to visible replacement...
907 */
908
909 vrow = vmap->uni2char[(int)((unichar >> 8) & 0xff)];
910
911 if (vrow)
912 vrow += (int)(unichar & 0xff);
913
914 if (!vrow || !*vrow)
915 legchar = (cups_vbcs_t)'?';
916 else
917 legchar = (cups_vbcs_t)*vrow;
918
919 /*
920 * Save n-byte legacy character...
921 */
922
923 if (legchar > 0xffffff)
924 {
925 if (maxout < 5)
926 return (-1);
927
928 *dest++ = (cups_sbcs_t)(legchar >> 24);
929 *dest++ = (cups_sbcs_t)(legchar >> 16);
930 *dest++ = (cups_sbcs_t)(legchar >> 8);
931 *dest++ = (cups_sbcs_t)legchar;
932
933 maxout -= 3;
934 }
935 else if (legchar > 0xffff)
936 {
937 if (maxout < 4)
938 return (-1);
939
940 *dest++ = (cups_sbcs_t)(legchar >> 16);
941 *dest++ = (cups_sbcs_t)(legchar >> 8);
942 *dest++ = (cups_sbcs_t)legchar;
943
944 maxout -= 2;
945 }
946 else if (legchar > 0xff)
947 {
948 *dest++ = (cups_sbcs_t)(legchar >> 8);
949 *dest++ = (cups_sbcs_t)legchar;
950
951 maxout --;
952 }
953 }
954
955 *dest = '\0';
956
957 vmap->used --;
958
959 return ((int)(dest - start));
960 }
961
962
963 /*
964 * 'conv_vbcs_to_utf8()' - Convert legacy DBCS/VBCS to UTF-8.
965 */
966
967 static int /* O - Count or -1 on error */
968 conv_vbcs_to_utf8(
969 cups_utf8_t *dest, /* O - Target string */
970 const cups_sbcs_t *src, /* I - Source string */
971 int maxout, /* I - Max output */
972 const cups_encoding_t encoding) /* I - Encoding */
973 {
974 _cups_vmap_t *vmap; /* Legacy VBCS / Unicode Charset Map */
975 cups_ucs2_t *crow; /* Pointer to UCS-2 row in 'char2uni' */
976 _cups_wide2uni_t *wide2uni; /* Pointer to row in 'wide2uni' */
977 cups_sbcs_t leadchar; /* Lead char of n-byte legacy char */
978 cups_vbcs_t legchar; /* Legacy character value */
979 cups_utf32_t work[CUPS_MAX_USTRING], /* Internal UCS-4 string */
980 *workptr; /* Pointer into string */
981
982
983 /*
984 * Find legacy charset map in cache...
985 */
986
987 if ((vmap = (_cups_vmap_t *)get_charmap(encoding)) == NULL)
988 return (-1);
989
990 /*
991 * Convert input legacy charset to internal UCS-4 (and insert BOM)...
992 */
993
994 work[0] = 0xfeff;
995 for (workptr = work + 1; *src && workptr < (work + CUPS_MAX_USTRING - 1);)
996 {
997 legchar = *src++;
998 leadchar = (cups_sbcs_t)legchar;
999
1000 /*
1001 * Convert ASCII verbatim (optimization)...
1002 */
1003
1004 if (legchar < 0x80)
1005 {
1006 *workptr++ = (cups_utf32_t)legchar;
1007 continue;
1008 }
1009
1010 /*
1011 * Convert 2-byte legacy character...
1012 */
1013
1014 if (vmap->lead2char[(int)leadchar] == leadchar)
1015 {
1016 if (!*src)
1017 return (-1);
1018
1019 legchar = (legchar << 8) | *src++;
1020
1021 /*
1022 * Convert unknown character to Replacement Character...
1023 */
1024
1025 crow = vmap->char2uni[(int)((legchar >> 8) & 0xff)];
1026 if (crow)
1027 crow += (int) (legchar & 0xff);
1028
1029 if (!crow || !*crow)
1030 *workptr++ = 0xfffd;
1031 else
1032 *workptr++ = (cups_utf32_t)*crow;
1033 continue;
1034 }
1035
1036 /*
1037 * Fetch 3-byte or 4-byte legacy character...
1038 */
1039
1040 if (vmap->lead3char[(int)leadchar] == leadchar)
1041 {
1042 if (!*src || !src[1])
1043 return (-1);
1044
1045 legchar = (legchar << 8) | *src++;
1046 legchar = (legchar << 8) | *src++;
1047 }
1048 else if (vmap->lead4char[(int)leadchar] == leadchar)
1049 {
1050 if (!*src || !src[1] || !src[2])
1051 return (-1);
1052
1053 legchar = (legchar << 8) | *src++;
1054 legchar = (legchar << 8) | *src++;
1055 legchar = (legchar << 8) | *src++;
1056 }
1057 else
1058 return (-1);
1059
1060 /*
1061 * Find 3-byte or 4-byte legacy character...
1062 */
1063
1064 wide2uni = (_cups_wide2uni_t *)bsearch(&legchar,
1065 vmap->wide2uni,
1066 vmap->widecount,
1067 sizeof(_cups_wide2uni_t),
1068 compare_wide);
1069
1070 /*
1071 * Convert unknown character to Replacement Character...
1072 */
1073
1074 if (!wide2uni || !wide2uni->unichar)
1075 *workptr++ = 0xfffd;
1076 else
1077 *workptr++ = wide2uni->unichar;
1078 }
1079
1080 *workptr = 0;
1081
1082 vmap->used --;
1083
1084 /*
1085 * Convert internal UCS-4 to output UTF-8 (and delete BOM)...
1086 */
1087
1088 return (cupsUTF32ToUTF8(dest, work, maxout));
1089 }
1090
1091
1092 /*
1093 * 'free_sbcs_charmap()' - Free memory used by a single byte character set.
1094 */
1095
1096 static void
1097 free_sbcs_charmap(_cups_cmap_t *cmap) /* I - Character set */
1098 {
1099 int i; /* Looping variable */
1100
1101
1102 for (i = 0; i < 256; i ++)
1103 if (cmap->uni2char[i])
1104 free(cmap->uni2char[i]);
1105
1106 free(cmap);
1107 }
1108
1109
1110 /*
1111 * 'free_vbcs_charmap()' - Free memory used by a variable byte character set.
1112 */
1113
1114 static void
1115 free_vbcs_charmap(_cups_vmap_t *vmap) /* I - Character set */
1116 {
1117 int i; /* Looping variable */
1118
1119
1120 for (i = 0; i < 256; i ++)
1121 if (vmap->char2uni[i])
1122 free(vmap->char2uni[i]);
1123
1124 for (i = 0; i < 256; i ++)
1125 if (vmap->uni2char[i])
1126 free(vmap->uni2char[i]);
1127
1128 if (vmap->wide2uni)
1129 free(vmap->wide2uni);
1130
1131 free(vmap);
1132 }
1133
1134
1135 /*
1136 * 'get_charmap()' - Lookup or get a character set map (private).
1137 *
1138 * This code handles single-byte (SBCS), double-byte (DBCS), and
1139 * variable-byte (VBCS) character sets _without_ charset escapes...
1140 * This code does not handle multiple-byte character sets (MBCS)
1141 * (such as ISO-2022-JP) with charset switching via escapes...
1142 */
1143
1144
1145 void * /* O - Charset map pointer */
1146 get_charmap(
1147 const cups_encoding_t encoding) /* I - Encoding */
1148 {
1149 char filename[1024]; /* Filename for charset map file */
1150 _cups_globals_t *cg = _cupsGlobals(); /* Global data */
1151
1152
1153 /*
1154 * Get the data directory and charset map name...
1155 */
1156
1157 snprintf(filename, sizeof(filename), "%s/charmaps/%s.txt",
1158 cg->cups_datadir, _cupsEncodingName(encoding));
1159
1160 DEBUG_printf((" filename=\"%s\"\n", filename));
1161
1162 /*
1163 * Read charset map input file into cache...
1164 */
1165
1166 if (encoding < CUPS_ENCODING_SBCS_END)
1167 return (get_sbcs_charmap(encoding, filename));
1168 else if (encoding < CUPS_ENCODING_VBCS_END)
1169 return (get_vbcs_charmap(encoding, filename));
1170 else
1171 return (NULL);
1172 }
1173
1174
1175 /*
1176 * 'get_charmap_count()' - Count lines in a charmap file.
1177 */
1178
1179 static int /* O - Count or -1 on error */
1180 get_charmap_count(cups_file_t *fp) /* I - File to read from */
1181 {
1182 int count; /* Number of lines */
1183 char line[256]; /* Line from input map file */
1184
1185
1186 /*
1187 * Count lines in map input file...
1188 */
1189
1190 count = 0;
1191
1192 while (cupsFileGets(fp, line, sizeof(line)))
1193 if (line[0] == '0')
1194 count ++;
1195
1196 /*
1197 * Return the number of lines...
1198 */
1199
1200 if (count > 0)
1201 return (count);
1202 else
1203 return (-1);
1204 }
1205
1206
1207 /*
1208 * 'get_sbcs_charmap()' - Get SBCS Charmap.
1209 */
1210
1211 static _cups_cmap_t * /* O - Charmap or 0 on error */
1212 get_sbcs_charmap(
1213 const cups_encoding_t encoding, /* I - Charmap Encoding */
1214 const char *filename) /* I - Charmap Filename */
1215 {
1216 unsigned long legchar; /* Legacy character value */
1217 cups_utf32_t unichar; /* Unicode character value */
1218 _cups_cmap_t *cmap; /* Legacy SBCS / Unicode Charset Map */
1219 cups_file_t *fp; /* Charset map file pointer */
1220 char *s; /* Line parsing pointer */
1221 cups_ucs2_t *crow; /* Pointer to UCS-2 row in 'char2uni' */
1222 cups_sbcs_t *srow; /* Pointer to SBCS row in 'uni2char' */
1223 char line[256]; /* Line from charset map file */
1224
1225
1226 /*
1227 * See if we already have this SBCS charset map loaded...
1228 */
1229
1230 for (cmap = cmap_cache; cmap; cmap = cmap->next)
1231 {
1232 if (cmap->encoding == encoding)
1233 {
1234 cmap->used ++;
1235 DEBUG_printf((" returning existing cmap=%p\n", cmap));
1236
1237 return ((void *)cmap);
1238 }
1239 }
1240
1241 /*
1242 * Open SBCS charset map input file...
1243 */
1244
1245 if ((fp = cupsFileOpen(filename, "r")) == NULL)
1246 return (NULL);
1247
1248 /*
1249 * Allocate memory for SBCS charset map...
1250 */
1251
1252 if ((cmap = (_cups_cmap_t *)calloc(1, sizeof(_cups_cmap_t))) == NULL)
1253 {
1254 cupsFileClose(fp);
1255 DEBUG_puts(" Unable to allocate memory!");
1256
1257 return (NULL);
1258 }
1259
1260 cmap->used ++;
1261 cmap->encoding = encoding;
1262
1263 /*
1264 * Save SBCS charset map into memory for transcoding...
1265 */
1266
1267 while (cupsFileGets(fp, line, sizeof(line)))
1268 {
1269 if (line[0] != '0')
1270 continue;
1271
1272 legchar = strtol(line, &s, 16);
1273 if (legchar < 0 || legchar > 0xff)
1274 goto sbcs_error;
1275
1276 unichar = strtol(s, NULL, 16);
1277 if (unichar < 0 || unichar > 0xffff)
1278 goto sbcs_error;
1279
1280 /*
1281 * Save legacy to Unicode mapping in direct lookup table...
1282 */
1283
1284 crow = cmap->char2uni + legchar;
1285 *crow = (cups_ucs2_t)(unichar & 0xffff);
1286
1287 /*
1288 * Save Unicode to legacy mapping in indirect lookup table...
1289 */
1290
1291 srow = cmap->uni2char[(unichar >> 8) & 0xff];
1292 if (!srow)
1293 {
1294 srow = (cups_sbcs_t *)calloc(256, sizeof(cups_sbcs_t));
1295 if (!srow)
1296 goto sbcs_error;
1297
1298 cmap->uni2char[(unichar >> 8) & 0xff] = srow;
1299 }
1300
1301 srow += unichar & 0xff;
1302
1303 /*
1304 * Convert Replacement Character to visible replacement...
1305 */
1306
1307 if (unichar == 0xfffd)
1308 legchar = (unsigned long)'?';
1309
1310 /*
1311 * First (oldest) legacy character uses Unicode mapping cell...
1312 */
1313
1314 if (!*srow)
1315 *srow = (cups_sbcs_t)legchar;
1316 }
1317
1318 cupsFileClose(fp);
1319
1320 /*
1321 * Add it to the cache and return...
1322 */
1323
1324 cmap->next = cmap_cache;
1325 cmap_cache = cmap;
1326
1327 DEBUG_printf((" returning new cmap=%p\n", cmap));
1328
1329 return (cmap);
1330
1331 /*
1332 * If we get here, there was an error in the cmap file...
1333 */
1334
1335 sbcs_error:
1336
1337 free_sbcs_charmap(cmap);
1338
1339 cupsFileClose(fp);
1340
1341 DEBUG_puts(" Error, returning NULL!");
1342
1343 return (NULL);
1344 }
1345
1346
1347 /*
1348 * 'get_vbcs_charmap()' - Get DBCS/VBCS Charmap.
1349 */
1350
1351 static _cups_vmap_t * /* O - Charmap or 0 on error */
1352 get_vbcs_charmap(
1353 const cups_encoding_t encoding, /* I - Charmap Encoding */
1354 const char *filename) /* I - Charmap Filename */
1355 {
1356 _cups_vmap_t *vmap; /* Legacy VBCS / Unicode Charset Map */
1357 cups_ucs2_t *crow; /* Pointer to UCS-2 row in 'char2uni' */
1358 cups_vbcs_t *vrow; /* Pointer to VBCS row in 'uni2char' */
1359 _cups_wide2uni_t *wide2uni; /* Pointer to row in 'wide2uni' */
1360 cups_sbcs_t leadchar; /* Lead char of 2-byte legacy char */
1361 unsigned long legchar; /* Legacy character value */
1362 cups_utf32_t unichar; /* Unicode character value */
1363 int mapcount; /* Count of lines in charmap file */
1364 cups_file_t *fp; /* Charset map file pointer */
1365 char *s; /* Line parsing pointer */
1366 char line[256]; /* Line from charset map file */
1367 int i; /* Loop variable */
1368 int wide; /* 32-bit legacy char */
1369
1370
1371 DEBUG_printf(("get_vbcs_charmap(encoding=%d, filename=\"%s\")\n",
1372 encoding, filename));
1373
1374 /*
1375 * See if we already have this DBCS/VBCS charset map loaded...
1376 */
1377
1378 for (vmap = vmap_cache; vmap; vmap = vmap->next)
1379 {
1380 if (vmap->encoding == encoding)
1381 {
1382 vmap->used ++;
1383 DEBUG_printf((" returning existing vmap=%p\n", vmap));
1384
1385 return ((void *)vmap);
1386 }
1387 }
1388
1389 /*
1390 * Open VBCS charset map input file...
1391 */
1392
1393 if ((fp = cupsFileOpen(filename, "r")) == NULL)
1394 {
1395 DEBUG_printf((" Unable to open file: %s\n", strerror(errno)));
1396
1397 return (NULL);
1398 }
1399
1400 /*
1401 * Count lines in charmap file...
1402 */
1403
1404 if ((mapcount = get_charmap_count(fp)) <= 0)
1405 {
1406 DEBUG_puts(" Unable to get charmap count!");
1407
1408 return (NULL);
1409 }
1410
1411 DEBUG_printf((" mapcount=%d\n", mapcount));
1412
1413 /*
1414 * Allocate memory for DBCS/VBCS charset map...
1415 */
1416
1417 if ((vmap = (_cups_vmap_t *)calloc(1, sizeof(_cups_vmap_t))) == NULL)
1418 {
1419 cupsFileClose(fp);
1420 DEBUG_puts(" Unable to allocate memory!");
1421
1422 return (NULL);
1423 }
1424
1425 vmap->used ++;
1426 vmap->encoding = encoding;
1427
1428 /*
1429 * Save DBCS/VBCS charset map into memory for transcoding...
1430 */
1431
1432 leadchar = 0;
1433 wide2uni = NULL;
1434
1435 cupsFileRewind(fp);
1436
1437 i = 0;
1438 wide = 0;
1439
1440 while (cupsFileGets(fp, line, sizeof(line)))
1441 {
1442 if (line[0] != '0')
1443 continue;
1444
1445 legchar = strtoul(line, &s, 16);
1446 if (legchar == ULONG_MAX)
1447 goto vbcs_error;
1448
1449 unichar = strtol(s, NULL, 16);
1450 if (unichar < 0 || unichar > 0xffff)
1451 goto vbcs_error;
1452
1453 i ++;
1454
1455 /* DEBUG_printf((" i=%d, legchar=0x%08lx, unichar=0x%04x\n", i,
1456 legchar, (unsigned)unichar)); */
1457
1458 /*
1459 * Save lead char of 2/3/4-byte legacy char...
1460 */
1461
1462 if (legchar > 0xff && legchar <= 0xffff)
1463 {
1464 leadchar = (cups_sbcs_t)(legchar >> 8);
1465 vmap->lead2char[leadchar] = leadchar;
1466 }
1467
1468 if (legchar > 0xffff && legchar <= 0xffffff)
1469 {
1470 leadchar = (cups_sbcs_t)(legchar >> 16);
1471 vmap->lead3char[leadchar] = leadchar;
1472 }
1473
1474 if (legchar > 0xffffff)
1475 {
1476 leadchar = (cups_sbcs_t)(legchar >> 24);
1477 vmap->lead4char[leadchar] = leadchar;
1478 }
1479
1480 /*
1481 * Save Legacy to Unicode mapping...
1482 */
1483
1484 if (legchar <= 0xffff)
1485 {
1486 /*
1487 * Save DBCS 16-bit to Unicode mapping in indirect lookup table...
1488 */
1489
1490 crow = vmap->char2uni[(int)leadchar];
1491 if (!crow)
1492 {
1493 crow = (cups_ucs2_t *)calloc(256, sizeof(cups_ucs2_t));
1494 if (!crow)
1495 goto vbcs_error;
1496
1497 vmap->char2uni[(int)leadchar] = crow;
1498 }
1499
1500 crow[(int)(legchar & 0xff)] = (cups_ucs2_t)unichar;
1501 }
1502 else
1503 {
1504 /*
1505 * Save VBCS 32-bit to Unicode mapping in sorted list table...
1506 */
1507
1508 if (!wide)
1509 {
1510 wide = 1;
1511 vmap->widecount = (mapcount - i + 1);
1512 wide2uni = (_cups_wide2uni_t *)calloc(vmap->widecount,
1513 sizeof(_cups_wide2uni_t));
1514 if (!wide2uni)
1515 goto vbcs_error;
1516
1517 vmap->wide2uni = wide2uni;
1518 }
1519
1520 wide2uni->widechar = (cups_vbcs_t)legchar;
1521 wide2uni->unichar = (cups_ucs2_t)unichar;
1522 wide2uni ++;
1523 }
1524
1525 /*
1526 * Save Unicode to legacy mapping in indirect lookup table...
1527 */
1528
1529 vrow = vmap->uni2char[(int)((unichar >> 8) & 0xff)];
1530 if (!vrow)
1531 {
1532 vrow = (cups_vbcs_t *)calloc(256, sizeof(cups_vbcs_t));
1533 if (!vrow)
1534 goto vbcs_error;
1535
1536 vmap->uni2char[(int) ((unichar >> 8) & 0xff)] = vrow;
1537 }
1538
1539 vrow += (int)(unichar & 0xff);
1540
1541 /*
1542 * Convert Replacement Character to visible replacement...
1543 */
1544
1545 if (unichar == 0xfffd)
1546 legchar = (unsigned long)'?';
1547
1548 /*
1549 * First (oldest) legacy character uses Unicode mapping cell...
1550 */
1551
1552 if (!*vrow)
1553 *vrow = (cups_vbcs_t)legchar;
1554 }
1555
1556 vmap->charcount = (i - vmap->widecount);
1557
1558 cupsFileClose(fp);
1559
1560 /*
1561 * Add it to the cache and return...
1562 */
1563
1564 vmap->next = vmap_cache;
1565 vmap_cache = vmap;
1566
1567 DEBUG_printf((" returning new vmap=%p\n", vmap));
1568
1569 return (vmap);
1570
1571 /*
1572 * If we get here, the file contains errors...
1573 */
1574
1575 vbcs_error:
1576
1577 free_vbcs_charmap(vmap);
1578
1579 cupsFileClose(fp);
1580
1581 DEBUG_puts(" Error, returning NULL!");
1582
1583 return (NULL);
1584 }
1585
1586
1587 /*
1588 * End of "$Id$"
1589 */