2 * "$Id: transcode.c 6649 2007-07-11 21:46:42Z mike $"
4 * Transcoding support for the Common UNIX Printing System (CUPS).
6 * Copyright 2007 by Apple Inc.
7 * Copyright 1997-2007 by Easy Software Products.
9 * These coded instructions, statements, and computer programs are the
10 * property of Apple Inc. and are protected by Federal copyright
11 * law. Distribution and use rights are outlined in the file "LICENSE.txt"
12 * which should have been included with this file. If this file is
13 * file is missing or damaged, see the license at "http://www.cups.org/".
15 * This file is subject to the Apple OS-Developed Software exception.
19 * _cupsCharmapFlush() - Flush all character set maps out of cache.
20 * _cupsCharmapFree() - Free a character set map.
21 * _cupsCharmapGet() - Get a character set map.
22 * cupsCharsetToUTF8() - Convert legacy character set to UTF-8.
23 * cupsUTF8ToCharset() - Convert UTF-8 to legacy character set.
24 * cupsUTF8ToUTF32() - Convert UTF-8 to UTF-32.
25 * cupsUTF32ToUTF8() - Convert UTF-32 to UTF-8.
26 * compare_wide() - Compare key for wide (VBCS) match.
27 * conv_sbcs_to_utf8() - Convert legacy SBCS to UTF-8.
28 * conv_utf8_to_sbcs() - Convert UTF-8 to legacy SBCS.
29 * conv_utf8_to_vbcs() - Convert UTF-8 to legacy DBCS/VBCS.
30 * conv_vbcs_to_utf8() - Convert legacy DBCS/VBCS to UTF-8.
31 * free_sbcs_charmap() - Free memory used by a single byte character set.
32 * free_vbcs_charmap() - Free memory used by a variable byte character set.
33 * get_charmap() - Lookup or get a character set map (private).
34 * get_charmap_count() - Count lines in a charmap file.
35 * get_sbcs_charmap() - Get SBCS Charmap.
36 * get_vbcs_charmap() - Get DBCS/VBCS Charmap.
40 * Include necessary headers...
56 static pthread_mutex_t map_mutex
= PTHREAD_MUTEX_INITIALIZER
;
57 /* Mutex to control access to maps */
58 #endif /* HAVE_PTHREAD_H */
59 static _cups_cmap_t
*cmap_cache
= NULL
;
60 /* SBCS Charmap Cache */
61 static _cups_vmap_t
*vmap_cache
= NULL
;
62 /* VBCS Charmap Cache */
69 static int compare_wide(const void *k1
, const void *k2
);
70 static int conv_sbcs_to_utf8(cups_utf8_t
*dest
,
71 const cups_sbcs_t
*src
,
73 const cups_encoding_t encoding
);
74 static int conv_utf8_to_sbcs(cups_sbcs_t
*dest
,
75 const cups_utf8_t
*src
,
77 const cups_encoding_t encoding
);
78 static int conv_utf8_to_vbcs(cups_sbcs_t
*dest
,
79 const cups_utf8_t
*src
,
81 const cups_encoding_t encoding
);
82 static int conv_vbcs_to_utf8(cups_utf8_t
*dest
,
83 const cups_sbcs_t
*src
,
85 const cups_encoding_t encoding
);
86 static void free_sbcs_charmap(_cups_cmap_t
*sbcs
);
87 static void free_vbcs_charmap(_cups_vmap_t
*vbcs
);
88 static void *get_charmap(const cups_encoding_t encoding
);
89 static int get_charmap_count(cups_file_t
*fp
);
90 static _cups_cmap_t
*get_sbcs_charmap(const cups_encoding_t encoding
,
91 const char *filename
);
92 static _cups_vmap_t
*get_vbcs_charmap(const cups_encoding_t encoding
,
93 const char *filename
);
97 * '_cupsCharmapFlush()' - Flush all character set maps out of cache.
101 _cupsCharmapFlush(void)
103 _cups_cmap_t
*cmap
, /* Legacy SBCS / Unicode Charset Map */
104 *cnext
; /* Next Legacy SBCS Charset Map */
105 _cups_vmap_t
*vmap
, /* Legacy VBCS / Unicode Charset Map */
106 *vnext
; /* Next Legacy VBCS Charset Map */
109 #ifdef HAVE_PTHREAD_H
110 pthread_mutex_lock(&map_mutex
);
111 #endif /* HAVE_PTHREAD_H */
114 * Loop through SBCS charset map cache, free all memory...
117 for (cmap
= cmap_cache
; cmap
; cmap
= cnext
)
121 free_sbcs_charmap(cmap
);
127 * Loop through DBCS/VBCS charset map cache, free all memory...
130 for (vmap
= vmap_cache
; vmap
; vmap
= vnext
)
134 free_vbcs_charmap(vmap
);
141 #ifdef HAVE_PTHREAD_H
142 pthread_mutex_unlock(&map_mutex
);
143 #endif /* HAVE_PTHREAD_H */
148 * '_cupsCharmapFree()' - Free a character set map.
150 * This does not actually free; use '_cupsCharmapFlush()' for that.
155 const cups_encoding_t encoding
) /* I - Encoding */
157 _cups_cmap_t
*cmap
; /* Legacy SBCS / Unicode Charset Map */
158 _cups_vmap_t
*vmap
; /* Legacy VBCS / Unicode Charset Map */
162 * See if we already have this SBCS charset map loaded...
165 #ifdef HAVE_PTHREAD_H
166 pthread_mutex_lock(&map_mutex
);
167 #endif /* HAVE_PTHREAD_H */
169 for (cmap
= cmap_cache
; cmap
; cmap
= cmap
->next
)
171 if (cmap
->encoding
== encoding
)
180 * See if we already have this DBCS/VBCS charset map loaded...
183 for (vmap
= vmap_cache
; vmap
; vmap
= vmap
->next
)
185 if (vmap
->encoding
== encoding
)
193 #ifdef HAVE_PTHREAD_H
194 pthread_mutex_unlock(&map_mutex
);
195 #endif /* HAVE_PTHREAD_H */
200 * '_cupsCharmapGet()' - Get a character set map.
202 * This code handles single-byte (SBCS), double-byte (DBCS), and
203 * variable-byte (VBCS) character sets _without_ charset escapes...
204 * This code does not handle multiple-byte character sets (MBCS)
205 * (such as ISO-2022-JP) with charset switching via escapes...
208 void * /* O - Charset map pointer */
210 const cups_encoding_t encoding
) /* I - Encoding */
212 void *charmap
; /* Charset map pointer */
215 DEBUG_printf(("_cupsCharmapGet(encoding=%d)\n", encoding
));
218 * Check for valid arguments...
221 if (encoding
< 0 || encoding
>= CUPS_ENCODING_VBCS_END
)
223 DEBUG_puts(" Bad encoding, returning NULL!");
228 * Lookup or get the charset map pointer and return...
231 #ifdef HAVE_PTHREAD_H
232 pthread_mutex_lock(&map_mutex
);
233 #endif /* HAVE_PTHREAD_H */
235 charmap
= get_charmap(encoding
);
237 #ifdef HAVE_PTHREAD_H
238 pthread_mutex_unlock(&map_mutex
);
239 #endif /* HAVE_PTHREAD_H */
246 * 'cupsCharsetToUTF8()' - Convert legacy character set to UTF-8.
248 * This code handles single-byte (SBCS), double-byte (DBCS), and
249 * variable-byte (VBCS) character sets _without_ charset escapes...
250 * This code does not handle multiple-byte character sets (MBCS)
251 * (such as ISO-2022-JP) with charset switching via escapes...
254 int /* O - Count or -1 on error */
256 cups_utf8_t
*dest
, /* O - Target string */
257 const char *src
, /* I - Source string */
258 const int maxout
, /* I - Max output */
259 const cups_encoding_t encoding
) /* I - Encoding */
261 int bytes
; /* Number of bytes converted */
265 * Check for valid arguments...
268 DEBUG_printf(("cupsCharsetToUTF8(dest=%p, src=\"%s\", maxout=%d, encoding=%d)\n",
269 dest
, src
, maxout
, encoding
));
274 if (!dest
|| !src
|| maxout
< 1 || maxout
> CUPS_MAX_USTRING
)
276 DEBUG_puts(" Bad arguments, returning -1");
281 * Handle identity conversions...
284 if (encoding
== CUPS_UTF8
||
285 encoding
< 0 || encoding
>= CUPS_ENCODING_VBCS_END
)
287 strlcpy((char *)dest
, src
, maxout
);
288 return ((int)strlen((char *)dest
));
292 * Handle ISO-8859-1 to UTF-8 directly...
295 if (encoding
== CUPS_ISO8859_1
)
297 int ch
; /* Character from string */
298 cups_utf8_t
*destptr
, /* Pointer into UTF-8 buffer */
299 *destend
; /* End of UTF-8 buffer */
303 destend
= dest
+ maxout
- 2;
305 while (*src
&& destptr
< destend
)
311 *destptr
++ = 0xc0 | (ch
>> 6);
312 *destptr
++ = 0x80 | (ch
& 0x3f);
320 return ((int)(destptr
- dest
));
324 * Convert input legacy charset to UTF-8...
327 #ifdef HAVE_PTHREAD_H
328 pthread_mutex_lock(&map_mutex
);
329 #endif /* HAVE_PTHREAD_H */
331 if (encoding
< CUPS_ENCODING_SBCS_END
)
332 bytes
= conv_sbcs_to_utf8(dest
, (cups_sbcs_t
*)src
, maxout
, encoding
);
333 else if (encoding
< CUPS_ENCODING_VBCS_END
)
334 bytes
= conv_vbcs_to_utf8(dest
, (cups_sbcs_t
*)src
, maxout
, encoding
);
337 DEBUG_puts(" Bad encoding, returning -1");
341 #ifdef HAVE_PTHREAD_H
342 pthread_mutex_unlock(&map_mutex
);
343 #endif /* HAVE_PTHREAD_H */
350 * 'cupsUTF8ToCharset()' - Convert UTF-8 to legacy character set.
352 * This code handles single-byte (SBCS), double-byte (DBCS), and
353 * variable-byte (VBCS) character sets _without_ charset escapes...
354 * This code does not handle multiple-byte character sets (MBCS)
355 * (such as ISO-2022-JP) with charset switching via escapes...
358 int /* O - Count or -1 on error */
360 char *dest
, /* O - Target string */
361 const cups_utf8_t
*src
, /* I - Source string */
362 const int maxout
, /* I - Max output */
363 const cups_encoding_t encoding
) /* I - Encoding */
365 int bytes
; /* Number of bytes converted */
369 * Check for valid arguments...
372 if (!dest
|| !src
|| maxout
< 1 || maxout
> CUPS_MAX_USTRING
)
381 * Handle identity conversions...
384 if (encoding
== CUPS_UTF8
||
385 encoding
< 0 || encoding
>= CUPS_ENCODING_VBCS_END
)
387 strlcpy(dest
, (char *)src
, maxout
);
388 return ((int)strlen(dest
));
392 * Handle UTF-8 to ISO-8859-1 directly...
395 if (encoding
== CUPS_ISO8859_1
)
397 int ch
; /* Character from string */
398 char *destptr
, /* Pointer into ISO-8859-1 buffer */
399 *destend
; /* End of ISO-8859-1 buffer */
403 destend
= dest
+ maxout
- 1;
405 while (*src
&& destptr
< destend
)
409 if ((ch
& 0xe0) == 0xc0)
411 ch
= ((ch
& 0x1f) << 6) | (*src
++ & 0x3f);
418 else if ((ch
& 0xf0) == 0xe0 ||
421 else if (!(ch
& 0x80))
427 return ((int)(destptr
- dest
));
431 * Convert input UTF-8 to legacy charset...
434 #ifdef HAVE_PTHREAD_H
435 pthread_mutex_lock(&map_mutex
);
436 #endif /* HAVE_PTHREAD_H */
438 if (encoding
< CUPS_ENCODING_SBCS_END
)
439 bytes
= conv_utf8_to_sbcs((cups_sbcs_t
*)dest
, src
, maxout
, encoding
);
440 else if (encoding
< CUPS_ENCODING_VBCS_END
)
441 bytes
= conv_utf8_to_vbcs((cups_sbcs_t
*)dest
, src
, maxout
, encoding
);
445 #ifdef HAVE_PTHREAD_H
446 pthread_mutex_unlock(&map_mutex
);
447 #endif /* HAVE_PTHREAD_H */
454 * 'cupsUTF8ToUTF32()' - Convert UTF-8 to UTF-32.
456 * 32-bit UTF-32 (actually 21-bit) maps to UTF-8 as follows...
458 * UTF-32 char UTF-8 char(s)
459 * --------------------------------------------------
460 * 0 to 127 = 0xxxxxxx (US-ASCII)
461 * 128 to 2047 = 110xxxxx 10yyyyyy
462 * 2048 to 65535 = 1110xxxx 10yyyyyy 10zzzzzz
463 * > 65535 = 11110xxx 10yyyyyy 10zzzzzz 10xxxxxx
465 * UTF-32 prohibits chars beyond Plane 16 (> 0x10ffff) in UCS-4,
466 * which would convert to five- or six-octet UTF-8 sequences...
469 int /* O - Count or -1 on error */
471 cups_utf32_t
*dest
, /* O - Target string */
472 const cups_utf8_t
*src
, /* I - Source string */
473 const int maxout
) /* I - Max output */
475 int i
; /* Looping variable */
476 cups_utf8_t ch
; /* Character value */
477 cups_utf8_t next
; /* Next character value */
478 cups_utf32_t ch32
; /* UTF-32 character value */
482 * Check for valid arguments and clear output...
488 if (!dest
|| !src
|| maxout
< 1 || maxout
> CUPS_MAX_USTRING
)
492 * Convert input UTF-8 to output UTF-32 (and insert BOM)...
497 for (i
= maxout
- 1; *src
&& i
> 0; i
--)
502 * Convert UTF-8 character(s) to UTF-32 character...
508 * One-octet UTF-8 <= 127 (US-ASCII)...
514 else if ((ch
& 0xe0) == 0xc0)
517 * Two-octet UTF-8 <= 2047 (Latin-x)...
524 ch32
= ((ch
& 0x1f) << 6) | (next
& 0x3f);
527 * Check for non-shortest form (invalid UTF-8)...
535 else if ((ch
& 0xf0) == 0xe0)
538 * Three-octet UTF-8 <= 65535 (Plane 0 - BMP)...
545 ch32
= ((ch
& 0x0f) << 6) | (next
& 0x3f);
551 ch32
= (ch32
<< 6) | (next
& 0x3f);
554 * Check for non-shortest form (invalid UTF-8)...
562 else if ((ch
& 0xf8) == 0xf0)
565 * Four-octet UTF-8...
572 ch32
= ((ch
& 0x07) << 6) | (next
& 0x3f);
578 ch32
= (ch32
<< 6) | (next
& 0x3f);
584 ch32
= (ch32
<< 6) | (next
& 0x3f);
587 * Check for non-shortest form (invalid UTF-8)...
598 * More than 4-octet (invalid UTF-8 sequence)...
605 * Check for UTF-16 surrogate (illegal UTF-8)...
608 if (ch32
>= 0xd800 && ch32
<= 0xdfff)
619 * 'cupsUTF32ToUTF8()' - Convert UTF-32 to UTF-8.
621 * 32-bit UTF-32 (actually 21-bit) maps to UTF-8 as follows...
623 * UTF-32 char UTF-8 char(s)
624 * --------------------------------------------------
625 * 0 to 127 = 0xxxxxxx (US-ASCII)
626 * 128 to 2047 = 110xxxxx 10yyyyyy
627 * 2048 to 65535 = 1110xxxx 10yyyyyy 10zzzzzz
628 * > 65535 = 11110xxx 10yyyyyy 10zzzzzz 10xxxxxx
630 * UTF-32 prohibits chars beyond Plane 16 (> 0x10ffff) in UCS-4,
631 * which would convert to five- or six-octet UTF-8 sequences...
634 int /* O - Count or -1 on error */
636 cups_utf8_t
*dest
, /* O - Target string */
637 const cups_utf32_t
*src
, /* I - Source string */
638 const int maxout
) /* I - Max output */
640 cups_utf8_t
*start
; /* Start of destination string */
641 int i
; /* Looping variable */
642 int swap
; /* Byte-swap input to output */
643 cups_utf32_t ch
; /* Character value */
647 * Check for valid arguments and clear output...
653 if (!dest
|| !src
|| maxout
< 1)
657 * Check for leading BOM in UTF-32 and inverted BOM...
661 swap
= *src
== 0xfffe0000;
663 if (*src
== 0xfffe0000 || *src
== 0xfeff)
667 * Convert input UTF-32 to output UTF-8...
670 for (i
= maxout
- 1; *src
&& i
> 0;)
675 * Byte swap input UTF-32, if necessary...
676 * (only byte-swapping 24 of 32 bits)
680 ch
= ((ch
>> 24) | ((ch
>> 8) & 0xff00) | ((ch
<< 8) & 0xff0000));
683 * Check for beyond Plane 16 (invalid UTF-32)...
690 * Convert UTF-32 character to UTF-8 character(s)...
696 * One-octet UTF-8 <= 127 (US-ASCII)...
699 *dest
++ = (cups_utf8_t
)ch
;
705 * Two-octet UTF-8 <= 2047 (Latin-x)...
711 *dest
++ = (cups_utf8_t
)(0xc0 | ((ch
>> 6) & 0x1f));
712 *dest
++ = (cups_utf8_t
)(0x80 | (ch
& 0x3f));
715 else if (ch
< 0x10000)
718 * Three-octet UTF-8 <= 65535 (Plane 0 - BMP)...
724 *dest
++ = (cups_utf8_t
)(0xe0 | ((ch
>> 12) & 0x0f));
725 *dest
++ = (cups_utf8_t
)(0x80 | ((ch
>> 6) & 0x3f));
726 *dest
++ = (cups_utf8_t
)(0x80 | (ch
& 0x3f));
732 * Four-octet UTF-8...
738 *dest
++ = (cups_utf8_t
)(0xf0 | ((ch
>> 18) & 0x07));
739 *dest
++ = (cups_utf8_t
)(0x80 | ((ch
>> 12) & 0x3f));
740 *dest
++ = (cups_utf8_t
)(0x80 | ((ch
>> 6) & 0x3f));
741 *dest
++ = (cups_utf8_t
)(0x80 | (ch
& 0x3f));
748 return ((int)(dest
- start
));
753 * 'compare_wide()' - Compare key for wide (VBCS) match.
757 compare_wide(const void *k1
, /* I - Key char */
758 const void *k2
) /* I - Map char */
760 cups_vbcs_t key
; /* Legacy key character */
761 cups_vbcs_t map
; /* Legacy map character */
764 key
= *((cups_vbcs_t
*)k1
);
765 map
= ((_cups_wide2uni_t
*)k2
)->widechar
;
767 return ((int)(key
- map
));
772 * 'conv_sbcs_to_utf8()' - Convert legacy SBCS to UTF-8.
775 static int /* O - Count or -1 on error */
777 cups_utf8_t
*dest
, /* O - Target string */
778 const cups_sbcs_t
*src
, /* I - Source string */
779 int maxout
, /* I - Max output */
780 const cups_encoding_t encoding
) /* I - Encoding */
782 _cups_cmap_t
*cmap
; /* Legacy SBCS / Unicode Charset Map */
783 cups_ucs2_t
*crow
; /* Pointer to UCS-2 row in 'char2uni' */
784 cups_sbcs_t legchar
; /* Legacy character value */
785 cups_utf32_t work
[CUPS_MAX_USTRING
], /* Internal UCS-4 string */
786 *workptr
; /* Pointer into string */
790 * Find legacy charset map in cache...
793 if ((cmap
= (_cups_cmap_t
*)get_charmap(encoding
)) == NULL
)
797 * Convert input legacy charset to internal UCS-4 (and insert BOM)...
801 for (workptr
= work
+ 1; *src
&& workptr
< (work
+ CUPS_MAX_USTRING
- 1);)
806 * Convert ASCII verbatim (optimization)...
810 *workptr
++ = (cups_utf32_t
)legchar
;
814 * Convert unknown character to Replacement Character...
817 crow
= cmap
->char2uni
+ legchar
;
822 *workptr
++ = (cups_utf32_t
)*crow
;
829 * Convert internal UCS-4 to output UTF-8 (and delete BOM)...
834 return (cupsUTF32ToUTF8(dest
, work
, maxout
));
839 * 'conv_utf8_to_sbcs()' - Convert UTF-8 to legacy SBCS.
842 static int /* O - Count or -1 on error */
844 cups_sbcs_t
*dest
, /* O - Target string */
845 const cups_utf8_t
*src
, /* I - Source string */
846 int maxout
, /* I - Max output */
847 const cups_encoding_t encoding
) /* I - Encoding */
849 cups_sbcs_t
*start
; /* Start of destination string */
850 _cups_cmap_t
*cmap
; /* Legacy SBCS / Unicode Charset Map */
851 cups_sbcs_t
*srow
; /* Pointer to SBCS row in 'uni2char' */
852 cups_utf32_t unichar
; /* Character value */
853 cups_utf32_t work
[CUPS_MAX_USTRING
], /* Internal UCS-4 string */
854 *workptr
; /* Pointer into string */
858 * Find legacy charset map in cache...
861 if ((cmap
= (_cups_cmap_t
*)get_charmap(encoding
)) == NULL
)
865 * Convert input UTF-8 to internal UCS-4 (and insert BOM)...
868 if (cupsUTF8ToUTF32(work
, src
, CUPS_MAX_USTRING
) < 0)
872 * Convert internal UCS-4 to SBCS legacy charset (and delete BOM)...
875 for (workptr
= work
+ 1, start
= dest
; *workptr
&& maxout
> 1; maxout
--)
877 unichar
= *workptr
++;
882 * Convert ASCII verbatim (optimization)...
887 *dest
++ = (cups_sbcs_t
)unichar
;
892 * Convert unknown character to visible replacement...
895 srow
= cmap
->uni2char
[(int)((unichar
>> 8) & 0xff)];
898 srow
+= (int)(unichar
& 0xff);
910 return ((int)(dest
- start
));
915 * 'conv_utf8_to_vbcs()' - Convert UTF-8 to legacy DBCS/VBCS.
918 static int /* O - Count or -1 on error */
920 cups_sbcs_t
*dest
, /* O - Target string */
921 const cups_utf8_t
*src
, /* I - Source string */
922 int maxout
, /* I - Max output */
923 const cups_encoding_t encoding
) /* I - Encoding */
925 cups_sbcs_t
*start
; /* Start of destination string */
926 _cups_vmap_t
*vmap
; /* Legacy DBCS / Unicode Charset Map */
927 cups_vbcs_t
*vrow
; /* Pointer to VBCS row in 'uni2char' */
928 cups_utf32_t unichar
; /* Character value */
929 cups_vbcs_t legchar
; /* Legacy character value */
930 cups_utf32_t work
[CUPS_MAX_USTRING
], /* Internal UCS-4 string */
931 *workptr
; /* Pointer into string */
935 * Find legacy charset map in cache...
938 if ((vmap
= (_cups_vmap_t
*)get_charmap(encoding
)) == NULL
)
942 * Convert input UTF-8 to internal UCS-4 (and insert BOM)...
945 if (cupsUTF8ToUTF32(work
, src
, CUPS_MAX_USTRING
) < 0)
949 * Convert internal UCS-4 to VBCS legacy charset (and delete BOM)...
952 for (start
= dest
, workptr
= work
+ 1; *workptr
&& maxout
> 1; maxout
--)
954 unichar
= *workptr
++;
959 * Convert ASCII verbatim (optimization)...
964 *dest
++ = (cups_sbcs_t
)unichar
;
969 * Convert unknown character to visible replacement...
972 vrow
= vmap
->uni2char
[(int)((unichar
>> 8) & 0xff)];
975 vrow
+= (int)(unichar
& 0xff);
978 legchar
= (cups_vbcs_t
)'?';
980 legchar
= (cups_vbcs_t
)*vrow
;
983 * Save n-byte legacy character...
986 if (legchar
> 0xffffff)
991 *dest
++ = (cups_sbcs_t
)(legchar
>> 24);
992 *dest
++ = (cups_sbcs_t
)(legchar
>> 16);
993 *dest
++ = (cups_sbcs_t
)(legchar
>> 8);
994 *dest
++ = (cups_sbcs_t
)legchar
;
998 else if (legchar
> 0xffff)
1003 *dest
++ = (cups_sbcs_t
)(legchar
>> 16);
1004 *dest
++ = (cups_sbcs_t
)(legchar
>> 8);
1005 *dest
++ = (cups_sbcs_t
)legchar
;
1009 else if (legchar
> 0xff)
1011 *dest
++ = (cups_sbcs_t
)(legchar
>> 8);
1012 *dest
++ = (cups_sbcs_t
)legchar
;
1022 return ((int)(dest
- start
));
1027 * 'conv_vbcs_to_utf8()' - Convert legacy DBCS/VBCS to UTF-8.
1030 static int /* O - Count or -1 on error */
1032 cups_utf8_t
*dest
, /* O - Target string */
1033 const cups_sbcs_t
*src
, /* I - Source string */
1034 int maxout
, /* I - Max output */
1035 const cups_encoding_t encoding
) /* I - Encoding */
1037 _cups_vmap_t
*vmap
; /* Legacy VBCS / Unicode Charset Map */
1038 cups_ucs2_t
*crow
; /* Pointer to UCS-2 row in 'char2uni' */
1039 _cups_wide2uni_t
*wide2uni
; /* Pointer to row in 'wide2uni' */
1040 cups_sbcs_t leadchar
; /* Lead char of n-byte legacy char */
1041 cups_vbcs_t legchar
; /* Legacy character value */
1042 cups_utf32_t work
[CUPS_MAX_USTRING
], /* Internal UCS-4 string */
1043 *workptr
; /* Pointer into string */
1047 * Find legacy charset map in cache...
1050 if ((vmap
= (_cups_vmap_t
*)get_charmap(encoding
)) == NULL
)
1054 * Convert input legacy charset to internal UCS-4 (and insert BOM)...
1058 for (workptr
= work
+ 1; *src
&& workptr
< (work
+ CUPS_MAX_USTRING
- 1);)
1061 leadchar
= (cups_sbcs_t
)legchar
;
1064 * Convert ASCII verbatim (optimization)...
1069 *workptr
++ = (cups_utf32_t
)legchar
;
1074 * Convert 2-byte legacy character...
1077 if (vmap
->lead2char
[(int)leadchar
] == leadchar
)
1082 legchar
= (legchar
<< 8) | *src
++;
1085 * Convert unknown character to Replacement Character...
1088 crow
= vmap
->char2uni
[(int)((legchar
>> 8) & 0xff)];
1090 crow
+= (int) (legchar
& 0xff);
1092 if (!crow
|| !*crow
)
1093 *workptr
++ = 0xfffd;
1095 *workptr
++ = (cups_utf32_t
)*crow
;
1100 * Fetch 3-byte or 4-byte legacy character...
1103 if (vmap
->lead3char
[(int)leadchar
] == leadchar
)
1105 if (!*src
|| !src
[1])
1108 legchar
= (legchar
<< 8) | *src
++;
1109 legchar
= (legchar
<< 8) | *src
++;
1111 else if (vmap
->lead4char
[(int)leadchar
] == leadchar
)
1113 if (!*src
|| !src
[1] || !src
[2])
1116 legchar
= (legchar
<< 8) | *src
++;
1117 legchar
= (legchar
<< 8) | *src
++;
1118 legchar
= (legchar
<< 8) | *src
++;
1124 * Find 3-byte or 4-byte legacy character...
1127 wide2uni
= (_cups_wide2uni_t
*)bsearch(&legchar
,
1130 sizeof(_cups_wide2uni_t
),
1134 * Convert unknown character to Replacement Character...
1137 if (!wide2uni
|| !wide2uni
->unichar
)
1138 *workptr
++ = 0xfffd;
1140 *workptr
++ = wide2uni
->unichar
;
1148 * Convert internal UCS-4 to output UTF-8 (and delete BOM)...
1151 return (cupsUTF32ToUTF8(dest
, work
, maxout
));
1156 * 'free_sbcs_charmap()' - Free memory used by a single byte character set.
1160 free_sbcs_charmap(_cups_cmap_t
*cmap
) /* I - Character set */
1162 int i
; /* Looping variable */
1165 for (i
= 0; i
< 256; i
++)
1166 if (cmap
->uni2char
[i
])
1167 free(cmap
->uni2char
[i
]);
1174 * 'free_vbcs_charmap()' - Free memory used by a variable byte character set.
1178 free_vbcs_charmap(_cups_vmap_t
*vmap
) /* I - Character set */
1180 int i
; /* Looping variable */
1183 for (i
= 0; i
< 256; i
++)
1184 if (vmap
->char2uni
[i
])
1185 free(vmap
->char2uni
[i
]);
1187 for (i
= 0; i
< 256; i
++)
1188 if (vmap
->uni2char
[i
])
1189 free(vmap
->uni2char
[i
]);
1192 free(vmap
->wide2uni
);
1199 * 'get_charmap()' - Lookup or get a character set map (private).
1201 * This code handles single-byte (SBCS), double-byte (DBCS), and
1202 * variable-byte (VBCS) character sets _without_ charset escapes...
1203 * This code does not handle multiple-byte character sets (MBCS)
1204 * (such as ISO-2022-JP) with charset switching via escapes...
1208 static void * /* O - Charset map pointer */
1210 const cups_encoding_t encoding
) /* I - Encoding */
1212 char filename
[1024]; /* Filename for charset map file */
1213 _cups_globals_t
*cg
= _cupsGlobals(); /* Global data */
1217 * Get the data directory and charset map name...
1220 snprintf(filename
, sizeof(filename
), "%s/charmaps/%s.txt",
1221 cg
->cups_datadir
, _cupsEncodingName(encoding
));
1223 DEBUG_printf((" filename=\"%s\"\n", filename
));
1226 * Read charset map input file into cache...
1229 if (encoding
< CUPS_ENCODING_SBCS_END
)
1230 return (get_sbcs_charmap(encoding
, filename
));
1231 else if (encoding
< CUPS_ENCODING_VBCS_END
)
1232 return (get_vbcs_charmap(encoding
, filename
));
1239 * 'get_charmap_count()' - Count lines in a charmap file.
1242 static int /* O - Count or -1 on error */
1243 get_charmap_count(cups_file_t
*fp
) /* I - File to read from */
1245 int count
; /* Number of lines */
1246 char line
[256]; /* Line from input map file */
1250 * Count lines in map input file...
1255 while (cupsFileGets(fp
, line
, sizeof(line
)))
1260 * Return the number of lines...
1271 * 'get_sbcs_charmap()' - Get SBCS Charmap.
1274 static _cups_cmap_t
* /* O - Charmap or 0 on error */
1276 const cups_encoding_t encoding
, /* I - Charmap Encoding */
1277 const char *filename
) /* I - Charmap Filename */
1279 unsigned long legchar
; /* Legacy character value */
1280 cups_utf32_t unichar
; /* Unicode character value */
1281 _cups_cmap_t
*cmap
; /* Legacy SBCS / Unicode Charset Map */
1282 cups_file_t
*fp
; /* Charset map file pointer */
1283 char *s
; /* Line parsing pointer */
1284 cups_ucs2_t
*crow
; /* Pointer to UCS-2 row in 'char2uni' */
1285 cups_sbcs_t
*srow
; /* Pointer to SBCS row in 'uni2char' */
1286 char line
[256]; /* Line from charset map file */
1290 * See if we already have this SBCS charset map loaded...
1293 for (cmap
= cmap_cache
; cmap
; cmap
= cmap
->next
)
1295 if (cmap
->encoding
== encoding
)
1298 DEBUG_printf((" returning existing cmap=%p\n", cmap
));
1300 return ((void *)cmap
);
1305 * Open SBCS charset map input file...
1308 if ((fp
= cupsFileOpen(filename
, "r")) == NULL
)
1312 * Allocate memory for SBCS charset map...
1315 if ((cmap
= (_cups_cmap_t
*)calloc(1, sizeof(_cups_cmap_t
))) == NULL
)
1318 DEBUG_puts(" Unable to allocate memory!");
1324 cmap
->encoding
= encoding
;
1327 * Save SBCS charset map into memory for transcoding...
1330 while (cupsFileGets(fp
, line
, sizeof(line
)))
1335 legchar
= strtol(line
, &s
, 16);
1336 if (legchar
< 0 || legchar
> 0xff)
1339 unichar
= strtol(s
, NULL
, 16);
1340 if (unichar
< 0 || unichar
> 0xffff)
1344 * Save legacy to Unicode mapping in direct lookup table...
1347 crow
= cmap
->char2uni
+ legchar
;
1348 *crow
= (cups_ucs2_t
)(unichar
& 0xffff);
1351 * Save Unicode to legacy mapping in indirect lookup table...
1354 srow
= cmap
->uni2char
[(unichar
>> 8) & 0xff];
1357 srow
= (cups_sbcs_t
*)calloc(256, sizeof(cups_sbcs_t
));
1361 cmap
->uni2char
[(unichar
>> 8) & 0xff] = srow
;
1364 srow
+= unichar
& 0xff;
1367 * Convert Replacement Character to visible replacement...
1370 if (unichar
== 0xfffd)
1371 legchar
= (unsigned long)'?';
1374 * First (oldest) legacy character uses Unicode mapping cell...
1378 *srow
= (cups_sbcs_t
)legchar
;
1384 * Add it to the cache and return...
1387 cmap
->next
= cmap_cache
;
1390 DEBUG_printf((" returning new cmap=%p\n", cmap
));
1395 * If we get here, there was an error in the cmap file...
1400 free_sbcs_charmap(cmap
);
1404 DEBUG_puts(" Error, returning NULL!");
1411 * 'get_vbcs_charmap()' - Get DBCS/VBCS Charmap.
1414 static _cups_vmap_t
* /* O - Charmap or 0 on error */
1416 const cups_encoding_t encoding
, /* I - Charmap Encoding */
1417 const char *filename
) /* I - Charmap Filename */
1419 _cups_vmap_t
*vmap
; /* Legacy VBCS / Unicode Charset Map */
1420 cups_ucs2_t
*crow
; /* Pointer to UCS-2 row in 'char2uni' */
1421 cups_vbcs_t
*vrow
; /* Pointer to VBCS row in 'uni2char' */
1422 _cups_wide2uni_t
*wide2uni
; /* Pointer to row in 'wide2uni' */
1423 cups_sbcs_t leadchar
; /* Lead char of 2-byte legacy char */
1424 unsigned long legchar
; /* Legacy character value */
1425 cups_utf32_t unichar
; /* Unicode character value */
1426 int mapcount
; /* Count of lines in charmap file */
1427 cups_file_t
*fp
; /* Charset map file pointer */
1428 char *s
; /* Line parsing pointer */
1429 char line
[256]; /* Line from charset map file */
1430 int i
; /* Loop variable */
1431 int legacy
; /* 32-bit legacy char */
1434 DEBUG_printf(("get_vbcs_charmap(encoding=%d, filename=\"%s\")\n",
1435 encoding
, filename
));
1438 * See if we already have this DBCS/VBCS charset map loaded...
1441 for (vmap
= vmap_cache
; vmap
; vmap
= vmap
->next
)
1443 if (vmap
->encoding
== encoding
)
1446 DEBUG_printf((" returning existing vmap=%p\n", vmap
));
1448 return ((void *)vmap
);
1453 * Open VBCS charset map input file...
1456 if ((fp
= cupsFileOpen(filename
, "r")) == NULL
)
1458 DEBUG_printf((" Unable to open file: %s\n", strerror(errno
)));
1464 * Count lines in charmap file...
1467 if ((mapcount
= get_charmap_count(fp
)) <= 0)
1469 DEBUG_puts(" Unable to get charmap count!");
1474 DEBUG_printf((" mapcount=%d\n", mapcount
));
1477 * Allocate memory for DBCS/VBCS charset map...
1480 if ((vmap
= (_cups_vmap_t
*)calloc(1, sizeof(_cups_vmap_t
))) == NULL
)
1483 DEBUG_puts(" Unable to allocate memory!");
1489 vmap
->encoding
= encoding
;
1492 * Save DBCS/VBCS charset map into memory for transcoding...
1503 while (cupsFileGets(fp
, line
, sizeof(line
)))
1508 legchar
= strtoul(line
, &s
, 16);
1509 if (legchar
== ULONG_MAX
)
1512 unichar
= strtol(s
, NULL
, 16);
1513 if (unichar
< 0 || unichar
> 0xffff)
1518 /* DEBUG_printf((" i=%d, legchar=0x%08lx, unichar=0x%04x\n", i,
1519 legchar, (unsigned)unichar)); */
1522 * Save lead char of 2/3/4-byte legacy char...
1525 if (legchar
> 0xff && legchar
<= 0xffff)
1527 leadchar
= (cups_sbcs_t
)(legchar
>> 8);
1528 vmap
->lead2char
[leadchar
] = leadchar
;
1531 if (legchar
> 0xffff && legchar
<= 0xffffff)
1533 leadchar
= (cups_sbcs_t
)(legchar
>> 16);
1534 vmap
->lead3char
[leadchar
] = leadchar
;
1537 if (legchar
> 0xffffff)
1539 leadchar
= (cups_sbcs_t
)(legchar
>> 24);
1540 vmap
->lead4char
[leadchar
] = leadchar
;
1544 * Save Legacy to Unicode mapping...
1547 if (legchar
<= 0xffff)
1550 * Save DBCS 16-bit to Unicode mapping in indirect lookup table...
1553 crow
= vmap
->char2uni
[(int)leadchar
];
1556 crow
= (cups_ucs2_t
*)calloc(256, sizeof(cups_ucs2_t
));
1560 vmap
->char2uni
[(int)leadchar
] = crow
;
1563 crow
[(int)(legchar
& 0xff)] = (cups_ucs2_t
)unichar
;
1568 * Save VBCS 32-bit to Unicode mapping in sorted list table...
1574 vmap
->widecount
= (mapcount
- i
+ 1);
1575 wide2uni
= (_cups_wide2uni_t
*)calloc(vmap
->widecount
,
1576 sizeof(_cups_wide2uni_t
));
1580 vmap
->wide2uni
= wide2uni
;
1583 wide2uni
->widechar
= (cups_vbcs_t
)legchar
;
1584 wide2uni
->unichar
= (cups_ucs2_t
)unichar
;
1589 * Save Unicode to legacy mapping in indirect lookup table...
1592 vrow
= vmap
->uni2char
[(int)((unichar
>> 8) & 0xff)];
1595 vrow
= (cups_vbcs_t
*)calloc(256, sizeof(cups_vbcs_t
));
1599 vmap
->uni2char
[(int) ((unichar
>> 8) & 0xff)] = vrow
;
1602 vrow
+= (int)(unichar
& 0xff);
1605 * Convert Replacement Character to visible replacement...
1608 if (unichar
== 0xfffd)
1609 legchar
= (unsigned long)'?';
1612 * First (oldest) legacy character uses Unicode mapping cell...
1616 *vrow
= (cups_vbcs_t
)legchar
;
1619 vmap
->charcount
= (i
- vmap
->widecount
);
1624 * Add it to the cache and return...
1627 vmap
->next
= vmap_cache
;
1630 DEBUG_printf((" returning new vmap=%p\n", vmap
));
1635 * If we get here, the file contains errors...
1640 free_vbcs_charmap(vmap
);
1644 DEBUG_puts(" Error, returning NULL!");
1651 * End of "$Id: transcode.c 6649 2007-07-11 21:46:42Z mike $"