2 * "$Id: transcode.c 6188 2007-01-10 16:23:06Z mike $"
4 * Transcoding support for the Common UNIX Printing System (CUPS).
6 * Copyright 1997-2007 by Easy Software Products.
8 * These coded instructions, statements, and computer programs are
9 * the property of Easy Software Products and are protected by Federal
10 * copyright law. Distribution and use rights are outlined in the
11 * file "LICENSE.txt" which should have been included with this file.
12 * If this file is missing or damaged please contact Easy Software
15 * Attn: CUPS Licensing Information
16 * Easy Software Products
17 * 44141 Airport View Drive, Suite 204
18 * Hollywood, Maryland 20636 USA
20 * Voice: (301) 373-9600
21 * EMail: cups-info@cups.org
22 * WWW: http://www.cups.org
26 * _cupsCharmapFlush() - Flush all character set maps out of cache.
27 * _cupsCharmapFree() - Free a character set map.
28 * _cupsCharmapGet() - Get a character set map.
29 * cupsCharsetToUTF8() - Convert legacy character set to UTF-8.
30 * cupsUTF8ToCharset() - Convert UTF-8 to legacy character set.
31 * cupsUTF8ToUTF32() - Convert UTF-8 to UTF-32.
32 * cupsUTF32ToUTF8() - Convert UTF-32 to UTF-8.
33 * compare_wide() - Compare key for wide (VBCS) match.
34 * conv_sbcs_to_utf8() - Convert legacy SBCS to UTF-8.
35 * conv_utf8_to_sbcs() - Convert UTF-8 to legacy SBCS.
36 * conv_utf8_to_vbcs() - Convert UTF-8 to legacy DBCS/VBCS.
37 * conv_vbcs_to_utf8() - Convert legacy DBCS/VBCS to UTF-8.
38 * free_sbcs_charmap() - Free memory used by a single byte character set.
39 * free_vbcs_charmap() - Free memory used by a variable byte character set.
40 * get_charmap() - Lookup or get a character set map (private).
41 * get_charmap_count() - Count lines in a charmap file.
42 * get_sbcs_charmap() - Get SBCS Charmap.
43 * get_vbcs_charmap() - Get DBCS/VBCS Charmap.
47 * Include necessary headers...
63 static pthread_mutex_t map_mutex
= PTHREAD_MUTEX_INITIALIZER
;
64 /* Mutex to control access to maps */
65 #endif /* HAVE_PTHREAD_H */
66 static _cups_cmap_t
*cmap_cache
= NULL
;
67 /* SBCS Charmap Cache */
68 static _cups_vmap_t
*vmap_cache
= NULL
;
69 /* VBCS Charmap Cache */
76 static int compare_wide(const void *k1
, const void *k2
);
77 static int conv_sbcs_to_utf8(cups_utf8_t
*dest
,
78 const cups_sbcs_t
*src
,
80 const cups_encoding_t encoding
);
81 static int conv_utf8_to_sbcs(cups_sbcs_t
*dest
,
82 const cups_utf8_t
*src
,
84 const cups_encoding_t encoding
);
85 static int conv_utf8_to_vbcs(cups_sbcs_t
*dest
,
86 const cups_utf8_t
*src
,
88 const cups_encoding_t encoding
);
89 static int conv_vbcs_to_utf8(cups_utf8_t
*dest
,
90 const cups_sbcs_t
*src
,
92 const cups_encoding_t encoding
);
93 static void free_sbcs_charmap(_cups_cmap_t
*sbcs
);
94 static void free_vbcs_charmap(_cups_vmap_t
*vbcs
);
95 static void *get_charmap(const cups_encoding_t encoding
);
96 static int get_charmap_count(cups_file_t
*fp
);
97 static _cups_cmap_t
*get_sbcs_charmap(const cups_encoding_t encoding
,
98 const char *filename
);
99 static _cups_vmap_t
*get_vbcs_charmap(const cups_encoding_t encoding
,
100 const char *filename
);
104 * '_cupsCharmapFlush()' - Flush all character set maps out of cache.
108 _cupsCharmapFlush(void)
110 _cups_cmap_t
*cmap
, /* Legacy SBCS / Unicode Charset Map */
111 *cnext
; /* Next Legacy SBCS Charset Map */
112 _cups_vmap_t
*vmap
, /* Legacy VBCS / Unicode Charset Map */
113 *vnext
; /* Next Legacy VBCS Charset Map */
116 #ifdef HAVE_PTHREAD_H
117 pthread_mutex_lock(&map_mutex
);
118 #endif /* HAVE_PTHREAD_H */
121 * Loop through SBCS charset map cache, free all memory...
124 for (cmap
= cmap_cache
; cmap
; cmap
= cnext
)
128 free_sbcs_charmap(cmap
);
134 * Loop through DBCS/VBCS charset map cache, free all memory...
137 for (vmap
= vmap_cache
; vmap
; vmap
= vnext
)
141 free_vbcs_charmap(vmap
);
148 #ifdef HAVE_PTHREAD_H
149 pthread_mutex_unlock(&map_mutex
);
150 #endif /* HAVE_PTHREAD_H */
155 * '_cupsCharmapFree()' - Free a character set map.
157 * This does not actually free; use '_cupsCharmapFlush()' for that.
162 const cups_encoding_t encoding
) /* I - Encoding */
164 _cups_cmap_t
*cmap
; /* Legacy SBCS / Unicode Charset Map */
165 _cups_vmap_t
*vmap
; /* Legacy VBCS / Unicode Charset Map */
169 * See if we already have this SBCS charset map loaded...
172 #ifdef HAVE_PTHREAD_H
173 pthread_mutex_lock(&map_mutex
);
174 #endif /* HAVE_PTHREAD_H */
176 for (cmap
= cmap_cache
; cmap
; cmap
= cmap
->next
)
178 if (cmap
->encoding
== encoding
)
187 * See if we already have this DBCS/VBCS charset map loaded...
190 for (vmap
= vmap_cache
; vmap
; vmap
= vmap
->next
)
192 if (vmap
->encoding
== encoding
)
200 #ifdef HAVE_PTHREAD_H
201 pthread_mutex_unlock(&map_mutex
);
202 #endif /* HAVE_PTHREAD_H */
207 * '_cupsCharmapGet()' - Get a character set map.
209 * This code handles single-byte (SBCS), double-byte (DBCS), and
210 * variable-byte (VBCS) character sets _without_ charset escapes...
211 * This code does not handle multiple-byte character sets (MBCS)
212 * (such as ISO-2022-JP) with charset switching via escapes...
215 void * /* O - Charset map pointer */
217 const cups_encoding_t encoding
) /* I - Encoding */
219 void *charmap
; /* Charset map pointer */
222 DEBUG_printf(("_cupsCharmapGet(encoding=%d)\n", encoding
));
225 * Check for valid arguments...
228 if (encoding
< 0 || encoding
>= CUPS_ENCODING_VBCS_END
)
230 DEBUG_puts(" Bad encoding, returning NULL!");
235 * Lookup or get the charset map pointer and return...
238 #ifdef HAVE_PTHREAD_H
239 pthread_mutex_lock(&map_mutex
);
240 #endif /* HAVE_PTHREAD_H */
242 charmap
= get_charmap(encoding
);
244 #ifdef HAVE_PTHREAD_H
245 pthread_mutex_unlock(&map_mutex
);
246 #endif /* HAVE_PTHREAD_H */
253 * 'cupsCharsetToUTF8()' - Convert legacy character set to UTF-8.
255 * This code handles single-byte (SBCS), double-byte (DBCS), and
256 * variable-byte (VBCS) character sets _without_ charset escapes...
257 * This code does not handle multiple-byte character sets (MBCS)
258 * (such as ISO-2022-JP) with charset switching via escapes...
261 int /* O - Count or -1 on error */
263 cups_utf8_t
*dest
, /* O - Target string */
264 const char *src
, /* I - Source string */
265 const int maxout
, /* I - Max output */
266 const cups_encoding_t encoding
) /* I - Encoding */
268 int bytes
; /* Number of bytes converted */
272 * Check for valid arguments...
275 DEBUG_printf(("cupsCharsetToUTF8(dest=%p, src=\"%s\", maxout=%d, encoding=%d)\n",
276 dest
, src
, maxout
, encoding
));
281 if (!dest
|| !src
|| maxout
< 1 || maxout
> CUPS_MAX_USTRING
)
283 DEBUG_puts(" Bad arguments, returning -1");
288 * Handle identity conversions...
291 if (encoding
== CUPS_UTF8
||
292 encoding
< 0 || encoding
>= CUPS_ENCODING_VBCS_END
)
294 strlcpy((char *)dest
, src
, maxout
);
295 return ((int)strlen((char *)dest
));
299 * Handle ISO-8859-1 to UTF-8 directly...
302 if (encoding
== CUPS_ISO8859_1
)
304 int ch
; /* Character from string */
305 cups_utf8_t
*destptr
, /* Pointer into UTF-8 buffer */
306 *destend
; /* End of UTF-8 buffer */
310 destend
= dest
+ maxout
- 2;
312 while (*src
&& destptr
< destend
)
318 *destptr
++ = 0xc0 | (ch
>> 6);
319 *destptr
++ = 0x80 | (ch
& 0x3f);
327 return ((int)(destptr
- dest
));
331 * Convert input legacy charset to UTF-8...
334 #ifdef HAVE_PTHREAD_H
335 pthread_mutex_lock(&map_mutex
);
336 #endif /* HAVE_PTHREAD_H */
338 if (encoding
< CUPS_ENCODING_SBCS_END
)
339 bytes
= conv_sbcs_to_utf8(dest
, (cups_sbcs_t
*)src
, maxout
, encoding
);
340 else if (encoding
< CUPS_ENCODING_VBCS_END
)
341 bytes
= conv_vbcs_to_utf8(dest
, (cups_sbcs_t
*)src
, maxout
, encoding
);
344 DEBUG_puts(" Bad encoding, returning -1");
348 #ifdef HAVE_PTHREAD_H
349 pthread_mutex_unlock(&map_mutex
);
350 #endif /* HAVE_PTHREAD_H */
357 * 'cupsUTF8ToCharset()' - Convert UTF-8 to legacy character set.
359 * This code handles single-byte (SBCS), double-byte (DBCS), and
360 * variable-byte (VBCS) character sets _without_ charset escapes...
361 * This code does not handle multiple-byte character sets (MBCS)
362 * (such as ISO-2022-JP) with charset switching via escapes...
365 int /* O - Count or -1 on error */
367 char *dest
, /* O - Target string */
368 const cups_utf8_t
*src
, /* I - Source string */
369 const int maxout
, /* I - Max output */
370 const cups_encoding_t encoding
) /* I - Encoding */
372 int bytes
; /* Number of bytes converted */
376 * Check for valid arguments...
379 if (!dest
|| !src
|| maxout
< 1 || maxout
> CUPS_MAX_USTRING
)
388 * Handle identity conversions...
391 if (encoding
== CUPS_UTF8
||
392 encoding
< 0 || encoding
>= CUPS_ENCODING_VBCS_END
)
394 strlcpy(dest
, (char *)src
, maxout
);
395 return ((int)strlen(dest
));
399 * Handle UTF-8 to ISO-8859-1 directly...
402 if (encoding
== CUPS_ISO8859_1
)
404 int ch
; /* Character from string */
405 char *destptr
, /* Pointer into ISO-8859-1 buffer */
406 *destend
; /* End of ISO-8859-1 buffer */
410 destend
= dest
+ maxout
- 1;
412 while (*src
&& destptr
< destend
)
416 if ((ch
& 0xe0) == 0xc0)
418 ch
= ((ch
& 0x1f) << 6) | (*src
++ & 0x3f);
425 else if ((ch
& 0xf0) == 0xe0 ||
428 else if (!(ch
& 0x80))
434 return ((int)(destptr
- dest
));
438 * Convert input UTF-8 to legacy charset...
441 #ifdef HAVE_PTHREAD_H
442 pthread_mutex_lock(&map_mutex
);
443 #endif /* HAVE_PTHREAD_H */
445 if (encoding
< CUPS_ENCODING_SBCS_END
)
446 bytes
= conv_utf8_to_sbcs((cups_sbcs_t
*)dest
, src
, maxout
, encoding
);
447 else if (encoding
< CUPS_ENCODING_VBCS_END
)
448 bytes
= conv_utf8_to_vbcs((cups_sbcs_t
*)dest
, src
, maxout
, encoding
);
452 #ifdef HAVE_PTHREAD_H
453 pthread_mutex_unlock(&map_mutex
);
454 #endif /* HAVE_PTHREAD_H */
461 * 'cupsUTF8ToUTF32()' - Convert UTF-8 to UTF-32.
463 * 32-bit UTF-32 (actually 21-bit) maps to UTF-8 as follows...
465 * UTF-32 char UTF-8 char(s)
466 * --------------------------------------------------
467 * 0 to 127 = 0xxxxxxx (US-ASCII)
468 * 128 to 2047 = 110xxxxx 10yyyyyy
469 * 2048 to 65535 = 1110xxxx 10yyyyyy 10zzzzzz
470 * > 65535 = 11110xxx 10yyyyyy 10zzzzzz 10xxxxxx
472 * UTF-32 prohibits chars beyond Plane 16 (> 0x10ffff) in UCS-4,
473 * which would convert to five- or six-octet UTF-8 sequences...
476 int /* O - Count or -1 on error */
478 cups_utf32_t
*dest
, /* O - Target string */
479 const cups_utf8_t
*src
, /* I - Source string */
480 const int maxout
) /* I - Max output */
482 int i
; /* Looping variable */
483 cups_utf8_t ch
; /* Character value */
484 cups_utf8_t next
; /* Next character value */
485 cups_utf32_t ch32
; /* UTF-32 character value */
489 * Check for valid arguments and clear output...
495 if (!dest
|| !src
|| maxout
< 1 || maxout
> CUPS_MAX_USTRING
)
499 * Convert input UTF-8 to output UTF-32 (and insert BOM)...
504 for (i
= maxout
- 1; *src
&& i
> 0; i
--)
509 * Convert UTF-8 character(s) to UTF-32 character...
515 * One-octet UTF-8 <= 127 (US-ASCII)...
521 else if ((ch
& 0xe0) == 0xc0)
524 * Two-octet UTF-8 <= 2047 (Latin-x)...
531 ch32
= ((ch
& 0x1f) << 6) | (next
& 0x3f);
534 * Check for non-shortest form (invalid UTF-8)...
542 else if ((ch
& 0xf0) == 0xe0)
545 * Three-octet UTF-8 <= 65535 (Plane 0 - BMP)...
552 ch32
= ((ch
& 0x0f) << 6) | (next
& 0x3f);
558 ch32
= (ch32
<< 6) | (next
& 0x3f);
561 * Check for non-shortest form (invalid UTF-8)...
569 else if ((ch
& 0xf8) == 0xf0)
572 * Four-octet UTF-8...
579 ch32
= ((ch
& 0x07) << 6) | (next
& 0x3f);
585 ch32
= (ch32
<< 6) | (next
& 0x3f);
591 ch32
= (ch32
<< 6) | (next
& 0x3f);
594 * Check for non-shortest form (invalid UTF-8)...
605 * More than 4-octet (invalid UTF-8 sequence)...
612 * Check for UTF-16 surrogate (illegal UTF-8)...
615 if (ch32
>= 0xd800 && ch32
<= 0xdfff)
626 * 'cupsUTF32ToUTF8()' - Convert UTF-32 to UTF-8.
628 * 32-bit UTF-32 (actually 21-bit) maps to UTF-8 as follows...
630 * UTF-32 char UTF-8 char(s)
631 * --------------------------------------------------
632 * 0 to 127 = 0xxxxxxx (US-ASCII)
633 * 128 to 2047 = 110xxxxx 10yyyyyy
634 * 2048 to 65535 = 1110xxxx 10yyyyyy 10zzzzzz
635 * > 65535 = 11110xxx 10yyyyyy 10zzzzzz 10xxxxxx
637 * UTF-32 prohibits chars beyond Plane 16 (> 0x10ffff) in UCS-4,
638 * which would convert to five- or six-octet UTF-8 sequences...
641 int /* O - Count or -1 on error */
643 cups_utf8_t
*dest
, /* O - Target string */
644 const cups_utf32_t
*src
, /* I - Source string */
645 const int maxout
) /* I - Max output */
647 cups_utf8_t
*start
; /* Start of destination string */
648 int i
; /* Looping variable */
649 int swap
; /* Byte-swap input to output */
650 cups_utf32_t ch
; /* Character value */
654 * Check for valid arguments and clear output...
660 if (!dest
|| !src
|| maxout
< 1)
664 * Check for leading BOM in UTF-32 and inverted BOM...
668 swap
= *src
== 0xfffe0000;
670 if (*src
== 0xfffe0000 || *src
== 0xfeff)
674 * Convert input UTF-32 to output UTF-8...
677 for (i
= maxout
- 1; *src
&& i
> 0;)
682 * Byte swap input UTF-32, if necessary...
683 * (only byte-swapping 24 of 32 bits)
687 ch
= ((ch
>> 24) | ((ch
>> 8) & 0xff00) | ((ch
<< 8) & 0xff0000));
690 * Check for beyond Plane 16 (invalid UTF-32)...
697 * Convert UTF-32 character to UTF-8 character(s)...
703 * One-octet UTF-8 <= 127 (US-ASCII)...
706 *dest
++ = (cups_utf8_t
)ch
;
712 * Two-octet UTF-8 <= 2047 (Latin-x)...
718 *dest
++ = (cups_utf8_t
)(0xc0 | ((ch
>> 6) & 0x1f));
719 *dest
++ = (cups_utf8_t
)(0x80 | (ch
& 0x3f));
722 else if (ch
< 0x10000)
725 * Three-octet UTF-8 <= 65535 (Plane 0 - BMP)...
731 *dest
++ = (cups_utf8_t
)(0xe0 | ((ch
>> 12) & 0x0f));
732 *dest
++ = (cups_utf8_t
)(0x80 | ((ch
>> 6) & 0x3f));
733 *dest
++ = (cups_utf8_t
)(0x80 | (ch
& 0x3f));
739 * Four-octet UTF-8...
745 *dest
++ = (cups_utf8_t
)(0xf0 | ((ch
>> 18) & 0x07));
746 *dest
++ = (cups_utf8_t
)(0x80 | ((ch
>> 12) & 0x3f));
747 *dest
++ = (cups_utf8_t
)(0x80 | ((ch
>> 6) & 0x3f));
748 *dest
++ = (cups_utf8_t
)(0x80 | (ch
& 0x3f));
755 return ((int)(dest
- start
));
760 * 'compare_wide()' - Compare key for wide (VBCS) match.
764 compare_wide(const void *k1
, /* I - Key char */
765 const void *k2
) /* I - Map char */
767 cups_vbcs_t key
; /* Legacy key character */
768 cups_vbcs_t map
; /* Legacy map character */
771 key
= *((cups_vbcs_t
*)k1
);
772 map
= ((_cups_wide2uni_t
*)k2
)->widechar
;
774 return ((int)(key
- map
));
779 * 'conv_sbcs_to_utf8()' - Convert legacy SBCS to UTF-8.
782 static int /* O - Count or -1 on error */
784 cups_utf8_t
*dest
, /* O - Target string */
785 const cups_sbcs_t
*src
, /* I - Source string */
786 int maxout
, /* I - Max output */
787 const cups_encoding_t encoding
) /* I - Encoding */
789 _cups_cmap_t
*cmap
; /* Legacy SBCS / Unicode Charset Map */
790 cups_ucs2_t
*crow
; /* Pointer to UCS-2 row in 'char2uni' */
791 cups_sbcs_t legchar
; /* Legacy character value */
792 cups_utf32_t work
[CUPS_MAX_USTRING
], /* Internal UCS-4 string */
793 *workptr
; /* Pointer into string */
797 * Find legacy charset map in cache...
800 if ((cmap
= (_cups_cmap_t
*)get_charmap(encoding
)) == NULL
)
804 * Convert input legacy charset to internal UCS-4 (and insert BOM)...
808 for (workptr
= work
+ 1; *src
&& workptr
< (work
+ CUPS_MAX_USTRING
- 1);)
813 * Convert ASCII verbatim (optimization)...
817 *workptr
++ = (cups_utf32_t
)legchar
;
821 * Convert unknown character to Replacement Character...
824 crow
= cmap
->char2uni
+ legchar
;
829 *workptr
++ = (cups_utf32_t
)*crow
;
836 * Convert internal UCS-4 to output UTF-8 (and delete BOM)...
841 return (cupsUTF32ToUTF8(dest
, work
, maxout
));
846 * 'conv_utf8_to_sbcs()' - Convert UTF-8 to legacy SBCS.
849 static int /* O - Count or -1 on error */
851 cups_sbcs_t
*dest
, /* O - Target string */
852 const cups_utf8_t
*src
, /* I - Source string */
853 int maxout
, /* I - Max output */
854 const cups_encoding_t encoding
) /* I - Encoding */
856 cups_sbcs_t
*start
; /* Start of destination string */
857 _cups_cmap_t
*cmap
; /* Legacy SBCS / Unicode Charset Map */
858 cups_sbcs_t
*srow
; /* Pointer to SBCS row in 'uni2char' */
859 cups_utf32_t unichar
; /* Character value */
860 cups_utf32_t work
[CUPS_MAX_USTRING
], /* Internal UCS-4 string */
861 *workptr
; /* Pointer into string */
865 * Find legacy charset map in cache...
868 if ((cmap
= (_cups_cmap_t
*)get_charmap(encoding
)) == NULL
)
872 * Convert input UTF-8 to internal UCS-4 (and insert BOM)...
875 if (cupsUTF8ToUTF32(work
, src
, CUPS_MAX_USTRING
) < 0)
879 * Convert internal UCS-4 to SBCS legacy charset (and delete BOM)...
882 for (workptr
= work
+ 1, start
= dest
; *workptr
&& maxout
> 1; maxout
--)
884 unichar
= *workptr
++;
889 * Convert ASCII verbatim (optimization)...
894 *dest
++ = (cups_sbcs_t
)unichar
;
899 * Convert unknown character to visible replacement...
902 srow
= cmap
->uni2char
[(int)((unichar
>> 8) & 0xff)];
905 srow
+= (int)(unichar
& 0xff);
917 return ((int)(dest
- start
));
922 * 'conv_utf8_to_vbcs()' - Convert UTF-8 to legacy DBCS/VBCS.
925 static int /* O - Count or -1 on error */
927 cups_sbcs_t
*dest
, /* O - Target string */
928 const cups_utf8_t
*src
, /* I - Source string */
929 int maxout
, /* I - Max output */
930 const cups_encoding_t encoding
) /* I - Encoding */
932 cups_sbcs_t
*start
; /* Start of destination string */
933 _cups_vmap_t
*vmap
; /* Legacy DBCS / Unicode Charset Map */
934 cups_vbcs_t
*vrow
; /* Pointer to VBCS row in 'uni2char' */
935 cups_utf32_t unichar
; /* Character value */
936 cups_vbcs_t legchar
; /* Legacy character value */
937 cups_utf32_t work
[CUPS_MAX_USTRING
], /* Internal UCS-4 string */
938 *workptr
; /* Pointer into string */
942 * Find legacy charset map in cache...
945 if ((vmap
= (_cups_vmap_t
*)get_charmap(encoding
)) == NULL
)
949 * Convert input UTF-8 to internal UCS-4 (and insert BOM)...
952 if (cupsUTF8ToUTF32(work
, src
, CUPS_MAX_USTRING
) < 0)
956 * Convert internal UCS-4 to VBCS legacy charset (and delete BOM)...
959 for (start
= dest
, workptr
= work
+ 1; *workptr
&& maxout
> 1; maxout
--)
961 unichar
= *workptr
++;
966 * Convert ASCII verbatim (optimization)...
971 *dest
++ = (cups_sbcs_t
)unichar
;
976 * Convert unknown character to visible replacement...
979 vrow
= vmap
->uni2char
[(int)((unichar
>> 8) & 0xff)];
982 vrow
+= (int)(unichar
& 0xff);
985 legchar
= (cups_vbcs_t
)'?';
987 legchar
= (cups_vbcs_t
)*vrow
;
990 * Save n-byte legacy character...
993 if (legchar
> 0xffffff)
998 *dest
++ = (cups_sbcs_t
)(legchar
>> 24);
999 *dest
++ = (cups_sbcs_t
)(legchar
>> 16);
1000 *dest
++ = (cups_sbcs_t
)(legchar
>> 8);
1001 *dest
++ = (cups_sbcs_t
)legchar
;
1005 else if (legchar
> 0xffff)
1010 *dest
++ = (cups_sbcs_t
)(legchar
>> 16);
1011 *dest
++ = (cups_sbcs_t
)(legchar
>> 8);
1012 *dest
++ = (cups_sbcs_t
)legchar
;
1016 else if (legchar
> 0xff)
1018 *dest
++ = (cups_sbcs_t
)(legchar
>> 8);
1019 *dest
++ = (cups_sbcs_t
)legchar
;
1029 return ((int)(dest
- start
));
1034 * 'conv_vbcs_to_utf8()' - Convert legacy DBCS/VBCS to UTF-8.
1037 static int /* O - Count or -1 on error */
1039 cups_utf8_t
*dest
, /* O - Target string */
1040 const cups_sbcs_t
*src
, /* I - Source string */
1041 int maxout
, /* I - Max output */
1042 const cups_encoding_t encoding
) /* I - Encoding */
1044 _cups_vmap_t
*vmap
; /* Legacy VBCS / Unicode Charset Map */
1045 cups_ucs2_t
*crow
; /* Pointer to UCS-2 row in 'char2uni' */
1046 _cups_wide2uni_t
*wide2uni
; /* Pointer to row in 'wide2uni' */
1047 cups_sbcs_t leadchar
; /* Lead char of n-byte legacy char */
1048 cups_vbcs_t legchar
; /* Legacy character value */
1049 cups_utf32_t work
[CUPS_MAX_USTRING
], /* Internal UCS-4 string */
1050 *workptr
; /* Pointer into string */
1054 * Find legacy charset map in cache...
1057 if ((vmap
= (_cups_vmap_t
*)get_charmap(encoding
)) == NULL
)
1061 * Convert input legacy charset to internal UCS-4 (and insert BOM)...
1065 for (workptr
= work
+ 1; *src
&& workptr
< (work
+ CUPS_MAX_USTRING
- 1);)
1068 leadchar
= (cups_sbcs_t
)legchar
;
1071 * Convert ASCII verbatim (optimization)...
1076 *workptr
++ = (cups_utf32_t
)legchar
;
1081 * Convert 2-byte legacy character...
1084 if (vmap
->lead2char
[(int)leadchar
] == leadchar
)
1089 legchar
= (legchar
<< 8) | *src
++;
1092 * Convert unknown character to Replacement Character...
1095 crow
= vmap
->char2uni
[(int)((legchar
>> 8) & 0xff)];
1097 crow
+= (int) (legchar
& 0xff);
1099 if (!crow
|| !*crow
)
1100 *workptr
++ = 0xfffd;
1102 *workptr
++ = (cups_utf32_t
)*crow
;
1107 * Fetch 3-byte or 4-byte legacy character...
1110 if (vmap
->lead3char
[(int)leadchar
] == leadchar
)
1112 if (!*src
|| !src
[1])
1115 legchar
= (legchar
<< 8) | *src
++;
1116 legchar
= (legchar
<< 8) | *src
++;
1118 else if (vmap
->lead4char
[(int)leadchar
] == leadchar
)
1120 if (!*src
|| !src
[1] || !src
[2])
1123 legchar
= (legchar
<< 8) | *src
++;
1124 legchar
= (legchar
<< 8) | *src
++;
1125 legchar
= (legchar
<< 8) | *src
++;
1131 * Find 3-byte or 4-byte legacy character...
1134 wide2uni
= (_cups_wide2uni_t
*)bsearch(&legchar
,
1137 sizeof(_cups_wide2uni_t
),
1141 * Convert unknown character to Replacement Character...
1144 if (!wide2uni
|| !wide2uni
->unichar
)
1145 *workptr
++ = 0xfffd;
1147 *workptr
++ = wide2uni
->unichar
;
1155 * Convert internal UCS-4 to output UTF-8 (and delete BOM)...
1158 return (cupsUTF32ToUTF8(dest
, work
, maxout
));
1163 * 'free_sbcs_charmap()' - Free memory used by a single byte character set.
1167 free_sbcs_charmap(_cups_cmap_t
*cmap
) /* I - Character set */
1169 int i
; /* Looping variable */
1172 for (i
= 0; i
< 256; i
++)
1173 if (cmap
->uni2char
[i
])
1174 free(cmap
->uni2char
[i
]);
1181 * 'free_vbcs_charmap()' - Free memory used by a variable byte character set.
1185 free_vbcs_charmap(_cups_vmap_t
*vmap
) /* I - Character set */
1187 int i
; /* Looping variable */
1190 for (i
= 0; i
< 256; i
++)
1191 if (vmap
->char2uni
[i
])
1192 free(vmap
->char2uni
[i
]);
1194 for (i
= 0; i
< 256; i
++)
1195 if (vmap
->uni2char
[i
])
1196 free(vmap
->uni2char
[i
]);
1199 free(vmap
->wide2uni
);
1206 * 'get_charmap()' - Lookup or get a character set map (private).
1208 * This code handles single-byte (SBCS), double-byte (DBCS), and
1209 * variable-byte (VBCS) character sets _without_ charset escapes...
1210 * This code does not handle multiple-byte character sets (MBCS)
1211 * (such as ISO-2022-JP) with charset switching via escapes...
1215 static void * /* O - Charset map pointer */
1217 const cups_encoding_t encoding
) /* I - Encoding */
1219 char filename
[1024]; /* Filename for charset map file */
1220 _cups_globals_t
*cg
= _cupsGlobals(); /* Global data */
1224 * Get the data directory and charset map name...
1227 snprintf(filename
, sizeof(filename
), "%s/charmaps/%s.txt",
1228 cg
->cups_datadir
, _cupsEncodingName(encoding
));
1230 DEBUG_printf((" filename=\"%s\"\n", filename
));
1233 * Read charset map input file into cache...
1236 if (encoding
< CUPS_ENCODING_SBCS_END
)
1237 return (get_sbcs_charmap(encoding
, filename
));
1238 else if (encoding
< CUPS_ENCODING_VBCS_END
)
1239 return (get_vbcs_charmap(encoding
, filename
));
1246 * 'get_charmap_count()' - Count lines in a charmap file.
1249 static int /* O - Count or -1 on error */
1250 get_charmap_count(cups_file_t
*fp
) /* I - File to read from */
1252 int count
; /* Number of lines */
1253 char line
[256]; /* Line from input map file */
1257 * Count lines in map input file...
1262 while (cupsFileGets(fp
, line
, sizeof(line
)))
1267 * Return the number of lines...
1278 * 'get_sbcs_charmap()' - Get SBCS Charmap.
1281 static _cups_cmap_t
* /* O - Charmap or 0 on error */
1283 const cups_encoding_t encoding
, /* I - Charmap Encoding */
1284 const char *filename
) /* I - Charmap Filename */
1286 unsigned long legchar
; /* Legacy character value */
1287 cups_utf32_t unichar
; /* Unicode character value */
1288 _cups_cmap_t
*cmap
; /* Legacy SBCS / Unicode Charset Map */
1289 cups_file_t
*fp
; /* Charset map file pointer */
1290 char *s
; /* Line parsing pointer */
1291 cups_ucs2_t
*crow
; /* Pointer to UCS-2 row in 'char2uni' */
1292 cups_sbcs_t
*srow
; /* Pointer to SBCS row in 'uni2char' */
1293 char line
[256]; /* Line from charset map file */
1297 * See if we already have this SBCS charset map loaded...
1300 for (cmap
= cmap_cache
; cmap
; cmap
= cmap
->next
)
1302 if (cmap
->encoding
== encoding
)
1305 DEBUG_printf((" returning existing cmap=%p\n", cmap
));
1307 return ((void *)cmap
);
1312 * Open SBCS charset map input file...
1315 if ((fp
= cupsFileOpen(filename
, "r")) == NULL
)
1319 * Allocate memory for SBCS charset map...
1322 if ((cmap
= (_cups_cmap_t
*)calloc(1, sizeof(_cups_cmap_t
))) == NULL
)
1325 DEBUG_puts(" Unable to allocate memory!");
1331 cmap
->encoding
= encoding
;
1334 * Save SBCS charset map into memory for transcoding...
1337 while (cupsFileGets(fp
, line
, sizeof(line
)))
1342 legchar
= strtol(line
, &s
, 16);
1343 if (legchar
< 0 || legchar
> 0xff)
1346 unichar
= strtol(s
, NULL
, 16);
1347 if (unichar
< 0 || unichar
> 0xffff)
1351 * Save legacy to Unicode mapping in direct lookup table...
1354 crow
= cmap
->char2uni
+ legchar
;
1355 *crow
= (cups_ucs2_t
)(unichar
& 0xffff);
1358 * Save Unicode to legacy mapping in indirect lookup table...
1361 srow
= cmap
->uni2char
[(unichar
>> 8) & 0xff];
1364 srow
= (cups_sbcs_t
*)calloc(256, sizeof(cups_sbcs_t
));
1368 cmap
->uni2char
[(unichar
>> 8) & 0xff] = srow
;
1371 srow
+= unichar
& 0xff;
1374 * Convert Replacement Character to visible replacement...
1377 if (unichar
== 0xfffd)
1378 legchar
= (unsigned long)'?';
1381 * First (oldest) legacy character uses Unicode mapping cell...
1385 *srow
= (cups_sbcs_t
)legchar
;
1391 * Add it to the cache and return...
1394 cmap
->next
= cmap_cache
;
1397 DEBUG_printf((" returning new cmap=%p\n", cmap
));
1402 * If we get here, there was an error in the cmap file...
1407 free_sbcs_charmap(cmap
);
1411 DEBUG_puts(" Error, returning NULL!");
1418 * 'get_vbcs_charmap()' - Get DBCS/VBCS Charmap.
1421 static _cups_vmap_t
* /* O - Charmap or 0 on error */
1423 const cups_encoding_t encoding
, /* I - Charmap Encoding */
1424 const char *filename
) /* I - Charmap Filename */
1426 _cups_vmap_t
*vmap
; /* Legacy VBCS / Unicode Charset Map */
1427 cups_ucs2_t
*crow
; /* Pointer to UCS-2 row in 'char2uni' */
1428 cups_vbcs_t
*vrow
; /* Pointer to VBCS row in 'uni2char' */
1429 _cups_wide2uni_t
*wide2uni
; /* Pointer to row in 'wide2uni' */
1430 cups_sbcs_t leadchar
; /* Lead char of 2-byte legacy char */
1431 unsigned long legchar
; /* Legacy character value */
1432 cups_utf32_t unichar
; /* Unicode character value */
1433 int mapcount
; /* Count of lines in charmap file */
1434 cups_file_t
*fp
; /* Charset map file pointer */
1435 char *s
; /* Line parsing pointer */
1436 char line
[256]; /* Line from charset map file */
1437 int i
; /* Loop variable */
1438 int wide
; /* 32-bit legacy char */
1441 DEBUG_printf(("get_vbcs_charmap(encoding=%d, filename=\"%s\")\n",
1442 encoding
, filename
));
1445 * See if we already have this DBCS/VBCS charset map loaded...
1448 for (vmap
= vmap_cache
; vmap
; vmap
= vmap
->next
)
1450 if (vmap
->encoding
== encoding
)
1453 DEBUG_printf((" returning existing vmap=%p\n", vmap
));
1455 return ((void *)vmap
);
1460 * Open VBCS charset map input file...
1463 if ((fp
= cupsFileOpen(filename
, "r")) == NULL
)
1465 DEBUG_printf((" Unable to open file: %s\n", strerror(errno
)));
1471 * Count lines in charmap file...
1474 if ((mapcount
= get_charmap_count(fp
)) <= 0)
1476 DEBUG_puts(" Unable to get charmap count!");
1481 DEBUG_printf((" mapcount=%d\n", mapcount
));
1484 * Allocate memory for DBCS/VBCS charset map...
1487 if ((vmap
= (_cups_vmap_t
*)calloc(1, sizeof(_cups_vmap_t
))) == NULL
)
1490 DEBUG_puts(" Unable to allocate memory!");
1496 vmap
->encoding
= encoding
;
1499 * Save DBCS/VBCS charset map into memory for transcoding...
1510 while (cupsFileGets(fp
, line
, sizeof(line
)))
1515 legchar
= strtoul(line
, &s
, 16);
1516 if (legchar
== ULONG_MAX
)
1519 unichar
= strtol(s
, NULL
, 16);
1520 if (unichar
< 0 || unichar
> 0xffff)
1525 /* DEBUG_printf((" i=%d, legchar=0x%08lx, unichar=0x%04x\n", i,
1526 legchar, (unsigned)unichar)); */
1529 * Save lead char of 2/3/4-byte legacy char...
1532 if (legchar
> 0xff && legchar
<= 0xffff)
1534 leadchar
= (cups_sbcs_t
)(legchar
>> 8);
1535 vmap
->lead2char
[leadchar
] = leadchar
;
1538 if (legchar
> 0xffff && legchar
<= 0xffffff)
1540 leadchar
= (cups_sbcs_t
)(legchar
>> 16);
1541 vmap
->lead3char
[leadchar
] = leadchar
;
1544 if (legchar
> 0xffffff)
1546 leadchar
= (cups_sbcs_t
)(legchar
>> 24);
1547 vmap
->lead4char
[leadchar
] = leadchar
;
1551 * Save Legacy to Unicode mapping...
1554 if (legchar
<= 0xffff)
1557 * Save DBCS 16-bit to Unicode mapping in indirect lookup table...
1560 crow
= vmap
->char2uni
[(int)leadchar
];
1563 crow
= (cups_ucs2_t
*)calloc(256, sizeof(cups_ucs2_t
));
1567 vmap
->char2uni
[(int)leadchar
] = crow
;
1570 crow
[(int)(legchar
& 0xff)] = (cups_ucs2_t
)unichar
;
1575 * Save VBCS 32-bit to Unicode mapping in sorted list table...
1581 vmap
->widecount
= (mapcount
- i
+ 1);
1582 wide2uni
= (_cups_wide2uni_t
*)calloc(vmap
->widecount
,
1583 sizeof(_cups_wide2uni_t
));
1587 vmap
->wide2uni
= wide2uni
;
1590 wide2uni
->widechar
= (cups_vbcs_t
)legchar
;
1591 wide2uni
->unichar
= (cups_ucs2_t
)unichar
;
1596 * Save Unicode to legacy mapping in indirect lookup table...
1599 vrow
= vmap
->uni2char
[(int)((unichar
>> 8) & 0xff)];
1602 vrow
= (cups_vbcs_t
*)calloc(256, sizeof(cups_vbcs_t
));
1606 vmap
->uni2char
[(int) ((unichar
>> 8) & 0xff)] = vrow
;
1609 vrow
+= (int)(unichar
& 0xff);
1612 * Convert Replacement Character to visible replacement...
1615 if (unichar
== 0xfffd)
1616 legchar
= (unsigned long)'?';
1619 * First (oldest) legacy character uses Unicode mapping cell...
1623 *vrow
= (cups_vbcs_t
)legchar
;
1626 vmap
->charcount
= (i
- vmap
->widecount
);
1631 * Add it to the cache and return...
1634 vmap
->next
= vmap_cache
;
1637 DEBUG_printf((" returning new vmap=%p\n", vmap
));
1642 * If we get here, the file contains errors...
1647 free_vbcs_charmap(vmap
);
1651 DEBUG_puts(" Error, returning NULL!");
1658 * End of "$Id: transcode.c 6188 2007-01-10 16:23:06Z mike $"