2 * Transcoding support for CUPS.
4 * Copyright © 2020-2024 by OpenPrinting.
5 * Copyright 2007-2014 by Apple Inc.
6 * Copyright 1997-2007 by Easy Software Products.
8 * Licensed under Apache License v2.0. See the file "LICENSE" for more information.
12 * Include necessary headers...
15 #include "cups-private.h"
16 #include "debug-internal.h"
21 #endif /* HAVE_ICONV_H */
29 static cups_mutex_t map_mutex
= CUPS_MUTEX_INITIALIZER
;
30 /* Mutex to control access to maps */
31 static iconv_t map_from_utf8
= (iconv_t
)-1;
32 /* Convert from UTF-8 to charset */
33 static iconv_t map_to_utf8
= (iconv_t
)-1;
34 /* Convert from charset to UTF-8 */
35 static cups_encoding_t map_encoding
= CUPS_AUTO_ENCODING
;
36 /* Which charset is cached */
37 #endif /* HAVE_ICONV_H */
41 * '_cupsCharmapFlush()' - Flush all character set maps out of cache.
45 _cupsCharmapFlush(void)
48 if (map_from_utf8
!= (iconv_t
)-1)
50 iconv_close(map_from_utf8
);
51 map_from_utf8
= (iconv_t
)-1;
54 if (map_to_utf8
!= (iconv_t
)-1)
56 iconv_close(map_to_utf8
);
57 map_to_utf8
= (iconv_t
)-1;
60 map_encoding
= CUPS_AUTO_ENCODING
;
61 #endif /* HAVE_ICONV_H */
66 * 'cupsCharsetToUTF8()' - Convert legacy character set to UTF-8.
69 int /* O - Count or -1 on error */
71 cups_utf8_t
*dest
, /* O - Target string */
72 const char *src
, /* I - Source string */
73 const int maxout
, /* I - Max output */
74 const cups_encoding_t encoding
) /* I - Encoding */
76 cups_utf8_t
*destptr
; /* Pointer into UTF-8 buffer */
78 size_t srclen
, /* Length of source string */
79 outBytesLeft
; /* Bytes remaining in output buffer */
80 #endif /* HAVE_ICONV_H */
84 * Check for valid arguments...
87 DEBUG_printf("2cupsCharsetToUTF8(dest=%p, src=\"%s\", maxout=%d, encoding=%d)", (void *)dest
, src
, maxout
, encoding
);
89 if (!dest
|| !src
|| maxout
< 1)
94 DEBUG_puts("3cupsCharsetToUTF8: Bad arguments, returning -1");
99 * Handle identity conversions...
102 if (encoding
== CUPS_UTF8
|| encoding
<= CUPS_US_ASCII
||
103 encoding
>= CUPS_ENCODING_VBCS_END
)
105 cupsCopyString((char *)dest
, src
, (size_t)maxout
);
106 return ((int)strlen((char *)dest
));
110 * Handle ISO-8859-1 to UTF-8 directly...
115 if (encoding
== CUPS_ISO8859_1
)
117 int ch
; /* Character from string */
118 cups_utf8_t
*destend
; /* End of UTF-8 buffer */
121 destend
= dest
+ maxout
- 2;
123 while (*src
&& destptr
< destend
)
129 *destptr
++ = (cups_utf8_t
)(0xc0 | (ch
>> 6));
130 *destptr
++ = (cups_utf8_t
)(0x80 | (ch
& 0x3f));
133 *destptr
++ = (cups_utf8_t
)ch
;
138 return ((int)(destptr
- dest
));
142 * Convert input legacy charset to UTF-8...
146 cupsMutexLock(&map_mutex
);
148 if (map_encoding
!= encoding
)
150 char toset
[1024]; /* Destination character set */
154 snprintf(toset
, sizeof(toset
), "%s//IGNORE", _cupsEncodingName(encoding
));
156 map_encoding
= encoding
;
157 map_from_utf8
= iconv_open(_cupsEncodingName(encoding
), "UTF-8");
158 map_to_utf8
= iconv_open("UTF-8", toset
);
161 if (map_to_utf8
!= (iconv_t
)-1)
163 char *altdestptr
= (char *)dest
; /* Silence bogus GCC type-punned */
165 srclen
= strlen(src
);
166 outBytesLeft
= (size_t)maxout
- 1;
168 iconv(map_to_utf8
, (char **)&src
, &srclen
, &altdestptr
, &outBytesLeft
);
171 cupsMutexUnlock(&map_mutex
);
173 return ((int)(altdestptr
- (char *)dest
));
176 cupsMutexUnlock(&map_mutex
);
177 #endif /* HAVE_ICONV_H */
180 * No iconv() support, so error out...
190 * 'cupsUTF8ToCharset()' - Convert UTF-8 to legacy character set.
193 int /* O - Count or -1 on error */
195 char *dest
, /* O - Target string */
196 const cups_utf8_t
*src
, /* I - Source string */
197 const int maxout
, /* I - Max output */
198 const cups_encoding_t encoding
) /* I - Encoding */
200 char *destptr
; /* Pointer into destination */
202 size_t srclen
, /* Length of source string */
203 outBytesLeft
; /* Bytes remaining in output buffer */
204 #endif /* HAVE_ICONV_H */
208 * Check for valid arguments...
211 if (!dest
|| !src
|| maxout
< 1)
220 * Handle identity conversions...
223 if (encoding
== CUPS_UTF8
||
224 encoding
>= CUPS_ENCODING_VBCS_END
)
226 cupsCopyString(dest
, (char *)src
, (size_t)maxout
);
227 return ((int)strlen(dest
));
231 * Handle UTF-8 to ISO-8859-1 directly...
236 if (encoding
== CUPS_ISO8859_1
|| encoding
<= CUPS_US_ASCII
)
238 int ch
, /* Character from string */
239 maxch
; /* Maximum character for charset */
240 char *destend
; /* End of ISO-8859-1 buffer */
242 maxch
= encoding
== CUPS_ISO8859_1
? 256 : 128;
243 destend
= dest
+ maxout
- 1;
245 while (*src
&& destptr
< destend
)
249 if ((ch
& 0xe0) == 0xc0)
251 ch
= ((ch
& 0x1f) << 6) | (*src
++ & 0x3f);
254 *destptr
++ = (char)ch
;
258 else if ((ch
& 0xf0) == 0xe0 ||
261 else if (!(ch
& 0x80))
262 *destptr
++ = (char)ch
;
267 return ((int)(destptr
- dest
));
272 * Convert input UTF-8 to legacy charset...
275 cupsMutexLock(&map_mutex
);
277 if (map_encoding
!= encoding
)
279 char toset
[1024]; /* Destination character set */
283 snprintf(toset
, sizeof(toset
), "%s//IGNORE", _cupsEncodingName(encoding
));
285 map_encoding
= encoding
;
286 map_from_utf8
= iconv_open(_cupsEncodingName(encoding
), "UTF-8");
287 map_to_utf8
= iconv_open("UTF-8", toset
);
290 if (map_from_utf8
!= (iconv_t
)-1)
292 char *altsrc
= (char *)src
; /* Silence bogus GCC type-punned */
294 srclen
= strlen((char *)src
);
295 outBytesLeft
= (size_t)maxout
- 1;
297 iconv(map_from_utf8
, &altsrc
, &srclen
, &destptr
, &outBytesLeft
);
300 cupsMutexUnlock(&map_mutex
);
302 return ((int)(destptr
- dest
));
305 cupsMutexUnlock(&map_mutex
);
306 #endif /* HAVE_ICONV_H */
309 * No iconv() support, so error out...
319 * 'cupsUTF8ToUTF32()' - Convert UTF-8 to UTF-32.
321 * 32-bit UTF-32 (actually 21-bit) maps to UTF-8 as follows...
323 * UTF-32 char UTF-8 char(s)
324 * --------------------------------------------------
325 * 0 to 127 = 0xxxxxxx (US-ASCII)
326 * 128 to 2047 = 110xxxxx 10yyyyyy
327 * 2048 to 65535 = 1110xxxx 10yyyyyy 10zzzzzz
328 * > 65535 = 11110xxx 10yyyyyy 10zzzzzz 10xxxxxx
330 * UTF-32 prohibits chars beyond Plane 16 (> 0x10ffff) in UCS-4,
331 * which would convert to five- or six-octet UTF-8 sequences...
334 int /* O - Count or -1 on error */
336 cups_utf32_t
*dest
, /* O - Target string */
337 const cups_utf8_t
*src
, /* I - Source string */
338 const int maxout
) /* I - Max output */
340 int i
; /* Looping variable */
341 cups_utf8_t ch
; /* Character value */
342 cups_utf8_t next
; /* Next character value */
343 cups_utf32_t ch32
; /* UTF-32 character value */
347 * Check for valid arguments and clear output...
350 DEBUG_printf("2cupsUTF8ToUTF32(dest=%p, src=\"%s\", maxout=%d)", (void *)dest
, src
, maxout
);
355 if (!dest
|| !src
|| maxout
< 1 || maxout
> CUPS_MAX_USTRING
)
357 DEBUG_puts("3cupsUTF8ToUTF32: Returning -1 (bad arguments)");
363 * Convert input UTF-8 to output UTF-32...
366 for (i
= maxout
- 1; *src
&& i
> 0; i
--)
371 * Convert UTF-8 character(s) to UTF-32 character...
377 * One-octet UTF-8 <= 127 (US-ASCII)...
382 DEBUG_printf("4cupsUTF8ToUTF32: %02x => %08X", src
[-1], ch
);
385 else if ((ch
& 0xe0) == 0xc0)
388 * Two-octet UTF-8 <= 2047 (Latin-x)...
392 if ((next
& 0xc0) != 0x80)
394 DEBUG_puts("3cupsUTF8ToUTF32: Returning -1 (bad UTF-8 sequence)");
399 ch32
= (cups_utf32_t
)((ch
& 0x1f) << 6) | (cups_utf32_t
)(next
& 0x3f);
402 * Check for non-shortest form (invalid UTF-8)...
407 DEBUG_puts("3cupsUTF8ToUTF32: Returning -1 (bad UTF-8 sequence)");
414 DEBUG_printf("4cupsUTF8ToUTF32: %02x %02x => %08X", src
[-2], src
[-1], (unsigned)ch32
);
416 else if ((ch
& 0xf0) == 0xe0)
419 * Three-octet UTF-8 <= 65535 (Plane 0 - BMP)...
423 if ((next
& 0xc0) != 0x80)
425 DEBUG_puts("3cupsUTF8ToUTF32: Returning -1 (bad UTF-8 sequence)");
430 ch32
= (cups_utf32_t
)((ch
& 0x0f) << 6) | (cups_utf32_t
)(next
& 0x3f);
433 if ((next
& 0xc0) != 0x80)
435 DEBUG_puts("3cupsUTF8ToUTF32: Returning -1 (bad UTF-8 sequence)");
440 ch32
= (ch32
<< 6) | (cups_utf32_t
)(next
& 0x3f);
443 * Check for non-shortest form (invalid UTF-8)...
448 DEBUG_puts("3cupsUTF8ToUTF32: Returning -1 (bad UTF-8 sequence)");
455 DEBUG_printf("4cupsUTF8ToUTF32: %02x %02x %02x => %08X", src
[-3], src
[-2], src
[-1], (unsigned)ch32
);
457 else if ((ch
& 0xf8) == 0xf0)
460 * Four-octet UTF-8...
464 if ((next
& 0xc0) != 0x80)
466 DEBUG_puts("3cupsUTF8ToUTF32: Returning -1 (bad UTF-8 sequence)");
471 ch32
= (cups_utf32_t
)((ch
& 0x07) << 6) | (cups_utf32_t
)(next
& 0x3f);
474 if ((next
& 0xc0) != 0x80)
476 DEBUG_puts("3cupsUTF8ToUTF32: Returning -1 (bad UTF-8 sequence)");
481 ch32
= (ch32
<< 6) | (cups_utf32_t
)(next
& 0x3f);
484 if ((next
& 0xc0) != 0x80)
486 DEBUG_puts("3cupsUTF8ToUTF32: Returning -1 (bad UTF-8 sequence)");
491 ch32
= (ch32
<< 6) | (cups_utf32_t
)(next
& 0x3f);
494 * Check for non-shortest form (invalid UTF-8)...
499 DEBUG_puts("3cupsUTF8ToUTF32: Returning -1 (bad UTF-8 sequence)");
506 DEBUG_printf("4cupsUTF8ToUTF32: %02x %02x %02x %02x => %08X", src
[-4], src
[-3], src
[-2], src
[-1], (unsigned)ch32
);
511 * More than 4-octet (invalid UTF-8 sequence)...
514 DEBUG_puts("3cupsUTF8ToUTF32: Returning -1 (bad UTF-8 sequence)");
520 * Check for UTF-16 surrogate (illegal UTF-8)...
523 if (ch32
>= 0xd800 && ch32
<= 0xdfff)
529 DEBUG_printf("3cupsUTF8ToUTF32: Returning %d characters", maxout
- 1 - i
);
531 return (maxout
- 1 - i
);
536 * 'cupsUTF32ToUTF8()' - Convert UTF-32 to UTF-8.
538 * 32-bit UTF-32 (actually 21-bit) maps to UTF-8 as follows...
540 * UTF-32 char UTF-8 char(s)
541 * --------------------------------------------------
542 * 0 to 127 = 0xxxxxxx (US-ASCII)
543 * 128 to 2047 = 110xxxxx 10yyyyyy
544 * 2048 to 65535 = 1110xxxx 10yyyyyy 10zzzzzz
545 * > 65535 = 11110xxx 10yyyyyy 10zzzzzz 10xxxxxx
547 * UTF-32 prohibits chars beyond Plane 16 (> 0x10ffff) in UCS-4,
548 * which would convert to five- or six-octet UTF-8 sequences...
551 int /* O - Count or -1 on error */
553 cups_utf8_t
*dest
, /* O - Target string */
554 const cups_utf32_t
*src
, /* I - Source string */
555 const int maxout
) /* I - Max output */
557 cups_utf8_t
*start
; /* Start of destination string */
558 int i
; /* Looping variable */
559 int swap
; /* Byte-swap input to output */
560 cups_utf32_t ch
; /* Character value */
564 * Check for valid arguments and clear output...
567 DEBUG_printf("2cupsUTF32ToUTF8(dest=%p, src=%p, maxout=%d)", (void *)dest
, (void *)src
, maxout
);
572 if (!dest
|| !src
|| maxout
< 1)
574 DEBUG_puts("3cupsUTF32ToUTF8: Returning -1 (bad args)");
580 * Check for leading BOM in UTF-32 and inverted BOM...
584 swap
= *src
== 0xfffe0000;
586 DEBUG_printf("4cupsUTF32ToUTF8: swap=%d", swap
);
588 if (*src
== 0xfffe0000 || *src
== 0xfeff)
592 * Convert input UTF-32 to output UTF-8...
595 for (i
= maxout
- 1; *src
&& i
> 0;)
600 * Byte swap input UTF-32, if necessary...
601 * (only byte-swapping 24 of 32 bits)
605 ch
= ((ch
>> 24) | ((ch
>> 8) & 0xff00) | ((ch
<< 8) & 0xff0000));
608 * Check for beyond Plane 16 (invalid UTF-32)...
613 DEBUG_puts("3cupsUTF32ToUTF8: Returning -1 (character out of range)");
619 * Convert UTF-32 character to UTF-8 character(s)...
625 * One-octet UTF-8 <= 127 (US-ASCII)...
628 *dest
++ = (cups_utf8_t
)ch
;
631 DEBUG_printf("4cupsUTF32ToUTF8: %08x => %02x", (unsigned)ch
, dest
[-1]);
636 * Two-octet UTF-8 <= 2047 (Latin-x)...
641 DEBUG_puts("3cupsUTF32ToUTF8: Returning -1 (too long 2)");
646 *dest
++ = (cups_utf8_t
)(0xc0 | ((ch
>> 6) & 0x1f));
647 *dest
++ = (cups_utf8_t
)(0x80 | (ch
& 0x3f));
650 DEBUG_printf("4cupsUTF32ToUTF8: %08x => %02x %02x", (unsigned)ch
, dest
[-2], dest
[-1]);
652 else if (ch
< 0x10000)
655 * Three-octet UTF-8 <= 65535 (Plane 0 - BMP)...
660 DEBUG_puts("3cupsUTF32ToUTF8: Returning -1 (too long 3)");
665 *dest
++ = (cups_utf8_t
)(0xe0 | ((ch
>> 12) & 0x0f));
666 *dest
++ = (cups_utf8_t
)(0x80 | ((ch
>> 6) & 0x3f));
667 *dest
++ = (cups_utf8_t
)(0x80 | (ch
& 0x3f));
670 DEBUG_printf("4cupsUTF32ToUTF8: %08x => %02x %02x %02x", (unsigned)ch
, dest
[-3], dest
[-2], dest
[-1]);
675 * Four-octet UTF-8...
680 DEBUG_puts("3cupsUTF32ToUTF8: Returning -1 (too long 4)");
685 *dest
++ = (cups_utf8_t
)(0xf0 | ((ch
>> 18) & 0x07));
686 *dest
++ = (cups_utf8_t
)(0x80 | ((ch
>> 12) & 0x3f));
687 *dest
++ = (cups_utf8_t
)(0x80 | ((ch
>> 6) & 0x3f));
688 *dest
++ = (cups_utf8_t
)(0x80 | (ch
& 0x3f));
691 DEBUG_printf("4cupsUTF32ToUTF8: %08x => %02x %02x %02x %02x", (unsigned)ch
, dest
[-4], dest
[-3], dest
[-2], dest
[-1]);
697 DEBUG_printf("3cupsUTF32ToUTF8: Returning %d", (int)(dest
- start
));
699 return ((int)(dest
- start
));