]>
Commit | Line | Data |
---|---|---|
ef416fc2 | 1 | /* |
7e86f2f6 | 2 | * Transcoding support for CUPS. |
ef416fc2 | 3 | * |
76b6aade | 4 | * Copyright © 2020-2024 by OpenPrinting. |
7e86f2f6 MS |
5 | * Copyright 2007-2014 by Apple Inc. |
6 | * Copyright 1997-2007 by Easy Software Products. | |
ef416fc2 | 7 | * |
e3101897 | 8 | * Licensed under Apache License v2.0. See the file "LICENSE" for more information. |
ef416fc2 | 9 | */ |
10 | ||
11 | /* | |
12 | * Include necessary headers... | |
13 | */ | |
14 | ||
71e16022 | 15 | #include "cups-private.h" |
fb863569 | 16 | #include "debug-internal.h" |
e53920b9 | 17 | #include <limits.h> |
ef416fc2 | 18 | #include <time.h> |
cc754834 MS |
19 | #ifdef HAVE_ICONV_H |
20 | # include <iconv.h> | |
21 | #endif /* HAVE_ICONV_H */ | |
ef416fc2 | 22 | |
23 | ||
d6ae789d | 24 | /* |
25 | * Local globals... | |
26 | */ | |
27 | ||
cc754834 | 28 | #ifdef HAVE_ICONV_H |
3a4a4db2 | 29 | static cups_mutex_t map_mutex = CUPS_MUTEX_INITIALIZER; |
d6ae789d | 30 | /* Mutex to control access to maps */ |
cc754834 MS |
31 | static iconv_t map_from_utf8 = (iconv_t)-1; |
32 | /* Convert from UTF-8 to charset */ | |
33 | static iconv_t map_to_utf8 = (iconv_t)-1; | |
34 | /* Convert from charset to UTF-8 */ | |
35 | static cups_encoding_t map_encoding = CUPS_AUTO_ENCODING; | |
36 | /* Which charset is cached */ | |
37 | #endif /* HAVE_ICONV_H */ | |
e1d6a774 | 38 | |
ef416fc2 | 39 | |
40 | /* | |
e1d6a774 | 41 | * '_cupsCharmapFlush()' - Flush all character set maps out of cache. |
ef416fc2 | 42 | */ |
43 | ||
e1d6a774 | 44 | void |
d6ae789d | 45 | _cupsCharmapFlush(void) |
ef416fc2 | 46 | { |
cc754834 MS |
47 | #ifdef HAVE_ICONV_H |
48 | if (map_from_utf8 != (iconv_t)-1) | |
ef416fc2 | 49 | { |
cc754834 MS |
50 | iconv_close(map_from_utf8); |
51 | map_from_utf8 = (iconv_t)-1; | |
ef416fc2 | 52 | } |
d6ae789d | 53 | |
cc754834 | 54 | if (map_to_utf8 != (iconv_t)-1) |
ef416fc2 | 55 | { |
cc754834 MS |
56 | iconv_close(map_to_utf8); |
57 | map_to_utf8 = (iconv_t)-1; | |
ef416fc2 | 58 | } |
ef416fc2 | 59 | |
cc754834 MS |
60 | map_encoding = CUPS_AUTO_ENCODING; |
61 | #endif /* HAVE_ICONV_H */ | |
ef416fc2 | 62 | } |
63 | ||
e1d6a774 | 64 | |
ef416fc2 | 65 | /* |
e1d6a774 | 66 | * 'cupsCharsetToUTF8()' - Convert legacy character set to UTF-8. |
ef416fc2 | 67 | */ |
e1d6a774 | 68 | |
69 | int /* O - Count or -1 on error */ | |
70 | cupsCharsetToUTF8( | |
cc754834 MS |
71 | cups_utf8_t *dest, /* O - Target string */ |
72 | const char *src, /* I - Source string */ | |
73 | const int maxout, /* I - Max output */ | |
e1d6a774 | 74 | const cups_encoding_t encoding) /* I - Encoding */ |
ef416fc2 | 75 | { |
cc754834 | 76 | cups_utf8_t *destptr; /* Pointer into UTF-8 buffer */ |
84315f46 | 77 | #ifdef HAVE_ICONV_H |
cc754834 MS |
78 | size_t srclen, /* Length of source string */ |
79 | outBytesLeft; /* Bytes remaining in output buffer */ | |
7cf5915e | 80 | #endif /* HAVE_ICONV_H */ |
d6ae789d | 81 | |
82 | ||
ef416fc2 | 83 | /* |
84 | * Check for valid arguments... | |
85 | */ | |
86 | ||
e3952d3e | 87 | DEBUG_printf("2cupsCharsetToUTF8(dest=%p, src=\"%s\", maxout=%d, encoding=%d)", (void *)dest, src, maxout, encoding); |
e1d6a774 | 88 | |
cc754834 | 89 | if (!dest || !src || maxout < 1) |
e1d6a774 | 90 | { |
cc754834 MS |
91 | if (dest) |
92 | *dest = '\0'; | |
93 | ||
f11a948a | 94 | DEBUG_puts("3cupsCharsetToUTF8: Bad arguments, returning -1"); |
ef416fc2 | 95 | return (-1); |
e1d6a774 | 96 | } |
ef416fc2 | 97 | |
98 | /* | |
99 | * Handle identity conversions... | |
100 | */ | |
101 | ||
cc754834 MS |
102 | if (encoding == CUPS_UTF8 || encoding <= CUPS_US_ASCII || |
103 | encoding >= CUPS_ENCODING_VBCS_END) | |
ef416fc2 | 104 | { |
6ac4da6b | 105 | cupsCopyString((char *)dest, src, (size_t)maxout); |
b86bc4cf | 106 | return ((int)strlen((char *)dest)); |
ef416fc2 | 107 | } |
108 | ||
411affcf | 109 | /* |
110 | * Handle ISO-8859-1 to UTF-8 directly... | |
111 | */ | |
112 | ||
cc754834 MS |
113 | destptr = dest; |
114 | ||
411affcf | 115 | if (encoding == CUPS_ISO8859_1) |
116 | { | |
117 | int ch; /* Character from string */ | |
cc754834 | 118 | cups_utf8_t *destend; /* End of UTF-8 buffer */ |
411affcf | 119 | |
120 | ||
411affcf | 121 | destend = dest + maxout - 2; |
122 | ||
123 | while (*src && destptr < destend) | |
124 | { | |
125 | ch = *src++ & 255; | |
126 | ||
127 | if (ch & 128) | |
128 | { | |
7e86f2f6 MS |
129 | *destptr++ = (cups_utf8_t)(0xc0 | (ch >> 6)); |
130 | *destptr++ = (cups_utf8_t)(0x80 | (ch & 0x3f)); | |
411affcf | 131 | } |
132 | else | |
7e86f2f6 | 133 | *destptr++ = (cups_utf8_t)ch; |
411affcf | 134 | } |
135 | ||
136 | *destptr = '\0'; | |
137 | ||
b86bc4cf | 138 | return ((int)(destptr - dest)); |
411affcf | 139 | } |
140 | ||
ef416fc2 | 141 | /* |
e1d6a774 | 142 | * Convert input legacy charset to UTF-8... |
ef416fc2 | 143 | */ |
e1d6a774 | 144 | |
cc754834 | 145 | #ifdef HAVE_ICONV_H |
3a4a4db2 | 146 | cupsMutexLock(&map_mutex); |
d6ae789d | 147 | |
cc754834 MS |
148 | if (map_encoding != encoding) |
149 | { | |
52958fdb MS |
150 | char toset[1024]; /* Destination character set */ |
151 | ||
cc754834 MS |
152 | _cupsCharmapFlush(); |
153 | ||
52958fdb MS |
154 | snprintf(toset, sizeof(toset), "%s//IGNORE", _cupsEncodingName(encoding)); |
155 | ||
156 | map_encoding = encoding; | |
cc754834 | 157 | map_from_utf8 = iconv_open(_cupsEncodingName(encoding), "UTF-8"); |
52958fdb | 158 | map_to_utf8 = iconv_open("UTF-8", toset); |
cc754834 MS |
159 | } |
160 | ||
161 | if (map_to_utf8 != (iconv_t)-1) | |
162 | { | |
f99f3698 MS |
163 | char *altdestptr = (char *)dest; /* Silence bogus GCC type-punned */ |
164 | ||
cc754834 | 165 | srclen = strlen(src); |
7e86f2f6 | 166 | outBytesLeft = (size_t)maxout - 1; |
4220952d | 167 | |
f99f3698 MS |
168 | iconv(map_to_utf8, (char **)&src, &srclen, &altdestptr, &outBytesLeft); |
169 | *altdestptr = '\0'; | |
cc754834 | 170 | |
3a4a4db2 | 171 | cupsMutexUnlock(&map_mutex); |
cc754834 | 172 | |
f99f3698 | 173 | return ((int)(altdestptr - (char *)dest)); |
cc754834 | 174 | } |
d6ae789d | 175 | |
3a4a4db2 | 176 | cupsMutexUnlock(&map_mutex); |
cc754834 | 177 | #endif /* HAVE_ICONV_H */ |
d6ae789d | 178 | |
cc754834 MS |
179 | /* |
180 | * No iconv() support, so error out... | |
181 | */ | |
182 | ||
183 | *destptr = '\0'; | |
184 | ||
185 | return (-1); | |
ef416fc2 | 186 | } |
187 | ||
e1d6a774 | 188 | |
ef416fc2 | 189 | /* |
e1d6a774 | 190 | * 'cupsUTF8ToCharset()' - Convert UTF-8 to legacy character set. |
ef416fc2 | 191 | */ |
e1d6a774 | 192 | |
193 | int /* O - Count or -1 on error */ | |
194 | cupsUTF8ToCharset( | |
195 | char *dest, /* O - Target string */ | |
196 | const cups_utf8_t *src, /* I - Source string */ | |
197 | const int maxout, /* I - Max output */ | |
198 | const cups_encoding_t encoding) /* I - Encoding */ | |
ef416fc2 | 199 | { |
cc754834 | 200 | char *destptr; /* Pointer into destination */ |
84315f46 | 201 | #ifdef HAVE_ICONV_H |
cc754834 MS |
202 | size_t srclen, /* Length of source string */ |
203 | outBytesLeft; /* Bytes remaining in output buffer */ | |
7cf5915e | 204 | #endif /* HAVE_ICONV_H */ |
d6ae789d | 205 | |
206 | ||
ef416fc2 | 207 | /* |
208 | * Check for valid arguments... | |
209 | */ | |
210 | ||
cc754834 | 211 | if (!dest || !src || maxout < 1) |
e1d6a774 | 212 | { |
213 | if (dest) | |
214 | *dest = '\0'; | |
215 | ||
ef416fc2 | 216 | return (-1); |
e1d6a774 | 217 | } |
ef416fc2 | 218 | |
219 | /* | |
220 | * Handle identity conversions... | |
221 | */ | |
222 | ||
22c9029b | 223 | if (encoding == CUPS_UTF8 || |
cc754834 | 224 | encoding >= CUPS_ENCODING_VBCS_END) |
ef416fc2 | 225 | { |
6ac4da6b | 226 | cupsCopyString(dest, (char *)src, (size_t)maxout); |
b86bc4cf | 227 | return ((int)strlen(dest)); |
ef416fc2 | 228 | } |
229 | ||
411affcf | 230 | /* |
231 | * Handle UTF-8 to ISO-8859-1 directly... | |
232 | */ | |
233 | ||
cc754834 MS |
234 | destptr = dest; |
235 | ||
22c9029b | 236 | if (encoding == CUPS_ISO8859_1 || encoding <= CUPS_US_ASCII) |
411affcf | 237 | { |
22c9029b MS |
238 | int ch, /* Character from string */ |
239 | maxch; /* Maximum character for charset */ | |
cc754834 | 240 | char *destend; /* End of ISO-8859-1 buffer */ |
411affcf | 241 | |
22c9029b | 242 | maxch = encoding == CUPS_ISO8859_1 ? 256 : 128; |
411affcf | 243 | destend = dest + maxout - 1; |
244 | ||
245 | while (*src && destptr < destend) | |
246 | { | |
247 | ch = *src++; | |
248 | ||
249 | if ((ch & 0xe0) == 0xc0) | |
250 | { | |
251 | ch = ((ch & 0x1f) << 6) | (*src++ & 0x3f); | |
252 | ||
22c9029b | 253 | if (ch < maxch) |
7e86f2f6 | 254 | *destptr++ = (char)ch; |
411affcf | 255 | else |
256 | *destptr++ = '?'; | |
257 | } | |
258 | else if ((ch & 0xf0) == 0xe0 || | |
259 | (ch & 0xf8) == 0xf0) | |
260 | *destptr++ = '?'; | |
261 | else if (!(ch & 0x80)) | |
7e86f2f6 | 262 | *destptr++ = (char)ch; |
411affcf | 263 | } |
264 | ||
265 | *destptr = '\0'; | |
266 | ||
b86bc4cf | 267 | return ((int)(destptr - dest)); |
411affcf | 268 | } |
269 | ||
cc754834 | 270 | #ifdef HAVE_ICONV_H |
ef416fc2 | 271 | /* |
e1d6a774 | 272 | * Convert input UTF-8 to legacy charset... |
ef416fc2 | 273 | */ |
e1d6a774 | 274 | |
3a4a4db2 | 275 | cupsMutexLock(&map_mutex); |
d6ae789d | 276 | |
cc754834 MS |
277 | if (map_encoding != encoding) |
278 | { | |
52958fdb MS |
279 | char toset[1024]; /* Destination character set */ |
280 | ||
cc754834 MS |
281 | _cupsCharmapFlush(); |
282 | ||
52958fdb MS |
283 | snprintf(toset, sizeof(toset), "%s//IGNORE", _cupsEncodingName(encoding)); |
284 | ||
cc754834 | 285 | map_encoding = encoding; |
52958fdb MS |
286 | map_from_utf8 = iconv_open(_cupsEncodingName(encoding), "UTF-8"); |
287 | map_to_utf8 = iconv_open("UTF-8", toset); | |
cc754834 MS |
288 | } |
289 | ||
290 | if (map_from_utf8 != (iconv_t)-1) | |
291 | { | |
f99f3698 MS |
292 | char *altsrc = (char *)src; /* Silence bogus GCC type-punned */ |
293 | ||
cc754834 | 294 | srclen = strlen((char *)src); |
7e86f2f6 | 295 | outBytesLeft = (size_t)maxout - 1; |
4220952d | 296 | |
f99f3698 | 297 | iconv(map_from_utf8, &altsrc, &srclen, &destptr, &outBytesLeft); |
4220952d | 298 | *destptr = '\0'; |
cc754834 | 299 | |
3a4a4db2 | 300 | cupsMutexUnlock(&map_mutex); |
cc754834 MS |
301 | |
302 | return ((int)(destptr - dest)); | |
303 | } | |
d6ae789d | 304 | |
3a4a4db2 | 305 | cupsMutexUnlock(&map_mutex); |
cc754834 MS |
306 | #endif /* HAVE_ICONV_H */ |
307 | ||
308 | /* | |
309 | * No iconv() support, so error out... | |
310 | */ | |
311 | ||
312 | *destptr = '\0'; | |
d6ae789d | 313 | |
cc754834 | 314 | return (-1); |
ef416fc2 | 315 | } |
316 | ||
ef416fc2 | 317 | |
318 | /* | |
319 | * 'cupsUTF8ToUTF32()' - Convert UTF-8 to UTF-32. | |
320 | * | |
321 | * 32-bit UTF-32 (actually 21-bit) maps to UTF-8 as follows... | |
322 | * | |
323 | * UTF-32 char UTF-8 char(s) | |
324 | * -------------------------------------------------- | |
e1d6a774 | 325 | * 0 to 127 = 0xxxxxxx (US-ASCII) |
ef416fc2 | 326 | * 128 to 2047 = 110xxxxx 10yyyyyy |
327 | * 2048 to 65535 = 1110xxxx 10yyyyyy 10zzzzzz | |
e1d6a774 | 328 | * > 65535 = 11110xxx 10yyyyyy 10zzzzzz 10xxxxxx |
ef416fc2 | 329 | * |
330 | * UTF-32 prohibits chars beyond Plane 16 (> 0x10ffff) in UCS-4, | |
331 | * which would convert to five- or six-octet UTF-8 sequences... | |
ef416fc2 | 332 | */ |
e1d6a774 | 333 | |
334 | int /* O - Count or -1 on error */ | |
335 | cupsUTF8ToUTF32( | |
336 | cups_utf32_t *dest, /* O - Target string */ | |
337 | const cups_utf8_t *src, /* I - Source string */ | |
338 | const int maxout) /* I - Max output */ | |
ef416fc2 | 339 | { |
e1d6a774 | 340 | int i; /* Looping variable */ |
341 | cups_utf8_t ch; /* Character value */ | |
342 | cups_utf8_t next; /* Next character value */ | |
343 | cups_utf32_t ch32; /* UTF-32 character value */ | |
344 | ||
ef416fc2 | 345 | |
346 | /* | |
347 | * Check for valid arguments and clear output... | |
348 | */ | |
e1d6a774 | 349 | |
e3952d3e | 350 | DEBUG_printf("2cupsUTF8ToUTF32(dest=%p, src=\"%s\", maxout=%d)", (void *)dest, src, maxout); |
c9fc04c6 | 351 | |
e1d6a774 | 352 | if (dest) |
353 | *dest = 0; | |
354 | ||
355 | if (!dest || !src || maxout < 1 || maxout > CUPS_MAX_USTRING) | |
c9fc04c6 | 356 | { |
e07d4801 | 357 | DEBUG_puts("3cupsUTF8ToUTF32: Returning -1 (bad arguments)"); |
c9fc04c6 | 358 | |
ef416fc2 | 359 | return (-1); |
c9fc04c6 | 360 | } |
ef416fc2 | 361 | |
362 | /* | |
cda47a96 | 363 | * Convert input UTF-8 to output UTF-32... |
ef416fc2 | 364 | */ |
e1d6a774 | 365 | |
e1d6a774 | 366 | for (i = maxout - 1; *src && i > 0; i --) |
ef416fc2 | 367 | { |
e1d6a774 | 368 | ch = *src++; |
ef416fc2 | 369 | |
370 | /* | |
371 | * Convert UTF-8 character(s) to UTF-32 character... | |
372 | */ | |
e1d6a774 | 373 | |
374 | if (!(ch & 0x80)) | |
ef416fc2 | 375 | { |
376 | /* | |
377 | * One-octet UTF-8 <= 127 (US-ASCII)... | |
378 | */ | |
e1d6a774 | 379 | |
380 | *dest++ = ch; | |
c9fc04c6 | 381 | |
e3952d3e | 382 | DEBUG_printf("4cupsUTF8ToUTF32: %02x => %08X", src[-1], ch); |
2abf387c | 383 | continue; |
ef416fc2 | 384 | } |
385 | else if ((ch & 0xe0) == 0xc0) | |
386 | { | |
387 | /* | |
388 | * Two-octet UTF-8 <= 2047 (Latin-x)... | |
389 | */ | |
e1d6a774 | 390 | |
391 | next = *src++; | |
c9fc04c6 MS |
392 | if ((next & 0xc0) != 0x80) |
393 | { | |
e07d4801 | 394 | DEBUG_puts("3cupsUTF8ToUTF32: Returning -1 (bad UTF-8 sequence)"); |
c9fc04c6 | 395 | |
ef416fc2 | 396 | return (-1); |
c9fc04c6 | 397 | } |
e1d6a774 | 398 | |
7e86f2f6 | 399 | ch32 = (cups_utf32_t)((ch & 0x1f) << 6) | (cups_utf32_t)(next & 0x3f); |
ef416fc2 | 400 | |
401 | /* | |
402 | * Check for non-shortest form (invalid UTF-8)... | |
403 | */ | |
e1d6a774 | 404 | |
405 | if (ch32 < 0x80) | |
c9fc04c6 | 406 | { |
e07d4801 | 407 | DEBUG_puts("3cupsUTF8ToUTF32: Returning -1 (bad UTF-8 sequence)"); |
c9fc04c6 | 408 | |
ef416fc2 | 409 | return (-1); |
c9fc04c6 | 410 | } |
e1d6a774 | 411 | |
412 | *dest++ = ch32; | |
c9fc04c6 | 413 | |
fd2496a6 | 414 | DEBUG_printf("4cupsUTF8ToUTF32: %02x %02x => %08X", src[-2], src[-1], (unsigned)ch32); |
ef416fc2 | 415 | } |
416 | else if ((ch & 0xf0) == 0xe0) | |
417 | { | |
418 | /* | |
419 | * Three-octet UTF-8 <= 65535 (Plane 0 - BMP)... | |
420 | */ | |
e1d6a774 | 421 | |
422 | next = *src++; | |
c9fc04c6 MS |
423 | if ((next & 0xc0) != 0x80) |
424 | { | |
e07d4801 | 425 | DEBUG_puts("3cupsUTF8ToUTF32: Returning -1 (bad UTF-8 sequence)"); |
c9fc04c6 | 426 | |
ef416fc2 | 427 | return (-1); |
c9fc04c6 | 428 | } |
e1d6a774 | 429 | |
7e86f2f6 | 430 | ch32 = (cups_utf32_t)((ch & 0x0f) << 6) | (cups_utf32_t)(next & 0x3f); |
e1d6a774 | 431 | |
432 | next = *src++; | |
c9fc04c6 MS |
433 | if ((next & 0xc0) != 0x80) |
434 | { | |
e07d4801 | 435 | DEBUG_puts("3cupsUTF8ToUTF32: Returning -1 (bad UTF-8 sequence)"); |
c9fc04c6 | 436 | |
ef416fc2 | 437 | return (-1); |
c9fc04c6 | 438 | } |
e1d6a774 | 439 | |
7e86f2f6 | 440 | ch32 = (ch32 << 6) | (cups_utf32_t)(next & 0x3f); |
ef416fc2 | 441 | |
442 | /* | |
443 | * Check for non-shortest form (invalid UTF-8)... | |
444 | */ | |
e1d6a774 | 445 | |
446 | if (ch32 < 0x800) | |
c9fc04c6 | 447 | { |
e07d4801 | 448 | DEBUG_puts("3cupsUTF8ToUTF32: Returning -1 (bad UTF-8 sequence)"); |
c9fc04c6 | 449 | |
ef416fc2 | 450 | return (-1); |
c9fc04c6 | 451 | } |
e1d6a774 | 452 | |
453 | *dest++ = ch32; | |
c9fc04c6 | 454 | |
fd2496a6 | 455 | DEBUG_printf("4cupsUTF8ToUTF32: %02x %02x %02x => %08X", src[-3], src[-2], src[-1], (unsigned)ch32); |
ef416fc2 | 456 | } |
457 | else if ((ch & 0xf8) == 0xf0) | |
458 | { | |
459 | /* | |
e1d6a774 | 460 | * Four-octet UTF-8... |
ef416fc2 | 461 | */ |
e1d6a774 | 462 | |
463 | next = *src++; | |
c9fc04c6 MS |
464 | if ((next & 0xc0) != 0x80) |
465 | { | |
e07d4801 | 466 | DEBUG_puts("3cupsUTF8ToUTF32: Returning -1 (bad UTF-8 sequence)"); |
c9fc04c6 | 467 | |
ef416fc2 | 468 | return (-1); |
c9fc04c6 | 469 | } |
e1d6a774 | 470 | |
7e86f2f6 | 471 | ch32 = (cups_utf32_t)((ch & 0x07) << 6) | (cups_utf32_t)(next & 0x3f); |
e1d6a774 | 472 | |
473 | next = *src++; | |
c9fc04c6 MS |
474 | if ((next & 0xc0) != 0x80) |
475 | { | |
e07d4801 | 476 | DEBUG_puts("3cupsUTF8ToUTF32: Returning -1 (bad UTF-8 sequence)"); |
c9fc04c6 | 477 | |
e1d6a774 | 478 | return (-1); |
c9fc04c6 | 479 | } |
e1d6a774 | 480 | |
7e86f2f6 | 481 | ch32 = (ch32 << 6) | (cups_utf32_t)(next & 0x3f); |
e1d6a774 | 482 | |
483 | next = *src++; | |
c9fc04c6 MS |
484 | if ((next & 0xc0) != 0x80) |
485 | { | |
e07d4801 | 486 | DEBUG_puts("3cupsUTF8ToUTF32: Returning -1 (bad UTF-8 sequence)"); |
c9fc04c6 | 487 | |
e1d6a774 | 488 | return (-1); |
c9fc04c6 | 489 | } |
e1d6a774 | 490 | |
7e86f2f6 | 491 | ch32 = (ch32 << 6) | (cups_utf32_t)(next & 0x3f); |
e1d6a774 | 492 | |
ef416fc2 | 493 | /* |
e1d6a774 | 494 | * Check for non-shortest form (invalid UTF-8)... |
ef416fc2 | 495 | */ |
e1d6a774 | 496 | |
497 | if (ch32 < 0x10000) | |
c9fc04c6 | 498 | { |
e07d4801 | 499 | DEBUG_puts("3cupsUTF8ToUTF32: Returning -1 (bad UTF-8 sequence)"); |
c9fc04c6 | 500 | |
e1d6a774 | 501 | return (-1); |
c9fc04c6 | 502 | } |
e1d6a774 | 503 | |
504 | *dest++ = ch32; | |
c9fc04c6 | 505 | |
fd2496a6 | 506 | DEBUG_printf("4cupsUTF8ToUTF32: %02x %02x %02x %02x => %08X", src[-4], src[-3], src[-2], src[-1], (unsigned)ch32); |
ef416fc2 | 507 | } |
508 | else | |
509 | { | |
510 | /* | |
e1d6a774 | 511 | * More than 4-octet (invalid UTF-8 sequence)... |
ef416fc2 | 512 | */ |
e1d6a774 | 513 | |
e07d4801 | 514 | DEBUG_puts("3cupsUTF8ToUTF32: Returning -1 (bad UTF-8 sequence)"); |
c9fc04c6 | 515 | |
ef416fc2 | 516 | return (-1); |
517 | } | |
518 | ||
519 | /* | |
520 | * Check for UTF-16 surrogate (illegal UTF-8)... | |
521 | */ | |
ef416fc2 | 522 | |
2abf387c | 523 | if (ch32 >= 0xd800 && ch32 <= 0xdfff) |
ef416fc2 | 524 | return (-1); |
525 | } | |
e1d6a774 | 526 | |
ef416fc2 | 527 | *dest = 0; |
e1d6a774 | 528 | |
e3952d3e | 529 | DEBUG_printf("3cupsUTF8ToUTF32: Returning %d characters", maxout - 1 - i); |
c9fc04c6 MS |
530 | |
531 | return (maxout - 1 - i); | |
ef416fc2 | 532 | } |
533 | ||
e1d6a774 | 534 | |
ef416fc2 | 535 | /* |
536 | * 'cupsUTF32ToUTF8()' - Convert UTF-32 to UTF-8. | |
537 | * | |
538 | * 32-bit UTF-32 (actually 21-bit) maps to UTF-8 as follows... | |
539 | * | |
540 | * UTF-32 char UTF-8 char(s) | |
541 | * -------------------------------------------------- | |
e1d6a774 | 542 | * 0 to 127 = 0xxxxxxx (US-ASCII) |
ef416fc2 | 543 | * 128 to 2047 = 110xxxxx 10yyyyyy |
544 | * 2048 to 65535 = 1110xxxx 10yyyyyy 10zzzzzz | |
e1d6a774 | 545 | * > 65535 = 11110xxx 10yyyyyy 10zzzzzz 10xxxxxx |
ef416fc2 | 546 | * |
547 | * UTF-32 prohibits chars beyond Plane 16 (> 0x10ffff) in UCS-4, | |
548 | * which would convert to five- or six-octet UTF-8 sequences... | |
ef416fc2 | 549 | */ |
e1d6a774 | 550 | |
551 | int /* O - Count or -1 on error */ | |
552 | cupsUTF32ToUTF8( | |
553 | cups_utf8_t *dest, /* O - Target string */ | |
554 | const cups_utf32_t *src, /* I - Source string */ | |
555 | const int maxout) /* I - Max output */ | |
ef416fc2 | 556 | { |
e1d6a774 | 557 | cups_utf8_t *start; /* Start of destination string */ |
558 | int i; /* Looping variable */ | |
559 | int swap; /* Byte-swap input to output */ | |
560 | cups_utf32_t ch; /* Character value */ | |
561 | ||
ef416fc2 | 562 | |
563 | /* | |
564 | * Check for valid arguments and clear output... | |
565 | */ | |
e1d6a774 | 566 | |
e3952d3e | 567 | DEBUG_printf("2cupsUTF32ToUTF8(dest=%p, src=%p, maxout=%d)", (void *)dest, (void *)src, maxout); |
c9fc04c6 | 568 | |
e1d6a774 | 569 | if (dest) |
570 | *dest = '\0'; | |
571 | ||
572 | if (!dest || !src || maxout < 1) | |
c9fc04c6 | 573 | { |
e07d4801 | 574 | DEBUG_puts("3cupsUTF32ToUTF8: Returning -1 (bad args)"); |
c9fc04c6 | 575 | |
ef416fc2 | 576 | return (-1); |
c9fc04c6 | 577 | } |
ef416fc2 | 578 | |
579 | /* | |
580 | * Check for leading BOM in UTF-32 and inverted BOM... | |
581 | */ | |
e1d6a774 | 582 | |
583 | start = dest; | |
584 | swap = *src == 0xfffe0000; | |
585 | ||
e3952d3e | 586 | DEBUG_printf("4cupsUTF32ToUTF8: swap=%d", swap); |
c9fc04c6 | 587 | |
e1d6a774 | 588 | if (*src == 0xfffe0000 || *src == 0xfeff) |
589 | src ++; | |
ef416fc2 | 590 | |
591 | /* | |
592 | * Convert input UTF-32 to output UTF-8... | |
593 | */ | |
e1d6a774 | 594 | |
595 | for (i = maxout - 1; *src && i > 0;) | |
ef416fc2 | 596 | { |
e1d6a774 | 597 | ch = *src++; |
ef416fc2 | 598 | |
599 | /* | |
600 | * Byte swap input UTF-32, if necessary... | |
e1d6a774 | 601 | * (only byte-swapping 24 of 32 bits) |
ef416fc2 | 602 | */ |
e1d6a774 | 603 | |
ef416fc2 | 604 | if (swap) |
605 | ch = ((ch >> 24) | ((ch >> 8) & 0xff00) | ((ch << 8) & 0xff0000)); | |
606 | ||
607 | /* | |
e1d6a774 | 608 | * Check for beyond Plane 16 (invalid UTF-32)... |
ef416fc2 | 609 | */ |
ef416fc2 | 610 | |
ef416fc2 | 611 | if (ch > 0x10ffff) |
c9fc04c6 | 612 | { |
e07d4801 | 613 | DEBUG_puts("3cupsUTF32ToUTF8: Returning -1 (character out of range)"); |
c9fc04c6 | 614 | |
ef416fc2 | 615 | return (-1); |
c9fc04c6 | 616 | } |
ef416fc2 | 617 | |
ef416fc2 | 618 | /* |
619 | * Convert UTF-32 character to UTF-8 character(s)... | |
620 | */ | |
e1d6a774 | 621 | |
622 | if (ch < 0x80) | |
ef416fc2 | 623 | { |
624 | /* | |
625 | * One-octet UTF-8 <= 127 (US-ASCII)... | |
626 | */ | |
e1d6a774 | 627 | |
628 | *dest++ = (cups_utf8_t)ch; | |
629 | i --; | |
c9fc04c6 | 630 | |
e3952d3e | 631 | DEBUG_printf("4cupsUTF32ToUTF8: %08x => %02x", (unsigned)ch, dest[-1]); |
ef416fc2 | 632 | } |
e1d6a774 | 633 | else if (ch < 0x800) |
ef416fc2 | 634 | { |
635 | /* | |
636 | * Two-octet UTF-8 <= 2047 (Latin-x)... | |
637 | */ | |
e1d6a774 | 638 | |
639 | if (i < 2) | |
c9fc04c6 | 640 | { |
e07d4801 | 641 | DEBUG_puts("3cupsUTF32ToUTF8: Returning -1 (too long 2)"); |
c9fc04c6 | 642 | |
e1d6a774 | 643 | return (-1); |
c9fc04c6 | 644 | } |
e1d6a774 | 645 | |
646 | *dest++ = (cups_utf8_t)(0xc0 | ((ch >> 6) & 0x1f)); | |
647 | *dest++ = (cups_utf8_t)(0x80 | (ch & 0x3f)); | |
648 | i -= 2; | |
c9fc04c6 | 649 | |
fd2496a6 | 650 | DEBUG_printf("4cupsUTF32ToUTF8: %08x => %02x %02x", (unsigned)ch, dest[-2], dest[-1]); |
ef416fc2 | 651 | } |
e1d6a774 | 652 | else if (ch < 0x10000) |
ef416fc2 | 653 | { |
654 | /* | |
655 | * Three-octet UTF-8 <= 65535 (Plane 0 - BMP)... | |
656 | */ | |
e1d6a774 | 657 | |
658 | if (i < 3) | |
c9fc04c6 | 659 | { |
e07d4801 | 660 | DEBUG_puts("3cupsUTF32ToUTF8: Returning -1 (too long 3)"); |
c9fc04c6 | 661 | |
e1d6a774 | 662 | return (-1); |
c9fc04c6 | 663 | } |
e1d6a774 | 664 | |
665 | *dest++ = (cups_utf8_t)(0xe0 | ((ch >> 12) & 0x0f)); | |
666 | *dest++ = (cups_utf8_t)(0x80 | ((ch >> 6) & 0x3f)); | |
667 | *dest++ = (cups_utf8_t)(0x80 | (ch & 0x3f)); | |
668 | i -= 3; | |
c9fc04c6 | 669 | |
fd2496a6 | 670 | DEBUG_printf("4cupsUTF32ToUTF8: %08x => %02x %02x %02x", (unsigned)ch, dest[-3], dest[-2], dest[-1]); |
e1d6a774 | 671 | } |
672 | else | |
673 | { | |
674 | /* | |
675 | * Four-octet UTF-8... | |
676 | */ | |
677 | ||
678 | if (i < 4) | |
e07d4801 MS |
679 | { |
680 | DEBUG_puts("3cupsUTF32ToUTF8: Returning -1 (too long 4)"); | |
681 | ||
e1d6a774 | 682 | return (-1); |
e07d4801 | 683 | } |
e1d6a774 | 684 | |
685 | *dest++ = (cups_utf8_t)(0xf0 | ((ch >> 18) & 0x07)); | |
686 | *dest++ = (cups_utf8_t)(0x80 | ((ch >> 12) & 0x3f)); | |
687 | *dest++ = (cups_utf8_t)(0x80 | ((ch >> 6) & 0x3f)); | |
688 | *dest++ = (cups_utf8_t)(0x80 | (ch & 0x3f)); | |
689 | i -= 4; | |
c9fc04c6 | 690 | |
fd2496a6 | 691 | DEBUG_printf("4cupsUTF32ToUTF8: %08x => %02x %02x %02x %02x", (unsigned)ch, dest[-4], dest[-3], dest[-2], dest[-1]); |
ef416fc2 | 692 | } |
693 | } | |
e1d6a774 | 694 | |
ef416fc2 | 695 | *dest = '\0'; |
e1d6a774 | 696 | |
e3952d3e | 697 | DEBUG_printf("3cupsUTF32ToUTF8: Returning %d", (int)(dest - start)); |
c9fc04c6 | 698 | |
e1d6a774 | 699 | return ((int)(dest - start)); |
ef416fc2 | 700 | } |