]> git.ipfire.org Git - thirdparty/cups.git/blob - cups/transcode.c
f9736029a850afa6816803633f9d62c8e643dbbc
[thirdparty/cups.git] / cups / transcode.c
1 /*
2 * "$Id: transcode.c 9306 2010-09-16 21:43:57Z mike $"
3 *
4 * Transcoding support for CUPS.
5 *
6 * Copyright 2007-2010 by Apple Inc.
7 * Copyright 1997-2007 by Easy Software Products.
8 *
9 * These coded instructions, statements, and computer programs are the
10 * property of Apple Inc. and are protected by Federal copyright
11 * law. Distribution and use rights are outlined in the file "LICENSE.txt"
12 * which should have been included with this file. If this file is
13 * file is missing or damaged, see the license at "http://www.cups.org/".
14 *
15 * This file is subject to the Apple OS-Developed Software exception.
16 *
17 * Contents:
18 *
19 * _cupsCharmapFlush() - Flush all character set maps out of cache.
20 * cupsCharsetToUTF8() - Convert legacy character set to UTF-8.
21 * cupsUTF8ToCharset() - Convert UTF-8 to legacy character set.
22 * cupsUTF8ToUTF32() - Convert UTF-8 to UTF-32.
23 * cupsUTF32ToUTF8() - Convert UTF-32 to UTF-8.
24 */
25
26 /*
27 * Include necessary headers...
28 */
29
30 #include "cups-private.h"
31 #include <limits.h>
32 #include <time.h>
33 #ifdef HAVE_ICONV_H
34 # include <iconv.h>
35 #endif /* HAVE_ICONV_H */
36
37
38 /*
39 * Local globals...
40 */
41
42 #ifdef HAVE_ICONV_H
43 static _cups_mutex_t map_mutex = _CUPS_MUTEX_INITIALIZER;
44 /* Mutex to control access to maps */
45 static iconv_t map_from_utf8 = (iconv_t)-1;
46 /* Convert from UTF-8 to charset */
47 static iconv_t map_to_utf8 = (iconv_t)-1;
48 /* Convert from charset to UTF-8 */
49 static cups_encoding_t map_encoding = CUPS_AUTO_ENCODING;
50 /* Which charset is cached */
51 #endif /* HAVE_ICONV_H */
52
53
54 /*
55 * '_cupsCharmapFlush()' - Flush all character set maps out of cache.
56 */
57
58 void
59 _cupsCharmapFlush(void)
60 {
61 #ifdef HAVE_ICONV_H
62 if (map_from_utf8 != (iconv_t)-1)
63 {
64 iconv_close(map_from_utf8);
65 map_from_utf8 = (iconv_t)-1;
66 }
67
68 if (map_to_utf8 != (iconv_t)-1)
69 {
70 iconv_close(map_to_utf8);
71 map_to_utf8 = (iconv_t)-1;
72 }
73
74 map_encoding = CUPS_AUTO_ENCODING;
75 #endif /* HAVE_ICONV_H */
76 }
77
78
79 /*
80 * 'cupsCharsetToUTF8()' - Convert legacy character set to UTF-8.
81 */
82
83 int /* O - Count or -1 on error */
84 cupsCharsetToUTF8(
85 cups_utf8_t *dest, /* O - Target string */
86 const char *src, /* I - Source string */
87 const int maxout, /* I - Max output */
88 const cups_encoding_t encoding) /* I - Encoding */
89 {
90 cups_utf8_t *destptr; /* Pointer into UTF-8 buffer */
91 #ifdef HAVE_ICONV_H
92 size_t srclen, /* Length of source string */
93 outBytesLeft; /* Bytes remaining in output buffer */
94 #endif /* HAVE_ICONV_H */
95
96
97 /*
98 * Check for valid arguments...
99 */
100
101 DEBUG_printf(("2cupsCharsetToUTF8(dest=%p, src=\"%s\", maxout=%d, encoding=%d)",
102 dest, src, maxout, encoding));
103
104 if (!dest || !src || maxout < 1)
105 {
106 if (dest)
107 *dest = '\0';
108
109 DEBUG_puts("3cupsCharsetToUTF8: Bad arguments, returning -1");
110 return (-1);
111 }
112
113 /*
114 * Handle identity conversions...
115 */
116
117 if (encoding == CUPS_UTF8 || encoding <= CUPS_US_ASCII ||
118 encoding >= CUPS_ENCODING_VBCS_END)
119 {
120 strlcpy((char *)dest, src, maxout);
121 return ((int)strlen((char *)dest));
122 }
123
124 /*
125 * Handle ISO-8859-1 to UTF-8 directly...
126 */
127
128 destptr = dest;
129
130 if (encoding == CUPS_ISO8859_1)
131 {
132 int ch; /* Character from string */
133 cups_utf8_t *destend; /* End of UTF-8 buffer */
134
135
136 destend = dest + maxout - 2;
137
138 while (*src && destptr < destend)
139 {
140 ch = *src++ & 255;
141
142 if (ch & 128)
143 {
144 *destptr++ = 0xc0 | (ch >> 6);
145 *destptr++ = 0x80 | (ch & 0x3f);
146 }
147 else
148 *destptr++ = ch;
149 }
150
151 *destptr = '\0';
152
153 return ((int)(destptr - dest));
154 }
155
156 /*
157 * Convert input legacy charset to UTF-8...
158 */
159
160 #ifdef HAVE_ICONV_H
161 _cupsMutexLock(&map_mutex);
162
163 if (map_encoding != encoding)
164 {
165 _cupsCharmapFlush();
166
167 map_from_utf8 = iconv_open(_cupsEncodingName(encoding), "UTF-8");
168 map_to_utf8 = iconv_open("UTF-8", _cupsEncodingName(encoding));
169 map_encoding = encoding;
170 }
171
172 if (map_to_utf8 != (iconv_t)-1)
173 {
174 srclen = strlen(src);
175 outBytesLeft = maxout - 1;
176
177 iconv(map_to_utf8, (char **)&src, &srclen, (char **)&destptr,
178 &outBytesLeft);
179 *destptr = '\0';
180
181 _cupsMutexUnlock(&map_mutex);
182
183 return ((int)(destptr - dest));
184 }
185
186 _cupsMutexUnlock(&map_mutex);
187 #endif /* HAVE_ICONV_H */
188
189 /*
190 * No iconv() support, so error out...
191 */
192
193 *destptr = '\0';
194
195 return (-1);
196 }
197
198
199 /*
200 * 'cupsUTF8ToCharset()' - Convert UTF-8 to legacy character set.
201 */
202
203 int /* O - Count or -1 on error */
204 cupsUTF8ToCharset(
205 char *dest, /* O - Target string */
206 const cups_utf8_t *src, /* I - Source string */
207 const int maxout, /* I - Max output */
208 const cups_encoding_t encoding) /* I - Encoding */
209 {
210 char *destptr; /* Pointer into destination */
211 #ifdef HAVE_ICONV_H
212 size_t srclen, /* Length of source string */
213 outBytesLeft; /* Bytes remaining in output buffer */
214 #endif /* HAVE_ICONV_H */
215
216
217 /*
218 * Check for valid arguments...
219 */
220
221 if (!dest || !src || maxout < 1)
222 {
223 if (dest)
224 *dest = '\0';
225
226 return (-1);
227 }
228
229 /*
230 * Handle identity conversions...
231 */
232
233 if (encoding == CUPS_UTF8 ||
234 encoding >= CUPS_ENCODING_VBCS_END)
235 {
236 strlcpy(dest, (char *)src, maxout);
237 return ((int)strlen(dest));
238 }
239
240 /*
241 * Handle UTF-8 to ISO-8859-1 directly...
242 */
243
244 destptr = dest;
245
246 if (encoding == CUPS_ISO8859_1 || encoding <= CUPS_US_ASCII)
247 {
248 int ch, /* Character from string */
249 maxch; /* Maximum character for charset */
250 char *destend; /* End of ISO-8859-1 buffer */
251
252 maxch = encoding == CUPS_ISO8859_1 ? 256 : 128;
253 destend = dest + maxout - 1;
254
255 while (*src && destptr < destend)
256 {
257 ch = *src++;
258
259 if ((ch & 0xe0) == 0xc0)
260 {
261 ch = ((ch & 0x1f) << 6) | (*src++ & 0x3f);
262
263 if (ch < maxch)
264 *destptr++ = ch;
265 else
266 *destptr++ = '?';
267 }
268 else if ((ch & 0xf0) == 0xe0 ||
269 (ch & 0xf8) == 0xf0)
270 *destptr++ = '?';
271 else if (!(ch & 0x80))
272 *destptr++ = ch;
273 }
274
275 *destptr = '\0';
276
277 return ((int)(destptr - dest));
278 }
279
280 #ifdef HAVE_ICONV_H
281 /*
282 * Convert input UTF-8 to legacy charset...
283 */
284
285 _cupsMutexLock(&map_mutex);
286
287 if (map_encoding != encoding)
288 {
289 _cupsCharmapFlush();
290
291 map_from_utf8 = iconv_open(_cupsEncodingName(encoding), "UTF-8");
292 map_to_utf8 = iconv_open("UTF-8", _cupsEncodingName(encoding));
293 map_encoding = encoding;
294 }
295
296 if (map_from_utf8 != (iconv_t)-1)
297 {
298 srclen = strlen((char *)src);
299 outBytesLeft = maxout - 1;
300
301 iconv(map_from_utf8, (char **)&src, &srclen, &destptr, &outBytesLeft);
302 *destptr = '\0';
303
304 _cupsMutexUnlock(&map_mutex);
305
306 return ((int)(destptr - dest));
307 }
308
309 _cupsMutexUnlock(&map_mutex);
310 #endif /* HAVE_ICONV_H */
311
312 /*
313 * No iconv() support, so error out...
314 */
315
316 *destptr = '\0';
317
318 return (-1);
319 }
320
321
322 /*
323 * 'cupsUTF8ToUTF32()' - Convert UTF-8 to UTF-32.
324 *
325 * 32-bit UTF-32 (actually 21-bit) maps to UTF-8 as follows...
326 *
327 * UTF-32 char UTF-8 char(s)
328 * --------------------------------------------------
329 * 0 to 127 = 0xxxxxxx (US-ASCII)
330 * 128 to 2047 = 110xxxxx 10yyyyyy
331 * 2048 to 65535 = 1110xxxx 10yyyyyy 10zzzzzz
332 * > 65535 = 11110xxx 10yyyyyy 10zzzzzz 10xxxxxx
333 *
334 * UTF-32 prohibits chars beyond Plane 16 (> 0x10ffff) in UCS-4,
335 * which would convert to five- or six-octet UTF-8 sequences...
336 */
337
338 int /* O - Count or -1 on error */
339 cupsUTF8ToUTF32(
340 cups_utf32_t *dest, /* O - Target string */
341 const cups_utf8_t *src, /* I - Source string */
342 const int maxout) /* I - Max output */
343 {
344 int i; /* Looping variable */
345 cups_utf8_t ch; /* Character value */
346 cups_utf8_t next; /* Next character value */
347 cups_utf32_t ch32; /* UTF-32 character value */
348
349
350 /*
351 * Check for valid arguments and clear output...
352 */
353
354 DEBUG_printf(("2cupsUTF8ToUTF32(dest=%p, src=\"%s\", maxout=%d)", dest,
355 src, maxout));
356
357 if (dest)
358 *dest = 0;
359
360 if (!dest || !src || maxout < 1 || maxout > CUPS_MAX_USTRING)
361 {
362 DEBUG_puts("3cupsUTF8ToUTF32: Returning -1 (bad arguments)");
363
364 return (-1);
365 }
366
367 /*
368 * Convert input UTF-8 to output UTF-32...
369 */
370
371 for (i = maxout - 1; *src && i > 0; i --)
372 {
373 ch = *src++;
374
375 /*
376 * Convert UTF-8 character(s) to UTF-32 character...
377 */
378
379 if (!(ch & 0x80))
380 {
381 /*
382 * One-octet UTF-8 <= 127 (US-ASCII)...
383 */
384
385 *dest++ = ch;
386
387 DEBUG_printf(("4cupsUTF8ToUTF32: %02x => %08X", src[-1], ch));
388 continue;
389 }
390 else if ((ch & 0xe0) == 0xc0)
391 {
392 /*
393 * Two-octet UTF-8 <= 2047 (Latin-x)...
394 */
395
396 next = *src++;
397 if ((next & 0xc0) != 0x80)
398 {
399 DEBUG_puts("3cupsUTF8ToUTF32: Returning -1 (bad UTF-8 sequence)");
400
401 return (-1);
402 }
403
404 ch32 = ((ch & 0x1f) << 6) | (next & 0x3f);
405
406 /*
407 * Check for non-shortest form (invalid UTF-8)...
408 */
409
410 if (ch32 < 0x80)
411 {
412 DEBUG_puts("3cupsUTF8ToUTF32: Returning -1 (bad UTF-8 sequence)");
413
414 return (-1);
415 }
416
417 *dest++ = ch32;
418
419 DEBUG_printf(("4cupsUTF8ToUTF32: %02x %02x => %08X",
420 src[-2], src[-1], (unsigned)ch32));
421 }
422 else if ((ch & 0xf0) == 0xe0)
423 {
424 /*
425 * Three-octet UTF-8 <= 65535 (Plane 0 - BMP)...
426 */
427
428 next = *src++;
429 if ((next & 0xc0) != 0x80)
430 {
431 DEBUG_puts("3cupsUTF8ToUTF32: Returning -1 (bad UTF-8 sequence)");
432
433 return (-1);
434 }
435
436 ch32 = ((ch & 0x0f) << 6) | (next & 0x3f);
437
438 next = *src++;
439 if ((next & 0xc0) != 0x80)
440 {
441 DEBUG_puts("3cupsUTF8ToUTF32: Returning -1 (bad UTF-8 sequence)");
442
443 return (-1);
444 }
445
446 ch32 = (ch32 << 6) | (next & 0x3f);
447
448 /*
449 * Check for non-shortest form (invalid UTF-8)...
450 */
451
452 if (ch32 < 0x800)
453 {
454 DEBUG_puts("3cupsUTF8ToUTF32: Returning -1 (bad UTF-8 sequence)");
455
456 return (-1);
457 }
458
459 *dest++ = ch32;
460
461 DEBUG_printf(("4cupsUTF8ToUTF32: %02x %02x %02x => %08X",
462 src[-3], src[-2], src[-1], (unsigned)ch32));
463 }
464 else if ((ch & 0xf8) == 0xf0)
465 {
466 /*
467 * Four-octet UTF-8...
468 */
469
470 next = *src++;
471 if ((next & 0xc0) != 0x80)
472 {
473 DEBUG_puts("3cupsUTF8ToUTF32: Returning -1 (bad UTF-8 sequence)");
474
475 return (-1);
476 }
477
478 ch32 = ((ch & 0x07) << 6) | (next & 0x3f);
479
480 next = *src++;
481 if ((next & 0xc0) != 0x80)
482 {
483 DEBUG_puts("3cupsUTF8ToUTF32: Returning -1 (bad UTF-8 sequence)");
484
485 return (-1);
486 }
487
488 ch32 = (ch32 << 6) | (next & 0x3f);
489
490 next = *src++;
491 if ((next & 0xc0) != 0x80)
492 {
493 DEBUG_puts("3cupsUTF8ToUTF32: Returning -1 (bad UTF-8 sequence)");
494
495 return (-1);
496 }
497
498 ch32 = (ch32 << 6) | (next & 0x3f);
499
500 /*
501 * Check for non-shortest form (invalid UTF-8)...
502 */
503
504 if (ch32 < 0x10000)
505 {
506 DEBUG_puts("3cupsUTF8ToUTF32: Returning -1 (bad UTF-8 sequence)");
507
508 return (-1);
509 }
510
511 *dest++ = ch32;
512
513 DEBUG_printf(("4cupsUTF8ToUTF32: %02x %02x %02x %02x => %08X",
514 src[-4], src[-3], src[-2], src[-1], (unsigned)ch32));
515 }
516 else
517 {
518 /*
519 * More than 4-octet (invalid UTF-8 sequence)...
520 */
521
522 DEBUG_puts("3cupsUTF8ToUTF32: Returning -1 (bad UTF-8 sequence)");
523
524 return (-1);
525 }
526
527 /*
528 * Check for UTF-16 surrogate (illegal UTF-8)...
529 */
530
531 if (ch32 >= 0xd800 && ch32 <= 0xdfff)
532 return (-1);
533 }
534
535 *dest = 0;
536
537 DEBUG_printf(("3cupsUTF8ToUTF32: Returning %d characters", maxout - 1 - i));
538
539 return (maxout - 1 - i);
540 }
541
542
543 /*
544 * 'cupsUTF32ToUTF8()' - Convert UTF-32 to UTF-8.
545 *
546 * 32-bit UTF-32 (actually 21-bit) maps to UTF-8 as follows...
547 *
548 * UTF-32 char UTF-8 char(s)
549 * --------------------------------------------------
550 * 0 to 127 = 0xxxxxxx (US-ASCII)
551 * 128 to 2047 = 110xxxxx 10yyyyyy
552 * 2048 to 65535 = 1110xxxx 10yyyyyy 10zzzzzz
553 * > 65535 = 11110xxx 10yyyyyy 10zzzzzz 10xxxxxx
554 *
555 * UTF-32 prohibits chars beyond Plane 16 (> 0x10ffff) in UCS-4,
556 * which would convert to five- or six-octet UTF-8 sequences...
557 */
558
559 int /* O - Count or -1 on error */
560 cupsUTF32ToUTF8(
561 cups_utf8_t *dest, /* O - Target string */
562 const cups_utf32_t *src, /* I - Source string */
563 const int maxout) /* I - Max output */
564 {
565 cups_utf8_t *start; /* Start of destination string */
566 int i; /* Looping variable */
567 int swap; /* Byte-swap input to output */
568 cups_utf32_t ch; /* Character value */
569
570
571 /*
572 * Check for valid arguments and clear output...
573 */
574
575 DEBUG_printf(("2cupsUTF32ToUTF8(dest=%p, src=%p, maxout=%d)", dest, src,
576 maxout));
577
578 if (dest)
579 *dest = '\0';
580
581 if (!dest || !src || maxout < 1)
582 {
583 DEBUG_puts("3cupsUTF32ToUTF8: Returning -1 (bad args)");
584
585 return (-1);
586 }
587
588 /*
589 * Check for leading BOM in UTF-32 and inverted BOM...
590 */
591
592 start = dest;
593 swap = *src == 0xfffe0000;
594
595 DEBUG_printf(("4cupsUTF32ToUTF8: swap=%d", swap));
596
597 if (*src == 0xfffe0000 || *src == 0xfeff)
598 src ++;
599
600 /*
601 * Convert input UTF-32 to output UTF-8...
602 */
603
604 for (i = maxout - 1; *src && i > 0;)
605 {
606 ch = *src++;
607
608 /*
609 * Byte swap input UTF-32, if necessary...
610 * (only byte-swapping 24 of 32 bits)
611 */
612
613 if (swap)
614 ch = ((ch >> 24) | ((ch >> 8) & 0xff00) | ((ch << 8) & 0xff0000));
615
616 /*
617 * Check for beyond Plane 16 (invalid UTF-32)...
618 */
619
620 if (ch > 0x10ffff)
621 {
622 DEBUG_puts("3cupsUTF32ToUTF8: Returning -1 (character out of range)");
623
624 return (-1);
625 }
626
627 /*
628 * Convert UTF-32 character to UTF-8 character(s)...
629 */
630
631 if (ch < 0x80)
632 {
633 /*
634 * One-octet UTF-8 <= 127 (US-ASCII)...
635 */
636
637 *dest++ = (cups_utf8_t)ch;
638 i --;
639
640 DEBUG_printf(("4cupsUTF32ToUTF8: %08x => %02x", (unsigned)ch, dest[-1]));
641 }
642 else if (ch < 0x800)
643 {
644 /*
645 * Two-octet UTF-8 <= 2047 (Latin-x)...
646 */
647
648 if (i < 2)
649 {
650 DEBUG_puts("3cupsUTF32ToUTF8: Returning -1 (too long 2)");
651
652 return (-1);
653 }
654
655 *dest++ = (cups_utf8_t)(0xc0 | ((ch >> 6) & 0x1f));
656 *dest++ = (cups_utf8_t)(0x80 | (ch & 0x3f));
657 i -= 2;
658
659 DEBUG_printf(("4cupsUTF32ToUTF8: %08x => %02x %02x", (unsigned)ch,
660 dest[-2], dest[-1]));
661 }
662 else if (ch < 0x10000)
663 {
664 /*
665 * Three-octet UTF-8 <= 65535 (Plane 0 - BMP)...
666 */
667
668 if (i < 3)
669 {
670 DEBUG_puts("3cupsUTF32ToUTF8: Returning -1 (too long 3)");
671
672 return (-1);
673 }
674
675 *dest++ = (cups_utf8_t)(0xe0 | ((ch >> 12) & 0x0f));
676 *dest++ = (cups_utf8_t)(0x80 | ((ch >> 6) & 0x3f));
677 *dest++ = (cups_utf8_t)(0x80 | (ch & 0x3f));
678 i -= 3;
679
680 DEBUG_printf(("4cupsUTF32ToUTF8: %08x => %02x %02x %02x", (unsigned)ch,
681 dest[-3], dest[-2], dest[-1]));
682 }
683 else
684 {
685 /*
686 * Four-octet UTF-8...
687 */
688
689 if (i < 4)
690 {
691 DEBUG_puts("3cupsUTF32ToUTF8: Returning -1 (too long 4)");
692
693 return (-1);
694 }
695
696 *dest++ = (cups_utf8_t)(0xf0 | ((ch >> 18) & 0x07));
697 *dest++ = (cups_utf8_t)(0x80 | ((ch >> 12) & 0x3f));
698 *dest++ = (cups_utf8_t)(0x80 | ((ch >> 6) & 0x3f));
699 *dest++ = (cups_utf8_t)(0x80 | (ch & 0x3f));
700 i -= 4;
701
702 DEBUG_printf(("4cupsUTF32ToUTF8: %08x => %02x %02x %02x %02x",
703 (unsigned)ch, dest[-4], dest[-3], dest[-2], dest[-1]));
704 }
705 }
706
707 *dest = '\0';
708
709 DEBUG_printf(("3cupsUTF32ToUTF8: Returning %d", (int)(dest - start)));
710
711 return ((int)(dest - start));
712 }
713
714
715 /*
716 * End of "$Id: transcode.c 9306 2010-09-16 21:43:57Z mike $"
717 */