]> git.ipfire.org Git - thirdparty/cups.git/blame - cups/transcode.c
Merge changes from CUPS 1.5svn-r9105.
[thirdparty/cups.git] / cups / transcode.c
CommitLineData
ef416fc2 1/*
75bd9771 2 * "$Id: transcode.c 7560 2008-05-13 06:34:04Z mike $"
ef416fc2 3 *
71e16022 4 * Transcoding support for CUPS.
ef416fc2 5 *
71e16022 6 * Copyright 2007-2010 by Apple Inc.
b86bc4cf 7 * Copyright 1997-2007 by Easy Software Products.
ef416fc2 8 *
bc44d920 9 * These coded instructions, statements, and computer programs are the
10 * property of Apple Inc. and are protected by Federal copyright
11 * law. Distribution and use rights are outlined in the file "LICENSE.txt"
12 * which should have been included with this file. If this file is
13 * file is missing or damaged, see the license at "http://www.cups.org/".
ef416fc2 14 *
bc44d920 15 * This file is subject to the Apple OS-Developed Software exception.
ef416fc2 16 *
17 * Contents:
18 *
fa73b229 19 * _cupsCharmapFlush() - Flush all character set maps out of cache.
e1d6a774 20 * _cupsCharmapFree() - Free a character set map.
21 * _cupsCharmapGet() - Get a character set map.
ef416fc2 22 * cupsCharsetToUTF8() - Convert legacy character set to UTF-8.
e1d6a774 23 * cupsUTF8ToCharset() - Convert UTF-8 to legacy character set.
ef416fc2 24 * cupsUTF8ToUTF32() - Convert UTF-8 to UTF-32.
25 * cupsUTF32ToUTF8() - Convert UTF-32 to UTF-8.
e1d6a774 26 * compare_wide() - Compare key for wide (VBCS) match.
27 * conv_sbcs_to_utf8() - Convert legacy SBCS to UTF-8.
ef416fc2 28 * conv_utf8_to_sbcs() - Convert UTF-8 to legacy SBCS.
29 * conv_utf8_to_vbcs() - Convert UTF-8 to legacy DBCS/VBCS.
ef416fc2 30 * conv_vbcs_to_utf8() - Convert legacy DBCS/VBCS to UTF-8.
e1d6a774 31 * free_sbcs_charmap() - Free memory used by a single byte character set.
32 * free_vbcs_charmap() - Free memory used by a variable byte character set.
d6ae789d 33 * get_charmap() - Lookup or get a character set map (private).
e1d6a774 34 * get_charmap_count() - Count lines in a charmap file.
35 * get_sbcs_charmap() - Get SBCS Charmap.
36 * get_vbcs_charmap() - Get DBCS/VBCS Charmap.
ef416fc2 37 */
38
39/*
40 * Include necessary headers...
41 */
42
71e16022 43#include "cups-private.h"
e53920b9 44#include <limits.h>
ef416fc2 45#include <time.h>
46
47
d6ae789d 48/*
49 * Local globals...
50 */
51
6d2f911b 52static _cups_mutex_t map_mutex = _CUPS_MUTEX_INITIALIZER;
d6ae789d 53 /* Mutex to control access to maps */
d6ae789d 54static _cups_cmap_t *cmap_cache = NULL;
55 /* SBCS Charmap Cache */
56static _cups_vmap_t *vmap_cache = NULL;
57 /* VBCS Charmap Cache */
58
59
ef416fc2 60/*
e1d6a774 61 * Local functions...
ef416fc2 62 */
63
e1d6a774 64static int compare_wide(const void *k1, const void *k2);
65static int conv_sbcs_to_utf8(cups_utf8_t *dest,
66 const cups_sbcs_t *src,
67 int maxout,
68 const cups_encoding_t encoding);
69static int conv_utf8_to_sbcs(cups_sbcs_t *dest,
70 const cups_utf8_t *src,
71 int maxout,
72 const cups_encoding_t encoding);
73static int conv_utf8_to_vbcs(cups_sbcs_t *dest,
74 const cups_utf8_t *src,
75 int maxout,
76 const cups_encoding_t encoding);
77static int conv_vbcs_to_utf8(cups_utf8_t *dest,
78 const cups_sbcs_t *src,
79 int maxout,
80 const cups_encoding_t encoding);
81static void free_sbcs_charmap(_cups_cmap_t *sbcs);
82static void free_vbcs_charmap(_cups_vmap_t *vbcs);
d6ae789d 83static void *get_charmap(const cups_encoding_t encoding);
e1d6a774 84static int get_charmap_count(cups_file_t *fp);
85static _cups_cmap_t *get_sbcs_charmap(const cups_encoding_t encoding,
86 const char *filename);
87static _cups_vmap_t *get_vbcs_charmap(const cups_encoding_t encoding,
88 const char *filename);
89
ef416fc2 90
91/*
e1d6a774 92 * '_cupsCharmapFlush()' - Flush all character set maps out of cache.
ef416fc2 93 */
94
e1d6a774 95void
d6ae789d 96_cupsCharmapFlush(void)
ef416fc2 97{
e1d6a774 98 _cups_cmap_t *cmap, /* Legacy SBCS / Unicode Charset Map */
99 *cnext; /* Next Legacy SBCS Charset Map */
100 _cups_vmap_t *vmap, /* Legacy VBCS / Unicode Charset Map */
101 *vnext; /* Next Legacy VBCS Charset Map */
ef416fc2 102
103
6d2f911b 104 _cupsMutexLock(&map_mutex);
d6ae789d 105
ef416fc2 106 /*
e1d6a774 107 * Loop through SBCS charset map cache, free all memory...
ef416fc2 108 */
109
d6ae789d 110 for (cmap = cmap_cache; cmap; cmap = cnext)
e1d6a774 111 {
112 cnext = cmap->next;
ef416fc2 113
e1d6a774 114 free_sbcs_charmap(cmap);
115 }
ef416fc2 116
d6ae789d 117 cmap_cache = NULL;
ef416fc2 118
119 /*
e1d6a774 120 * Loop through DBCS/VBCS charset map cache, free all memory...
ef416fc2 121 */
122
d6ae789d 123 for (vmap = vmap_cache; vmap; vmap = vnext)
e1d6a774 124 {
125 vnext = vmap->next;
126
127 free_vbcs_charmap(vmap);
e1d6a774 128 }
129
d6ae789d 130 vmap_cache = NULL;
131
6d2f911b 132 _cupsMutexUnlock(&map_mutex);
ef416fc2 133}
134
e1d6a774 135
ef416fc2 136/*
e1d6a774 137 * '_cupsCharmapFree()' - Free a character set map.
ef416fc2 138 *
e1d6a774 139 * This does not actually free; use '_cupsCharmapFlush()' for that.
ef416fc2 140 */
e1d6a774 141
ef416fc2 142void
e1d6a774 143_cupsCharmapFree(
144 const cups_encoding_t encoding) /* I - Encoding */
ef416fc2 145{
e1d6a774 146 _cups_cmap_t *cmap; /* Legacy SBCS / Unicode Charset Map */
147 _cups_vmap_t *vmap; /* Legacy VBCS / Unicode Charset Map */
e1d6a774 148
ef416fc2 149
150 /*
151 * See if we already have this SBCS charset map loaded...
152 */
e1d6a774 153
6d2f911b 154 _cupsMutexLock(&map_mutex);
d6ae789d 155
156 for (cmap = cmap_cache; cmap; cmap = cmap->next)
ef416fc2 157 {
158 if (cmap->encoding == encoding)
159 {
160 if (cmap->used > 0)
161 cmap->used --;
d6ae789d 162 break;
ef416fc2 163 }
164 }
165
166 /*
167 * See if we already have this DBCS/VBCS charset map loaded...
168 */
e1d6a774 169
d6ae789d 170 for (vmap = vmap_cache; vmap; vmap = vmap->next)
ef416fc2 171 {
172 if (vmap->encoding == encoding)
173 {
174 if (vmap->used > 0)
175 vmap->used --;
d6ae789d 176 break;
ef416fc2 177 }
178 }
d6ae789d 179
6d2f911b 180 _cupsMutexUnlock(&map_mutex);
fa73b229 181}
182
183
184/*
e1d6a774 185 * '_cupsCharmapGet()' - Get a character set map.
186 *
187 * This code handles single-byte (SBCS), double-byte (DBCS), and
188 * variable-byte (VBCS) character sets _without_ charset escapes...
189 * This code does not handle multiple-byte character sets (MBCS)
190 * (such as ISO-2022-JP) with charset switching via escapes...
fa73b229 191 */
192
e1d6a774 193void * /* O - Charset map pointer */
194_cupsCharmapGet(
195 const cups_encoding_t encoding) /* I - Encoding */
fa73b229 196{
d6ae789d 197 void *charmap; /* Charset map pointer */
e1d6a774 198
fa73b229 199
e07d4801 200 DEBUG_printf(("7_cupsCharmapGet(encoding=%d)", encoding));
ef416fc2 201
202 /*
e1d6a774 203 * Check for valid arguments...
ef416fc2 204 */
e1d6a774 205
206 if (encoding < 0 || encoding >= CUPS_ENCODING_VBCS_END)
ef416fc2 207 {
e07d4801 208 DEBUG_puts("8_cupsCharmapGet: Bad encoding, returning NULL!");
e1d6a774 209 return (NULL);
ef416fc2 210 }
ef416fc2 211
212 /*
d6ae789d 213 * Lookup or get the charset map pointer and return...
ef416fc2 214 */
e1d6a774 215
6d2f911b 216 _cupsMutexLock(&map_mutex);
e1d6a774 217
d6ae789d 218 charmap = get_charmap(encoding);
e1d6a774 219
6d2f911b 220 _cupsMutexUnlock(&map_mutex);
e1d6a774 221
d6ae789d 222 return (charmap);
ef416fc2 223}
224
e1d6a774 225
ef416fc2 226/*
e1d6a774 227 * 'cupsCharsetToUTF8()' - Convert legacy character set to UTF-8.
ef416fc2 228 *
229 * This code handles single-byte (SBCS), double-byte (DBCS), and
230 * variable-byte (VBCS) character sets _without_ charset escapes...
231 * This code does not handle multiple-byte character sets (MBCS)
232 * (such as ISO-2022-JP) with charset switching via escapes...
233 */
e1d6a774 234
235int /* O - Count or -1 on error */
236cupsCharsetToUTF8(
237 cups_utf8_t *dest, /* O - Target string */
238 const char *src, /* I - Source string */
239 const int maxout, /* I - Max output */
240 const cups_encoding_t encoding) /* I - Encoding */
ef416fc2 241{
d6ae789d 242 int bytes; /* Number of bytes converted */
243
244
ef416fc2 245 /*
246 * Check for valid arguments...
247 */
248
f11a948a 249 DEBUG_printf(("2cupsCharsetToUTF8(dest=%p, src=\"%s\", maxout=%d, encoding=%d)",
e1d6a774 250 dest, src, maxout, encoding));
251
252 if (dest)
253 *dest = '\0';
254
ef416fc2 255 if (!dest || !src || maxout < 1 || maxout > CUPS_MAX_USTRING)
e1d6a774 256 {
f11a948a 257 DEBUG_puts("3cupsCharsetToUTF8: Bad arguments, returning -1");
ef416fc2 258 return (-1);
e1d6a774 259 }
ef416fc2 260
261 /*
262 * Handle identity conversions...
263 */
264
265 if (encoding == CUPS_UTF8 ||
266 encoding < 0 || encoding >= CUPS_ENCODING_VBCS_END)
267 {
e1d6a774 268 strlcpy((char *)dest, src, maxout);
b86bc4cf 269 return ((int)strlen((char *)dest));
ef416fc2 270 }
271
411affcf 272 /*
273 * Handle ISO-8859-1 to UTF-8 directly...
274 */
275
276 if (encoding == CUPS_ISO8859_1)
277 {
278 int ch; /* Character from string */
279 cups_utf8_t *destptr, /* Pointer into UTF-8 buffer */
280 *destend; /* End of UTF-8 buffer */
281
282
283 destptr = dest;
284 destend = dest + maxout - 2;
285
286 while (*src && destptr < destend)
287 {
288 ch = *src++ & 255;
289
290 if (ch & 128)
291 {
292 *destptr++ = 0xc0 | (ch >> 6);
293 *destptr++ = 0x80 | (ch & 0x3f);
294 }
295 else
296 *destptr++ = ch;
297 }
298
299 *destptr = '\0';
300
b86bc4cf 301 return ((int)(destptr - dest));
411affcf 302 }
303
ef416fc2 304 /*
e1d6a774 305 * Convert input legacy charset to UTF-8...
ef416fc2 306 */
e1d6a774 307
6d2f911b 308 _cupsMutexLock(&map_mutex);
d6ae789d 309
ef416fc2 310 if (encoding < CUPS_ENCODING_SBCS_END)
d6ae789d 311 bytes = conv_sbcs_to_utf8(dest, (cups_sbcs_t *)src, maxout, encoding);
ef416fc2 312 else
91c84a35 313 bytes = conv_vbcs_to_utf8(dest, (cups_sbcs_t *)src, maxout, encoding);
d6ae789d 314
6d2f911b 315 _cupsMutexUnlock(&map_mutex);
d6ae789d 316
317 return (bytes);
ef416fc2 318}
319
e1d6a774 320
ef416fc2 321/*
e1d6a774 322 * 'cupsUTF8ToCharset()' - Convert UTF-8 to legacy character set.
ef416fc2 323 *
324 * This code handles single-byte (SBCS), double-byte (DBCS), and
325 * variable-byte (VBCS) character sets _without_ charset escapes...
326 * This code does not handle multiple-byte character sets (MBCS)
327 * (such as ISO-2022-JP) with charset switching via escapes...
328 */
e1d6a774 329
330int /* O - Count or -1 on error */
331cupsUTF8ToCharset(
332 char *dest, /* O - Target string */
333 const cups_utf8_t *src, /* I - Source string */
334 const int maxout, /* I - Max output */
335 const cups_encoding_t encoding) /* I - Encoding */
ef416fc2 336{
d6ae789d 337 int bytes; /* Number of bytes converted */
338
339
ef416fc2 340 /*
341 * Check for valid arguments...
342 */
343
344 if (!dest || !src || maxout < 1 || maxout > CUPS_MAX_USTRING)
e1d6a774 345 {
346 if (dest)
347 *dest = '\0';
348
ef416fc2 349 return (-1);
e1d6a774 350 }
ef416fc2 351
352 /*
353 * Handle identity conversions...
354 */
355
356 if (encoding == CUPS_UTF8 ||
357 encoding < 0 || encoding >= CUPS_ENCODING_VBCS_END)
358 {
e1d6a774 359 strlcpy(dest, (char *)src, maxout);
b86bc4cf 360 return ((int)strlen(dest));
ef416fc2 361 }
362
411affcf 363 /*
364 * Handle UTF-8 to ISO-8859-1 directly...
365 */
366
367 if (encoding == CUPS_ISO8859_1)
368 {
369 int ch; /* Character from string */
370 char *destptr, /* Pointer into ISO-8859-1 buffer */
371 *destend; /* End of ISO-8859-1 buffer */
372
373
374 destptr = dest;
375 destend = dest + maxout - 1;
376
377 while (*src && destptr < destend)
378 {
379 ch = *src++;
380
381 if ((ch & 0xe0) == 0xc0)
382 {
383 ch = ((ch & 0x1f) << 6) | (*src++ & 0x3f);
384
385 if (ch < 256)
386 *destptr++ = ch;
387 else
388 *destptr++ = '?';
389 }
390 else if ((ch & 0xf0) == 0xe0 ||
391 (ch & 0xf8) == 0xf0)
392 *destptr++ = '?';
393 else if (!(ch & 0x80))
394 *destptr++ = ch;
395 }
396
397 *destptr = '\0';
398
b86bc4cf 399 return ((int)(destptr - dest));
411affcf 400 }
401
ef416fc2 402 /*
e1d6a774 403 * Convert input UTF-8 to legacy charset...
ef416fc2 404 */
e1d6a774 405
6d2f911b 406 _cupsMutexLock(&map_mutex);
d6ae789d 407
ef416fc2 408 if (encoding < CUPS_ENCODING_SBCS_END)
d6ae789d 409 bytes = conv_utf8_to_sbcs((cups_sbcs_t *)dest, src, maxout, encoding);
ef416fc2 410 else
91c84a35 411 bytes = conv_utf8_to_vbcs((cups_sbcs_t *)dest, src, maxout, encoding);
d6ae789d 412
6d2f911b 413 _cupsMutexUnlock(&map_mutex);
d6ae789d 414
415 return (bytes);
ef416fc2 416}
417
ef416fc2 418
419/*
420 * 'cupsUTF8ToUTF32()' - Convert UTF-8 to UTF-32.
421 *
422 * 32-bit UTF-32 (actually 21-bit) maps to UTF-8 as follows...
423 *
424 * UTF-32 char UTF-8 char(s)
425 * --------------------------------------------------
e1d6a774 426 * 0 to 127 = 0xxxxxxx (US-ASCII)
ef416fc2 427 * 128 to 2047 = 110xxxxx 10yyyyyy
428 * 2048 to 65535 = 1110xxxx 10yyyyyy 10zzzzzz
e1d6a774 429 * > 65535 = 11110xxx 10yyyyyy 10zzzzzz 10xxxxxx
ef416fc2 430 *
431 * UTF-32 prohibits chars beyond Plane 16 (> 0x10ffff) in UCS-4,
432 * which would convert to five- or six-octet UTF-8 sequences...
ef416fc2 433 */
e1d6a774 434
435int /* O - Count or -1 on error */
436cupsUTF8ToUTF32(
437 cups_utf32_t *dest, /* O - Target string */
438 const cups_utf8_t *src, /* I - Source string */
439 const int maxout) /* I - Max output */
ef416fc2 440{
e1d6a774 441 int i; /* Looping variable */
442 cups_utf8_t ch; /* Character value */
443 cups_utf8_t next; /* Next character value */
444 cups_utf32_t ch32; /* UTF-32 character value */
445
ef416fc2 446
447 /*
448 * Check for valid arguments and clear output...
449 */
e1d6a774 450
e07d4801
MS
451 DEBUG_printf(("2cupsUTF8ToUTF32(dest=%p, src=\"%s\", maxout=%d)", dest,
452 src, maxout));
c9fc04c6 453
e1d6a774 454 if (dest)
455 *dest = 0;
456
457 if (!dest || !src || maxout < 1 || maxout > CUPS_MAX_USTRING)
c9fc04c6 458 {
e07d4801 459 DEBUG_puts("3cupsUTF8ToUTF32: Returning -1 (bad arguments)");
c9fc04c6 460
ef416fc2 461 return (-1);
c9fc04c6 462 }
ef416fc2 463
464 /*
cda47a96 465 * Convert input UTF-8 to output UTF-32...
ef416fc2 466 */
e1d6a774 467
e1d6a774 468 for (i = maxout - 1; *src && i > 0; i --)
ef416fc2 469 {
e1d6a774 470 ch = *src++;
ef416fc2 471
472 /*
473 * Convert UTF-8 character(s) to UTF-32 character...
474 */
e1d6a774 475
476 if (!(ch & 0x80))
ef416fc2 477 {
478 /*
479 * One-octet UTF-8 <= 127 (US-ASCII)...
480 */
e1d6a774 481
482 *dest++ = ch;
c9fc04c6 483
e07d4801 484 DEBUG_printf(("4cupsUTF8ToUTF32: %02x => %08X", src[-1], ch));
2abf387c 485 continue;
ef416fc2 486 }
487 else if ((ch & 0xe0) == 0xc0)
488 {
489 /*
490 * Two-octet UTF-8 <= 2047 (Latin-x)...
491 */
e1d6a774 492
493 next = *src++;
c9fc04c6
MS
494 if ((next & 0xc0) != 0x80)
495 {
e07d4801 496 DEBUG_puts("3cupsUTF8ToUTF32: Returning -1 (bad UTF-8 sequence)");
c9fc04c6 497
ef416fc2 498 return (-1);
c9fc04c6 499 }
e1d6a774 500
ef416fc2 501 ch32 = ((ch & 0x1f) << 6) | (next & 0x3f);
502
503 /*
504 * Check for non-shortest form (invalid UTF-8)...
505 */
e1d6a774 506
507 if (ch32 < 0x80)
c9fc04c6 508 {
e07d4801 509 DEBUG_puts("3cupsUTF8ToUTF32: Returning -1 (bad UTF-8 sequence)");
c9fc04c6 510
ef416fc2 511 return (-1);
c9fc04c6 512 }
e1d6a774 513
514 *dest++ = ch32;
c9fc04c6 515
e07d4801 516 DEBUG_printf(("4cupsUTF8ToUTF32: %02x %02x => %08X",
c9fc04c6 517 src[-2], src[-1], (unsigned)ch32));
ef416fc2 518 }
519 else if ((ch & 0xf0) == 0xe0)
520 {
521 /*
522 * Three-octet UTF-8 <= 65535 (Plane 0 - BMP)...
523 */
e1d6a774 524
525 next = *src++;
c9fc04c6
MS
526 if ((next & 0xc0) != 0x80)
527 {
e07d4801 528 DEBUG_puts("3cupsUTF8ToUTF32: Returning -1 (bad UTF-8 sequence)");
c9fc04c6 529
ef416fc2 530 return (-1);
c9fc04c6 531 }
e1d6a774 532
533 ch32 = ((ch & 0x0f) << 6) | (next & 0x3f);
534
535 next = *src++;
c9fc04c6
MS
536 if ((next & 0xc0) != 0x80)
537 {
e07d4801 538 DEBUG_puts("3cupsUTF8ToUTF32: Returning -1 (bad UTF-8 sequence)");
c9fc04c6 539
ef416fc2 540 return (-1);
c9fc04c6 541 }
e1d6a774 542
543 ch32 = (ch32 << 6) | (next & 0x3f);
ef416fc2 544
545 /*
546 * Check for non-shortest form (invalid UTF-8)...
547 */
e1d6a774 548
549 if (ch32 < 0x800)
c9fc04c6 550 {
e07d4801 551 DEBUG_puts("3cupsUTF8ToUTF32: Returning -1 (bad UTF-8 sequence)");
c9fc04c6 552
ef416fc2 553 return (-1);
c9fc04c6 554 }
e1d6a774 555
556 *dest++ = ch32;
c9fc04c6 557
e07d4801 558 DEBUG_printf(("4cupsUTF8ToUTF32: %02x %02x %02x => %08X",
c9fc04c6 559 src[-3], src[-2], src[-1], (unsigned)ch32));
ef416fc2 560 }
561 else if ((ch & 0xf8) == 0xf0)
562 {
563 /*
e1d6a774 564 * Four-octet UTF-8...
ef416fc2 565 */
e1d6a774 566
567 next = *src++;
c9fc04c6
MS
568 if ((next & 0xc0) != 0x80)
569 {
e07d4801 570 DEBUG_puts("3cupsUTF8ToUTF32: Returning -1 (bad UTF-8 sequence)");
c9fc04c6 571
ef416fc2 572 return (-1);
c9fc04c6 573 }
e1d6a774 574
575 ch32 = ((ch & 0x07) << 6) | (next & 0x3f);
576
577 next = *src++;
c9fc04c6
MS
578 if ((next & 0xc0) != 0x80)
579 {
e07d4801 580 DEBUG_puts("3cupsUTF8ToUTF32: Returning -1 (bad UTF-8 sequence)");
c9fc04c6 581
e1d6a774 582 return (-1);
c9fc04c6 583 }
e1d6a774 584
585 ch32 = (ch32 << 6) | (next & 0x3f);
586
587 next = *src++;
c9fc04c6
MS
588 if ((next & 0xc0) != 0x80)
589 {
e07d4801 590 DEBUG_puts("3cupsUTF8ToUTF32: Returning -1 (bad UTF-8 sequence)");
c9fc04c6 591
e1d6a774 592 return (-1);
c9fc04c6 593 }
e1d6a774 594
595 ch32 = (ch32 << 6) | (next & 0x3f);
596
ef416fc2 597 /*
e1d6a774 598 * Check for non-shortest form (invalid UTF-8)...
ef416fc2 599 */
e1d6a774 600
601 if (ch32 < 0x10000)
c9fc04c6 602 {
e07d4801 603 DEBUG_puts("3cupsUTF8ToUTF32: Returning -1 (bad UTF-8 sequence)");
c9fc04c6 604
e1d6a774 605 return (-1);
c9fc04c6 606 }
e1d6a774 607
608 *dest++ = ch32;
c9fc04c6 609
e07d4801 610 DEBUG_printf(("4cupsUTF8ToUTF32: %02x %02x %02x %02x => %08X",
c9fc04c6 611 src[-4], src[-3], src[-2], src[-1], (unsigned)ch32));
ef416fc2 612 }
613 else
614 {
615 /*
e1d6a774 616 * More than 4-octet (invalid UTF-8 sequence)...
ef416fc2 617 */
e1d6a774 618
e07d4801 619 DEBUG_puts("3cupsUTF8ToUTF32: Returning -1 (bad UTF-8 sequence)");
c9fc04c6 620
ef416fc2 621 return (-1);
622 }
623
624 /*
625 * Check for UTF-16 surrogate (illegal UTF-8)...
626 */
ef416fc2 627
2abf387c 628 if (ch32 >= 0xd800 && ch32 <= 0xdfff)
ef416fc2 629 return (-1);
630 }
e1d6a774 631
ef416fc2 632 *dest = 0;
e1d6a774 633
e07d4801 634 DEBUG_printf(("3cupsUTF8ToUTF32: Returning %d characters", maxout - 1 - i));
c9fc04c6
MS
635
636 return (maxout - 1 - i);
ef416fc2 637}
638
e1d6a774 639
ef416fc2 640/*
641 * 'cupsUTF32ToUTF8()' - Convert UTF-32 to UTF-8.
642 *
643 * 32-bit UTF-32 (actually 21-bit) maps to UTF-8 as follows...
644 *
645 * UTF-32 char UTF-8 char(s)
646 * --------------------------------------------------
e1d6a774 647 * 0 to 127 = 0xxxxxxx (US-ASCII)
ef416fc2 648 * 128 to 2047 = 110xxxxx 10yyyyyy
649 * 2048 to 65535 = 1110xxxx 10yyyyyy 10zzzzzz
e1d6a774 650 * > 65535 = 11110xxx 10yyyyyy 10zzzzzz 10xxxxxx
ef416fc2 651 *
652 * UTF-32 prohibits chars beyond Plane 16 (> 0x10ffff) in UCS-4,
653 * which would convert to five- or six-octet UTF-8 sequences...
ef416fc2 654 */
e1d6a774 655
656int /* O - Count or -1 on error */
657cupsUTF32ToUTF8(
658 cups_utf8_t *dest, /* O - Target string */
659 const cups_utf32_t *src, /* I - Source string */
660 const int maxout) /* I - Max output */
ef416fc2 661{
e1d6a774 662 cups_utf8_t *start; /* Start of destination string */
663 int i; /* Looping variable */
664 int swap; /* Byte-swap input to output */
665 cups_utf32_t ch; /* Character value */
666
ef416fc2 667
668 /*
669 * Check for valid arguments and clear output...
670 */
e1d6a774 671
e07d4801 672 DEBUG_printf(("2cupsUTF32ToUTF8(dest=%p, src=%p, maxout=%d)", dest, src,
c9fc04c6
MS
673 maxout));
674
e1d6a774 675 if (dest)
676 *dest = '\0';
677
678 if (!dest || !src || maxout < 1)
c9fc04c6 679 {
e07d4801 680 DEBUG_puts("3cupsUTF32ToUTF8: Returning -1 (bad args)");
c9fc04c6 681
ef416fc2 682 return (-1);
c9fc04c6 683 }
ef416fc2 684
685 /*
686 * Check for leading BOM in UTF-32 and inverted BOM...
687 */
e1d6a774 688
689 start = dest;
690 swap = *src == 0xfffe0000;
691
e07d4801 692 DEBUG_printf(("4cupsUTF32ToUTF8: swap=%d", swap));
c9fc04c6 693
e1d6a774 694 if (*src == 0xfffe0000 || *src == 0xfeff)
695 src ++;
ef416fc2 696
697 /*
698 * Convert input UTF-32 to output UTF-8...
699 */
e1d6a774 700
701 for (i = maxout - 1; *src && i > 0;)
ef416fc2 702 {
e1d6a774 703 ch = *src++;
ef416fc2 704
705 /*
706 * Byte swap input UTF-32, if necessary...
e1d6a774 707 * (only byte-swapping 24 of 32 bits)
ef416fc2 708 */
e1d6a774 709
ef416fc2 710 if (swap)
711 ch = ((ch >> 24) | ((ch >> 8) & 0xff00) | ((ch << 8) & 0xff0000));
712
713 /*
e1d6a774 714 * Check for beyond Plane 16 (invalid UTF-32)...
ef416fc2 715 */
ef416fc2 716
ef416fc2 717 if (ch > 0x10ffff)
c9fc04c6 718 {
e07d4801 719 DEBUG_puts("3cupsUTF32ToUTF8: Returning -1 (character out of range)");
c9fc04c6 720
ef416fc2 721 return (-1);
c9fc04c6 722 }
ef416fc2 723
ef416fc2 724 /*
725 * Convert UTF-32 character to UTF-8 character(s)...
726 */
e1d6a774 727
728 if (ch < 0x80)
ef416fc2 729 {
730 /*
731 * One-octet UTF-8 <= 127 (US-ASCII)...
732 */
e1d6a774 733
734 *dest++ = (cups_utf8_t)ch;
735 i --;
c9fc04c6 736
e07d4801 737 DEBUG_printf(("4cupsUTF32ToUTF8: %08x => %02x", (unsigned)ch, dest[-1]));
ef416fc2 738 }
e1d6a774 739 else if (ch < 0x800)
ef416fc2 740 {
741 /*
742 * Two-octet UTF-8 <= 2047 (Latin-x)...
743 */
e1d6a774 744
745 if (i < 2)
c9fc04c6 746 {
e07d4801 747 DEBUG_puts("3cupsUTF32ToUTF8: Returning -1 (too long 2)");
c9fc04c6 748
e1d6a774 749 return (-1);
c9fc04c6 750 }
e1d6a774 751
752 *dest++ = (cups_utf8_t)(0xc0 | ((ch >> 6) & 0x1f));
753 *dest++ = (cups_utf8_t)(0x80 | (ch & 0x3f));
754 i -= 2;
c9fc04c6 755
e07d4801 756 DEBUG_printf(("4cupsUTF32ToUTF8: %08x => %02x %02x", (unsigned)ch,
c9fc04c6 757 dest[-2], dest[-1]));
ef416fc2 758 }
e1d6a774 759 else if (ch < 0x10000)
ef416fc2 760 {
761 /*
762 * Three-octet UTF-8 <= 65535 (Plane 0 - BMP)...
763 */
e1d6a774 764
765 if (i < 3)
c9fc04c6 766 {
e07d4801 767 DEBUG_puts("3cupsUTF32ToUTF8: Returning -1 (too long 3)");
c9fc04c6 768
e1d6a774 769 return (-1);
c9fc04c6 770 }
e1d6a774 771
772 *dest++ = (cups_utf8_t)(0xe0 | ((ch >> 12) & 0x0f));
773 *dest++ = (cups_utf8_t)(0x80 | ((ch >> 6) & 0x3f));
774 *dest++ = (cups_utf8_t)(0x80 | (ch & 0x3f));
775 i -= 3;
c9fc04c6 776
e07d4801 777 DEBUG_printf(("4cupsUTF32ToUTF8: %08x => %02x %02x %02x", (unsigned)ch,
c9fc04c6 778 dest[-3], dest[-2], dest[-1]));
e1d6a774 779 }
780 else
781 {
782 /*
783 * Four-octet UTF-8...
784 */
785
786 if (i < 4)
e07d4801
MS
787 {
788 DEBUG_puts("3cupsUTF32ToUTF8: Returning -1 (too long 4)");
789
e1d6a774 790 return (-1);
e07d4801 791 }
e1d6a774 792
793 *dest++ = (cups_utf8_t)(0xf0 | ((ch >> 18) & 0x07));
794 *dest++ = (cups_utf8_t)(0x80 | ((ch >> 12) & 0x3f));
795 *dest++ = (cups_utf8_t)(0x80 | ((ch >> 6) & 0x3f));
796 *dest++ = (cups_utf8_t)(0x80 | (ch & 0x3f));
797 i -= 4;
c9fc04c6 798
e07d4801 799 DEBUG_printf(("4cupsUTF32ToUTF8: %08x => %02x %02x %02x %02x",
c9fc04c6 800 (unsigned)ch, dest[-4], dest[-3], dest[-2], dest[-1]));
ef416fc2 801 }
802 }
e1d6a774 803
ef416fc2 804 *dest = '\0';
e1d6a774 805
e07d4801 806 DEBUG_printf(("3cupsUTF32ToUTF8: Returning %d", (int)(dest - start)));
c9fc04c6 807
e1d6a774 808 return ((int)(dest - start));
ef416fc2 809}
810
e1d6a774 811
ef416fc2 812/*
e1d6a774 813 * 'compare_wide()' - Compare key for wide (VBCS) match.
814 */
815
816static int
817compare_wide(const void *k1, /* I - Key char */
818 const void *k2) /* I - Map char */
819{
820 cups_vbcs_t key; /* Legacy key character */
821 cups_vbcs_t map; /* Legacy map character */
822
823
824 key = *((cups_vbcs_t *)k1);
825 map = ((_cups_wide2uni_t *)k2)->widechar;
826
827 return ((int)(key - map));
828}
829
830
831/*
832 * 'conv_sbcs_to_utf8()' - Convert legacy SBCS to UTF-8.
ef416fc2 833 */
e1d6a774 834
835static int /* O - Count or -1 on error */
836conv_sbcs_to_utf8(
837 cups_utf8_t *dest, /* O - Target string */
838 const cups_sbcs_t *src, /* I - Source string */
839 int maxout, /* I - Max output */
840 const cups_encoding_t encoding) /* I - Encoding */
ef416fc2 841{
e1d6a774 842 _cups_cmap_t *cmap; /* Legacy SBCS / Unicode Charset Map */
843 cups_ucs2_t *crow; /* Pointer to UCS-2 row in 'char2uni' */
844 cups_sbcs_t legchar; /* Legacy character value */
845 cups_utf32_t work[CUPS_MAX_USTRING], /* Internal UCS-4 string */
846 *workptr; /* Pointer into string */
847
ef416fc2 848
849 /*
e1d6a774 850 * Find legacy charset map in cache...
ef416fc2 851 */
e1d6a774 852
d6ae789d 853 if ((cmap = (_cups_cmap_t *)get_charmap(encoding)) == NULL)
ef416fc2 854 return (-1);
ef416fc2 855
856 /*
e1d6a774 857 * Convert input legacy charset to internal UCS-4 (and insert BOM)...
ef416fc2 858 */
ef416fc2 859
e1d6a774 860 work[0] = 0xfeff;
861 for (workptr = work + 1; *src && workptr < (work + CUPS_MAX_USTRING - 1);)
ef416fc2 862 {
e1d6a774 863 legchar = *src++;
ef416fc2 864
865 /*
e1d6a774 866 * Convert ASCII verbatim (optimization)...
ef416fc2 867 */
ef416fc2 868
e1d6a774 869 if (legchar < 0x80)
870 *workptr++ = (cups_utf32_t)legchar;
871 else
ef416fc2 872 {
e1d6a774 873 /*
874 * Convert unknown character to Replacement Character...
875 */
ef416fc2 876
e1d6a774 877 crow = cmap->char2uni + legchar;
878
879 if (!*crow)
880 *workptr++ = 0xfffd;
881 else
882 *workptr++ = (cups_utf32_t)*crow;
ef416fc2 883 }
ef416fc2 884 }
e1d6a774 885
886 *workptr = 0;
887
888 /*
889 * Convert internal UCS-4 to output UTF-8 (and delete BOM)...
890 */
891
d6ae789d 892 cmap->used --;
e1d6a774 893
894 return (cupsUTF32ToUTF8(dest, work, maxout));
ef416fc2 895}
896
e1d6a774 897
ef416fc2 898/*
e1d6a774 899 * 'conv_utf8_to_sbcs()' - Convert UTF-8 to legacy SBCS.
ef416fc2 900 */
e1d6a774 901
902static int /* O - Count or -1 on error */
903conv_utf8_to_sbcs(
904 cups_sbcs_t *dest, /* O - Target string */
905 const cups_utf8_t *src, /* I - Source string */
906 int maxout, /* I - Max output */
907 const cups_encoding_t encoding) /* I - Encoding */
ef416fc2 908{
e1d6a774 909 cups_sbcs_t *start; /* Start of destination string */
910 _cups_cmap_t *cmap; /* Legacy SBCS / Unicode Charset Map */
911 cups_sbcs_t *srow; /* Pointer to SBCS row in 'uni2char' */
912 cups_utf32_t unichar; /* Character value */
913 cups_utf32_t work[CUPS_MAX_USTRING], /* Internal UCS-4 string */
914 *workptr; /* Pointer into string */
915
ef416fc2 916
917 /*
e1d6a774 918 * Find legacy charset map in cache...
ef416fc2 919 */
e1d6a774 920
d6ae789d 921 if ((cmap = (_cups_cmap_t *)get_charmap(encoding)) == NULL)
ef416fc2 922 return (-1);
ef416fc2 923
924 /*
e1d6a774 925 * Convert input UTF-8 to internal UCS-4 (and insert BOM)...
ef416fc2 926 */
e1d6a774 927
928 if (cupsUTF8ToUTF32(work, src, CUPS_MAX_USTRING) < 0)
929 return (-1);
ef416fc2 930
931 /*
e1d6a774 932 * Convert internal UCS-4 to SBCS legacy charset (and delete BOM)...
ef416fc2 933 */
e1d6a774 934
58dc1933 935 for (workptr = work, start = dest; *workptr && maxout > 0; maxout --)
ef416fc2 936 {
e1d6a774 937 unichar = *workptr++;
938 if (!unichar)
ef416fc2 939 break;
ef416fc2 940
941 /*
e1d6a774 942 * Convert ASCII verbatim (optimization)...
ef416fc2 943 */
ef416fc2 944
e1d6a774 945 if (unichar < 0x80)
946 {
947 *dest++ = (cups_sbcs_t)unichar;
948 continue;
949 }
ef416fc2 950
951 /*
e1d6a774 952 * Convert unknown character to visible replacement...
ef416fc2 953 */
ef416fc2 954
e1d6a774 955 srow = cmap->uni2char[(int)((unichar >> 8) & 0xff)];
ef416fc2 956
e1d6a774 957 if (srow)
958 srow += (int)(unichar & 0xff);
ef416fc2 959
e1d6a774 960 if (!srow || !*srow)
961 *dest++ = '?';
962 else
963 *dest++ = *srow;
ef416fc2 964 }
ef416fc2 965
e1d6a774 966 *dest = '\0';
967
d6ae789d 968 cmap->used --;
e1d6a774 969
970 return ((int)(dest - start));
ef416fc2 971}
972
e1d6a774 973
ef416fc2 974/*
e1d6a774 975 * 'conv_utf8_to_vbcs()' - Convert UTF-8 to legacy DBCS/VBCS.
ef416fc2 976 */
e1d6a774 977
978static int /* O - Count or -1 on error */
979conv_utf8_to_vbcs(
980 cups_sbcs_t *dest, /* O - Target string */
981 const cups_utf8_t *src, /* I - Source string */
982 int maxout, /* I - Max output */
983 const cups_encoding_t encoding) /* I - Encoding */
ef416fc2 984{
e1d6a774 985 cups_sbcs_t *start; /* Start of destination string */
986 _cups_vmap_t *vmap; /* Legacy DBCS / Unicode Charset Map */
987 cups_vbcs_t *vrow; /* Pointer to VBCS row in 'uni2char' */
988 cups_utf32_t unichar; /* Character value */
989 cups_vbcs_t legchar; /* Legacy character value */
990 cups_utf32_t work[CUPS_MAX_USTRING], /* Internal UCS-4 string */
991 *workptr; /* Pointer into string */
ef416fc2 992
ef416fc2 993
e07d4801
MS
994 DEBUG_printf(("7conv_utf8_to_vbcs(dest=%p, src=\"%s\", maxout=%d, "
995 "encoding=%d)", dest, src, maxout, encoding));
c9fc04c6 996
ef416fc2 997 /*
e1d6a774 998 * Find legacy charset map in cache...
ef416fc2 999 */
ef416fc2 1000
d6ae789d 1001 if ((vmap = (_cups_vmap_t *)get_charmap(encoding)) == NULL)
c9fc04c6 1002 {
e07d4801 1003 DEBUG_puts("8conv_utf8_to_vbcs: Returning -1 (no charmap)");
c9fc04c6 1004
e1d6a774 1005 return (-1);
c9fc04c6 1006 }
ef416fc2 1007
1008 /*
e1d6a774 1009 * Convert input UTF-8 to internal UCS-4 (and insert BOM)...
ef416fc2 1010 */
e1d6a774 1011
1012 if (cupsUTF8ToUTF32(work, src, CUPS_MAX_USTRING) < 0)
c9fc04c6 1013 {
e07d4801 1014 DEBUG_puts("8conv_utf8_to_vbcs: Returning -1 (Unable to convert to UTF-32)");
c9fc04c6 1015
e1d6a774 1016 return (-1);
c9fc04c6 1017 }
ef416fc2 1018
1019 /*
e1d6a774 1020 * Convert internal UCS-4 to VBCS legacy charset (and delete BOM)...
ef416fc2 1021 */
e1d6a774 1022
58dc1933 1023 for (start = dest, workptr = work; *workptr && maxout > 0; maxout --)
ef416fc2 1024 {
e1d6a774 1025 unichar = *workptr++;
ef416fc2 1026
1027 /*
e1d6a774 1028 * Convert ASCII verbatim (optimization)...
ef416fc2 1029 */
e1d6a774 1030
1031 if (unichar < 0x80)
1032 {
b86bc4cf 1033 *dest++ = (cups_sbcs_t)unichar;
c9fc04c6 1034
e07d4801 1035 DEBUG_printf(("9conv_utf8_to_vbcs: %08x => %02X", (unsigned)unichar,
c9fc04c6
MS
1036 dest[-1]));
1037
e1d6a774 1038 continue;
1039 }
ef416fc2 1040
1041 /*
e1d6a774 1042 * Convert unknown character to visible replacement...
ef416fc2 1043 */
e1d6a774 1044
1045 vrow = vmap->uni2char[(int)((unichar >> 8) & 0xff)];
1046
1047 if (vrow)
1048 vrow += (int)(unichar & 0xff);
1049
1050 if (!vrow || !*vrow)
1051 legchar = (cups_vbcs_t)'?';
1052 else
1053 legchar = (cups_vbcs_t)*vrow;
ef416fc2 1054
1055 /*
e1d6a774 1056 * Save n-byte legacy character...
ef416fc2 1057 */
e1d6a774 1058
1059 if (legchar > 0xffffff)
ef416fc2 1060 {
e1d6a774 1061 if (maxout < 5)
c9fc04c6 1062 {
e07d4801 1063 DEBUG_puts("8conv_utf8_to_vbcs: Returning -1 (out of space)");
c9fc04c6 1064
e1d6a774 1065 return (-1);
c9fc04c6 1066 }
e1d6a774 1067
1068 *dest++ = (cups_sbcs_t)(legchar >> 24);
1069 *dest++ = (cups_sbcs_t)(legchar >> 16);
1070 *dest++ = (cups_sbcs_t)(legchar >> 8);
1071 *dest++ = (cups_sbcs_t)legchar;
1072
1073 maxout -= 3;
c9fc04c6 1074
e07d4801 1075 DEBUG_printf(("9conv_utf8_to_vbcs: %08x => %02X %02X %02X %02X",
c9fc04c6 1076 (unsigned)unichar, dest[-4], dest[-3], dest[-2], dest[-1]));
ef416fc2 1077 }
e1d6a774 1078 else if (legchar > 0xffff)
1079 {
1080 if (maxout < 4)
c9fc04c6 1081 {
e07d4801 1082 DEBUG_puts("8conv_utf8_to_vbcs: Returning -1 (out of space)");
c9fc04c6 1083
e1d6a774 1084 return (-1);
c9fc04c6 1085 }
ef416fc2 1086
e1d6a774 1087 *dest++ = (cups_sbcs_t)(legchar >> 16);
1088 *dest++ = (cups_sbcs_t)(legchar >> 8);
1089 *dest++ = (cups_sbcs_t)legchar;
ef416fc2 1090
e1d6a774 1091 maxout -= 2;
c9fc04c6 1092
e07d4801 1093 DEBUG_printf(("9conv_utf8_to_vbcs: %08x => %02X %02X %02X",
c9fc04c6 1094 (unsigned)unichar, dest[-3], dest[-2], dest[-1]));
e1d6a774 1095 }
1096 else if (legchar > 0xff)
1097 {
1098 *dest++ = (cups_sbcs_t)(legchar >> 8);
1099 *dest++ = (cups_sbcs_t)legchar;
1100
1101 maxout --;
c9fc04c6 1102
e07d4801 1103 DEBUG_printf(("9conv_utf8_to_vbcs: %08x => %02X %02X",
c9fc04c6
MS
1104 (unsigned)unichar, dest[-2], dest[-1]));
1105 }
1106 else
1107 {
536bc2c6 1108 *dest++ = (cups_sbcs_t)legchar;
c9fc04c6 1109
e07d4801 1110 DEBUG_printf(("9conv_utf8_to_vbcs: %08x => %02X",
c9fc04c6 1111 (unsigned)unichar, dest[-1]));
e1d6a774 1112 }
ef416fc2 1113 }
e1d6a774 1114
1115 *dest = '\0';
1116
d6ae789d 1117 vmap->used --;
e1d6a774 1118
e07d4801 1119 DEBUG_printf(("8conv_utf8_to_vbcs: Returning %d characters",
c9fc04c6
MS
1120 (int)(dest - start)));
1121
e1d6a774 1122 return ((int)(dest - start));
ef416fc2 1123}
1124
e1d6a774 1125
ef416fc2 1126/*
e1d6a774 1127 * 'conv_vbcs_to_utf8()' - Convert legacy DBCS/VBCS to UTF-8.
ef416fc2 1128 */
e1d6a774 1129
1130static int /* O - Count or -1 on error */
1131conv_vbcs_to_utf8(
1132 cups_utf8_t *dest, /* O - Target string */
1133 const cups_sbcs_t *src, /* I - Source string */
1134 int maxout, /* I - Max output */
1135 const cups_encoding_t encoding) /* I - Encoding */
ef416fc2 1136{
e1d6a774 1137 _cups_vmap_t *vmap; /* Legacy VBCS / Unicode Charset Map */
1138 cups_ucs2_t *crow; /* Pointer to UCS-2 row in 'char2uni' */
1139 _cups_wide2uni_t *wide2uni; /* Pointer to row in 'wide2uni' */
1140 cups_sbcs_t leadchar; /* Lead char of n-byte legacy char */
1141 cups_vbcs_t legchar; /* Legacy character value */
1142 cups_utf32_t work[CUPS_MAX_USTRING], /* Internal UCS-4 string */
1143 *workptr; /* Pointer into string */
ef416fc2 1144
ef416fc2 1145
1146 /*
e1d6a774 1147 * Find legacy charset map in cache...
ef416fc2 1148 */
ef416fc2 1149
e07d4801 1150 DEBUG_printf(("7conv_vbcs_to_utf8(dest=%p, src=%p, maxout=%d, encoding=%d)",
c9fc04c6
MS
1151 dest, src, maxout, encoding));
1152
d6ae789d 1153 if ((vmap = (_cups_vmap_t *)get_charmap(encoding)) == NULL)
c9fc04c6 1154 {
e07d4801 1155 DEBUG_puts("8conv_vbcs_to_utf8: Returning -1 (NULL vmap)");
c9fc04c6 1156
e1d6a774 1157 return (-1);
c9fc04c6 1158 }
ef416fc2 1159
1160 /*
e1d6a774 1161 * Convert input legacy charset to internal UCS-4 (and insert BOM)...
ef416fc2 1162 */
ef416fc2 1163
e1d6a774 1164 work[0] = 0xfeff;
1165 for (workptr = work + 1; *src && workptr < (work + CUPS_MAX_USTRING - 1);)
ef416fc2 1166 {
e1d6a774 1167 legchar = *src++;
1168 leadchar = (cups_sbcs_t)legchar;
ef416fc2 1169
1170 /*
e1d6a774 1171 * Convert ASCII verbatim (optimization)...
ef416fc2 1172 */
ef416fc2 1173
e1d6a774 1174 if (legchar < 0x80)
ef416fc2 1175 {
e1d6a774 1176 *workptr++ = (cups_utf32_t)legchar;
c9fc04c6 1177
e07d4801 1178 DEBUG_printf(("9conv_vbcs_to_utf8: %02X => %08X", src[-1],
c9fc04c6 1179 (unsigned)legchar));
e1d6a774 1180 continue;
ef416fc2 1181 }
1182
1183 /*
e1d6a774 1184 * Convert 2-byte legacy character...
ef416fc2 1185 */
e1d6a774 1186
1187 if (vmap->lead2char[(int)leadchar] == leadchar)
ef416fc2 1188 {
e1d6a774 1189 if (!*src)
c9fc04c6 1190 {
e07d4801 1191 DEBUG_puts("8conv_vbcs_to_utf8: Returning -1 (short string)");
c9fc04c6 1192
e1d6a774 1193 return (-1);
c9fc04c6 1194 }
e1d6a774 1195
1196 legchar = (legchar << 8) | *src++;
1197
ef416fc2 1198 /*
e1d6a774 1199 * Convert unknown character to Replacement Character...
ef416fc2 1200 */
e1d6a774 1201
1202 crow = vmap->char2uni[(int)((legchar >> 8) & 0xff)];
1203 if (crow)
1204 crow += (int) (legchar & 0xff);
1205
1206 if (!crow || !*crow)
1207 *workptr++ = 0xfffd;
1208 else
1209 *workptr++ = (cups_utf32_t)*crow;
c9fc04c6 1210
e07d4801 1211 DEBUG_printf(("9conv_vbcs_to_utf8: %02X %02X => %08X",
c9fc04c6 1212 src[-2], src[-1], (unsigned)workptr[-1]));
e1d6a774 1213 continue;
ef416fc2 1214 }
1215
1216 /*
e1d6a774 1217 * Fetch 3-byte or 4-byte legacy character...
ef416fc2 1218 */
e1d6a774 1219
1220 if (vmap->lead3char[(int)leadchar] == leadchar)
ef416fc2 1221 {
e1d6a774 1222 if (!*src || !src[1])
c9fc04c6 1223 {
e07d4801 1224 DEBUG_puts("8conv_vbcs_to_utf8: Returning -1 (short string 2)");
c9fc04c6 1225
e1d6a774 1226 return (-1);
c9fc04c6 1227 }
e1d6a774 1228
1229 legchar = (legchar << 8) | *src++;
1230 legchar = (legchar << 8) | *src++;
ef416fc2 1231 }
e1d6a774 1232 else if (vmap->lead4char[(int)leadchar] == leadchar)
1233 {
1234 if (!*src || !src[1] || !src[2])
c9fc04c6 1235 {
e07d4801 1236 DEBUG_puts("8conv_vbcs_to_utf8: Returning -1 (short string 3)");
c9fc04c6 1237
e1d6a774 1238 return (-1);
c9fc04c6 1239 }
e1d6a774 1240
1241 legchar = (legchar << 8) | *src++;
1242 legchar = (legchar << 8) | *src++;
1243 legchar = (legchar << 8) | *src++;
1244 }
1245 else
c9fc04c6 1246 {
e07d4801 1247 DEBUG_puts("8conv_vbcs_to_utf8: Returning -1 (bad character)");
c9fc04c6 1248
e1d6a774 1249 return (-1);
c9fc04c6 1250 }
ef416fc2 1251
1252 /*
e1d6a774 1253 * Find 3-byte or 4-byte legacy character...
ef416fc2 1254 */
e1d6a774 1255
1256 wide2uni = (_cups_wide2uni_t *)bsearch(&legchar,
1257 vmap->wide2uni,
1258 vmap->widecount,
1259 sizeof(_cups_wide2uni_t),
1260 compare_wide);
ef416fc2 1261
1262 /*
e1d6a774 1263 * Convert unknown character to Replacement Character...
ef416fc2 1264 */
e1d6a774 1265
1266 if (!wide2uni || !wide2uni->unichar)
1267 *workptr++ = 0xfffd;
1268 else
1269 *workptr++ = wide2uni->unichar;
c9fc04c6
MS
1270
1271 if (vmap->lead3char[(int)leadchar] == leadchar)
e07d4801 1272 DEBUG_printf(("9conv_vbcs_to_utf8: %02X %02X %02X => %08X",
c9fc04c6
MS
1273 src[-3], src[-2], src[-1], (unsigned)workptr[-1]));
1274 else
e07d4801 1275 DEBUG_printf(("9conv_vbcs_to_utf8: %02X %02X %02X %02X => %08X",
c9fc04c6 1276 src[-4], src[-3], src[-2], src[-1], (unsigned)workptr[-1]));
ef416fc2 1277 }
e1d6a774 1278
1279 *workptr = 0;
1280
d6ae789d 1281 vmap->used --;
e1d6a774 1282
e07d4801 1283 DEBUG_printf(("9conv_vbcs_to_utf8: Converting %d UTF-32 characters to UTF-8",
c9fc04c6
MS
1284 (int)(workptr - work)));
1285
e1d6a774 1286 /*
1287 * Convert internal UCS-4 to output UTF-8 (and delete BOM)...
1288 */
1289
1290 return (cupsUTF32ToUTF8(dest, work, maxout));
ef416fc2 1291}
1292
e1d6a774 1293
ef416fc2 1294/*
e1d6a774 1295 * 'free_sbcs_charmap()' - Free memory used by a single byte character set.
ef416fc2 1296 */
e1d6a774 1297
1298static void
1299free_sbcs_charmap(_cups_cmap_t *cmap) /* I - Character set */
ef416fc2 1300{
e1d6a774 1301 int i; /* Looping variable */
ef416fc2 1302
ef416fc2 1303
e1d6a774 1304 for (i = 0; i < 256; i ++)
1305 if (cmap->uni2char[i])
1306 free(cmap->uni2char[i]);
1307
1308 free(cmap);
1309}
1310
1311
1312/*
1313 * 'free_vbcs_charmap()' - Free memory used by a variable byte character set.
1314 */
1315
1316static void
1317free_vbcs_charmap(_cups_vmap_t *vmap) /* I - Character set */
1318{
1319 int i; /* Looping variable */
1320
1321
1322 for (i = 0; i < 256; i ++)
1323 if (vmap->char2uni[i])
1324 free(vmap->char2uni[i]);
1325
1326 for (i = 0; i < 256; i ++)
1327 if (vmap->uni2char[i])
1328 free(vmap->uni2char[i]);
1329
1330 if (vmap->wide2uni)
1331 free(vmap->wide2uni);
1332
1333 free(vmap);
1334}
1335
1336
d6ae789d 1337/*
1338 * 'get_charmap()' - Lookup or get a character set map (private).
1339 *
1340 * This code handles single-byte (SBCS), double-byte (DBCS), and
1341 * variable-byte (VBCS) character sets _without_ charset escapes...
1342 * This code does not handle multiple-byte character sets (MBCS)
1343 * (such as ISO-2022-JP) with charset switching via escapes...
1344 */
1345
1346
d09495fa 1347static void * /* O - Charset map pointer */
d6ae789d 1348get_charmap(
1349 const cups_encoding_t encoding) /* I - Encoding */
1350{
1351 char filename[1024]; /* Filename for charset map file */
1352 _cups_globals_t *cg = _cupsGlobals(); /* Global data */
1353
1354
e07d4801 1355 DEBUG_printf(("7get_charmap(encoding=%d)", encoding));
c9fc04c6 1356
d6ae789d 1357 /*
1358 * Get the data directory and charset map name...
1359 */
1360
1361 snprintf(filename, sizeof(filename), "%s/charmaps/%s.txt",
1362 cg->cups_datadir, _cupsEncodingName(encoding));
1363
e07d4801 1364 DEBUG_printf(("9get_charmap: filename=\"%s\"", filename));
d6ae789d 1365
1366 /*
1367 * Read charset map input file into cache...
1368 */
1369
1370 if (encoding < CUPS_ENCODING_SBCS_END)
1371 return (get_sbcs_charmap(encoding, filename));
1372 else if (encoding < CUPS_ENCODING_VBCS_END)
1373 return (get_vbcs_charmap(encoding, filename));
1374 else
1375 return (NULL);
1376}
1377
1378
e1d6a774 1379/*
1380 * 'get_charmap_count()' - Count lines in a charmap file.
1381 */
1382
1383static int /* O - Count or -1 on error */
1384get_charmap_count(cups_file_t *fp) /* I - File to read from */
1385{
1386 int count; /* Number of lines */
1387 char line[256]; /* Line from input map file */
ef416fc2 1388
ef416fc2 1389
1390 /*
e1d6a774 1391 * Count lines in map input file...
ef416fc2 1392 */
ef416fc2 1393
e1d6a774 1394 count = 0;
ef416fc2 1395
e1d6a774 1396 while (cupsFileGets(fp, line, sizeof(line)))
1397 if (line[0] == '0')
1398 count ++;
ef416fc2 1399
e1d6a774 1400 /*
1401 * Return the number of lines...
1402 */
1403
1404 if (count > 0)
1405 return (count);
1406 else
1407 return (-1);
ef416fc2 1408}
1409
e1d6a774 1410
ef416fc2 1411/*
e1d6a774 1412 * 'get_sbcs_charmap()' - Get SBCS Charmap.
ef416fc2 1413 */
e1d6a774 1414
1415static _cups_cmap_t * /* O - Charmap or 0 on error */
1416get_sbcs_charmap(
1417 const cups_encoding_t encoding, /* I - Charmap Encoding */
1418 const char *filename) /* I - Charmap Filename */
ef416fc2 1419{
e1d6a774 1420 unsigned long legchar; /* Legacy character value */
1421 cups_utf32_t unichar; /* Unicode character value */
1422 _cups_cmap_t *cmap; /* Legacy SBCS / Unicode Charset Map */
1423 cups_file_t *fp; /* Charset map file pointer */
1424 char *s; /* Line parsing pointer */
1425 cups_ucs2_t *crow; /* Pointer to UCS-2 row in 'char2uni' */
1426 cups_sbcs_t *srow; /* Pointer to SBCS row in 'uni2char' */
1427 char line[256]; /* Line from charset map file */
e1d6a774 1428
ef416fc2 1429
1430 /*
e1d6a774 1431 * See if we already have this SBCS charset map loaded...
ef416fc2 1432 */
e1d6a774 1433
e07d4801 1434 DEBUG_printf(("7get_sbcs_charmap(encoding=%d, filename=\"%s\")", encoding,
c9fc04c6
MS
1435 filename));
1436
d6ae789d 1437 for (cmap = cmap_cache; cmap; cmap = cmap->next)
e1d6a774 1438 {
1439 if (cmap->encoding == encoding)
1440 {
1441 cmap->used ++;
e07d4801 1442 DEBUG_printf(("8get_sbcs_charmap: Returning existing cmap=%p", cmap));
d6ae789d 1443
e1d6a774 1444 return ((void *)cmap);
1445 }
1446 }
ef416fc2 1447
1448 /*
e1d6a774 1449 * Open SBCS charset map input file...
ef416fc2 1450 */
e1d6a774 1451
1452 if ((fp = cupsFileOpen(filename, "r")) == NULL)
c9fc04c6 1453 {
e07d4801 1454 DEBUG_printf(("8get_sbcs_charmap: Returning NULL (%s)", strerror(errno)));
c9fc04c6 1455
e1d6a774 1456 return (NULL);
c9fc04c6 1457 }
ef416fc2 1458
1459 /*
e1d6a774 1460 * Allocate memory for SBCS charset map...
ef416fc2 1461 */
e1d6a774 1462
1463 if ((cmap = (_cups_cmap_t *)calloc(1, sizeof(_cups_cmap_t))) == NULL)
1464 {
1465 cupsFileClose(fp);
e07d4801 1466 DEBUG_puts("8get_sbcs_charmap: Returning NULL (Unable to allocate memory)");
d6ae789d 1467
e1d6a774 1468 return (NULL);
1469 }
1470
1471 cmap->used ++;
1472 cmap->encoding = encoding;
ef416fc2 1473
1474 /*
e1d6a774 1475 * Save SBCS charset map into memory for transcoding...
ef416fc2 1476 */
e1d6a774 1477
1478 while (cupsFileGets(fp, line, sizeof(line)))
ef416fc2 1479 {
e1d6a774 1480 if (line[0] != '0')
1481 continue;
1482
1483 legchar = strtol(line, &s, 16);
1484 if (legchar < 0 || legchar > 0xff)
1485 goto sbcs_error;
1486
1487 unichar = strtol(s, NULL, 16);
bf3816c7 1488 if (unichar < 0 || unichar > 0x10ffff)
e1d6a774 1489 goto sbcs_error;
ef416fc2 1490
1491 /*
e1d6a774 1492 * Save legacy to Unicode mapping in direct lookup table...
ef416fc2 1493 */
e1d6a774 1494
1495 crow = cmap->char2uni + legchar;
1496 *crow = (cups_ucs2_t)(unichar & 0xffff);
ef416fc2 1497
1498 /*
e1d6a774 1499 * Save Unicode to legacy mapping in indirect lookup table...
ef416fc2 1500 */
e1d6a774 1501
1502 srow = cmap->uni2char[(unichar >> 8) & 0xff];
1503 if (!srow)
ef416fc2 1504 {
e1d6a774 1505 srow = (cups_sbcs_t *)calloc(256, sizeof(cups_sbcs_t));
1506 if (!srow)
1507 goto sbcs_error;
1508
1509 cmap->uni2char[(unichar >> 8) & 0xff] = srow;
ef416fc2 1510 }
1511
e1d6a774 1512 srow += unichar & 0xff;
1513
ef416fc2 1514 /*
e1d6a774 1515 * Convert Replacement Character to visible replacement...
ef416fc2 1516 */
e1d6a774 1517
1518 if (unichar == 0xfffd)
1519 legchar = (unsigned long)'?';
ef416fc2 1520
1521 /*
e1d6a774 1522 * First (oldest) legacy character uses Unicode mapping cell...
ef416fc2 1523 */
ef416fc2 1524
e1d6a774 1525 if (!*srow)
1526 *srow = (cups_sbcs_t)legchar;
1527 }
ef416fc2 1528
e1d6a774 1529 cupsFileClose(fp);
1530
ef416fc2 1531 /*
e1d6a774 1532 * Add it to the cache and return...
ef416fc2 1533 */
e1d6a774 1534
d6ae789d 1535 cmap->next = cmap_cache;
1536 cmap_cache = cmap;
e1d6a774 1537
e07d4801 1538 DEBUG_printf(("8get_sbcs_charmap: Returning new cmap=%p", cmap));
e1d6a774 1539
1540 return (cmap);
ef416fc2 1541
1542 /*
e1d6a774 1543 * If we get here, there was an error in the cmap file...
ef416fc2 1544 */
e1d6a774 1545
1546 sbcs_error:
1547
1548 free_sbcs_charmap(cmap);
1549
1550 cupsFileClose(fp);
1551
e07d4801 1552 DEBUG_puts("8get_sbcs_charmap: Returning NULL (Read/format error)");
e1d6a774 1553
1554 return (NULL);
1555}
1556
1557
1558/*
1559 * 'get_vbcs_charmap()' - Get DBCS/VBCS Charmap.
1560 */
1561
1562static _cups_vmap_t * /* O - Charmap or 0 on error */
1563get_vbcs_charmap(
1564 const cups_encoding_t encoding, /* I - Charmap Encoding */
1565 const char *filename) /* I - Charmap Filename */
1566{
1567 _cups_vmap_t *vmap; /* Legacy VBCS / Unicode Charset Map */
1568 cups_ucs2_t *crow; /* Pointer to UCS-2 row in 'char2uni' */
1569 cups_vbcs_t *vrow; /* Pointer to VBCS row in 'uni2char' */
1570 _cups_wide2uni_t *wide2uni; /* Pointer to row in 'wide2uni' */
1571 cups_sbcs_t leadchar; /* Lead char of 2-byte legacy char */
1572 unsigned long legchar; /* Legacy character value */
1573 cups_utf32_t unichar; /* Unicode character value */
1574 int mapcount; /* Count of lines in charmap file */
1575 cups_file_t *fp; /* Charset map file pointer */
1576 char *s; /* Line parsing pointer */
1577 char line[256]; /* Line from charset map file */
1578 int i; /* Loop variable */
09a101d6 1579 int legacy; /* 32-bit legacy char */
e1d6a774 1580
1581
e07d4801 1582 DEBUG_printf(("7get_vbcs_charmap(encoding=%d, filename=\"%s\")\n",
e1d6a774 1583 encoding, filename));
ef416fc2 1584
1585 /*
e1d6a774 1586 * See if we already have this DBCS/VBCS charset map loaded...
ef416fc2 1587 */
ef416fc2 1588
d6ae789d 1589 for (vmap = vmap_cache; vmap; vmap = vmap->next)
e1d6a774 1590 {
1591 if (vmap->encoding == encoding)
ef416fc2 1592 {
e1d6a774 1593 vmap->used ++;
e07d4801 1594 DEBUG_printf(("8get_vbcs_charmap: Returning existing vmap=%p", vmap));
d6ae789d 1595
e1d6a774 1596 return ((void *)vmap);
ef416fc2 1597 }
ef416fc2 1598 }
ef416fc2 1599
1600 /*
e1d6a774 1601 * Open VBCS charset map input file...
ef416fc2 1602 */
ef416fc2 1603
e1d6a774 1604 if ((fp = cupsFileOpen(filename, "r")) == NULL)
1605 {
e07d4801 1606 DEBUG_printf(("8get_vbcs_charmap: Returning NULL (%s)", strerror(errno)));
d6ae789d 1607
e1d6a774 1608 return (NULL);
1609 }
ef416fc2 1610
1611 /*
e1d6a774 1612 * Count lines in charmap file...
ef416fc2 1613 */
e1d6a774 1614
1615 if ((mapcount = get_charmap_count(fp)) <= 0)
1616 {
e07d4801 1617 DEBUG_puts("8get_vbcs_charmap: Unable to get charmap count!");
d6ae789d 1618
91c84a35
MS
1619 cupsFileClose(fp);
1620
e1d6a774 1621 return (NULL);
1622 }
1623
e07d4801 1624 DEBUG_printf(("8get_vbcs_charmap: mapcount=%d", mapcount));
ef416fc2 1625
1626 /*
e1d6a774 1627 * Allocate memory for DBCS/VBCS charset map...
ef416fc2 1628 */
e1d6a774 1629
1630 if ((vmap = (_cups_vmap_t *)calloc(1, sizeof(_cups_vmap_t))) == NULL)
1631 {
e07d4801 1632 DEBUG_puts("8get_vbcs_charmap: Unable to allocate memory!");
d6ae789d 1633
91c84a35
MS
1634 cupsFileClose(fp);
1635
e1d6a774 1636 return (NULL);
1637 }
1638
1639 vmap->used ++;
1640 vmap->encoding = encoding;
ef416fc2 1641
1642 /*
e1d6a774 1643 * Save DBCS/VBCS charset map into memory for transcoding...
ef416fc2 1644 */
e1d6a774 1645
e1d6a774 1646 wide2uni = NULL;
1647
1648 cupsFileRewind(fp);
1649
09a101d6 1650 i = 0;
1651 legacy = 0;
e1d6a774 1652
1653 while (cupsFileGets(fp, line, sizeof(line)))
ef416fc2 1654 {
e1d6a774 1655 if (line[0] != '0')
1656 continue;
1657
1658 legchar = strtoul(line, &s, 16);
1659 if (legchar == ULONG_MAX)
1660 goto vbcs_error;
1661
1662 unichar = strtol(s, NULL, 16);
bf3816c7 1663 if (unichar < 0 || unichar > 0x10ffff)
e1d6a774 1664 goto vbcs_error;
1665
1666 i ++;
1667
e07d4801
MS
1668 DEBUG_printf(("9get_vbcs_charmap: i=%d, legchar=0x%08lx, unichar=0x%04x", i,
1669 legchar, (unsigned)unichar));
ef416fc2 1670
1671 /*
e1d6a774 1672 * Save lead char of 2/3/4-byte legacy char...
ef416fc2 1673 */
e1d6a774 1674
c9fc04c6 1675 if (legchar > 0xffffff)
ef416fc2 1676 {
c9fc04c6
MS
1677 leadchar = (cups_sbcs_t)(legchar >> 24);
1678 vmap->lead4char[leadchar] = leadchar;
e1d6a774 1679 }
c9fc04c6 1680 else if (legchar > 0xffff)
e1d6a774 1681 {
1682 leadchar = (cups_sbcs_t)(legchar >> 16);
1683 vmap->lead3char[leadchar] = leadchar;
1684 }
bf3816c7 1685 else
e1d6a774 1686 {
c9fc04c6
MS
1687 leadchar = (cups_sbcs_t)(legchar >> 8);
1688 vmap->lead2char[leadchar] = leadchar;
ef416fc2 1689 }
1690
1691 /*
e1d6a774 1692 * Save Legacy to Unicode mapping...
ef416fc2 1693 */
e1d6a774 1694
1695 if (legchar <= 0xffff)
ef416fc2 1696 {
ef416fc2 1697 /*
e1d6a774 1698 * Save DBCS 16-bit to Unicode mapping in indirect lookup table...
ef416fc2 1699 */
e1d6a774 1700
1701 crow = vmap->char2uni[(int)leadchar];
1702 if (!crow)
1703 {
1704 crow = (cups_ucs2_t *)calloc(256, sizeof(cups_ucs2_t));
1705 if (!crow)
1706 goto vbcs_error;
1707
1708 vmap->char2uni[(int)leadchar] = crow;
1709 }
1710
1711 crow[(int)(legchar & 0xff)] = (cups_ucs2_t)unichar;
1712 }
1713 else
1714 {
1715 /*
1716 * Save VBCS 32-bit to Unicode mapping in sorted list table...
1717 */
1718
09a101d6 1719 if (!legacy)
e1d6a774 1720 {
09a101d6 1721 legacy = 1;
e1d6a774 1722 vmap->widecount = (mapcount - i + 1);
1723 wide2uni = (_cups_wide2uni_t *)calloc(vmap->widecount,
1724 sizeof(_cups_wide2uni_t));
1725 if (!wide2uni)
1726 goto vbcs_error;
1727
1728 vmap->wide2uni = wide2uni;
1729 }
1730
1731 wide2uni->widechar = (cups_vbcs_t)legchar;
1732 wide2uni->unichar = (cups_ucs2_t)unichar;
1733 wide2uni ++;
ef416fc2 1734 }
1735
1736 /*
e1d6a774 1737 * Save Unicode to legacy mapping in indirect lookup table...
ef416fc2 1738 */
e1d6a774 1739
1740 vrow = vmap->uni2char[(int)((unichar >> 8) & 0xff)];
1741 if (!vrow)
ef416fc2 1742 {
e1d6a774 1743 vrow = (cups_vbcs_t *)calloc(256, sizeof(cups_vbcs_t));
1744 if (!vrow)
1745 goto vbcs_error;
1746
1747 vmap->uni2char[(int) ((unichar >> 8) & 0xff)] = vrow;
ef416fc2 1748 }
e1d6a774 1749
1750 vrow += (int)(unichar & 0xff);
ef416fc2 1751
1752 /*
e1d6a774 1753 * Convert Replacement Character to visible replacement...
ef416fc2 1754 */
e1d6a774 1755
1756 if (unichar == 0xfffd)
1757 legchar = (unsigned long)'?';
ef416fc2 1758
1759 /*
e1d6a774 1760 * First (oldest) legacy character uses Unicode mapping cell...
ef416fc2 1761 */
e1d6a774 1762
1763 if (!*vrow)
1764 *vrow = (cups_vbcs_t)legchar;
ef416fc2 1765 }
e1d6a774 1766
1767 vmap->charcount = (i - vmap->widecount);
1768
1769 cupsFileClose(fp);
ef416fc2 1770
1771 /*
e1d6a774 1772 * Add it to the cache and return...
ef416fc2 1773 */
ef416fc2 1774
c9fc04c6 1775 vmap->next = vmap_cache;
d6ae789d 1776 vmap_cache = vmap;
e1d6a774 1777
e07d4801 1778 DEBUG_printf(("8get_vbcs_charmap: Returning new vmap=%p", vmap));
e1d6a774 1779
1780 return (vmap);
1781
1782 /*
1783 * If we get here, the file contains errors...
1784 */
1785
1786 vbcs_error:
1787
1788 free_vbcs_charmap(vmap);
1789
1790 cupsFileClose(fp);
1791
e07d4801 1792 DEBUG_puts("8get_vbcs_charmap: Returning NULL (Read/format error)");
e1d6a774 1793
1794 return (NULL);
ef416fc2 1795}
1796
1797
1798/*
75bd9771 1799 * End of "$Id: transcode.c 7560 2008-05-13 06:34:04Z mike $"
ef416fc2 1800 */