cups/transcode.c

   1 /*
   2  * "$Id: transcode.c 9306 2010-09-16 21:43:57Z mike $"
   3  *
   4  *   Transcoding support for CUPS.
   5  *
   6  *   Copyright 2007-2010 by Apple Inc.
   7  *   Copyright 1997-2007 by Easy Software Products.
   8  *
   9  *   These coded instructions, statements, and computer programs are the
  10  *   property of Apple Inc. and are protected by Federal copyright
  11  *   law.  Distribution and use rights are outlined in the file "LICENSE.txt"
  12  *   which should have been included with this file.  If this file is
  13  *   file is missing or damaged, see the license at "http://www.cups.org/".
  14  *
  15  *   This file is subject to the Apple OS-Developed Software exception.
  16  *
  17  * Contents:
  18  *
  19  *   _cupsCharmapFlush() - Flush all character set maps out of cache.
  20  *   cupsCharsetToUTF8() - Convert legacy character set to UTF-8.
  21  *   cupsUTF8ToCharset() - Convert UTF-8 to legacy character set.
  22  *   cupsUTF8ToUTF32()   - Convert UTF-8 to UTF-32.
  23  *   cupsUTF32ToUTF8()   - Convert UTF-32 to UTF-8.
  24  */
  25
  26 /*
  27  * Include necessary headers...
  28  */
  29
  30 #include "cups-private.h"
  31 #include <limits.h>
  32 #include <time.h>
  33 #ifdef HAVE_ICONV_H
  34 #  include <iconv.h>
  35 #endif /* HAVE_ICONV_H */
  36
  37
  38 /*
  39  * Local globals...
  40  */
  41
  42 #ifdef HAVE_ICONV_H
  43 static _cups_mutex_t    map_mutex = _CUPS_MUTEX_INITIALIZER;
  44                                         /* Mutex to control access to maps */
  45 static iconv_t          map_from_utf8 = (iconv_t)-1;
  46                                         /* Convert from UTF-8 to charset */
  47 static iconv_t          map_to_utf8 = (iconv_t)-1;
  48                                         /* Convert from charset to UTF-8 */
  49 static cups_encoding_t  map_encoding = CUPS_AUTO_ENCODING;
  50                                         /* Which charset is cached */
  51 #endif /* HAVE_ICONV_H */
  52
  53
  54 /*
  55  * '_cupsCharmapFlush()' - Flush all character set maps out of cache.
  56  */
  57
  58 void
  59 _cupsCharmapFlush(void)
  60 {
  61 #ifdef HAVE_ICONV_H
  62   if (map_from_utf8 != (iconv_t)-1)
  63   {
  64     iconv_close(map_from_utf8);
  65     map_from_utf8 = (iconv_t)-1;
  66   }
  67
  68   if (map_to_utf8 != (iconv_t)-1)
  69   {
  70     iconv_close(map_to_utf8);
  71     map_to_utf8 = (iconv_t)-1;
  72   }
  73
  74   map_encoding = CUPS_AUTO_ENCODING;
  75 #endif /* HAVE_ICONV_H */
  76 }
  77
  78
  79 /*
  80  * 'cupsCharsetToUTF8()' - Convert legacy character set to UTF-8.
  81  */
  82
  83 int                                     /* O - Count or -1 on error */
  84 cupsCharsetToUTF8(
  85     cups_utf8_t           *dest,        /* O - Target string */
  86     const char            *src,         /* I - Source string */
  87     const int             maxout,       /* I - Max output */
  88     const cups_encoding_t encoding)     /* I - Encoding */
  89 {
  90   cups_utf8_t   *destptr;               /* Pointer into UTF-8 buffer */
  91 #ifdef HAVE_ICONV_H
  92   size_t        srclen,                 /* Length of source string */
  93                 outBytesLeft;           /* Bytes remaining in output buffer */
  94 #endif /* HAVE_ICONV_H */
  95
  96
  97  /*
  98   * Check for valid arguments...
  99   */
 100
 101   DEBUG_printf(("2cupsCharsetToUTF8(dest=%p, src=\"%s\", maxout=%d, encoding=%d)",
 102                 dest, src, maxout, encoding));
 103
 104   if (!dest || !src || maxout < 1)
 105   {
 106     if (dest)
 107       *dest = '\0';
 108
 109     DEBUG_puts("3cupsCharsetToUTF8: Bad arguments, returning -1");
 110     return (-1);
 111   }
 112
 113  /*
 114   * Handle identity conversions...
 115   */
 116
 117   if (encoding == CUPS_UTF8 || encoding <= CUPS_US_ASCII ||
 118       encoding >= CUPS_ENCODING_VBCS_END)
 119   {
 120     strlcpy((char *)dest, src, maxout);
 121     return ((int)strlen((char *)dest));
 122   }
 123
 124  /*
 125   * Handle ISO-8859-1 to UTF-8 directly...
 126   */
 127
 128   destptr = dest;
 129
 130   if (encoding == CUPS_ISO8859_1)
 131   {
 132     int         ch;                     /* Character from string */
 133     cups_utf8_t *destend;               /* End of UTF-8 buffer */
 134
 135
 136     destend = dest + maxout - 2;
 137
 138     while (*src && destptr < destend)
 139     {
 140       ch = *src++ & 255;
 141
 142       if (ch & 128)
 143       {
 144         *destptr++ = 0xc0 | (ch >> 6);
 145         *destptr++ = 0x80 | (ch & 0x3f);
 146       }
 147       else
 148         *destptr++ = ch;
 149     }
 150
 151     *destptr = '\0';
 152
 153     return ((int)(destptr - dest));
 154   }
 155
 156  /*
 157   * Convert input legacy charset to UTF-8...
 158   */
 159
 160 #ifdef HAVE_ICONV_H
 161   _cupsMutexLock(&map_mutex);
 162
 163   if (map_encoding != encoding)
 164   {
 165     _cupsCharmapFlush();
 166
 167     map_from_utf8 = iconv_open(_cupsEncodingName(encoding), "UTF-8");
 168     map_to_utf8   = iconv_open("UTF-8", _cupsEncodingName(encoding));
 169     map_encoding     = encoding;
 170   }
 171
 172   if (map_to_utf8 != (iconv_t)-1)
 173   {
 174     srclen       = strlen(src);
 175     outBytesLeft = maxout - 1;
 176
 177     iconv(map_to_utf8, (char **)&src, &srclen, (char **)&destptr,
 178           &outBytesLeft);
 179     *destptr = '\0';
 180
 181     _cupsMutexUnlock(&map_mutex);
 182
 183     return ((int)(destptr - dest));
 184   }
 185
 186   _cupsMutexUnlock(&map_mutex);
 187 #endif /* HAVE_ICONV_H */
 188
 189  /*
 190   * No iconv() support, so error out...
 191   */
 192
 193   *destptr = '\0';
 194
 195   return (-1);
 196 }
 197
 198
 199 /*
 200  * 'cupsUTF8ToCharset()' - Convert UTF-8 to legacy character set.
 201  */
 202
 203 int                                     /* O - Count or -1 on error */
 204 cupsUTF8ToCharset(
 205     char                  *dest,        /* O - Target string */
 206     const cups_utf8_t     *src,         /* I - Source string */
 207     const int             maxout,       /* I - Max output */
 208     const cups_encoding_t encoding)     /* I - Encoding */
 209 {
 210   char          *destptr;               /* Pointer into destination */
 211 #ifdef HAVE_ICONV_H
 212   size_t        srclen,                 /* Length of source string */
 213                 outBytesLeft;           /* Bytes remaining in output buffer */
 214 #endif /* HAVE_ICONV_H */
 215
 216
 217  /*
 218   * Check for valid arguments...
 219   */
 220
 221   if (!dest || !src || maxout < 1)
 222   {
 223     if (dest)
 224       *dest = '\0';
 225
 226     return (-1);
 227   }
 228
 229  /*
 230   * Handle identity conversions...
 231   */
 232
 233   if (encoding == CUPS_UTF8 || encoding <= CUPS_US_ASCII ||
 234       encoding >= CUPS_ENCODING_VBCS_END)
 235   {
 236     strlcpy(dest, (char *)src, maxout);
 237     return ((int)strlen(dest));
 238   }
 239
 240  /*
 241   * Handle UTF-8 to ISO-8859-1 directly...
 242   */
 243
 244   destptr = dest;
 245
 246   if (encoding == CUPS_ISO8859_1)
 247   {
 248     int         ch;                     /* Character from string */
 249     char        *destend;               /* End of ISO-8859-1 buffer */
 250
 251
 252     destend = dest + maxout - 1;
 253
 254     while (*src && destptr < destend)
 255     {
 256       ch = *src++;
 257
 258       if ((ch & 0xe0) == 0xc0)
 259       {
 260         ch = ((ch & 0x1f) << 6) | (*src++ & 0x3f);
 261
 262         if (ch < 256)
 263           *destptr++ = ch;
 264         else
 265           *destptr++ = '?';
 266       }
 267       else if ((ch & 0xf0) == 0xe0 ||
 268                (ch & 0xf8) == 0xf0)
 269         *destptr++ = '?';
 270       else if (!(ch & 0x80))
 271         *destptr++ = ch;
 272     }
 273
 274     *destptr = '\0';
 275
 276     return ((int)(destptr - dest));
 277   }
 278
 279 #ifdef HAVE_ICONV_H
 280  /*
 281   * Convert input UTF-8 to legacy charset...
 282   */
 283
 284   _cupsMutexLock(&map_mutex);
 285
 286   if (map_encoding != encoding)
 287   {
 288     _cupsCharmapFlush();
 289
 290     map_from_utf8 = iconv_open(_cupsEncodingName(encoding), "UTF-8");
 291     map_to_utf8   = iconv_open("UTF-8", _cupsEncodingName(encoding));
 292     map_encoding  = encoding;
 293   }
 294
 295   if (map_from_utf8 != (iconv_t)-1)
 296   {
 297     srclen       = strlen((char *)src);
 298     outBytesLeft = maxout - 1;
 299
 300     iconv(map_from_utf8, (char **)&src, &srclen, &destptr, &outBytesLeft);
 301     *destptr = '\0';
 302
 303     _cupsMutexUnlock(&map_mutex);
 304
 305     return ((int)(destptr - dest));
 306   }
 307
 308   _cupsMutexUnlock(&map_mutex);
 309 #endif /* HAVE_ICONV_H */
 310
 311  /*
 312   * No iconv() support, so error out...
 313   */
 314
 315   *destptr = '\0';
 316
 317   return (-1);
 318 }
 319
 320
 321 /*
 322  * 'cupsUTF8ToUTF32()' - Convert UTF-8 to UTF-32.
 323  *
 324  * 32-bit UTF-32 (actually 21-bit) maps to UTF-8 as follows...
 325  *
 326  *   UTF-32 char     UTF-8 char(s)
 327  *   --------------------------------------------------
 328  *        0 to 127 = 0xxxxxxx (US-ASCII)
 329  *     128 to 2047 = 110xxxxx 10yyyyyy
 330  *   2048 to 65535 = 1110xxxx 10yyyyyy 10zzzzzz
 331  *         > 65535 = 11110xxx 10yyyyyy 10zzzzzz 10xxxxxx
 332  *
 333  * UTF-32 prohibits chars beyond Plane 16 (> 0x10ffff) in UCS-4,
 334  * which would convert to five- or six-octet UTF-8 sequences...
 335  */
 336
 337 int                                     /* O - Count or -1 on error */
 338 cupsUTF8ToUTF32(
 339     cups_utf32_t      *dest,            /* O - Target string */
 340     const cups_utf8_t *src,             /* I - Source string */
 341     const int         maxout)           /* I - Max output */
 342 {
 343   int           i;                      /* Looping variable */
 344   cups_utf8_t   ch;                     /* Character value */
 345   cups_utf8_t   next;                   /* Next character value */
 346   cups_utf32_t  ch32;                   /* UTF-32 character value */
 347
 348
 349  /*
 350   * Check for valid arguments and clear output...
 351   */
 352
 353   DEBUG_printf(("2cupsUTF8ToUTF32(dest=%p, src=\"%s\", maxout=%d)", dest,
 354                 src, maxout));
 355
 356   if (dest)
 357     *dest = 0;
 358
 359   if (!dest || !src || maxout < 1 || maxout > CUPS_MAX_USTRING)
 360   {
 361     DEBUG_puts("3cupsUTF8ToUTF32: Returning -1 (bad arguments)");
 362
 363     return (-1);
 364   }
 365
 366  /*
 367   * Convert input UTF-8 to output UTF-32...
 368   */
 369
 370   for (i = maxout - 1; *src && i > 0; i --)
 371   {
 372     ch = *src++;
 373
 374    /*
 375     * Convert UTF-8 character(s) to UTF-32 character...
 376     */
 377
 378     if (!(ch & 0x80))
 379     {
 380      /*
 381       * One-octet UTF-8 <= 127 (US-ASCII)...
 382       */
 383
 384       *dest++ = ch;
 385
 386       DEBUG_printf(("4cupsUTF8ToUTF32: %02x => %08X", src[-1], ch));
 387       continue;
 388     }
 389     else if ((ch & 0xe0) == 0xc0)
 390     {
 391      /*
 392       * Two-octet UTF-8 <= 2047 (Latin-x)...
 393       */
 394
 395       next = *src++;
 396       if ((next & 0xc0) != 0x80)
 397       {
 398         DEBUG_puts("3cupsUTF8ToUTF32: Returning -1 (bad UTF-8 sequence)");
 399
 400         return (-1);
 401       }
 402
 403       ch32 = ((ch & 0x1f) << 6) | (next & 0x3f);
 404
 405      /*
 406       * Check for non-shortest form (invalid UTF-8)...
 407       */
 408
 409       if (ch32 < 0x80)
 410       {
 411         DEBUG_puts("3cupsUTF8ToUTF32: Returning -1 (bad UTF-8 sequence)");
 412
 413         return (-1);
 414       }
 415
 416       *dest++ = ch32;
 417
 418       DEBUG_printf(("4cupsUTF8ToUTF32: %02x %02x => %08X",
 419                     src[-2], src[-1], (unsigned)ch32));
 420     }
 421     else if ((ch & 0xf0) == 0xe0)
 422     {
 423      /*
 424       * Three-octet UTF-8 <= 65535 (Plane 0 - BMP)...
 425       */
 426
 427       next = *src++;
 428       if ((next & 0xc0) != 0x80)
 429       {
 430         DEBUG_puts("3cupsUTF8ToUTF32: Returning -1 (bad UTF-8 sequence)");
 431
 432         return (-1);
 433       }
 434
 435       ch32 = ((ch & 0x0f) << 6) | (next & 0x3f);
 436
 437       next = *src++;
 438       if ((next & 0xc0) != 0x80)
 439       {
 440         DEBUG_puts("3cupsUTF8ToUTF32: Returning -1 (bad UTF-8 sequence)");
 441
 442         return (-1);
 443       }
 444
 445       ch32 = (ch32 << 6) | (next & 0x3f);
 446
 447      /*
 448       * Check for non-shortest form (invalid UTF-8)...
 449       */
 450
 451       if (ch32 < 0x800)
 452       {
 453         DEBUG_puts("3cupsUTF8ToUTF32: Returning -1 (bad UTF-8 sequence)");
 454
 455         return (-1);
 456       }
 457
 458       *dest++ = ch32;
 459
 460       DEBUG_printf(("4cupsUTF8ToUTF32: %02x %02x %02x => %08X",
 461                     src[-3], src[-2], src[-1], (unsigned)ch32));
 462     }
 463     else if ((ch & 0xf8) == 0xf0)
 464     {
 465      /*
 466       * Four-octet UTF-8...
 467       */
 468
 469       next = *src++;
 470       if ((next & 0xc0) != 0x80)
 471       {
 472         DEBUG_puts("3cupsUTF8ToUTF32: Returning -1 (bad UTF-8 sequence)");
 473
 474         return (-1);
 475       }
 476
 477       ch32 = ((ch & 0x07) << 6) | (next & 0x3f);
 478
 479       next = *src++;
 480       if ((next & 0xc0) != 0x80)
 481       {
 482         DEBUG_puts("3cupsUTF8ToUTF32: Returning -1 (bad UTF-8 sequence)");
 483
 484         return (-1);
 485       }
 486
 487       ch32 = (ch32 << 6) | (next & 0x3f);
 488
 489       next = *src++;
 490       if ((next & 0xc0) != 0x80)
 491       {
 492         DEBUG_puts("3cupsUTF8ToUTF32: Returning -1 (bad UTF-8 sequence)");
 493
 494         return (-1);
 495       }
 496
 497       ch32 = (ch32 << 6) | (next & 0x3f);
 498
 499      /*
 500       * Check for non-shortest form (invalid UTF-8)...
 501       */
 502
 503       if (ch32 < 0x10000)
 504       {
 505         DEBUG_puts("3cupsUTF8ToUTF32: Returning -1 (bad UTF-8 sequence)");
 506
 507         return (-1);
 508       }
 509
 510       *dest++ = ch32;
 511
 512       DEBUG_printf(("4cupsUTF8ToUTF32: %02x %02x %02x %02x => %08X",
 513                     src[-4], src[-3], src[-2], src[-1], (unsigned)ch32));
 514     }
 515     else
 516     {
 517      /*
 518       * More than 4-octet (invalid UTF-8 sequence)...
 519       */
 520
 521       DEBUG_puts("3cupsUTF8ToUTF32: Returning -1 (bad UTF-8 sequence)");
 522
 523       return (-1);
 524     }
 525
 526    /*
 527     * Check for UTF-16 surrogate (illegal UTF-8)...
 528     */
 529
 530     if (ch32 >= 0xd800 && ch32 <= 0xdfff)
 531       return (-1);
 532   }
 533
 534   *dest = 0;
 535
 536   DEBUG_printf(("3cupsUTF8ToUTF32: Returning %d characters", maxout - 1 - i));
 537
 538   return (maxout - 1 - i);
 539 }
 540
 541
 542 /*
 543  * 'cupsUTF32ToUTF8()' - Convert UTF-32 to UTF-8.
 544  *
 545  * 32-bit UTF-32 (actually 21-bit) maps to UTF-8 as follows...
 546  *
 547  *   UTF-32 char     UTF-8 char(s)
 548  *   --------------------------------------------------
 549  *        0 to 127 = 0xxxxxxx (US-ASCII)
 550  *     128 to 2047 = 110xxxxx 10yyyyyy
 551  *   2048 to 65535 = 1110xxxx 10yyyyyy 10zzzzzz
 552  *         > 65535 = 11110xxx 10yyyyyy 10zzzzzz 10xxxxxx
 553  *
 554  * UTF-32 prohibits chars beyond Plane 16 (> 0x10ffff) in UCS-4,
 555  * which would convert to five- or six-octet UTF-8 sequences...
 556  */
 557
 558 int                                     /* O - Count or -1 on error */
 559 cupsUTF32ToUTF8(
 560     cups_utf8_t        *dest,           /* O - Target string */
 561     const cups_utf32_t *src,            /* I - Source string */
 562     const int          maxout)          /* I - Max output */
 563 {
 564   cups_utf8_t   *start;                 /* Start of destination string */
 565   int           i;                      /* Looping variable */
 566   int           swap;                   /* Byte-swap input to output */
 567   cups_utf32_t  ch;                     /* Character value */
 568
 569
 570  /*
 571   * Check for valid arguments and clear output...
 572   */
 573
 574   DEBUG_printf(("2cupsUTF32ToUTF8(dest=%p, src=%p, maxout=%d)", dest, src,
 575                 maxout));
 576
 577   if (dest)
 578     *dest = '\0';
 579
 580   if (!dest || !src || maxout < 1)
 581   {
 582     DEBUG_puts("3cupsUTF32ToUTF8: Returning -1 (bad args)");
 583
 584     return (-1);
 585   }
 586
 587  /*
 588   * Check for leading BOM in UTF-32 and inverted BOM...
 589   */
 590
 591   start = dest;
 592   swap  = *src == 0xfffe0000;
 593
 594   DEBUG_printf(("4cupsUTF32ToUTF8: swap=%d", swap));
 595
 596   if (*src == 0xfffe0000 || *src == 0xfeff)
 597     src ++;
 598
 599  /*
 600   * Convert input UTF-32 to output UTF-8...
 601   */
 602
 603   for (i = maxout - 1; *src && i > 0;)
 604   {
 605     ch = *src++;
 606
 607    /*
 608     * Byte swap input UTF-32, if necessary...
 609     * (only byte-swapping 24 of 32 bits)
 610     */
 611
 612     if (swap)
 613       ch = ((ch >> 24) | ((ch >> 8) & 0xff00) | ((ch << 8) & 0xff0000));
 614
 615    /*
 616     * Check for beyond Plane 16 (invalid UTF-32)...
 617     */
 618
 619     if (ch > 0x10ffff)
 620     {
 621       DEBUG_puts("3cupsUTF32ToUTF8: Returning -1 (character out of range)");
 622
 623       return (-1);
 624     }
 625
 626    /*
 627     * Convert UTF-32 character to UTF-8 character(s)...
 628     */
 629
 630     if (ch < 0x80)
 631     {
 632      /*
 633       * One-octet UTF-8 <= 127 (US-ASCII)...
 634       */
 635
 636       *dest++ = (cups_utf8_t)ch;
 637       i --;
 638
 639       DEBUG_printf(("4cupsUTF32ToUTF8: %08x => %02x", (unsigned)ch, dest[-1]));
 640     }
 641     else if (ch < 0x800)
 642     {
 643      /*
 644       * Two-octet UTF-8 <= 2047 (Latin-x)...
 645       */
 646
 647       if (i < 2)
 648       {
 649         DEBUG_puts("3cupsUTF32ToUTF8: Returning -1 (too long 2)");
 650
 651         return (-1);
 652       }
 653
 654       *dest++ = (cups_utf8_t)(0xc0 | ((ch >> 6) & 0x1f));
 655       *dest++ = (cups_utf8_t)(0x80 | (ch & 0x3f));
 656       i -= 2;
 657
 658       DEBUG_printf(("4cupsUTF32ToUTF8: %08x => %02x %02x", (unsigned)ch,
 659                     dest[-2], dest[-1]));
 660     }
 661     else if (ch < 0x10000)
 662     {
 663      /*
 664       * Three-octet UTF-8 <= 65535 (Plane 0 - BMP)...
 665       */
 666
 667       if (i < 3)
 668       {
 669         DEBUG_puts("3cupsUTF32ToUTF8: Returning -1 (too long 3)");
 670
 671         return (-1);
 672       }
 673
 674       *dest++ = (cups_utf8_t)(0xe0 | ((ch >> 12) & 0x0f));
 675       *dest++ = (cups_utf8_t)(0x80 | ((ch >> 6) & 0x3f));
 676       *dest++ = (cups_utf8_t)(0x80 | (ch & 0x3f));
 677       i -= 3;
 678
 679       DEBUG_printf(("4cupsUTF32ToUTF8: %08x => %02x %02x %02x", (unsigned)ch,
 680                     dest[-3], dest[-2], dest[-1]));
 681     }
 682     else
 683     {
 684      /*
 685       * Four-octet UTF-8...
 686       */
 687
 688       if (i < 4)
 689       {
 690         DEBUG_puts("3cupsUTF32ToUTF8: Returning -1 (too long 4)");
 691
 692         return (-1);
 693       }
 694
 695       *dest++ = (cups_utf8_t)(0xf0 | ((ch >> 18) & 0x07));
 696       *dest++ = (cups_utf8_t)(0x80 | ((ch >> 12) & 0x3f));
 697       *dest++ = (cups_utf8_t)(0x80 | ((ch >> 6) & 0x3f));
 698       *dest++ = (cups_utf8_t)(0x80 | (ch & 0x3f));
 699       i -= 4;
 700
 701       DEBUG_printf(("4cupsUTF32ToUTF8: %08x => %02x %02x %02x %02x",
 702                     (unsigned)ch, dest[-4], dest[-3], dest[-2], dest[-1]));
 703     }
 704   }
 705
 706   *dest = '\0';
 707
 708   DEBUG_printf(("3cupsUTF32ToUTF8: Returning %d", (int)(dest - start)));
 709
 710   return ((int)(dest - start));
 711 }
 712
 713
 714 /*
 715  * End of "$Id: transcode.c 9306 2010-09-16 21:43:57Z mike $"
 716  */