cups/transcode.c

   1 /*
   2  * "$Id: transcode.c 9306 2010-09-16 21:43:57Z mike $"
   3  *
   4  *   Transcoding support for CUPS.
   5  *
   6  *   Copyright 2007-2010 by Apple Inc.
   7  *   Copyright 1997-2007 by Easy Software Products.
   8  *
   9  *   These coded instructions, statements, and computer programs are the
  10  *   property of Apple Inc. and are protected by Federal copyright
  11  *   law.  Distribution and use rights are outlined in the file "LICENSE.txt"
  12  *   which should have been included with this file.  If this file is
  13  *   file is missing or damaged, see the license at "http://www.cups.org/".
  14  *
  15  *   This file is subject to the Apple OS-Developed Software exception.
  16  *
  17  * Contents:
  18  *
  19  *   _cupsCharmapFlush() - Flush all character set maps out of cache.
  20  *   cupsCharsetToUTF8() - Convert legacy character set to UTF-8.
  21  *   cupsUTF8ToCharset() - Convert UTF-8 to legacy character set.
  22  *   cupsUTF8ToUTF32()   - Convert UTF-8 to UTF-32.
  23  *   cupsUTF32ToUTF8()   - Convert UTF-32 to UTF-8.
  24  */
  25
  26 /*
  27  * Include necessary headers...
  28  */
  29
  30 #include "cups-private.h"
  31 #include <limits.h>
  32 #include <time.h>
  33 #ifdef HAVE_ICONV_H
  34 #  include <iconv.h>
  35 #endif /* HAVE_ICONV_H */
  36
  37
  38 /*
  39  * Local globals...
  40  */
  41
  42 #ifdef HAVE_ICONV_H
  43 static _cups_mutex_t    map_mutex = _CUPS_MUTEX_INITIALIZER;
  44                                         /* Mutex to control access to maps */
  45 static iconv_t          map_from_utf8 = (iconv_t)-1;
  46                                         /* Convert from UTF-8 to charset */
  47 static iconv_t          map_to_utf8 = (iconv_t)-1;
  48                                         /* Convert from charset to UTF-8 */
  49 static cups_encoding_t  map_encoding = CUPS_AUTO_ENCODING;
  50                                         /* Which charset is cached */
  51 #endif /* HAVE_ICONV_H */
  52
  53
  54 /*
  55  * '_cupsCharmapFlush()' - Flush all character set maps out of cache.
  56  */
  57
  58 void
  59 _cupsCharmapFlush(void)
  60 {
  61 #ifdef HAVE_ICONV_H
  62   if (map_from_utf8 != (iconv_t)-1)
  63   {
  64     iconv_close(map_from_utf8);
  65     map_from_utf8 = (iconv_t)-1;
  66   }
  67
  68   if (map_to_utf8 != (iconv_t)-1)
  69   {
  70     iconv_close(map_to_utf8);
  71     map_to_utf8 = (iconv_t)-1;
  72   }
  73
  74   map_encoding = CUPS_AUTO_ENCODING;
  75 #endif /* HAVE_ICONV_H */
  76 }
  77
  78
  79 /*
  80  * 'cupsCharsetToUTF8()' - Convert legacy character set to UTF-8.
  81  */
  82
  83 int                                     /* O - Count or -1 on error */
  84 cupsCharsetToUTF8(
  85     cups_utf8_t           *dest,        /* O - Target string */
  86     const char            *src,         /* I - Source string */
  87     const int             maxout,       /* I - Max output */
  88     const cups_encoding_t encoding)     /* I - Encoding */
  89 {
  90   cups_utf8_t   *destptr;               /* Pointer into UTF-8 buffer */
  91 #ifdef HAVE_ICONV_H
  92   size_t        srclen,                 /* Length of source string */
  93                 outBytesLeft;           /* Bytes remaining in output buffer */
  94 #endif /* HAVE_ICONV_H */
  95
  96
  97  /*
  98   * Check for valid arguments...
  99   */
 100
 101   DEBUG_printf(("2cupsCharsetToUTF8(dest=%p, src=\"%s\", maxout=%d, encoding=%d)",
 102                 dest, src, maxout, encoding));
 103
 104   if (!dest || !src || maxout < 1)
 105   {
 106     if (dest)
 107       *dest = '\0';
 108
 109     DEBUG_puts("3cupsCharsetToUTF8: Bad arguments, returning -1");
 110     return (-1);
 111   }
 112
 113  /*
 114   * Handle identity conversions...
 115   */
 116
 117   if (encoding == CUPS_UTF8 || encoding <= CUPS_US_ASCII ||
 118       encoding >= CUPS_ENCODING_VBCS_END)
 119   {
 120     strlcpy((char *)dest, src, maxout);
 121     return ((int)strlen((char *)dest));
 122   }
 123
 124  /*
 125   * Handle ISO-8859-1 to UTF-8 directly...
 126   */
 127
 128   destptr = dest;
 129
 130   if (encoding == CUPS_ISO8859_1)
 131   {
 132     int         ch;                     /* Character from string */
 133     cups_utf8_t *destend;               /* End of UTF-8 buffer */
 134
 135
 136     destend = dest + maxout - 2;
 137
 138     while (*src && destptr < destend)
 139     {
 140       ch = *src++ & 255;
 141
 142       if (ch & 128)
 143       {
 144         *destptr++ = 0xc0 | (ch >> 6);
 145         *destptr++ = 0x80 | (ch & 0x3f);
 146       }
 147       else
 148         *destptr++ = ch;
 149     }
 150
 151     *destptr = '\0';
 152
 153     return ((int)(destptr - dest));
 154   }
 155
 156  /*
 157   * Convert input legacy charset to UTF-8...
 158   */
 159
 160 #ifdef HAVE_ICONV_H
 161   _cupsMutexLock(&map_mutex);
 162
 163   if (map_encoding != encoding)
 164   {
 165     _cupsCharmapFlush();
 166
 167     map_from_utf8 = iconv_open(_cupsEncodingName(encoding), "UTF-8");
 168     map_to_utf8   = iconv_open("UTF-8", _cupsEncodingName(encoding));
 169     map_encoding     = encoding;
 170   }
 171
 172   if (map_to_utf8 != (iconv_t)-1)
 173   {
 174     srclen       = strlen(src);
 175     outBytesLeft = maxout - 1;
 176
 177     iconv(map_to_utf8, (char **)&src, &srclen, (char **)&destptr,
 178           &outBytesLeft);
 179     *destptr = '\0';
 180
 181     _cupsMutexUnlock(&map_mutex);
 182
 183     return ((int)(destptr - dest));
 184   }
 185
 186   _cupsMutexUnlock(&map_mutex);
 187 #endif /* HAVE_ICONV_H */
 188
 189  /*
 190   * No iconv() support, so error out...
 191   */
 192
 193   *destptr = '\0';
 194
 195   return (-1);
 196 }
 197
 198
 199 /*
 200  * 'cupsUTF8ToCharset()' - Convert UTF-8 to legacy character set.
 201  */
 202
 203 int                                     /* O - Count or -1 on error */
 204 cupsUTF8ToCharset(
 205     char                  *dest,        /* O - Target string */
 206     const cups_utf8_t     *src,         /* I - Source string */
 207     const int             maxout,       /* I - Max output */
 208     const cups_encoding_t encoding)     /* I - Encoding */
 209 {
 210   char          *destptr;               /* Pointer into destination */
 211 #ifdef HAVE_ICONV_H
 212   size_t        srclen,                 /* Length of source string */
 213                 outBytesLeft;           /* Bytes remaining in output buffer */
 214 #endif /* HAVE_ICONV_H */
 215
 216
 217  /*
 218   * Check for valid arguments...
 219   */
 220
 221   if (!dest || !src || maxout < 1)
 222   {
 223     if (dest)
 224       *dest = '\0';
 225
 226     return (-1);
 227   }
 228
 229  /*
 230   * Handle identity conversions...
 231   */
 232
 233   if (encoding == CUPS_UTF8 ||
 234       encoding >= CUPS_ENCODING_VBCS_END)
 235   {
 236     strlcpy(dest, (char *)src, maxout);
 237     return ((int)strlen(dest));
 238   }
 239
 240  /*
 241   * Handle UTF-8 to ISO-8859-1 directly...
 242   */
 243
 244   destptr = dest;
 245
 246   if (encoding == CUPS_ISO8859_1 || encoding <= CUPS_US_ASCII)
 247   {
 248     int         ch,                     /* Character from string */
 249                 maxch;                  /* Maximum character for charset */
 250     char        *destend;               /* End of ISO-8859-1 buffer */
 251
 252     maxch   = encoding == CUPS_ISO8859_1 ? 256 : 128;
 253     destend = dest + maxout - 1;
 254
 255     while (*src && destptr < destend)
 256     {
 257       ch = *src++;
 258
 259       if ((ch & 0xe0) == 0xc0)
 260       {
 261         ch = ((ch & 0x1f) << 6) | (*src++ & 0x3f);
 262
 263         if (ch < maxch)
 264           *destptr++ = ch;
 265         else
 266           *destptr++ = '?';
 267       }
 268       else if ((ch & 0xf0) == 0xe0 ||
 269                (ch & 0xf8) == 0xf0)
 270         *destptr++ = '?';
 271       else if (!(ch & 0x80))
 272         *destptr++ = ch;
 273     }
 274
 275     *destptr = '\0';
 276
 277     return ((int)(destptr - dest));
 278   }
 279
 280 #ifdef HAVE_ICONV_H
 281  /*
 282   * Convert input UTF-8 to legacy charset...
 283   */
 284
 285   _cupsMutexLock(&map_mutex);
 286
 287   if (map_encoding != encoding)
 288   {
 289     _cupsCharmapFlush();
 290
 291     map_from_utf8 = iconv_open(_cupsEncodingName(encoding), "UTF-8");
 292     map_to_utf8   = iconv_open("UTF-8", _cupsEncodingName(encoding));
 293     map_encoding  = encoding;
 294   }
 295
 296   if (map_from_utf8 != (iconv_t)-1)
 297   {
 298     srclen       = strlen((char *)src);
 299     outBytesLeft = maxout - 1;
 300
 301     iconv(map_from_utf8, (char **)&src, &srclen, &destptr, &outBytesLeft);
 302     *destptr = '\0';
 303
 304     _cupsMutexUnlock(&map_mutex);
 305
 306     return ((int)(destptr - dest));
 307   }
 308
 309   _cupsMutexUnlock(&map_mutex);
 310 #endif /* HAVE_ICONV_H */
 311
 312  /*
 313   * No iconv() support, so error out...
 314   */
 315
 316   *destptr = '\0';
 317
 318   return (-1);
 319 }
 320
 321
 322 /*
 323  * 'cupsUTF8ToUTF32()' - Convert UTF-8 to UTF-32.
 324  *
 325  * 32-bit UTF-32 (actually 21-bit) maps to UTF-8 as follows...
 326  *
 327  *   UTF-32 char     UTF-8 char(s)
 328  *   --------------------------------------------------
 329  *        0 to 127 = 0xxxxxxx (US-ASCII)
 330  *     128 to 2047 = 110xxxxx 10yyyyyy
 331  *   2048 to 65535 = 1110xxxx 10yyyyyy 10zzzzzz
 332  *         > 65535 = 11110xxx 10yyyyyy 10zzzzzz 10xxxxxx
 333  *
 334  * UTF-32 prohibits chars beyond Plane 16 (> 0x10ffff) in UCS-4,
 335  * which would convert to five- or six-octet UTF-8 sequences...
 336  */
 337
 338 int                                     /* O - Count or -1 on error */
 339 cupsUTF8ToUTF32(
 340     cups_utf32_t      *dest,            /* O - Target string */
 341     const cups_utf8_t *src,             /* I - Source string */
 342     const int         maxout)           /* I - Max output */
 343 {
 344   int           i;                      /* Looping variable */
 345   cups_utf8_t   ch;                     /* Character value */
 346   cups_utf8_t   next;                   /* Next character value */
 347   cups_utf32_t  ch32;                   /* UTF-32 character value */
 348
 349
 350  /*
 351   * Check for valid arguments and clear output...
 352   */
 353
 354   DEBUG_printf(("2cupsUTF8ToUTF32(dest=%p, src=\"%s\", maxout=%d)", dest,
 355                 src, maxout));
 356
 357   if (dest)
 358     *dest = 0;
 359
 360   if (!dest || !src || maxout < 1 || maxout > CUPS_MAX_USTRING)
 361   {
 362     DEBUG_puts("3cupsUTF8ToUTF32: Returning -1 (bad arguments)");
 363
 364     return (-1);
 365   }
 366
 367  /*
 368   * Convert input UTF-8 to output UTF-32...
 369   */
 370
 371   for (i = maxout - 1; *src && i > 0; i --)
 372   {
 373     ch = *src++;
 374
 375    /*
 376     * Convert UTF-8 character(s) to UTF-32 character...
 377     */
 378
 379     if (!(ch & 0x80))
 380     {
 381      /*
 382       * One-octet UTF-8 <= 127 (US-ASCII)...
 383       */
 384
 385       *dest++ = ch;
 386
 387       DEBUG_printf(("4cupsUTF8ToUTF32: %02x => %08X", src[-1], ch));
 388       continue;
 389     }
 390     else if ((ch & 0xe0) == 0xc0)
 391     {
 392      /*
 393       * Two-octet UTF-8 <= 2047 (Latin-x)...
 394       */
 395
 396       next = *src++;
 397       if ((next & 0xc0) != 0x80)
 398       {
 399         DEBUG_puts("3cupsUTF8ToUTF32: Returning -1 (bad UTF-8 sequence)");
 400
 401         return (-1);
 402       }
 403
 404       ch32 = ((ch & 0x1f) << 6) | (next & 0x3f);
 405
 406      /*
 407       * Check for non-shortest form (invalid UTF-8)...
 408       */
 409
 410       if (ch32 < 0x80)
 411       {
 412         DEBUG_puts("3cupsUTF8ToUTF32: Returning -1 (bad UTF-8 sequence)");
 413
 414         return (-1);
 415       }
 416
 417       *dest++ = ch32;
 418
 419       DEBUG_printf(("4cupsUTF8ToUTF32: %02x %02x => %08X",
 420                     src[-2], src[-1], (unsigned)ch32));
 421     }
 422     else if ((ch & 0xf0) == 0xe0)
 423     {
 424      /*
 425       * Three-octet UTF-8 <= 65535 (Plane 0 - BMP)...
 426       */
 427
 428       next = *src++;
 429       if ((next & 0xc0) != 0x80)
 430       {
 431         DEBUG_puts("3cupsUTF8ToUTF32: Returning -1 (bad UTF-8 sequence)");
 432
 433         return (-1);
 434       }
 435
 436       ch32 = ((ch & 0x0f) << 6) | (next & 0x3f);
 437
 438       next = *src++;
 439       if ((next & 0xc0) != 0x80)
 440       {
 441         DEBUG_puts("3cupsUTF8ToUTF32: Returning -1 (bad UTF-8 sequence)");
 442
 443         return (-1);
 444       }
 445
 446       ch32 = (ch32 << 6) | (next & 0x3f);
 447
 448      /*
 449       * Check for non-shortest form (invalid UTF-8)...
 450       */
 451
 452       if (ch32 < 0x800)
 453       {
 454         DEBUG_puts("3cupsUTF8ToUTF32: Returning -1 (bad UTF-8 sequence)");
 455
 456         return (-1);
 457       }
 458
 459       *dest++ = ch32;
 460
 461       DEBUG_printf(("4cupsUTF8ToUTF32: %02x %02x %02x => %08X",
 462                     src[-3], src[-2], src[-1], (unsigned)ch32));
 463     }
 464     else if ((ch & 0xf8) == 0xf0)
 465     {
 466      /*
 467       * Four-octet UTF-8...
 468       */
 469
 470       next = *src++;
 471       if ((next & 0xc0) != 0x80)
 472       {
 473         DEBUG_puts("3cupsUTF8ToUTF32: Returning -1 (bad UTF-8 sequence)");
 474
 475         return (-1);
 476       }
 477
 478       ch32 = ((ch & 0x07) << 6) | (next & 0x3f);
 479
 480       next = *src++;
 481       if ((next & 0xc0) != 0x80)
 482       {
 483         DEBUG_puts("3cupsUTF8ToUTF32: Returning -1 (bad UTF-8 sequence)");
 484
 485         return (-1);
 486       }
 487
 488       ch32 = (ch32 << 6) | (next & 0x3f);
 489
 490       next = *src++;
 491       if ((next & 0xc0) != 0x80)
 492       {
 493         DEBUG_puts("3cupsUTF8ToUTF32: Returning -1 (bad UTF-8 sequence)");
 494
 495         return (-1);
 496       }
 497
 498       ch32 = (ch32 << 6) | (next & 0x3f);
 499
 500      /*
 501       * Check for non-shortest form (invalid UTF-8)...
 502       */
 503
 504       if (ch32 < 0x10000)
 505       {
 506         DEBUG_puts("3cupsUTF8ToUTF32: Returning -1 (bad UTF-8 sequence)");
 507
 508         return (-1);
 509       }
 510
 511       *dest++ = ch32;
 512
 513       DEBUG_printf(("4cupsUTF8ToUTF32: %02x %02x %02x %02x => %08X",
 514                     src[-4], src[-3], src[-2], src[-1], (unsigned)ch32));
 515     }
 516     else
 517     {
 518      /*
 519       * More than 4-octet (invalid UTF-8 sequence)...
 520       */
 521
 522       DEBUG_puts("3cupsUTF8ToUTF32: Returning -1 (bad UTF-8 sequence)");
 523
 524       return (-1);
 525     }
 526
 527    /*
 528     * Check for UTF-16 surrogate (illegal UTF-8)...
 529     */
 530
 531     if (ch32 >= 0xd800 && ch32 <= 0xdfff)
 532       return (-1);
 533   }
 534
 535   *dest = 0;
 536
 537   DEBUG_printf(("3cupsUTF8ToUTF32: Returning %d characters", maxout - 1 - i));
 538
 539   return (maxout - 1 - i);
 540 }
 541
 542
 543 /*
 544  * 'cupsUTF32ToUTF8()' - Convert UTF-32 to UTF-8.
 545  *
 546  * 32-bit UTF-32 (actually 21-bit) maps to UTF-8 as follows...
 547  *
 548  *   UTF-32 char     UTF-8 char(s)
 549  *   --------------------------------------------------
 550  *        0 to 127 = 0xxxxxxx (US-ASCII)
 551  *     128 to 2047 = 110xxxxx 10yyyyyy
 552  *   2048 to 65535 = 1110xxxx 10yyyyyy 10zzzzzz
 553  *         > 65535 = 11110xxx 10yyyyyy 10zzzzzz 10xxxxxx
 554  *
 555  * UTF-32 prohibits chars beyond Plane 16 (> 0x10ffff) in UCS-4,
 556  * which would convert to five- or six-octet UTF-8 sequences...
 557  */
 558
 559 int                                     /* O - Count or -1 on error */
 560 cupsUTF32ToUTF8(
 561     cups_utf8_t        *dest,           /* O - Target string */
 562     const cups_utf32_t *src,            /* I - Source string */
 563     const int          maxout)          /* I - Max output */
 564 {
 565   cups_utf8_t   *start;                 /* Start of destination string */
 566   int           i;                      /* Looping variable */
 567   int           swap;                   /* Byte-swap input to output */
 568   cups_utf32_t  ch;                     /* Character value */
 569
 570
 571  /*
 572   * Check for valid arguments and clear output...
 573   */
 574
 575   DEBUG_printf(("2cupsUTF32ToUTF8(dest=%p, src=%p, maxout=%d)", dest, src,
 576                 maxout));
 577
 578   if (dest)
 579     *dest = '\0';
 580
 581   if (!dest || !src || maxout < 1)
 582   {
 583     DEBUG_puts("3cupsUTF32ToUTF8: Returning -1 (bad args)");
 584
 585     return (-1);
 586   }
 587
 588  /*
 589   * Check for leading BOM in UTF-32 and inverted BOM...
 590   */
 591
 592   start = dest;
 593   swap  = *src == 0xfffe0000;
 594
 595   DEBUG_printf(("4cupsUTF32ToUTF8: swap=%d", swap));
 596
 597   if (*src == 0xfffe0000 || *src == 0xfeff)
 598     src ++;
 599
 600  /*
 601   * Convert input UTF-32 to output UTF-8...
 602   */
 603
 604   for (i = maxout - 1; *src && i > 0;)
 605   {
 606     ch = *src++;
 607
 608    /*
 609     * Byte swap input UTF-32, if necessary...
 610     * (only byte-swapping 24 of 32 bits)
 611     */
 612
 613     if (swap)
 614       ch = ((ch >> 24) | ((ch >> 8) & 0xff00) | ((ch << 8) & 0xff0000));
 615
 616    /*
 617     * Check for beyond Plane 16 (invalid UTF-32)...
 618     */
 619
 620     if (ch > 0x10ffff)
 621     {
 622       DEBUG_puts("3cupsUTF32ToUTF8: Returning -1 (character out of range)");
 623
 624       return (-1);
 625     }
 626
 627    /*
 628     * Convert UTF-32 character to UTF-8 character(s)...
 629     */
 630
 631     if (ch < 0x80)
 632     {
 633      /*
 634       * One-octet UTF-8 <= 127 (US-ASCII)...
 635       */
 636
 637       *dest++ = (cups_utf8_t)ch;
 638       i --;
 639
 640       DEBUG_printf(("4cupsUTF32ToUTF8: %08x => %02x", (unsigned)ch, dest[-1]));
 641     }
 642     else if (ch < 0x800)
 643     {
 644      /*
 645       * Two-octet UTF-8 <= 2047 (Latin-x)...
 646       */
 647
 648       if (i < 2)
 649       {
 650         DEBUG_puts("3cupsUTF32ToUTF8: Returning -1 (too long 2)");
 651
 652         return (-1);
 653       }
 654
 655       *dest++ = (cups_utf8_t)(0xc0 | ((ch >> 6) & 0x1f));
 656       *dest++ = (cups_utf8_t)(0x80 | (ch & 0x3f));
 657       i -= 2;
 658
 659       DEBUG_printf(("4cupsUTF32ToUTF8: %08x => %02x %02x", (unsigned)ch,
 660                     dest[-2], dest[-1]));
 661     }
 662     else if (ch < 0x10000)
 663     {
 664      /*
 665       * Three-octet UTF-8 <= 65535 (Plane 0 - BMP)...
 666       */
 667
 668       if (i < 3)
 669       {
 670         DEBUG_puts("3cupsUTF32ToUTF8: Returning -1 (too long 3)");
 671
 672         return (-1);
 673       }
 674
 675       *dest++ = (cups_utf8_t)(0xe0 | ((ch >> 12) & 0x0f));
 676       *dest++ = (cups_utf8_t)(0x80 | ((ch >> 6) & 0x3f));
 677       *dest++ = (cups_utf8_t)(0x80 | (ch & 0x3f));
 678       i -= 3;
 679
 680       DEBUG_printf(("4cupsUTF32ToUTF8: %08x => %02x %02x %02x", (unsigned)ch,
 681                     dest[-3], dest[-2], dest[-1]));
 682     }
 683     else
 684     {
 685      /*
 686       * Four-octet UTF-8...
 687       */
 688
 689       if (i < 4)
 690       {
 691         DEBUG_puts("3cupsUTF32ToUTF8: Returning -1 (too long 4)");
 692
 693         return (-1);
 694       }
 695
 696       *dest++ = (cups_utf8_t)(0xf0 | ((ch >> 18) & 0x07));
 697       *dest++ = (cups_utf8_t)(0x80 | ((ch >> 12) & 0x3f));
 698       *dest++ = (cups_utf8_t)(0x80 | ((ch >> 6) & 0x3f));
 699       *dest++ = (cups_utf8_t)(0x80 | (ch & 0x3f));
 700       i -= 4;
 701
 702       DEBUG_printf(("4cupsUTF32ToUTF8: %08x => %02x %02x %02x %02x",
 703                     (unsigned)ch, dest[-4], dest[-3], dest[-2], dest[-1]));
 704     }
 705   }
 706
 707   *dest = '\0';
 708
 709   DEBUG_printf(("3cupsUTF32ToUTF8: Returning %d", (int)(dest - start)));
 710
 711   return ((int)(dest - start));
 712 }
 713
 714
 715 /*
 716  * End of "$Id: transcode.c 9306 2010-09-16 21:43:57Z mike $"
 717  */