cups/transcode.c

   1 /*
   2  * Transcoding support for CUPS.
   3  *
   4  * Copyright 2007-2014 by Apple Inc.
   5  * Copyright 1997-2007 by Easy Software Products.
   6  *
   7  * Licensed under Apache License v2.0.  See the file "LICENSE" for more information.
   8  */
   9
  10 /*
  11  * Include necessary headers...
  12  */
  13
  14 #include "cups-private.h"
  15 #include <limits.h>
  16 #include <time.h>
  17 #ifdef HAVE_ICONV_H
  18 #  include <iconv.h>
  19 #endif /* HAVE_ICONV_H */
  20
  21
  22 /*
  23  * Local globals...
  24  */
  25
  26 #ifdef HAVE_ICONV_H
  27 static _cups_mutex_t    map_mutex = _CUPS_MUTEX_INITIALIZER;
  28                                         /* Mutex to control access to maps */
  29 static iconv_t          map_from_utf8 = (iconv_t)-1;
  30                                         /* Convert from UTF-8 to charset */
  31 static iconv_t          map_to_utf8 = (iconv_t)-1;
  32                                         /* Convert from charset to UTF-8 */
  33 static cups_encoding_t  map_encoding = CUPS_AUTO_ENCODING;
  34                                         /* Which charset is cached */
  35 #endif /* HAVE_ICONV_H */
  36
  37
  38 /*
  39  * '_cupsCharmapFlush()' - Flush all character set maps out of cache.
  40  */
  41
  42 void
  43 _cupsCharmapFlush(void)
  44 {
  45 #ifdef HAVE_ICONV_H
  46   if (map_from_utf8 != (iconv_t)-1)
  47   {
  48     iconv_close(map_from_utf8);
  49     map_from_utf8 = (iconv_t)-1;
  50   }
  51
  52   if (map_to_utf8 != (iconv_t)-1)
  53   {
  54     iconv_close(map_to_utf8);
  55     map_to_utf8 = (iconv_t)-1;
  56   }
  57
  58   map_encoding = CUPS_AUTO_ENCODING;
  59 #endif /* HAVE_ICONV_H */
  60 }
  61
  62
  63 /*
  64  * 'cupsCharsetToUTF8()' - Convert legacy character set to UTF-8.
  65  */
  66
  67 int                                     /* O - Count or -1 on error */
  68 cupsCharsetToUTF8(
  69     cups_utf8_t           *dest,        /* O - Target string */
  70     const char            *src,         /* I - Source string */
  71     const int             maxout,       /* I - Max output */
  72     const cups_encoding_t encoding)     /* I - Encoding */
  73 {
  74   cups_utf8_t   *destptr;               /* Pointer into UTF-8 buffer */
  75 #ifdef HAVE_ICONV_H
  76   size_t        srclen,                 /* Length of source string */
  77                 outBytesLeft;           /* Bytes remaining in output buffer */
  78 #endif /* HAVE_ICONV_H */
  79
  80
  81  /*
  82   * Check for valid arguments...
  83   */
  84
  85   DEBUG_printf(("2cupsCharsetToUTF8(dest=%p, src=\"%s\", maxout=%d, encoding=%d)", (void *)dest, src, maxout, encoding));
  86
  87   if (!dest || !src || maxout < 1)
  88   {
  89     if (dest)
  90       *dest = '\0';
  91
  92     DEBUG_puts("3cupsCharsetToUTF8: Bad arguments, returning -1");
  93     return (-1);
  94   }
  95
  96  /*
  97   * Handle identity conversions...
  98   */
  99
 100   if (encoding == CUPS_UTF8 || encoding <= CUPS_US_ASCII ||
 101       encoding >= CUPS_ENCODING_VBCS_END)
 102   {
 103     strlcpy((char *)dest, src, (size_t)maxout);
 104     return ((int)strlen((char *)dest));
 105   }
 106
 107  /*
 108   * Handle ISO-8859-1 to UTF-8 directly...
 109   */
 110
 111   destptr = dest;
 112
 113   if (encoding == CUPS_ISO8859_1)
 114   {
 115     int         ch;                     /* Character from string */
 116     cups_utf8_t *destend;               /* End of UTF-8 buffer */
 117
 118
 119     destend = dest + maxout - 2;
 120
 121     while (*src && destptr < destend)
 122     {
 123       ch = *src++ & 255;
 124
 125       if (ch & 128)
 126       {
 127         *destptr++ = (cups_utf8_t)(0xc0 | (ch >> 6));
 128         *destptr++ = (cups_utf8_t)(0x80 | (ch & 0x3f));
 129       }
 130       else
 131         *destptr++ = (cups_utf8_t)ch;
 132     }
 133
 134     *destptr = '\0';
 135
 136     return ((int)(destptr - dest));
 137   }
 138
 139  /*
 140   * Convert input legacy charset to UTF-8...
 141   */
 142
 143 #ifdef HAVE_ICONV_H
 144   _cupsMutexLock(&map_mutex);
 145
 146   if (map_encoding != encoding)
 147   {
 148     char        toset[1024];            /* Destination character set */
 149
 150     _cupsCharmapFlush();
 151
 152     snprintf(toset, sizeof(toset), "%s//IGNORE", _cupsEncodingName(encoding));
 153
 154     map_encoding  = encoding;
 155     map_from_utf8 = iconv_open(_cupsEncodingName(encoding), "UTF-8");
 156     map_to_utf8   = iconv_open("UTF-8", toset);
 157   }
 158
 159   if (map_to_utf8 != (iconv_t)-1)
 160   {
 161     char *altdestptr = (char *)dest;    /* Silence bogus GCC type-punned */
 162
 163     srclen       = strlen(src);
 164     outBytesLeft = (size_t)maxout - 1;
 165
 166     iconv(map_to_utf8, (char **)&src, &srclen, &altdestptr, &outBytesLeft);
 167     *altdestptr = '\0';
 168
 169     _cupsMutexUnlock(&map_mutex);
 170
 171     return ((int)(altdestptr - (char *)dest));
 172   }
 173
 174   _cupsMutexUnlock(&map_mutex);
 175 #endif /* HAVE_ICONV_H */
 176
 177  /*
 178   * No iconv() support, so error out...
 179   */
 180
 181   *destptr = '\0';
 182
 183   return (-1);
 184 }
 185
 186
 187 /*
 188  * 'cupsUTF8ToCharset()' - Convert UTF-8 to legacy character set.
 189  */
 190
 191 int                                     /* O - Count or -1 on error */
 192 cupsUTF8ToCharset(
 193     char                  *dest,        /* O - Target string */
 194     const cups_utf8_t     *src,         /* I - Source string */
 195     const int             maxout,       /* I - Max output */
 196     const cups_encoding_t encoding)     /* I - Encoding */
 197 {
 198   char          *destptr;               /* Pointer into destination */
 199 #ifdef HAVE_ICONV_H
 200   size_t        srclen,                 /* Length of source string */
 201                 outBytesLeft;           /* Bytes remaining in output buffer */
 202 #endif /* HAVE_ICONV_H */
 203
 204
 205  /*
 206   * Check for valid arguments...
 207   */
 208
 209   if (!dest || !src || maxout < 1)
 210   {
 211     if (dest)
 212       *dest = '\0';
 213
 214     return (-1);
 215   }
 216
 217  /*
 218   * Handle identity conversions...
 219   */
 220
 221   if (encoding == CUPS_UTF8 ||
 222       encoding >= CUPS_ENCODING_VBCS_END)
 223   {
 224     strlcpy(dest, (char *)src, (size_t)maxout);
 225     return ((int)strlen(dest));
 226   }
 227
 228  /*
 229   * Handle UTF-8 to ISO-8859-1 directly...
 230   */
 231
 232   destptr = dest;
 233
 234   if (encoding == CUPS_ISO8859_1 || encoding <= CUPS_US_ASCII)
 235   {
 236     int         ch,                     /* Character from string */
 237                 maxch;                  /* Maximum character for charset */
 238     char        *destend;               /* End of ISO-8859-1 buffer */
 239
 240     maxch   = encoding == CUPS_ISO8859_1 ? 256 : 128;
 241     destend = dest + maxout - 1;
 242
 243     while (*src && destptr < destend)
 244     {
 245       ch = *src++;
 246
 247       if ((ch & 0xe0) == 0xc0)
 248       {
 249         ch = ((ch & 0x1f) << 6) | (*src++ & 0x3f);
 250
 251         if (ch < maxch)
 252           *destptr++ = (char)ch;
 253         else
 254           *destptr++ = '?';
 255       }
 256       else if ((ch & 0xf0) == 0xe0 ||
 257                (ch & 0xf8) == 0xf0)
 258         *destptr++ = '?';
 259       else if (!(ch & 0x80))
 260         *destptr++ = (char)ch;
 261     }
 262
 263     *destptr = '\0';
 264
 265     return ((int)(destptr - dest));
 266   }
 267
 268 #ifdef HAVE_ICONV_H
 269  /*
 270   * Convert input UTF-8 to legacy charset...
 271   */
 272
 273   _cupsMutexLock(&map_mutex);
 274
 275   if (map_encoding != encoding)
 276   {
 277     char        toset[1024];            /* Destination character set */
 278
 279     _cupsCharmapFlush();
 280
 281     snprintf(toset, sizeof(toset), "%s//IGNORE", _cupsEncodingName(encoding));
 282
 283     map_encoding  = encoding;
 284     map_from_utf8 = iconv_open(_cupsEncodingName(encoding), "UTF-8");
 285     map_to_utf8   = iconv_open("UTF-8", toset);
 286   }
 287
 288   if (map_from_utf8 != (iconv_t)-1)
 289   {
 290     char *altsrc = (char *)src;         /* Silence bogus GCC type-punned */
 291
 292     srclen       = strlen((char *)src);
 293     outBytesLeft = (size_t)maxout - 1;
 294
 295     iconv(map_from_utf8, &altsrc, &srclen, &destptr, &outBytesLeft);
 296     *destptr = '\0';
 297
 298     _cupsMutexUnlock(&map_mutex);
 299
 300     return ((int)(destptr - dest));
 301   }
 302
 303   _cupsMutexUnlock(&map_mutex);
 304 #endif /* HAVE_ICONV_H */
 305
 306  /*
 307   * No iconv() support, so error out...
 308   */
 309
 310   *destptr = '\0';
 311
 312   return (-1);
 313 }
 314
 315
 316 /*
 317  * 'cupsUTF8ToUTF32()' - Convert UTF-8 to UTF-32.
 318  *
 319  * 32-bit UTF-32 (actually 21-bit) maps to UTF-8 as follows...
 320  *
 321  *   UTF-32 char     UTF-8 char(s)
 322  *   --------------------------------------------------
 323  *        0 to 127 = 0xxxxxxx (US-ASCII)
 324  *     128 to 2047 = 110xxxxx 10yyyyyy
 325  *   2048 to 65535 = 1110xxxx 10yyyyyy 10zzzzzz
 326  *         > 65535 = 11110xxx 10yyyyyy 10zzzzzz 10xxxxxx
 327  *
 328  * UTF-32 prohibits chars beyond Plane 16 (> 0x10ffff) in UCS-4,
 329  * which would convert to five- or six-octet UTF-8 sequences...
 330  */
 331
 332 int                                     /* O - Count or -1 on error */
 333 cupsUTF8ToUTF32(
 334     cups_utf32_t      *dest,            /* O - Target string */
 335     const cups_utf8_t *src,             /* I - Source string */
 336     const int         maxout)           /* I - Max output */
 337 {
 338   int           i;                      /* Looping variable */
 339   cups_utf8_t   ch;                     /* Character value */
 340   cups_utf8_t   next;                   /* Next character value */
 341   cups_utf32_t  ch32;                   /* UTF-32 character value */
 342
 343
 344  /*
 345   * Check for valid arguments and clear output...
 346   */
 347
 348   DEBUG_printf(("2cupsUTF8ToUTF32(dest=%p, src=\"%s\", maxout=%d)", (void *)dest, src, maxout));
 349
 350   if (dest)
 351     *dest = 0;
 352
 353   if (!dest || !src || maxout < 1 || maxout > CUPS_MAX_USTRING)
 354   {
 355     DEBUG_puts("3cupsUTF8ToUTF32: Returning -1 (bad arguments)");
 356
 357     return (-1);
 358   }
 359
 360  /*
 361   * Convert input UTF-8 to output UTF-32...
 362   */
 363
 364   for (i = maxout - 1; *src && i > 0; i --)
 365   {
 366     ch = *src++;
 367
 368    /*
 369     * Convert UTF-8 character(s) to UTF-32 character...
 370     */
 371
 372     if (!(ch & 0x80))
 373     {
 374      /*
 375       * One-octet UTF-8 <= 127 (US-ASCII)...
 376       */
 377
 378       *dest++ = ch;
 379
 380       DEBUG_printf(("4cupsUTF8ToUTF32: %02x => %08X", src[-1], ch));
 381       continue;
 382     }
 383     else if ((ch & 0xe0) == 0xc0)
 384     {
 385      /*
 386       * Two-octet UTF-8 <= 2047 (Latin-x)...
 387       */
 388
 389       next = *src++;
 390       if ((next & 0xc0) != 0x80)
 391       {
 392         DEBUG_puts("3cupsUTF8ToUTF32: Returning -1 (bad UTF-8 sequence)");
 393
 394         return (-1);
 395       }
 396
 397       ch32 = (cups_utf32_t)((ch & 0x1f) << 6) | (cups_utf32_t)(next & 0x3f);
 398
 399      /*
 400       * Check for non-shortest form (invalid UTF-8)...
 401       */
 402
 403       if (ch32 < 0x80)
 404       {
 405         DEBUG_puts("3cupsUTF8ToUTF32: Returning -1 (bad UTF-8 sequence)");
 406
 407         return (-1);
 408       }
 409
 410       *dest++ = ch32;
 411
 412       DEBUG_printf(("4cupsUTF8ToUTF32: %02x %02x => %08X",
 413                     src[-2], src[-1], (unsigned)ch32));
 414     }
 415     else if ((ch & 0xf0) == 0xe0)
 416     {
 417      /*
 418       * Three-octet UTF-8 <= 65535 (Plane 0 - BMP)...
 419       */
 420
 421       next = *src++;
 422       if ((next & 0xc0) != 0x80)
 423       {
 424         DEBUG_puts("3cupsUTF8ToUTF32: Returning -1 (bad UTF-8 sequence)");
 425
 426         return (-1);
 427       }
 428
 429       ch32 = (cups_utf32_t)((ch & 0x0f) << 6) | (cups_utf32_t)(next & 0x3f);
 430
 431       next = *src++;
 432       if ((next & 0xc0) != 0x80)
 433       {
 434         DEBUG_puts("3cupsUTF8ToUTF32: Returning -1 (bad UTF-8 sequence)");
 435
 436         return (-1);
 437       }
 438
 439       ch32 = (ch32 << 6) | (cups_utf32_t)(next & 0x3f);
 440
 441      /*
 442       * Check for non-shortest form (invalid UTF-8)...
 443       */
 444
 445       if (ch32 < 0x800)
 446       {
 447         DEBUG_puts("3cupsUTF8ToUTF32: Returning -1 (bad UTF-8 sequence)");
 448
 449         return (-1);
 450       }
 451
 452       *dest++ = ch32;
 453
 454       DEBUG_printf(("4cupsUTF8ToUTF32: %02x %02x %02x => %08X",
 455                     src[-3], src[-2], src[-1], (unsigned)ch32));
 456     }
 457     else if ((ch & 0xf8) == 0xf0)
 458     {
 459      /*
 460       * Four-octet UTF-8...
 461       */
 462
 463       next = *src++;
 464       if ((next & 0xc0) != 0x80)
 465       {
 466         DEBUG_puts("3cupsUTF8ToUTF32: Returning -1 (bad UTF-8 sequence)");
 467
 468         return (-1);
 469       }
 470
 471       ch32 = (cups_utf32_t)((ch & 0x07) << 6) | (cups_utf32_t)(next & 0x3f);
 472
 473       next = *src++;
 474       if ((next & 0xc0) != 0x80)
 475       {
 476         DEBUG_puts("3cupsUTF8ToUTF32: Returning -1 (bad UTF-8 sequence)");
 477
 478         return (-1);
 479       }
 480
 481       ch32 = (ch32 << 6) | (cups_utf32_t)(next & 0x3f);
 482
 483       next = *src++;
 484       if ((next & 0xc0) != 0x80)
 485       {
 486         DEBUG_puts("3cupsUTF8ToUTF32: Returning -1 (bad UTF-8 sequence)");
 487
 488         return (-1);
 489       }
 490
 491       ch32 = (ch32 << 6) | (cups_utf32_t)(next & 0x3f);
 492
 493      /*
 494       * Check for non-shortest form (invalid UTF-8)...
 495       */
 496
 497       if (ch32 < 0x10000)
 498       {
 499         DEBUG_puts("3cupsUTF8ToUTF32: Returning -1 (bad UTF-8 sequence)");
 500
 501         return (-1);
 502       }
 503
 504       *dest++ = ch32;
 505
 506       DEBUG_printf(("4cupsUTF8ToUTF32: %02x %02x %02x %02x => %08X",
 507                     src[-4], src[-3], src[-2], src[-1], (unsigned)ch32));
 508     }
 509     else
 510     {
 511      /*
 512       * More than 4-octet (invalid UTF-8 sequence)...
 513       */
 514
 515       DEBUG_puts("3cupsUTF8ToUTF32: Returning -1 (bad UTF-8 sequence)");
 516
 517       return (-1);
 518     }
 519
 520    /*
 521     * Check for UTF-16 surrogate (illegal UTF-8)...
 522     */
 523
 524     if (ch32 >= 0xd800 && ch32 <= 0xdfff)
 525       return (-1);
 526   }
 527
 528   *dest = 0;
 529
 530   DEBUG_printf(("3cupsUTF8ToUTF32: Returning %d characters", maxout - 1 - i));
 531
 532   return (maxout - 1 - i);
 533 }
 534
 535
 536 /*
 537  * 'cupsUTF32ToUTF8()' - Convert UTF-32 to UTF-8.
 538  *
 539  * 32-bit UTF-32 (actually 21-bit) maps to UTF-8 as follows...
 540  *
 541  *   UTF-32 char     UTF-8 char(s)
 542  *   --------------------------------------------------
 543  *        0 to 127 = 0xxxxxxx (US-ASCII)
 544  *     128 to 2047 = 110xxxxx 10yyyyyy
 545  *   2048 to 65535 = 1110xxxx 10yyyyyy 10zzzzzz
 546  *         > 65535 = 11110xxx 10yyyyyy 10zzzzzz 10xxxxxx
 547  *
 548  * UTF-32 prohibits chars beyond Plane 16 (> 0x10ffff) in UCS-4,
 549  * which would convert to five- or six-octet UTF-8 sequences...
 550  */
 551
 552 int                                     /* O - Count or -1 on error */
 553 cupsUTF32ToUTF8(
 554     cups_utf8_t        *dest,           /* O - Target string */
 555     const cups_utf32_t *src,            /* I - Source string */
 556     const int          maxout)          /* I - Max output */
 557 {
 558   cups_utf8_t   *start;                 /* Start of destination string */
 559   int           i;                      /* Looping variable */
 560   int           swap;                   /* Byte-swap input to output */
 561   cups_utf32_t  ch;                     /* Character value */
 562
 563
 564  /*
 565   * Check for valid arguments and clear output...
 566   */
 567
 568   DEBUG_printf(("2cupsUTF32ToUTF8(dest=%p, src=%p, maxout=%d)", (void *)dest, (void *)src, maxout));
 569
 570   if (dest)
 571     *dest = '\0';
 572
 573   if (!dest || !src || maxout < 1)
 574   {
 575     DEBUG_puts("3cupsUTF32ToUTF8: Returning -1 (bad args)");
 576
 577     return (-1);
 578   }
 579
 580  /*
 581   * Check for leading BOM in UTF-32 and inverted BOM...
 582   */
 583
 584   start = dest;
 585   swap  = *src == 0xfffe0000;
 586
 587   DEBUG_printf(("4cupsUTF32ToUTF8: swap=%d", swap));
 588
 589   if (*src == 0xfffe0000 || *src == 0xfeff)
 590     src ++;
 591
 592  /*
 593   * Convert input UTF-32 to output UTF-8...
 594   */
 595
 596   for (i = maxout - 1; *src && i > 0;)
 597   {
 598     ch = *src++;
 599
 600    /*
 601     * Byte swap input UTF-32, if necessary...
 602     * (only byte-swapping 24 of 32 bits)
 603     */
 604
 605     if (swap)
 606       ch = ((ch >> 24) | ((ch >> 8) & 0xff00) | ((ch << 8) & 0xff0000));
 607
 608    /*
 609     * Check for beyond Plane 16 (invalid UTF-32)...
 610     */
 611
 612     if (ch > 0x10ffff)
 613     {
 614       DEBUG_puts("3cupsUTF32ToUTF8: Returning -1 (character out of range)");
 615
 616       return (-1);
 617     }
 618
 619    /*
 620     * Convert UTF-32 character to UTF-8 character(s)...
 621     */
 622
 623     if (ch < 0x80)
 624     {
 625      /*
 626       * One-octet UTF-8 <= 127 (US-ASCII)...
 627       */
 628
 629       *dest++ = (cups_utf8_t)ch;
 630       i --;
 631
 632       DEBUG_printf(("4cupsUTF32ToUTF8: %08x => %02x", (unsigned)ch, dest[-1]));
 633     }
 634     else if (ch < 0x800)
 635     {
 636      /*
 637       * Two-octet UTF-8 <= 2047 (Latin-x)...
 638       */
 639
 640       if (i < 2)
 641       {
 642         DEBUG_puts("3cupsUTF32ToUTF8: Returning -1 (too long 2)");
 643
 644         return (-1);
 645       }
 646
 647       *dest++ = (cups_utf8_t)(0xc0 | ((ch >> 6) & 0x1f));
 648       *dest++ = (cups_utf8_t)(0x80 | (ch & 0x3f));
 649       i -= 2;
 650
 651       DEBUG_printf(("4cupsUTF32ToUTF8: %08x => %02x %02x", (unsigned)ch,
 652                     dest[-2], dest[-1]));
 653     }
 654     else if (ch < 0x10000)
 655     {
 656      /*
 657       * Three-octet UTF-8 <= 65535 (Plane 0 - BMP)...
 658       */
 659
 660       if (i < 3)
 661       {
 662         DEBUG_puts("3cupsUTF32ToUTF8: Returning -1 (too long 3)");
 663
 664         return (-1);
 665       }
 666
 667       *dest++ = (cups_utf8_t)(0xe0 | ((ch >> 12) & 0x0f));
 668       *dest++ = (cups_utf8_t)(0x80 | ((ch >> 6) & 0x3f));
 669       *dest++ = (cups_utf8_t)(0x80 | (ch & 0x3f));
 670       i -= 3;
 671
 672       DEBUG_printf(("4cupsUTF32ToUTF8: %08x => %02x %02x %02x", (unsigned)ch,
 673                     dest[-3], dest[-2], dest[-1]));
 674     }
 675     else
 676     {
 677      /*
 678       * Four-octet UTF-8...
 679       */
 680
 681       if (i < 4)
 682       {
 683         DEBUG_puts("3cupsUTF32ToUTF8: Returning -1 (too long 4)");
 684
 685         return (-1);
 686       }
 687
 688       *dest++ = (cups_utf8_t)(0xf0 | ((ch >> 18) & 0x07));
 689       *dest++ = (cups_utf8_t)(0x80 | ((ch >> 12) & 0x3f));
 690       *dest++ = (cups_utf8_t)(0x80 | ((ch >> 6) & 0x3f));
 691       *dest++ = (cups_utf8_t)(0x80 | (ch & 0x3f));
 692       i -= 4;
 693
 694       DEBUG_printf(("4cupsUTF32ToUTF8: %08x => %02x %02x %02x %02x",
 695                     (unsigned)ch, dest[-4], dest[-3], dest[-2], dest[-1]));
 696     }
 697   }
 698
 699   *dest = '\0';
 700
 701   DEBUG_printf(("3cupsUTF32ToUTF8: Returning %d", (int)(dest - start)));
 702
 703   return ((int)(dest - start));
 704 }