cups/transcode.c

   1 /*
   2  * Transcoding support for CUPS.
   3  *
   4  * Copyright 2007-2014 by Apple Inc.
   5  * Copyright 1997-2007 by Easy Software Products.
   6  *
   7  * Licensed under Apache License v2.0.  See the file "LICENSE" for more information.
   8  */
   9
  10 /*
  11  * Include necessary headers...
  12  */
  13
  14 #include "cups-private.h"
  15 #include "debug-internal.h"
  16 #include <limits.h>
  17 #include <time.h>
  18 #ifdef HAVE_ICONV_H
  19 #  include <iconv.h>
  20 #endif /* HAVE_ICONV_H */
  21
  22
  23 /*
  24  * Local globals...
  25  */
  26
  27 #ifdef HAVE_ICONV_H
  28 static _cups_mutex_t    map_mutex = _CUPS_MUTEX_INITIALIZER;
  29                                         /* Mutex to control access to maps */
  30 static iconv_t          map_from_utf8 = (iconv_t)-1;
  31                                         /* Convert from UTF-8 to charset */
  32 static iconv_t          map_to_utf8 = (iconv_t)-1;
  33                                         /* Convert from charset to UTF-8 */
  34 static cups_encoding_t  map_encoding = CUPS_AUTO_ENCODING;
  35                                         /* Which charset is cached */
  36 #endif /* HAVE_ICONV_H */
  37
  38
  39 /*
  40  * '_cupsCharmapFlush()' - Flush all character set maps out of cache.
  41  */
  42
  43 void
  44 _cupsCharmapFlush(void)
  45 {
  46 #ifdef HAVE_ICONV_H
  47   if (map_from_utf8 != (iconv_t)-1)
  48   {
  49     iconv_close(map_from_utf8);
  50     map_from_utf8 = (iconv_t)-1;
  51   }
  52
  53   if (map_to_utf8 != (iconv_t)-1)
  54   {
  55     iconv_close(map_to_utf8);
  56     map_to_utf8 = (iconv_t)-1;
  57   }
  58
  59   map_encoding = CUPS_AUTO_ENCODING;
  60 #endif /* HAVE_ICONV_H */
  61 }
  62
  63
  64 /*
  65  * 'cupsCharsetToUTF8()' - Convert legacy character set to UTF-8.
  66  */
  67
  68 int                                     /* O - Count or -1 on error */
  69 cupsCharsetToUTF8(
  70     cups_utf8_t           *dest,        /* O - Target string */
  71     const char            *src,         /* I - Source string */
  72     const int             maxout,       /* I - Max output */
  73     const cups_encoding_t encoding)     /* I - Encoding */
  74 {
  75   cups_utf8_t   *destptr;               /* Pointer into UTF-8 buffer */
  76 #ifdef HAVE_ICONV_H
  77   size_t        srclen,                 /* Length of source string */
  78                 outBytesLeft;           /* Bytes remaining in output buffer */
  79 #endif /* HAVE_ICONV_H */
  80
  81
  82  /*
  83   * Check for valid arguments...
  84   */
  85
  86   DEBUG_printf(("2cupsCharsetToUTF8(dest=%p, src=\"%s\", maxout=%d, encoding=%d)", (void *)dest, src, maxout, encoding));
  87
  88   if (!dest || !src || maxout < 1)
  89   {
  90     if (dest)
  91       *dest = '\0';
  92
  93     DEBUG_puts("3cupsCharsetToUTF8: Bad arguments, returning -1");
  94     return (-1);
  95   }
  96
  97  /*
  98   * Handle identity conversions...
  99   */
 100
 101   if (encoding == CUPS_UTF8 || encoding <= CUPS_US_ASCII ||
 102       encoding >= CUPS_ENCODING_VBCS_END)
 103   {
 104     strlcpy((char *)dest, src, (size_t)maxout);
 105     return ((int)strlen((char *)dest));
 106   }
 107
 108  /*
 109   * Handle ISO-8859-1 to UTF-8 directly...
 110   */
 111
 112   destptr = dest;
 113
 114   if (encoding == CUPS_ISO8859_1)
 115   {
 116     int         ch;                     /* Character from string */
 117     cups_utf8_t *destend;               /* End of UTF-8 buffer */
 118
 119
 120     destend = dest + maxout - 2;
 121
 122     while (*src && destptr < destend)
 123     {
 124       ch = *src++ & 255;
 125
 126       if (ch & 128)
 127       {
 128         *destptr++ = (cups_utf8_t)(0xc0 | (ch >> 6));
 129         *destptr++ = (cups_utf8_t)(0x80 | (ch & 0x3f));
 130       }
 131       else
 132         *destptr++ = (cups_utf8_t)ch;
 133     }
 134
 135     *destptr = '\0';
 136
 137     return ((int)(destptr - dest));
 138   }
 139
 140  /*
 141   * Convert input legacy charset to UTF-8...
 142   */
 143
 144 #ifdef HAVE_ICONV_H
 145   _cupsMutexLock(&map_mutex);
 146
 147   if (map_encoding != encoding)
 148   {
 149     char        toset[1024];            /* Destination character set */
 150
 151     _cupsCharmapFlush();
 152
 153     snprintf(toset, sizeof(toset), "%s//IGNORE", _cupsEncodingName(encoding));
 154
 155     map_encoding  = encoding;
 156     map_from_utf8 = iconv_open(_cupsEncodingName(encoding), "UTF-8");
 157     map_to_utf8   = iconv_open("UTF-8", toset);
 158   }
 159
 160   if (map_to_utf8 != (iconv_t)-1)
 161   {
 162     char *altdestptr = (char *)dest;    /* Silence bogus GCC type-punned */
 163
 164     srclen       = strlen(src);
 165     outBytesLeft = (size_t)maxout - 1;
 166
 167     iconv(map_to_utf8, (char **)&src, &srclen, &altdestptr, &outBytesLeft);
 168     *altdestptr = '\0';
 169
 170     _cupsMutexUnlock(&map_mutex);
 171
 172     return ((int)(altdestptr - (char *)dest));
 173   }
 174
 175   _cupsMutexUnlock(&map_mutex);
 176 #endif /* HAVE_ICONV_H */
 177
 178  /*
 179   * No iconv() support, so error out...
 180   */
 181
 182   *destptr = '\0';
 183
 184   return (-1);
 185 }
 186
 187
 188 /*
 189  * 'cupsUTF8ToCharset()' - Convert UTF-8 to legacy character set.
 190  */
 191
 192 int                                     /* O - Count or -1 on error */
 193 cupsUTF8ToCharset(
 194     char                  *dest,        /* O - Target string */
 195     const cups_utf8_t     *src,         /* I - Source string */
 196     const int             maxout,       /* I - Max output */
 197     const cups_encoding_t encoding)     /* I - Encoding */
 198 {
 199   char          *destptr;               /* Pointer into destination */
 200 #ifdef HAVE_ICONV_H
 201   size_t        srclen,                 /* Length of source string */
 202                 outBytesLeft;           /* Bytes remaining in output buffer */
 203 #endif /* HAVE_ICONV_H */
 204
 205
 206  /*
 207   * Check for valid arguments...
 208   */
 209
 210   if (!dest || !src || maxout < 1)
 211   {
 212     if (dest)
 213       *dest = '\0';
 214
 215     return (-1);
 216   }
 217
 218  /*
 219   * Handle identity conversions...
 220   */
 221
 222   if (encoding == CUPS_UTF8 ||
 223       encoding >= CUPS_ENCODING_VBCS_END)
 224   {
 225     strlcpy(dest, (char *)src, (size_t)maxout);
 226     return ((int)strlen(dest));
 227   }
 228
 229  /*
 230   * Handle UTF-8 to ISO-8859-1 directly...
 231   */
 232
 233   destptr = dest;
 234
 235   if (encoding == CUPS_ISO8859_1 || encoding <= CUPS_US_ASCII)
 236   {
 237     int         ch,                     /* Character from string */
 238                 maxch;                  /* Maximum character for charset */
 239     char        *destend;               /* End of ISO-8859-1 buffer */
 240
 241     maxch   = encoding == CUPS_ISO8859_1 ? 256 : 128;
 242     destend = dest + maxout - 1;
 243
 244     while (*src && destptr < destend)
 245     {
 246       ch = *src++;
 247
 248       if ((ch & 0xe0) == 0xc0)
 249       {
 250         ch = ((ch & 0x1f) << 6) | (*src++ & 0x3f);
 251
 252         if (ch < maxch)
 253           *destptr++ = (char)ch;
 254         else
 255           *destptr++ = '?';
 256       }
 257       else if ((ch & 0xf0) == 0xe0 ||
 258                (ch & 0xf8) == 0xf0)
 259         *destptr++ = '?';
 260       else if (!(ch & 0x80))
 261         *destptr++ = (char)ch;
 262     }
 263
 264     *destptr = '\0';
 265
 266     return ((int)(destptr - dest));
 267   }
 268
 269 #ifdef HAVE_ICONV_H
 270  /*
 271   * Convert input UTF-8 to legacy charset...
 272   */
 273
 274   _cupsMutexLock(&map_mutex);
 275
 276   if (map_encoding != encoding)
 277   {
 278     char        toset[1024];            /* Destination character set */
 279
 280     _cupsCharmapFlush();
 281
 282     snprintf(toset, sizeof(toset), "%s//IGNORE", _cupsEncodingName(encoding));
 283
 284     map_encoding  = encoding;
 285     map_from_utf8 = iconv_open(_cupsEncodingName(encoding), "UTF-8");
 286     map_to_utf8   = iconv_open("UTF-8", toset);
 287   }
 288
 289   if (map_from_utf8 != (iconv_t)-1)
 290   {
 291     char *altsrc = (char *)src;         /* Silence bogus GCC type-punned */
 292
 293     srclen       = strlen((char *)src);
 294     outBytesLeft = (size_t)maxout - 1;
 295
 296     iconv(map_from_utf8, &altsrc, &srclen, &destptr, &outBytesLeft);
 297     *destptr = '\0';
 298
 299     _cupsMutexUnlock(&map_mutex);
 300
 301     return ((int)(destptr - dest));
 302   }
 303
 304   _cupsMutexUnlock(&map_mutex);
 305 #endif /* HAVE_ICONV_H */
 306
 307  /*
 308   * No iconv() support, so error out...
 309   */
 310
 311   *destptr = '\0';
 312
 313   return (-1);
 314 }
 315
 316
 317 /*
 318  * 'cupsUTF8ToUTF32()' - Convert UTF-8 to UTF-32.
 319  *
 320  * 32-bit UTF-32 (actually 21-bit) maps to UTF-8 as follows...
 321  *
 322  *   UTF-32 char     UTF-8 char(s)
 323  *   --------------------------------------------------
 324  *        0 to 127 = 0xxxxxxx (US-ASCII)
 325  *     128 to 2047 = 110xxxxx 10yyyyyy
 326  *   2048 to 65535 = 1110xxxx 10yyyyyy 10zzzzzz
 327  *         > 65535 = 11110xxx 10yyyyyy 10zzzzzz 10xxxxxx
 328  *
 329  * UTF-32 prohibits chars beyond Plane 16 (> 0x10ffff) in UCS-4,
 330  * which would convert to five- or six-octet UTF-8 sequences...
 331  */
 332
 333 int                                     /* O - Count or -1 on error */
 334 cupsUTF8ToUTF32(
 335     cups_utf32_t      *dest,            /* O - Target string */
 336     const cups_utf8_t *src,             /* I - Source string */
 337     const int         maxout)           /* I - Max output */
 338 {
 339   int           i;                      /* Looping variable */
 340   cups_utf8_t   ch;                     /* Character value */
 341   cups_utf8_t   next;                   /* Next character value */
 342   cups_utf32_t  ch32;                   /* UTF-32 character value */
 343
 344
 345  /*
 346   * Check for valid arguments and clear output...
 347   */
 348
 349   DEBUG_printf(("2cupsUTF8ToUTF32(dest=%p, src=\"%s\", maxout=%d)", (void *)dest, src, maxout));
 350
 351   if (dest)
 352     *dest = 0;
 353
 354   if (!dest || !src || maxout < 1 || maxout > CUPS_MAX_USTRING)
 355   {
 356     DEBUG_puts("3cupsUTF8ToUTF32: Returning -1 (bad arguments)");
 357
 358     return (-1);
 359   }
 360
 361  /*
 362   * Convert input UTF-8 to output UTF-32...
 363   */
 364
 365   for (i = maxout - 1; *src && i > 0; i --)
 366   {
 367     ch = *src++;
 368
 369    /*
 370     * Convert UTF-8 character(s) to UTF-32 character...
 371     */
 372
 373     if (!(ch & 0x80))
 374     {
 375      /*
 376       * One-octet UTF-8 <= 127 (US-ASCII)...
 377       */
 378
 379       *dest++ = ch;
 380
 381       DEBUG_printf(("4cupsUTF8ToUTF32: %02x => %08X", src[-1], ch));
 382       continue;
 383     }
 384     else if ((ch & 0xe0) == 0xc0)
 385     {
 386      /*
 387       * Two-octet UTF-8 <= 2047 (Latin-x)...
 388       */
 389
 390       next = *src++;
 391       if ((next & 0xc0) != 0x80)
 392       {
 393         DEBUG_puts("3cupsUTF8ToUTF32: Returning -1 (bad UTF-8 sequence)");
 394
 395         return (-1);
 396       }
 397
 398       ch32 = (cups_utf32_t)((ch & 0x1f) << 6) | (cups_utf32_t)(next & 0x3f);
 399
 400      /*
 401       * Check for non-shortest form (invalid UTF-8)...
 402       */
 403
 404       if (ch32 < 0x80)
 405       {
 406         DEBUG_puts("3cupsUTF8ToUTF32: Returning -1 (bad UTF-8 sequence)");
 407
 408         return (-1);
 409       }
 410
 411       *dest++ = ch32;
 412
 413       DEBUG_printf(("4cupsUTF8ToUTF32: %02x %02x => %08X",
 414                     src[-2], src[-1], (unsigned)ch32));
 415     }
 416     else if ((ch & 0xf0) == 0xe0)
 417     {
 418      /*
 419       * Three-octet UTF-8 <= 65535 (Plane 0 - BMP)...
 420       */
 421
 422       next = *src++;
 423       if ((next & 0xc0) != 0x80)
 424       {
 425         DEBUG_puts("3cupsUTF8ToUTF32: Returning -1 (bad UTF-8 sequence)");
 426
 427         return (-1);
 428       }
 429
 430       ch32 = (cups_utf32_t)((ch & 0x0f) << 6) | (cups_utf32_t)(next & 0x3f);
 431
 432       next = *src++;
 433       if ((next & 0xc0) != 0x80)
 434       {
 435         DEBUG_puts("3cupsUTF8ToUTF32: Returning -1 (bad UTF-8 sequence)");
 436
 437         return (-1);
 438       }
 439
 440       ch32 = (ch32 << 6) | (cups_utf32_t)(next & 0x3f);
 441
 442      /*
 443       * Check for non-shortest form (invalid UTF-8)...
 444       */
 445
 446       if (ch32 < 0x800)
 447       {
 448         DEBUG_puts("3cupsUTF8ToUTF32: Returning -1 (bad UTF-8 sequence)");
 449
 450         return (-1);
 451       }
 452
 453       *dest++ = ch32;
 454
 455       DEBUG_printf(("4cupsUTF8ToUTF32: %02x %02x %02x => %08X",
 456                     src[-3], src[-2], src[-1], (unsigned)ch32));
 457     }
 458     else if ((ch & 0xf8) == 0xf0)
 459     {
 460      /*
 461       * Four-octet UTF-8...
 462       */
 463
 464       next = *src++;
 465       if ((next & 0xc0) != 0x80)
 466       {
 467         DEBUG_puts("3cupsUTF8ToUTF32: Returning -1 (bad UTF-8 sequence)");
 468
 469         return (-1);
 470       }
 471
 472       ch32 = (cups_utf32_t)((ch & 0x07) << 6) | (cups_utf32_t)(next & 0x3f);
 473
 474       next = *src++;
 475       if ((next & 0xc0) != 0x80)
 476       {
 477         DEBUG_puts("3cupsUTF8ToUTF32: Returning -1 (bad UTF-8 sequence)");
 478
 479         return (-1);
 480       }
 481
 482       ch32 = (ch32 << 6) | (cups_utf32_t)(next & 0x3f);
 483
 484       next = *src++;
 485       if ((next & 0xc0) != 0x80)
 486       {
 487         DEBUG_puts("3cupsUTF8ToUTF32: Returning -1 (bad UTF-8 sequence)");
 488
 489         return (-1);
 490       }
 491
 492       ch32 = (ch32 << 6) | (cups_utf32_t)(next & 0x3f);
 493
 494      /*
 495       * Check for non-shortest form (invalid UTF-8)...
 496       */
 497
 498       if (ch32 < 0x10000)
 499       {
 500         DEBUG_puts("3cupsUTF8ToUTF32: Returning -1 (bad UTF-8 sequence)");
 501
 502         return (-1);
 503       }
 504
 505       *dest++ = ch32;
 506
 507       DEBUG_printf(("4cupsUTF8ToUTF32: %02x %02x %02x %02x => %08X",
 508                     src[-4], src[-3], src[-2], src[-1], (unsigned)ch32));
 509     }
 510     else
 511     {
 512      /*
 513       * More than 4-octet (invalid UTF-8 sequence)...
 514       */
 515
 516       DEBUG_puts("3cupsUTF8ToUTF32: Returning -1 (bad UTF-8 sequence)");
 517
 518       return (-1);
 519     }
 520
 521    /*
 522     * Check for UTF-16 surrogate (illegal UTF-8)...
 523     */
 524
 525     if (ch32 >= 0xd800 && ch32 <= 0xdfff)
 526       return (-1);
 527   }
 528
 529   *dest = 0;
 530
 531   DEBUG_printf(("3cupsUTF8ToUTF32: Returning %d characters", maxout - 1 - i));
 532
 533   return (maxout - 1 - i);
 534 }
 535
 536
 537 /*
 538  * 'cupsUTF32ToUTF8()' - Convert UTF-32 to UTF-8.
 539  *
 540  * 32-bit UTF-32 (actually 21-bit) maps to UTF-8 as follows...
 541  *
 542  *   UTF-32 char     UTF-8 char(s)
 543  *   --------------------------------------------------
 544  *        0 to 127 = 0xxxxxxx (US-ASCII)
 545  *     128 to 2047 = 110xxxxx 10yyyyyy
 546  *   2048 to 65535 = 1110xxxx 10yyyyyy 10zzzzzz
 547  *         > 65535 = 11110xxx 10yyyyyy 10zzzzzz 10xxxxxx
 548  *
 549  * UTF-32 prohibits chars beyond Plane 16 (> 0x10ffff) in UCS-4,
 550  * which would convert to five- or six-octet UTF-8 sequences...
 551  */
 552
 553 int                                     /* O - Count or -1 on error */
 554 cupsUTF32ToUTF8(
 555     cups_utf8_t        *dest,           /* O - Target string */
 556     const cups_utf32_t *src,            /* I - Source string */
 557     const int          maxout)          /* I - Max output */
 558 {
 559   cups_utf8_t   *start;                 /* Start of destination string */
 560   int           i;                      /* Looping variable */
 561   int           swap;                   /* Byte-swap input to output */
 562   cups_utf32_t  ch;                     /* Character value */
 563
 564
 565  /*
 566   * Check for valid arguments and clear output...
 567   */
 568
 569   DEBUG_printf(("2cupsUTF32ToUTF8(dest=%p, src=%p, maxout=%d)", (void *)dest, (void *)src, maxout));
 570
 571   if (dest)
 572     *dest = '\0';
 573
 574   if (!dest || !src || maxout < 1)
 575   {
 576     DEBUG_puts("3cupsUTF32ToUTF8: Returning -1 (bad args)");
 577
 578     return (-1);
 579   }
 580
 581  /*
 582   * Check for leading BOM in UTF-32 and inverted BOM...
 583   */
 584
 585   start = dest;
 586   swap  = *src == 0xfffe0000;
 587
 588   DEBUG_printf(("4cupsUTF32ToUTF8: swap=%d", swap));
 589
 590   if (*src == 0xfffe0000 || *src == 0xfeff)
 591     src ++;
 592
 593  /*
 594   * Convert input UTF-32 to output UTF-8...
 595   */
 596
 597   for (i = maxout - 1; *src && i > 0;)
 598   {
 599     ch = *src++;
 600
 601    /*
 602     * Byte swap input UTF-32, if necessary...
 603     * (only byte-swapping 24 of 32 bits)
 604     */
 605
 606     if (swap)
 607       ch = ((ch >> 24) | ((ch >> 8) & 0xff00) | ((ch << 8) & 0xff0000));
 608
 609    /*
 610     * Check for beyond Plane 16 (invalid UTF-32)...
 611     */
 612
 613     if (ch > 0x10ffff)
 614     {
 615       DEBUG_puts("3cupsUTF32ToUTF8: Returning -1 (character out of range)");
 616
 617       return (-1);
 618     }
 619
 620    /*
 621     * Convert UTF-32 character to UTF-8 character(s)...
 622     */
 623
 624     if (ch < 0x80)
 625     {
 626      /*
 627       * One-octet UTF-8 <= 127 (US-ASCII)...
 628       */
 629
 630       *dest++ = (cups_utf8_t)ch;
 631       i --;
 632
 633       DEBUG_printf(("4cupsUTF32ToUTF8: %08x => %02x", (unsigned)ch, dest[-1]));
 634     }
 635     else if (ch < 0x800)
 636     {
 637      /*
 638       * Two-octet UTF-8 <= 2047 (Latin-x)...
 639       */
 640
 641       if (i < 2)
 642       {
 643         DEBUG_puts("3cupsUTF32ToUTF8: Returning -1 (too long 2)");
 644
 645         return (-1);
 646       }
 647
 648       *dest++ = (cups_utf8_t)(0xc0 | ((ch >> 6) & 0x1f));
 649       *dest++ = (cups_utf8_t)(0x80 | (ch & 0x3f));
 650       i -= 2;
 651
 652       DEBUG_printf(("4cupsUTF32ToUTF8: %08x => %02x %02x", (unsigned)ch,
 653                     dest[-2], dest[-1]));
 654     }
 655     else if (ch < 0x10000)
 656     {
 657      /*
 658       * Three-octet UTF-8 <= 65535 (Plane 0 - BMP)...
 659       */
 660
 661       if (i < 3)
 662       {
 663         DEBUG_puts("3cupsUTF32ToUTF8: Returning -1 (too long 3)");
 664
 665         return (-1);
 666       }
 667
 668       *dest++ = (cups_utf8_t)(0xe0 | ((ch >> 12) & 0x0f));
 669       *dest++ = (cups_utf8_t)(0x80 | ((ch >> 6) & 0x3f));
 670       *dest++ = (cups_utf8_t)(0x80 | (ch & 0x3f));
 671       i -= 3;
 672
 673       DEBUG_printf(("4cupsUTF32ToUTF8: %08x => %02x %02x %02x", (unsigned)ch,
 674                     dest[-3], dest[-2], dest[-1]));
 675     }
 676     else
 677     {
 678      /*
 679       * Four-octet UTF-8...
 680       */
 681
 682       if (i < 4)
 683       {
 684         DEBUG_puts("3cupsUTF32ToUTF8: Returning -1 (too long 4)");
 685
 686         return (-1);
 687       }
 688
 689       *dest++ = (cups_utf8_t)(0xf0 | ((ch >> 18) & 0x07));
 690       *dest++ = (cups_utf8_t)(0x80 | ((ch >> 12) & 0x3f));
 691       *dest++ = (cups_utf8_t)(0x80 | ((ch >> 6) & 0x3f));
 692       *dest++ = (cups_utf8_t)(0x80 | (ch & 0x3f));
 693       i -= 4;
 694
 695       DEBUG_printf(("4cupsUTF32ToUTF8: %08x => %02x %02x %02x %02x",
 696                     (unsigned)ch, dest[-4], dest[-3], dest[-2], dest[-1]));
 697     }
 698   }
 699
 700   *dest = '\0';
 701
 702   DEBUG_printf(("3cupsUTF32ToUTF8: Returning %d", (int)(dest - start)));
 703
 704   return ((int)(dest - start));
 705 }