/*
- * "$Id: transcode.c 4903 2006-01-10 20:02:46Z mike $"
+ * "$Id$"
*
- * Transcoding support for the Common UNIX Printing System (CUPS).
+ * Transcoding support for CUPS.
*
- * Copyright 1997-2006 by Easy Software Products.
+ * Copyright 2007-2010 by Apple Inc.
+ * Copyright 1997-2007 by Easy Software Products.
*
- * These coded instructions, statements, and computer programs are
- * the property of Easy Software Products and are protected by Federal
- * copyright law. Distribution and use rights are outlined in the
- * file "LICENSE.txt" which should have been included with this file.
- * If this file is missing or damaged please contact Easy Software
- * Products at:
+ * These coded instructions, statements, and computer programs are the
+ * property of Apple Inc. and are protected by Federal copyright
+ * law. Distribution and use rights are outlined in the file "LICENSE.txt"
+ * which should have been included with this file. If this file is
+ * file is missing or damaged, see the license at "http://www.cups.org/".
*
- * Attn: CUPS Licensing Information
- * Easy Software Products
- * 44141 Airport View Drive, Suite 204
- * Hollywood, Maryland 20636 USA
- *
- * Voice: (301) 373-9600
- * EMail: cups-info@cups.org
- * WWW: http://www.cups.org
+ * This file is subject to the Apple OS-Developed Software exception.
*
* Contents:
*
- * cupsCharmapGet() - Get a character set map.
- * cupsCharmapFree() - Free a character set map.
- * cupsCharmapFlush() - Flush all character set maps out of cache.
- * cupsUTF8ToCharset() - Convert UTF-8 to legacy character set.
+ * _cupsCharmapFlush() - Flush all character set maps out of cache.
* cupsCharsetToUTF8() - Convert legacy character set to UTF-8.
- * cupsUTF8ToUTF16() - Convert UTF-8 to UTF-16.
- * cupsUTF16ToUTF8() - Convert UTF-16 to UTF-8.
+ * cupsUTF8ToCharset() - Convert UTF-8 to legacy character set.
* cupsUTF8ToUTF32() - Convert UTF-8 to UTF-32.
* cupsUTF32ToUTF8() - Convert UTF-32 to UTF-8.
- * cupsUTF16ToUTF32() - Convert UTF-16 to UTF-32.
- * cupsUTF32ToUTF16() - Convert UTF-32 to UTF-16.
- * get_charmap_count() - Count lines in a charmap file.
- * get_sbcs_charmap() - Get SBCS Charmap.
- * get_vbcs_charmap() - Get DBCS/VBCS Charmap.
- * conv_utf8_to_sbcs() - Convert UTF-8 to legacy SBCS.
- * conv_utf8_to_vbcs() - Convert UTF-8 to legacy DBCS/VBCS.
- * conv_sbcs_to_utf8() - Convert legacy SBCS to UTF-8.
- * conv_vbcs_to_utf8() - Convert legacy DBCS/VBCS to UTF-8.
- * compare_wide() - Compare key for wide (VBCS) match.
*/
/*
* Include necessary headers...
*/
-#include "globals.h"
-#include <stdlib.h>
-#include <errno.h>
+#include "cups-private.h"
+#include <limits.h>
#include <time.h>
+#ifdef HAVE_ICONV_H
+# include <iconv.h>
+#endif /* HAVE_ICONV_H */
/*
- * Prototypes...
+ * Local globals...
*/
-static int get_charmap_count(const char *filename);
-static _cups_cmap_t *get_sbcs_charmap(const cups_encoding_t encoding,
- const char *filename);
-static _cups_vmap_t *get_vbcs_charmap(const cups_encoding_t encoding,
- const char *filename);
-
-static int conv_utf8_to_sbcs(char *dest,
- const cups_utf8_t *src,
- const int maxout,
- const cups_encoding_t encoding);
-static int conv_utf8_to_vbcs(char *dest,
- const cups_utf8_t *src,
- const int maxout,
- const cups_encoding_t encoding);
-
-static int conv_sbcs_to_utf8(cups_utf8_t *dest,
- const char *src,
- const int maxout,
- const cups_encoding_t encoding);
-static int conv_vbcs_to_utf8(cups_utf8_t *dest,
- const char *src,
- const int maxout,
- const cups_encoding_t encoding);
-
-static int compare_wide(const void *k1, const void *k2);
+#ifdef HAVE_ICONV_H
+static _cups_mutex_t map_mutex = _CUPS_MUTEX_INITIALIZER;
+ /* Mutex to control access to maps */
+static iconv_t map_from_utf8 = (iconv_t)-1;
+ /* Convert from UTF-8 to charset */
+static iconv_t map_to_utf8 = (iconv_t)-1;
+ /* Convert from charset to UTF-8 */
+static cups_encoding_t map_encoding = CUPS_AUTO_ENCODING;
+ /* Which charset is cached */
+#endif /* HAVE_ICONV_H */
+
/*
- * 'cupsCharmapGet()' - Get a character set map.
- *
- * This code handles single-byte (SBCS), double-byte (DBCS), and
- * variable-byte (VBCS) character sets _without_ charset escapes...
- * This code does not handle multiple-byte character sets (MBCS)
- * (such as ISO-2022-JP) with charset switching via escapes...
+ * '_cupsCharmapFlush()' - Flush all character set maps out of cache.
+ */
+
+void
+_cupsCharmapFlush(void)
+{
+#ifdef HAVE_ICONV_H
+ if (map_from_utf8 != (iconv_t)-1)
+ {
+ iconv_close(map_from_utf8);
+ map_from_utf8 = (iconv_t)-1;
+ }
+
+ if (map_to_utf8 != (iconv_t)-1)
+ {
+ iconv_close(map_to_utf8);
+ map_to_utf8 = (iconv_t)-1;
+ }
+
+ map_encoding = CUPS_AUTO_ENCODING;
+#endif /* HAVE_ICONV_H */
+}
+
+
+/*
+ * 'cupsCharsetToUTF8()' - Convert legacy character set to UTF-8.
*/
-void * /* O - Charset map pointer */
-cupsCharmapGet(
+int /* O - Count or -1 on error */
+cupsCharsetToUTF8(
+ cups_utf8_t *dest, /* O - Target string */
+ const char *src, /* I - Source string */
+ const int maxout, /* I - Max output */
const cups_encoding_t encoding) /* I - Encoding */
{
- char mapname[80]; /* Name of charset map */
- char filename[1024]; /* Filename for charset map file */
- _cups_globals_t *cg = _cupsGlobals(); /* Global data */
+ cups_utf8_t *destptr; /* Pointer into UTF-8 buffer */
+#ifdef HAVE_ICONV_H
+ size_t srclen, /* Length of source string */
+ outBytesLeft; /* Bytes remaining in output buffer */
+#endif /* HAVE_ICONV_H */
/*
* Check for valid arguments...
*/
- if ((encoding < 0) || (encoding >= CUPS_ENCODING_VBCS_END))
- return (NULL);
+ DEBUG_printf(("2cupsCharsetToUTF8(dest=%p, src=\"%s\", maxout=%d, encoding=%d)",
+ dest, src, maxout, encoding));
- /*
- * Get the data directory and charset map name...
- */
+ if (!dest || !src || maxout < 1)
+ {
+ if (dest)
+ *dest = '\0';
- snprintf(mapname, sizeof(mapname), "%s.txt", _cupsEncodingName(encoding));
- snprintf(filename, sizeof(filename), "%s/charmaps/%s",
- cg->cups_datadir, mapname);
+ DEBUG_puts("3cupsCharsetToUTF8: Bad arguments, returning -1");
+ return (-1);
+ }
/*
- * Read charset map input file into cache...
+ * Handle identity conversions...
*/
- if (encoding < CUPS_ENCODING_SBCS_END)
- return (get_sbcs_charmap(encoding, filename));
- else if (encoding < CUPS_ENCODING_VBCS_END)
- return (get_vbcs_charmap(encoding, filename));
- else
- return (NULL);
-}
-
-/*
- * 'cupsCharmapFree()' - Free a character set map.
- *
- * This does not actually free; use 'cupsCharmapFlush()' for that.
- */
-void
-cupsCharmapFree(const cups_encoding_t encoding)
- /* I - Encoding */
-{
- _cups_cmap_t *cmap; /* Legacy SBCS / Unicode Charset Map */
- _cups_vmap_t *vmap; /* Legacy VBCS / Unicode Charset Map */
- _cups_globals_t *cg = _cupsGlobals();
- /* Pointer to library globals */
+ if (encoding == CUPS_UTF8 || encoding <= CUPS_US_ASCII ||
+ encoding >= CUPS_ENCODING_VBCS_END)
+ {
+ strlcpy((char *)dest, src, maxout);
+ return ((int)strlen((char *)dest));
+ }
/*
- * See if we already have this SBCS charset map loaded...
+ * Handle ISO-8859-1 to UTF-8 directly...
*/
- for (cmap = cg->cmap_cache; cmap != NULL; cmap = cmap->next)
+
+ destptr = dest;
+
+ if (encoding == CUPS_ISO8859_1)
{
- if (cmap->encoding == encoding)
+ int ch; /* Character from string */
+ cups_utf8_t *destend; /* End of UTF-8 buffer */
+
+
+ destend = dest + maxout - 2;
+
+ while (*src && destptr < destend)
{
- if (cmap->used > 0)
- cmap->used --;
- return;
+ ch = *src++ & 255;
+
+ if (ch & 128)
+ {
+ *destptr++ = 0xc0 | (ch >> 6);
+ *destptr++ = 0x80 | (ch & 0x3f);
+ }
+ else
+ *destptr++ = ch;
}
+
+ *destptr = '\0';
+
+ return ((int)(destptr - dest));
}
/*
- * See if we already have this DBCS/VBCS charset map loaded...
+ * Convert input legacy charset to UTF-8...
*/
- for (vmap = cg->vmap_cache; vmap != NULL; vmap = vmap->next)
+
+#ifdef HAVE_ICONV_H
+ _cupsMutexLock(&map_mutex);
+
+ if (map_encoding != encoding)
{
- if (vmap->encoding == encoding)
- {
- if (vmap->used > 0)
- vmap->used --;
- return;
- }
- }
- return;
-}
+ _cupsCharmapFlush();
-/*
- * 'cupsCharmapFlush()' - Flush all character set maps out of cache.
- */
-void
-cupsCharmapFlush(void)
-{
- int i; /* Looping variable */
- _cups_cmap_t *cmap; /* Legacy SBCS / Unicode Charset Map */
- _cups_vmap_t *vmap; /* Legacy VBCS / Unicode Charset Map */
- _cups_cmap_t *cnext; /* Next Legacy SBCS Charset Map */
- _cups_vmap_t *vnext; /* Next Legacy VBCS Charset Map */
- cups_ucs2_t *crow; /* Pointer to UCS-2 row in 'char2uni' */
- cups_sbcs_t *srow; /* Pointer to SBCS row in 'uni2char' */
- cups_vbcs_t *vrow; /* Pointer to VBCS row in 'uni2char' */
- _cups_globals_t *cg = _cupsGlobals();
- /* Pointer to library globals */
+ map_from_utf8 = iconv_open(_cupsEncodingName(encoding), "UTF-8");
+ map_to_utf8 = iconv_open("UTF-8", _cupsEncodingName(encoding));
+ map_encoding = encoding;
+ }
- /*
- * Loop through SBCS charset map cache, free all memory...
- */
- for (cmap = cg->cmap_cache; cmap != NULL; cmap = cnext)
+ if (map_to_utf8 != (iconv_t)-1)
{
- for (i = 0; i < 256; i ++)
- {
- if ((srow = cmap->uni2char[i]) != NULL)
- free(srow);
- }
- cnext = cmap->next;
- free(cmap);
+ char *altdestptr = (char *)dest; /* Silence bogus GCC type-punned */
+
+ srclen = strlen(src);
+ outBytesLeft = maxout - 1;
+
+ iconv(map_to_utf8, (char **)&src, &srclen, &altdestptr, &outBytesLeft);
+ *altdestptr = '\0';
+
+ _cupsMutexUnlock(&map_mutex);
+
+ return ((int)(altdestptr - (char *)dest));
}
- cg->cmap_cache = NULL;
+
+ _cupsMutexUnlock(&map_mutex);
+#endif /* HAVE_ICONV_H */
/*
- * Loop through DBCS/VBCS charset map cache, free all memory...
+ * No iconv() support, so error out...
*/
- for (vmap = cg->vmap_cache; vmap != NULL; vmap = vnext)
- {
- for (i = 0; i < 256; i ++)
- {
- if ((crow = vmap->char2uni[i]) != NULL)
- free(crow);
- }
- for (i = 0; i < 256; i ++)
- {
- if ((vrow = vmap->uni2char[i]) != NULL)
- free(vrow);
- }
- if (vmap->wide2uni)
- free(vmap->wide2uni);
- vnext = vmap->next;
- free(vmap);
- }
- cg->vmap_cache = NULL;
- return;
+
+ *destptr = '\0';
+
+ return (-1);
}
+
/*
* 'cupsUTF8ToCharset()' - Convert UTF-8 to legacy character set.
- *
- * This code handles single-byte (SBCS), double-byte (DBCS), and
- * variable-byte (VBCS) character sets _without_ charset escapes...
- * This code does not handle multiple-byte character sets (MBCS)
- * (such as ISO-2022-JP) with charset switching via escapes...
*/
-int /* O - Count or -1 on error */
-cupsUTF8ToCharset(char *dest, /* O - Target string */
- const cups_utf8_t *src, /* I - Source string */
- const int maxout, /* I - Max output */
- const cups_encoding_t encoding) /* I - Encoding */
+
+int /* O - Count or -1 on error */
+cupsUTF8ToCharset(
+ char *dest, /* O - Target string */
+ const cups_utf8_t *src, /* I - Source string */
+ const int maxout, /* I - Max output */
+ const cups_encoding_t encoding) /* I - Encoding */
{
+ char *destptr; /* Pointer into destination */
+#ifdef HAVE_ICONV_H
+ size_t srclen, /* Length of source string */
+ outBytesLeft; /* Bytes remaining in output buffer */
+#endif /* HAVE_ICONV_H */
+
+
/*
* Check for valid arguments...
*/
- if (!dest || !src || maxout < 1 || maxout > CUPS_MAX_USTRING)
+ if (!dest || !src || maxout < 1)
+ {
+ if (dest)
+ *dest = '\0';
+
return (-1);
+ }
/*
* Handle identity conversions...
*/
if (encoding == CUPS_UTF8 ||
- encoding < 0 || encoding >= CUPS_ENCODING_VBCS_END)
+ encoding >= CUPS_ENCODING_VBCS_END)
{
strlcpy(dest, (char *)src, maxout);
- return (strlen(dest));
+ return ((int)strlen(dest));
}
/*
- * Convert input UTF-8 to legacy charset...
+ * Handle UTF-8 to ISO-8859-1 directly...
*/
- if (encoding < CUPS_ENCODING_SBCS_END)
- return (conv_utf8_to_sbcs(dest, src, maxout, encoding));
- else if (encoding < CUPS_ENCODING_VBCS_END)
- return (conv_utf8_to_vbcs(dest, src, maxout, encoding));
- else
- return (-1);
-}
-/*
- * 'cupsCharsetToUTF8()' - Convert legacy character set to UTF-8.
- *
- * This code handles single-byte (SBCS), double-byte (DBCS), and
- * variable-byte (VBCS) character sets _without_ charset escapes...
- * This code does not handle multiple-byte character sets (MBCS)
- * (such as ISO-2022-JP) with charset switching via escapes...
- */
-int /* O - Count or -1 on error */
-cupsCharsetToUTF8(cups_utf8_t *dest, /* O - Target string */
- const char *src, /* I - Source string */
- const int maxout, /* I - Max output */
- const cups_encoding_t encoding) /* I - Encoding */
-{
- /*
- * Check for valid arguments...
- */
+ destptr = dest;
- if (!dest || !src || maxout < 1 || maxout > CUPS_MAX_USTRING)
- return (-1);
+ if (encoding == CUPS_ISO8859_1 || encoding <= CUPS_US_ASCII)
+ {
+ int ch, /* Character from string */
+ maxch; /* Maximum character for charset */
+ char *destend; /* End of ISO-8859-1 buffer */
+
+ maxch = encoding == CUPS_ISO8859_1 ? 256 : 128;
+ destend = dest + maxout - 1;
+
+ while (*src && destptr < destend)
+ {
+ ch = *src++;
+
+ if ((ch & 0xe0) == 0xc0)
+ {
+ ch = ((ch & 0x1f) << 6) | (*src++ & 0x3f);
+
+ if (ch < maxch)
+ *destptr++ = ch;
+ else
+ *destptr++ = '?';
+ }
+ else if ((ch & 0xf0) == 0xe0 ||
+ (ch & 0xf8) == 0xf0)
+ *destptr++ = '?';
+ else if (!(ch & 0x80))
+ *destptr++ = ch;
+ }
+ *destptr = '\0';
+
+ return ((int)(destptr - dest));
+ }
+
+#ifdef HAVE_ICONV_H
/*
- * Handle identity conversions...
+ * Convert input UTF-8 to legacy charset...
*/
- if (encoding == CUPS_UTF8 ||
- encoding < 0 || encoding >= CUPS_ENCODING_VBCS_END)
+ _cupsMutexLock(&map_mutex);
+
+ if (map_encoding != encoding)
{
- strlcpy((char *)dest, src, maxout);
- return (strlen((char *)dest));
+ _cupsCharmapFlush();
+
+ map_from_utf8 = iconv_open(_cupsEncodingName(encoding), "UTF-8");
+ map_to_utf8 = iconv_open("UTF-8", _cupsEncodingName(encoding));
+ map_encoding = encoding;
}
- /*
- * Convert input legacy charset to UTF-8...
- */
- if (encoding < CUPS_ENCODING_SBCS_END)
- return (conv_sbcs_to_utf8(dest, src, maxout, encoding));
- else if (encoding < CUPS_ENCODING_VBCS_END)
- return (conv_vbcs_to_utf8(dest, src, maxout, encoding));
- else
- return (-1);
-}
+ if (map_from_utf8 != (iconv_t)-1)
+ {
+ char *altsrc = (char *)src; /* Silence bogus GCC type-punned */
-/*
- * 'cupsUTF8ToUTF16()' - Convert UTF-8 to UTF-16.
- *
- * This code does not support Unicode beyond 16-bits (Plane 0)...
- */
-int /* O - Count or -1 on error */
-cupsUTF8ToUTF16(cups_utf16_t *dest, /* O - Target string */
- const cups_utf8_t *src, /* I - Source string */
- const int maxout) /* I - Max output */
-{
- int worklen; /* Internal UCS-4 string length */
- cups_utf32_t work[CUPS_MAX_USTRING];
- /* Internal UCS-4 string */
+ srclen = strlen((char *)src);
+ outBytesLeft = maxout - 1;
- /*
- * Check for valid arguments and clear output...
- */
- if ((dest == NULL)
- || (src == NULL)
- || (maxout < 1)
- || (maxout > CUPS_MAX_USTRING))
- return (-1);
- *dest = 0;
+ iconv(map_from_utf8, &altsrc, &srclen, &destptr, &outBytesLeft);
+ *destptr = '\0';
- /*
- * Convert input UTF-8 to internal UCS-4 (and insert BOM)...
- */
- worklen = cupsUTF8ToUTF32(work, src, CUPS_MAX_USTRING);
- if (worklen < 0)
- return (-1);
+ _cupsMutexUnlock(&map_mutex);
- /*
- * Convert internal UCS-4 to output UTF-16...
- */
- worklen = cupsUTF32ToUTF16(dest, work, maxout);
- return (worklen);
-}
+ return ((int)(destptr - dest));
+ }
-/*
- * 'cupsUTF16ToUTF8()' - Convert UTF-16 to UTF-8.
- *
- * This code does not support Unicode beyond 16-bits (Plane 0)...
- */
-int /* O - Count or -1 on error */
-cupsUTF16ToUTF8(cups_utf8_t *dest, /* O - Target string */
- const cups_utf16_t *src, /* I - Source string */
- const int maxout) /* I - Max output */
-{
- int worklen; /* Internal UCS-4 string length */
- cups_utf32_t work[CUPS_MAX_USTRING];
- /* Internal UCS-4 string */
+ _cupsMutexUnlock(&map_mutex);
+#endif /* HAVE_ICONV_H */
/*
- * Check for valid arguments and clear output...
+ * No iconv() support, so error out...
*/
- if ((dest == NULL)
- || (src == NULL)
- || (maxout < 1)
- || (maxout > CUPS_MAX_USTRING))
- return (-1);
- *dest = 0;
- /*
- * Convert input UTF-16 to internal UCS-4 (and byte-swap)...
- */
- worklen = cupsUTF16ToUTF32(work, src, CUPS_MAX_USTRING);
- if (worklen < 0)
- return (-1);
+ *destptr = '\0';
- /*
- * Convert internal UCS-4 to output UTF-8 (and delete BOM)...
- */
- worklen = cupsUTF32ToUTF8(dest, work, maxout);
- return (worklen);
+ return (-1);
}
+
/*
* 'cupsUTF8ToUTF32()' - Convert UTF-8 to UTF-32.
*
*
* UTF-32 char UTF-8 char(s)
* --------------------------------------------------
- * 0 to 127 = 0xxxxxxx (US-ASCII)
+ * 0 to 127 = 0xxxxxxx (US-ASCII)
* 128 to 2047 = 110xxxxx 10yyyyyy
* 2048 to 65535 = 1110xxxx 10yyyyyy 10zzzzzz
- * > 65535 = 11110xxx 10yyyyyy 10zzzzzz 10xxxxxx
+ * > 65535 = 11110xxx 10yyyyyy 10zzzzzz 10xxxxxx
*
* UTF-32 prohibits chars beyond Plane 16 (> 0x10ffff) in UCS-4,
* which would convert to five- or six-octet UTF-8 sequences...
- *
- * This code does not support Unicode beyond 16-bits (Plane 0)...
*/
-int /* O - Count or -1 on error */
-cupsUTF8ToUTF32(cups_utf32_t *dest, /* O - Target string */
- const cups_utf8_t *src, /* I - Source string */
- const int maxout) /* I - Max output */
+
+int /* O - Count or -1 on error */
+cupsUTF8ToUTF32(
+ cups_utf32_t *dest, /* O - Target string */
+ const cups_utf8_t *src, /* I - Source string */
+ const int maxout) /* I - Max output */
{
- cups_utf8_t *first = (cups_utf8_t *) src;
- size_t srclen; /* Source string length */
- int i; /* Looping variable */
- cups_utf32_t ch; /* Character value */
- cups_utf32_t next; /* Next character value */
- cups_utf32_t ch32; /* UTF-32 character value */
+ int i; /* Looping variable */
+ cups_utf8_t ch; /* Character value */
+ cups_utf8_t next; /* Next character value */
+ cups_utf32_t ch32; /* UTF-32 character value */
+
/*
* Check for valid arguments and clear output...
*/
- if ((dest == NULL)
- || (src == NULL)
- || (maxout < 1)
- || (maxout > CUPS_MAX_USTRING))
+
+ DEBUG_printf(("2cupsUTF8ToUTF32(dest=%p, src=\"%s\", maxout=%d)", dest,
+ src, maxout));
+
+ if (dest)
+ *dest = 0;
+
+ if (!dest || !src || maxout < 1 || maxout > CUPS_MAX_USTRING)
+ {
+ DEBUG_puts("3cupsUTF8ToUTF32: Returning -1 (bad arguments)");
+
return (-1);
- *dest = 0;
+ }
/*
- * Convert input UTF-8 to output UTF-32 (and insert BOM)...
+ * Convert input UTF-8 to output UTF-32...
*/
- *dest = 0xfeff;
- dest ++;
- srclen = strlen((char *) src);
- for (i = 1; i < (maxout - 1); src ++, dest ++)
+
+ for (i = maxout - 1; *src && i > 0; i --)
{
- ch = (cups_utf32_t) *src;
- ch &= 0xff;
- if (ch == 0)
- break;
- i ++;
+ ch = *src++;
/*
* Convert UTF-8 character(s) to UTF-32 character...
*/
- if ((ch & 0x7f) == ch)
+
+ if (!(ch & 0x80))
{
/*
* One-octet UTF-8 <= 127 (US-ASCII)...
*/
- *dest = ch;
+
+ *dest++ = ch;
+
+ DEBUG_printf(("4cupsUTF8ToUTF32: %02x => %08X", src[-1], ch));
+ continue;
}
else if ((ch & 0xe0) == 0xc0)
{
/*
* Two-octet UTF-8 <= 2047 (Latin-x)...
*/
- src ++;
- next = (cups_utf32_t) *src;
- next &= 0xff;
- if (next == 0)
+
+ next = *src++;
+ if ((next & 0xc0) != 0x80)
+ {
+ DEBUG_puts("3cupsUTF8ToUTF32: Returning -1 (bad UTF-8 sequence)");
+
return (-1);
+ }
+
ch32 = ((ch & 0x1f) << 6) | (next & 0x3f);
/*
* Check for non-shortest form (invalid UTF-8)...
*/
- if (ch32 <= 127)
+
+ if (ch32 < 0x80)
+ {
+ DEBUG_puts("3cupsUTF8ToUTF32: Returning -1 (bad UTF-8 sequence)");
+
return (-1);
- *dest = ch32;
+ }
+
+ *dest++ = ch32;
+
+ DEBUG_printf(("4cupsUTF8ToUTF32: %02x %02x => %08X",
+ src[-2], src[-1], (unsigned)ch32));
}
else if ((ch & 0xf0) == 0xe0)
{
/*
* Three-octet UTF-8 <= 65535 (Plane 0 - BMP)...
*/
- src ++;
- next = (cups_utf32_t) *src;
- next &= 0xff;
- if (next == 0)
+
+ next = *src++;
+ if ((next & 0xc0) != 0x80)
+ {
+ DEBUG_puts("3cupsUTF8ToUTF32: Returning -1 (bad UTF-8 sequence)");
+
return (-1);
- ch32 = ((ch & 0x1f) << 6) | (next & 0x3f);
- src ++;
- next = (cups_utf32_t) *src;
- next &= 0xff;
- if (next == 0)
+ }
+
+ ch32 = ((ch & 0x0f) << 6) | (next & 0x3f);
+
+ next = *src++;
+ if ((next & 0xc0) != 0x80)
+ {
+ DEBUG_puts("3cupsUTF8ToUTF32: Returning -1 (bad UTF-8 sequence)");
+
return (-1);
- ch32 = ((ch32 << 6) | (next & 0x3f));
+ }
+
+ ch32 = (ch32 << 6) | (next & 0x3f);
/*
* Check for non-shortest form (invalid UTF-8)...
*/
- if (ch32 <= 2047)
+
+ if (ch32 < 0x800)
+ {
+ DEBUG_puts("3cupsUTF8ToUTF32: Returning -1 (bad UTF-8 sequence)");
+
return (-1);
- *dest = ch32;
+ }
+
+ *dest++ = ch32;
+
+ DEBUG_printf(("4cupsUTF8ToUTF32: %02x %02x %02x => %08X",
+ src[-3], src[-2], src[-1], (unsigned)ch32));
}
else if ((ch & 0xf8) == 0xf0)
{
/*
- * Four-octet UTF-8 to Replacement Character...
+ * Four-octet UTF-8...
*/
- if (((src - first) + 3) >= srclen)
+
+ next = *src++;
+ if ((next & 0xc0) != 0x80)
+ {
+ DEBUG_puts("3cupsUTF8ToUTF32: Returning -1 (bad UTF-8 sequence)");
+
return (-1);
- src += 3;
- *dest = 0xfffd;
- }
- else if ((ch & 0xfc) == 0xf8)
- {
- /*
- * Five-octet UTF-8 (invalid strict UTF-32)...
- */
- return (-1);
- }
- else if ((ch & 0xfe) == 0xfc)
- {
+ }
+
+ ch32 = ((ch & 0x07) << 6) | (next & 0x3f);
+
+ next = *src++;
+ if ((next & 0xc0) != 0x80)
+ {
+ DEBUG_puts("3cupsUTF8ToUTF32: Returning -1 (bad UTF-8 sequence)");
+
+ return (-1);
+ }
+
+ ch32 = (ch32 << 6) | (next & 0x3f);
+
+ next = *src++;
+ if ((next & 0xc0) != 0x80)
+ {
+ DEBUG_puts("3cupsUTF8ToUTF32: Returning -1 (bad UTF-8 sequence)");
+
+ return (-1);
+ }
+
+ ch32 = (ch32 << 6) | (next & 0x3f);
+
/*
- * Six-octet UTF-8 (invalid strict UTF-32)...
+ * Check for non-shortest form (invalid UTF-8)...
*/
- return (-1);
+
+ if (ch32 < 0x10000)
+ {
+ DEBUG_puts("3cupsUTF8ToUTF32: Returning -1 (bad UTF-8 sequence)");
+
+ return (-1);
+ }
+
+ *dest++ = ch32;
+
+ DEBUG_printf(("4cupsUTF8ToUTF32: %02x %02x %02x %02x => %08X",
+ src[-4], src[-3], src[-2], src[-1], (unsigned)ch32));
}
else
{
/*
- * More than six-octet (invalid UTF-8 sequence)...
+ * More than 4-octet (invalid UTF-8 sequence)...
*/
+
+ DEBUG_puts("3cupsUTF8ToUTF32: Returning -1 (bad UTF-8 sequence)");
+
return (-1);
}
/*
* Check for UTF-16 surrogate (illegal UTF-8)...
*/
- if ((*dest >= 0xd800) && (*dest <= 0xdfff))
- return (-1);
- /*
- * Check for beyond Plane 16 (invalid UTF-8)...
- */
- if (*dest > 0x10ffff)
+ if (ch32 >= 0xd800 && ch32 <= 0xdfff)
return (-1);
}
+
*dest = 0;
- return (i);
+
+ DEBUG_printf(("3cupsUTF8ToUTF32: Returning %d characters", maxout - 1 - i));
+
+ return (maxout - 1 - i);
}
+
/*
* 'cupsUTF32ToUTF8()' - Convert UTF-32 to UTF-8.
*
*
* UTF-32 char UTF-8 char(s)
* --------------------------------------------------
- * 0 to 127 = 0xxxxxxx (US-ASCII)
+ * 0 to 127 = 0xxxxxxx (US-ASCII)
* 128 to 2047 = 110xxxxx 10yyyyyy
* 2048 to 65535 = 1110xxxx 10yyyyyy 10zzzzzz
- * > 65535 = 11110xxx 10yyyyyy 10zzzzzz 10xxxxxx
+ * > 65535 = 11110xxx 10yyyyyy 10zzzzzz 10xxxxxx
*
* UTF-32 prohibits chars beyond Plane 16 (> 0x10ffff) in UCS-4,
* which would convert to five- or six-octet UTF-8 sequences...
- *
- * This code does not support Unicode beyond 16-bits (Plane 0)...
*/
-int /* O - Count or -1 on error */
-cupsUTF32ToUTF8(cups_utf8_t *dest, /* O - Target string */
- const cups_utf32_t *src, /* I - Source string */
- const int maxout) /* I - Max output */
+
+int /* O - Count or -1 on error */
+cupsUTF32ToUTF8(
+ cups_utf8_t *dest, /* O - Target string */
+ const cups_utf32_t *src, /* I - Source string */
+ const int maxout) /* I - Max output */
{
- cups_utf32_t *first = (cups_utf32_t *) src;
- /* First source char */
- cups_utf8_t *start = dest; /* Start of destination string */
- int i; /* Looping variable */
- int swap = 0; /* Byte-swap input to output */
- cups_utf32_t ch; /* Character value */
+ cups_utf8_t *start; /* Start of destination string */
+ int i; /* Looping variable */
+ int swap; /* Byte-swap input to output */
+ cups_utf32_t ch; /* Character value */
+
/*
* Check for valid arguments and clear output...
*/
- if ((dest == NULL)
- || (src == NULL)
- || (maxout < 1))
+
+ DEBUG_printf(("2cupsUTF32ToUTF8(dest=%p, src=%p, maxout=%d)", dest, src,
+ maxout));
+
+ if (dest)
+ *dest = '\0';
+
+ if (!dest || !src || maxout < 1)
+ {
+ DEBUG_puts("3cupsUTF32ToUTF8: Returning -1 (bad args)");
+
return (-1);
- *dest = '\0';
+ }
/*
* Check for leading BOM in UTF-32 and inverted BOM...
*/
- if (*src == 0xfffe0000)
- swap = 1;
+
+ start = dest;
+ swap = *src == 0xfffe0000;
+
+ DEBUG_printf(("4cupsUTF32ToUTF8: swap=%d", swap));
+
+ if (*src == 0xfffe0000 || *src == 0xfeff)
+ src ++;
/*
* Convert input UTF-32 to output UTF-8...
*/
- for (i = 0; i < (maxout - 1); src ++)
+
+ for (i = maxout - 1; *src && i > 0;)
{
- ch = *src;
- if (ch == 0)
- break;
+ ch = *src++;
/*
* Byte swap input UTF-32, if necessary...
+ * (only byte-swapping 24 of 32 bits)
*/
+
if (swap)
ch = ((ch >> 24) | ((ch >> 8) & 0xff00) | ((ch << 8) & 0xff0000));
/*
- * Check for leading BOM (and delete from output)...
+ * Check for beyond Plane 16 (invalid UTF-32)...
*/
- if ((src == first) && (ch == 0xfeff))
- continue;
- /*
- * Check for beyond Plane 16 (invalid UTF-32)...
- */
if (ch > 0x10ffff)
- return (-1);
+ {
+ DEBUG_puts("3cupsUTF32ToUTF8: Returning -1 (character out of range)");
- /*
- * Convert beyond Plane 0 (BMP) to Replacement Character...
- */
- if (ch > 0xffff)
- ch = 0xfffd;
+ return (-1);
+ }
/*
* Convert UTF-32 character to UTF-8 character(s)...
*/
- if (ch <= 0x7f)
+
+ if (ch < 0x80)
{
/*
* One-octet UTF-8 <= 127 (US-ASCII)...
*/
- *dest = (cups_utf8_t) ch;
- dest ++;
- i ++;
+
+ *dest++ = (cups_utf8_t)ch;
+ i --;
+
+ DEBUG_printf(("4cupsUTF32ToUTF8: %08x => %02x", (unsigned)ch, dest[-1]));
}
- else if (ch <= 0x7ff)
+ else if (ch < 0x800)
{
/*
* Two-octet UTF-8 <= 2047 (Latin-x)...
*/
- if (i > (maxout - 2))
- break;
- *dest = (cups_utf8_t) (0xc0 | ((ch >> 6) & 0x1f));
- dest ++;
- i ++;
- *dest = (cups_utf8_t) (0x80 | (ch & 0x3f));
- dest ++;
- i ++;
- }
- else
- {
- /*
- * Three-octet UTF-8 <= 65535 (Plane 0 - BMP)...
- */
- if (i > (maxout - 3))
- break;
- *dest = (cups_utf8_t) (0xe0 | ((ch >> 12) & 0x0f));
- dest ++;
- i ++;
- *dest = (cups_utf8_t) (0x80 | ((ch >> 6) & 0x3f));
- dest ++;
- i ++;
- *dest = (cups_utf8_t) (0x80 | (ch & 0x3f));
- dest ++;
- i ++;
- }
- }
- *dest = '\0';
- i = (int) (dest - start);
- return (i);
-}
-
-/*
- * 'cupsUTF16ToUTF32()' - Convert UTF-16 to UTF-32.
- *
- * This code does not support Unicode beyond 16-bits (Plane 0)...
- */
-int /* O - Count or -1 on error */
-cupsUTF16ToUTF32(cups_utf32_t *dest, /* O - Target string */
- const cups_utf16_t *src, /* I - Source string */
- const int maxout) /* I - Max output */
-{
- int i; /* Looping variable */
- int swap = 0; /* Byte-swap input to output */
- int surrogate = 0; /* Expecting low-half surrogate */
- cups_utf32_t ch; /* Character value */
-
- /*
- * Check for valid arguments and clear output...
- */
- if ((dest == NULL)
- || (src == NULL)
- || (maxout < 1)
- || (maxout > CUPS_MAX_USTRING))
- return (-1);
- *dest = 0;
-
- /*
- * Check for leading BOM in UTF-16 and inverted BOM...
- */
- if (*src == 0xfffe)
- swap = 1;
-
- /*
- * Convert input UTF-16 to output UTF-32...
- */
- for (i = 0; i < (maxout - 1); src ++)
- {
- ch = (cups_utf32_t) (*src & 0xffff);
- if (ch == 0)
- break;
- i ++;
-
- /*
- * Byte swap input UTF-16, if necessary...
- */
- if (swap)
- ch = (cups_utf32_t) ((ch << 8) | (ch >> 8));
-
- /*
- * Discard expected UTF-16 low-half surrogate...
- */
- if ((ch >= 0xdc00) && (ch <= 0xdfff))
- {
- if (surrogate == 0)
- return (-1);
- surrogate = 0;
- continue;
- }
-
- /*
- * Convert UTF-16 high-half surrogate to Replacement Character...
- */
- if ((ch >= 0xd800) && (ch <= 0xdbff))
- {
- if (surrogate == 1)
- return (-1);
- surrogate = 1;
- ch = 0xfffd;
- }
- *dest = ch;
- dest ++;
- }
- *dest = 0;
- return (i);
-}
-
-/*
- * 'cupsUTF32ToUTF16()' - Convert UTF-32 to UTF-16.
- *
- * This code does not support Unicode beyond 16-bits (Plane 0)...
- */
-int /* O - Count or -1 on error */
-cupsUTF32ToUTF16(cups_utf16_t *dest, /* O - Target string */
- const cups_utf32_t *src, /* I - Source string */
- const int maxout) /* I - Max output */
-{
- int i; /* Looping variable */
- int swap = 0; /* Byte-swap input to output */
- cups_utf32_t ch; /* Character value */
-
- /*
- * Check for valid arguments and clear output...
- */
- if ((dest == NULL)
- || (src == NULL)
- || (maxout < 1)
- || (maxout > CUPS_MAX_USTRING))
- return (-1);
- *dest = 0;
-
- /*
- * Check for leading BOM in UTF-32 and inverted BOM...
- */
- if (*src == 0xfffe0000)
- swap = 1;
-
- /*
- * Convert input UTF-32 to output UTF-16 (w/out surrogate pairs)...
- */
- for (i = 0; i < (maxout - 1); src ++, dest ++)
- {
- ch = *src;
- if (ch == 0)
- break;
- i ++;
-
- /*
- * Byte swap input UTF-32, if necessary...
- */
- if (swap)
- ch = ((ch >> 24) | ((ch >> 8) & 0xff00) | ((ch << 8) & 0xff0000));
-
- /*
- * Check for UTF-16 surrogate (illegal UTF-32)...
- */
- if ((ch >= 0xd800) && (ch <= 0xdfff))
- return (-1);
-
- /*
- * Check for beyond Plane 16 (invalid UTF-32)...
- */
- if (ch > 0x10ffff)
- return (-1);
-
- /*
- * Convert beyond Plane 0 (BMP) to Replacement Character...
- */
- if (ch > 0xffff)
- ch = 0xfffd;
- *dest = (cups_utf16_t) ch;
- }
- *dest = 0;
- return (i);
-}
-
-/*
- * 'get_charmap_count()' - Count lines in a charmap file.
- */
-static int /* O - Count or -1 on error */
-get_charmap_count(const char *filename) /* I - Charmap Filename */
-{
- int i; /* Looping variable */
- cups_file_t *fp; /* Map input file pointer */
- char *s; /* Line parsing pointer */
- char line[256]; /* Line from input map file */
- cups_utf32_t unichar; /* Unicode character value */
- /*
- * Open map input file...
- */
- if ((filename == NULL) || (*filename == '\0'))
- return (-1);
- fp = cupsFileOpen(filename, "r");
- if (fp == NULL)
- return (-1);
-
- /*
- * Count lines in map input file...
- */
- for (i = 0; i < CUPS_MAX_CHARMAP_LINES;)
- {
- s = cupsFileGets(fp, line, sizeof(line));
- if (s == NULL)
- break;
- if ((*s == '#') || (*s == '\n') || (*s == '\0'))
- continue;
- while ((*s != 0) && (*s != ' ') && (*s != '\t'))
- s ++;
- while ((*s == ' ') || (*s == '\t'))
- s ++;
- if (strncmp (s, "0x", 2) == 0)
- s += 2;
- if ((sscanf(s, "%lx", &unichar) != 1)
- || (unichar > 0xffff))
- {
- cupsFileClose(fp);
- return (-1);
- }
- i ++;
- }
- if (i == 0)
- i = -1;
-
- /*
- * Close file and return charmap count (non-comment line count)...
- */
- cupsFileClose(fp);
- return (i);
-}
-
-/*
- * 'get_sbcs_charmap()' - Get SBCS Charmap.
- */
-static _cups_cmap_t * /* O - Charmap or 0 on error */
-get_sbcs_charmap(const cups_encoding_t encoding,
- /* I - Charmap Encoding */
- const char *filename) /* I - Charmap Filename */
-{
- int i; /* Loop variable */
- unsigned long legchar; /* Legacy character value */
- cups_utf32_t unichar; /* Unicode character value */
- _cups_cmap_t *cmap; /* Legacy SBCS / Unicode Charset Map */
- cups_file_t *fp; /* Charset map file pointer */
- char *s; /* Line parsing pointer */
- cups_ucs2_t *crow; /* Pointer to UCS-2 row in 'char2uni' */
- cups_sbcs_t *srow; /* Pointer to SBCS row in 'uni2char' */
- char line[256]; /* Line from charset map file */
- _cups_globals_t *cg = _cupsGlobals();
- /* Pointer to library globals */
-
- /*
- * Check for valid arguments...
- */
- if ((encoding < 0) || (filename == NULL))
- return (NULL);
-
- /*
- * See if we already have this SBCS charset map loaded...
- */
- for (cmap = cg->cmap_cache; cmap != NULL; cmap = cmap->next)
- {
- if (cmap->encoding == encoding)
- {
- cmap->used ++;
- return ((void *) cmap);
- }
- }
-
- /*
- * Open SBCS charset map input file...
- */
- fp = cupsFileOpen(filename, "r");
- if (fp == NULL)
- return (NULL);
-
- /*
- * Allocate memory for SBCS charset map and add to cache...
- */
- cmap = (_cups_cmap_t *) calloc(1, sizeof(_cups_cmap_t));
- if (cmap == NULL)
- {
- cupsFileClose(fp);
- return (NULL);
- }
- cmap->next = cg->cmap_cache;
- cg->cmap_cache = cmap;
- cmap->used ++;
- cmap->encoding = encoding;
-
- /*
- * Save SBCS charset map into memory for transcoding...
- */
- for (i = 0; i < CUPS_MAX_CHARMAP_LINES;)
- {
- s = cupsFileGets(fp, line, sizeof(line));
- if (s == NULL)
- break;
- if ((*s == '#') || (*s == '\n') || (*s == '\0'))
- continue;
- if (strncmp (s, "0x", 2) == 0)
- s += 2;
- if ((sscanf(s, "%lx", &legchar) != 1)
- || (legchar > 0xff))
- {
- cupsFileClose(fp);
- cupsCharmapFlush();
- return (NULL);
- }
- while ((*s != 0) && (*s != ' ') && (*s != '\t'))
- s ++;
- while ((*s == ' ') || (*s == '\t'))
- s ++;
- if (strncmp (s, "0x", 2) == 0)
- s += 2;
- if (sscanf(s, "%lx", &unichar) != 1)
- {
- cupsFileClose(fp);
- cupsCharmapFlush();
- return (NULL);
- }
- i ++;
-
- /*
- * Convert beyond Plane 0 (BMP) to Replacement Character...
- */
- if (unichar > 0xffff)
- unichar = 0xfffd;
-
- /*
- * Save legacy to Unicode mapping in direct lookup table...
- */
- crow = &cmap->char2uni[(int) legchar];
- *crow = (cups_ucs2_t) (unichar & 0xffff);
-
- /*
- * Save Unicode to legacy mapping in indirect lookup table...
- */
- srow = cmap->uni2char[(int) ((unichar >> 8) & 0xff)];
- if (srow == NULL)
- {
- srow = (cups_sbcs_t *) calloc(256, sizeof(cups_sbcs_t));
- if (srow == NULL)
+ if (i < 2)
{
- cupsFileClose(fp);
- cupsCharmapFlush();
- return (NULL);
- }
- cmap->uni2char[(int) ((unichar >> 8) & 0xff)] = srow;
- }
- srow += (int) (unichar & 0xff);
-
- /*
- * Convert Replacement Character to visible replacement...
- */
- if (unichar == 0xfffd)
- legchar = (unsigned long) '?';
-
- /*
- * First (oldest) legacy character uses Unicode mapping cell...
- */
- if (*srow == 0)
- *srow = (cups_sbcs_t) legchar;
- }
- cupsFileClose(fp);
- return (cmap);
-}
-
-/*
- * 'get_vbcs_charmap()' - Get DBCS/VBCS Charmap.
- */
-static _cups_vmap_t * /* O - Charmap or 0 on error */
-get_vbcs_charmap(const cups_encoding_t encoding,
- /* I - Charmap Encoding */
- const char *filename) /* I - Charmap Filename */
-{
- _cups_vmap_t *vmap; /* Legacy VBCS / Unicode Charset Map */
- cups_ucs2_t *crow; /* Pointer to UCS-2 row in 'char2uni' */
- cups_vbcs_t *vrow; /* Pointer to VBCS row in 'uni2char' */
- _cups_wide2uni_t *wide2uni; /* Pointer to row in 'wide2uni' */
- cups_sbcs_t leadchar; /* Lead char of 2-byte legacy char */
- unsigned long legchar; /* Legacy character value */
- cups_utf32_t unichar; /* Unicode character value */
- int mapcount; /* Count of lines in charmap file */
- cups_file_t *fp; /* Charset map file pointer */
- char *s; /* Line parsing pointer */
- char line[256]; /* Line from charset map file */
- int i; /* Loop variable */
- int wide; /* 32-bit legacy char */
- _cups_globals_t *cg = _cupsGlobals();
- /* Pointer to library globals */
-
- /*
- * Check for valid arguments...
- */
- if ((encoding < 0) || (filename == NULL))
- return (NULL);
-
- /*
- * See if we already have this DBCS/VBCS charset map loaded...
- */
- for (vmap = cg->vmap_cache; vmap != NULL; vmap = vmap->next)
- {
- if (vmap->encoding == encoding)
- {
- vmap->used ++;
- return ((void *) vmap);
- }
- }
-
- /*
- * Count lines in charmap file...
- */
- mapcount = get_charmap_count(filename);
- if (mapcount <= 0)
- return (NULL);
-
- /*
- * Open VBCS charset map input file...
- */
- fp = cupsFileOpen(filename, "r");
- if (fp == NULL)
- return (NULL);
-
- /*
- * Allocate memory for DBCS/VBCS charset map and add to cache...
- */
- vmap = (_cups_vmap_t *) calloc(1, sizeof(_cups_vmap_t));
- if (vmap == NULL)
- {
- cupsFileClose(fp);
- return (NULL);
- }
- vmap->next = cg->vmap_cache;
- cg->vmap_cache = vmap;
- vmap->used ++;
- vmap->encoding = encoding;
-
- /*
- * Save DBCS/VBCS charset map into memory for transcoding...
- */
- leadchar = 0;
- wide2uni = NULL;
-
- for (i = 0, wide = 0; i < mapcount; )
- {
- s = cupsFileGets(fp, line, sizeof(line));
- if (s == NULL)
- break;
- if ((*s == '#') || (*s == '\n') || (*s == '\0'))
- continue;
- if (strncmp (s, "0x", 2) == 0)
- s += 2;
- if ((sscanf(s, "%lx", &legchar) != 1)
- || ((legchar > 0xffff) && (encoding < CUPS_ENCODING_DBCS_END)))
- {
- cupsFileClose(fp);
- cupsCharmapFlush();
- return (NULL);
- }
- while ((*s != 0) && (*s != ' ') && (*s != '\t'))
- s ++;
- while ((*s == ' ') || (*s == '\t'))
- s ++;
- if (strncmp (s, "0x", 2) == 0)
- s += 2;
- if (sscanf(s, "%lx", &unichar) != 1)
- {
- cupsFileClose(fp);
- cupsCharmapFlush();
- return (NULL);
- }
- i ++;
+ DEBUG_puts("3cupsUTF32ToUTF8: Returning -1 (too long 2)");
- /*
- * Convert beyond Plane 0 (BMP) to Replacement Character...
- */
- if (unichar > 0xffff)
- unichar = 0xfffd;
+ return (-1);
+ }
- /*
- * Save lead char of 2/3/4-byte legacy char...
- */
- if ((legchar > 0xff) && (legchar <= 0xffff))
- {
- leadchar = (cups_sbcs_t) (legchar >> 8);
- vmap->lead2char[leadchar] = leadchar;
- }
- if ((legchar > 0xffff) && (legchar <= 0xffffff))
- {
- leadchar = (cups_sbcs_t) (legchar >> 16);
- vmap->lead3char[leadchar] = leadchar;
- }
- if (legchar > 0xffffff)
- {
- leadchar = (cups_sbcs_t) (legchar >> 24);
- vmap->lead4char[leadchar] = leadchar;
- }
+ *dest++ = (cups_utf8_t)(0xc0 | ((ch >> 6) & 0x1f));
+ *dest++ = (cups_utf8_t)(0x80 | (ch & 0x3f));
+ i -= 2;
- /*
- * Save Legacy to Unicode mapping...
- */
- if (legchar <= 0xffff)
- {
- /*
- * Save DBCS 16-bit to Unicode mapping in indirect lookup table...
- */
- crow = vmap->char2uni[(int) leadchar];
- if (crow == NULL)
- {
- crow = (cups_ucs2_t *) calloc(256, sizeof(cups_ucs2_t));
- if (crow == NULL)
- {
- cupsFileClose(fp);
- cupsCharmapFlush();
- return (NULL);
- }
- vmap->char2uni[(int) leadchar] = crow;
- }
- crow += (int) (legchar & 0xff);
- *crow = (cups_ucs2_t) unichar;
+ DEBUG_printf(("4cupsUTF32ToUTF8: %08x => %02x %02x", (unsigned)ch,
+ dest[-2], dest[-1]));
}
- else
+ else if (ch < 0x10000)
{
/*
- * Save VBCS 32-bit to Unicode mapping in sorted list table...
+ * Three-octet UTF-8 <= 65535 (Plane 0 - BMP)...
*/
- if (wide == 0)
- {
- wide = 1;
- vmap->widecount = (mapcount - i + 1);
- wide2uni = (_cups_wide2uni_t *)
- calloc(vmap->widecount, sizeof(_cups_wide2uni_t));
- if (wide2uni == NULL)
- {
- cupsFileClose(fp);
- cupsCharmapFlush();
- return (NULL);
- }
- vmap->wide2uni = wide2uni;
- }
- wide2uni->widechar = (cups_vbcs_t) legchar;
- wide2uni->unichar = (cups_ucs2_t)unichar;
- wide2uni ++;
- }
- /*
- * Save Unicode to legacy mapping in indirect lookup table...
- */
- vrow = vmap->uni2char[(int) ((unichar >> 8) & 0xff)];
- if (vrow == NULL)
- {
- vrow = (cups_vbcs_t *) calloc(256, sizeof(cups_vbcs_t));
- if (vrow == NULL)
+ if (i < 3)
{
- cupsFileClose(fp);
- cupsCharmapFlush();
- return (NULL);
- }
- vmap->uni2char[(int) ((unichar >> 8) & 0xff)] = vrow;
- }
- vrow += (int) (unichar & 0xff);
-
- /*
- * Convert Replacement Character to visible replacement...
- */
- if (unichar == 0xfffd)
- legchar = (unsigned long) '?';
-
- /*
- * First (oldest) legacy character uses Unicode mapping cell...
- */
- if (*vrow == 0)
- *vrow = (cups_vbcs_t) legchar;
- }
- vmap->charcount = (i - vmap->widecount);
- cupsFileClose(fp);
- return (vmap);
-}
-
-/*
- * 'conv_utf8_to_sbcs()' - Convert UTF-8 to legacy SBCS.
- */
-static int /* O - Count or -1 on error */
-conv_utf8_to_sbcs(char *dest, /* O - Target string */
- const cups_utf8_t *src, /* I - Source string */
- const int maxout, /* I - Max output */
- const cups_encoding_t encoding) /* I - Encoding */
-{
- char *start = dest; /* Start of destination string */
- _cups_cmap_t *cmap; /* Legacy SBCS / Unicode Charset Map */
- cups_sbcs_t *srow; /* Pointer to SBCS row in 'uni2char' */
- cups_utf32_t unichar; /* Character value */
- int worklen; /* Internal UCS-4 string length */
- cups_utf32_t work[CUPS_MAX_USTRING];
- /* Internal UCS-4 string */
- int i; /* Looping variable */
+ DEBUG_puts("3cupsUTF32ToUTF8: Returning -1 (too long 3)");
- /*
- * Check for valid arguments and clear output...
- */
- if ((dest == NULL)
- || (src == NULL)
- || (maxout < 1)
- || (maxout > CUPS_MAX_USTRING)
- || (encoding == CUPS_UTF8))
- return (-1);
- *dest = '\0';
-
- /*
- * Find legacy charset map in cache...
- */
- cmap = (_cups_cmap_t *) cupsCharmapGet(encoding);
- if (cmap == NULL)
- return (-1);
-
- /*
- * Convert input UTF-8 to internal UCS-4 (and insert BOM)...
- */
- worklen = cupsUTF8ToUTF32(work, src, CUPS_MAX_USTRING);
- if (worklen < 0)
- return (-1);
-
- /*
- * Convert internal UCS-4 to SBCS legacy charset (and delete BOM)...
- */
- for (i = 0; i < worklen;)
- {
- unichar = work[i];
- if (unichar == 0)
- break;
- i ++;
-
- /*
- * Check for leading BOM (and delete from output)...
- */
- if ((i == 1) && (unichar == 0xfeff))
- continue;
-
- /*
- * Convert ASCII verbatim (optimization)...
- */
- if (unichar <= 0x7f)
- {
- *dest = (char) unichar;
- dest ++;
- continue;
- }
-
- /*
- * Convert unknown character to visible replacement...
- */
- srow = cmap->uni2char[(int) ((unichar >> 8) & 0xff)];
- if (srow)
- srow += (int) (unichar & 0xff);
- if ((srow == NULL) || (*srow == 0))
- *dest = '?';
- else
- *dest = (char) (*srow);
- dest ++;
- }
- *dest = '\0';
- worklen = (int) (dest - start);
- cupsCharmapFree(encoding);
- return (worklen);
-}
-
-/*
- * 'conv_utf8_to_vbcs()' - Convert UTF-8 to legacy DBCS/VBCS.
- */
-static int /* O - Count or -1 on error */
-conv_utf8_to_vbcs(char *dest, /* O - Target string */
- const cups_utf8_t *src, /* I - Source string */
- const int maxout, /* I - Max output */
- const cups_encoding_t encoding) /* I - Encoding */
-{
- char *start = dest; /* Start of destination string */
- _cups_vmap_t *vmap; /* Legacy DBCS / Unicode Charset Map */
- cups_vbcs_t *vrow; /* Pointer to VBCS row in 'uni2char' */
- cups_utf32_t unichar; /* Character value */
- cups_vbcs_t legchar; /* Legacy character value */
- int worklen; /* Internal UCS-4 string length */
- cups_utf32_t work[CUPS_MAX_USTRING];
- /* Internal UCS-4 string */
- int i; /* Looping variable */
-
- /*
- * Check for valid arguments and clear output...
- */
- if ((dest == NULL)
- || (src == NULL)
- || (maxout < 1)
- || (maxout > CUPS_MAX_USTRING)
- || (encoding == CUPS_UTF8))
- return (-1);
- *dest = '\0';
-
- /*
- * Find legacy charset map in cache...
- */
- vmap = (_cups_vmap_t *) cupsCharmapGet(encoding);
- if (vmap == NULL)
- return (-1);
-
- /*
- * Convert input UTF-8 to internal UCS-4 (and insert BOM)...
- */
- worklen = cupsUTF8ToUTF32(work, src, CUPS_MAX_USTRING);
- if (worklen < 0)
- return (-1);
+ return (-1);
+ }
- /*
- * Convert internal UCS-4 to VBCS legacy charset (and delete BOM)...
- */
- for (i = 0; i < worklen;)
- {
- unichar = work[i];
- if (unichar == 0)
- break;
- i ++;
+ *dest++ = (cups_utf8_t)(0xe0 | ((ch >> 12) & 0x0f));
+ *dest++ = (cups_utf8_t)(0x80 | ((ch >> 6) & 0x3f));
+ *dest++ = (cups_utf8_t)(0x80 | (ch & 0x3f));
+ i -= 3;
- /*
- * Check for leading BOM (and delete from output)...
- */
- if ((i == 1) && (unichar == 0xfeff))
- continue;
-
- /*
- * Convert ASCII verbatim (optimization)...
- */
- if (unichar <= 0x7f)
- {
- *dest = (char) unichar;
- dest ++;
- continue;
+ DEBUG_printf(("4cupsUTF32ToUTF8: %08x => %02x %02x %02x", (unsigned)ch,
+ dest[-3], dest[-2], dest[-1]));
}
-
- /*
- * Convert unknown character to visible replacement...
- */
- vrow = vmap->uni2char[(int) ((unichar >> 8) & 0xff)];
- if (vrow)
- vrow += (int) (unichar & 0xff);
- if ((vrow == NULL) || (*vrow == 0))
- legchar = (cups_vbcs_t) '?';
else
- legchar = (cups_vbcs_t) *vrow;
-
- /*
- * Save n-byte legacy character...
- */
- if (legchar > 0xffffff)
- {
- *dest = (char) ((legchar >> 24) & 0xff);
- dest++;
- }
- if (legchar > 0xffff)
{
- *dest = (char) ((legchar >> 16) & 0xff);
- dest++;
- }
- if (legchar > 0xff)
- {
- *dest = (char) ((legchar >> 8) & 0xff);
- dest++;
- }
- *dest = (char) (legchar & 0xff);
- dest ++;
- }
- *dest = '\0';
- worklen = (int) (dest - start);
- cupsCharmapFree(encoding);
- return (worklen);
-}
-
-/*
- * 'conv_sbcs_to_utf8()' - Convert legacy SBCS to UTF-8.
- */
-static int /* O - Count or -1 on error */
-conv_sbcs_to_utf8(cups_utf8_t *dest, /* O - Target string */
- const char *src, /* I - Source string */
- const int maxout, /* I - Max output */
- const cups_encoding_t encoding) /* I - Encoding */
-{
- _cups_cmap_t *cmap; /* Legacy SBCS / Unicode Charset Map */
- cups_ucs2_t *crow; /* Pointer to UCS-2 row in 'char2uni' */
- unsigned long legchar; /* Legacy character value */
- cups_utf32_t unichar; /* Unicode character value */
- int worklen; /* Internal UCS-4 string length */
- cups_utf32_t work[CUPS_MAX_USTRING];
- /* Internal UCS-4 string */
- int i; /* Looping variable */
+ /*
+ * Four-octet UTF-8...
+ */
- /*
- * Check for valid arguments and clear output...
- */
- if ((dest == NULL)
- || (src == NULL)
- || (maxout < 1)
- || (maxout > CUPS_MAX_USTRING)
- || (encoding == CUPS_UTF8))
- return (-1);
- *dest = '\0';
+ if (i < 4)
+ {
+ DEBUG_puts("3cupsUTF32ToUTF8: Returning -1 (too long 4)");
- /*
- * Find legacy charset map in cache...
- */
- cmap = (_cups_cmap_t *) cupsCharmapGet(encoding);
- if (cmap == NULL)
- return (-1);
+ return (-1);
+ }
- /*
- * Convert input legacy charset to internal UCS-4 (and insert BOM)...
- */
- work[0] = 0xfeff;
- for (i = 1; i < (CUPS_MAX_USTRING - 1); src ++)
- {
- if (*src == '\0')
- break;
- legchar = (unsigned long) *src;
+ *dest++ = (cups_utf8_t)(0xf0 | ((ch >> 18) & 0x07));
+ *dest++ = (cups_utf8_t)(0x80 | ((ch >> 12) & 0x3f));
+ *dest++ = (cups_utf8_t)(0x80 | ((ch >> 6) & 0x3f));
+ *dest++ = (cups_utf8_t)(0x80 | (ch & 0x3f));
+ i -= 4;
- /*
- * Convert ASCII verbatim (optimization)...
- */
- if (legchar <= 0x7f)
- {
- work[i] = (cups_utf32_t) legchar;
- i ++;
- continue;
+ DEBUG_printf(("4cupsUTF32ToUTF8: %08x => %02x %02x %02x %02x",
+ (unsigned)ch, dest[-4], dest[-3], dest[-2], dest[-1]));
}
-
- /*
- * Convert unknown character to Replacement Character...
- */
- crow = &cmap->char2uni[0];
- crow += (int) legchar;
- if (*crow == 0)
- unichar = 0xfffd;
- else
- unichar = (cups_utf32_t) *crow;
- work[i] = unichar;
- i ++;
}
- work[i] = 0;
- /*
- * Convert internal UCS-4 to output UTF-8 (and delete BOM)...
- */
- worklen = cupsUTF32ToUTF8(dest, work, maxout);
- cupsCharmapFree(encoding);
- return (worklen);
-}
-
-
-/*
- * 'conv_vbcs_to_utf8()' - Convert legacy DBCS/VBCS to UTF-8.
- */
-static int /* O - Count or -1 on error */
-conv_vbcs_to_utf8(cups_utf8_t *dest, /* O - Target string */
- const char *src, /* I - Source string */
- const int maxout, /* I - Max output */
- const cups_encoding_t encoding) /* I - Encoding */
-{
- _cups_vmap_t *vmap; /* Legacy VBCS / Unicode Charset Map */
- cups_ucs2_t *crow; /* Pointer to UCS-2 row in 'char2uni' */
- _cups_wide2uni_t *wide2uni; /* Pointer to row in 'wide2uni' */
- cups_sbcs_t leadchar; /* Lead char of n-byte legacy char */
- cups_vbcs_t legchar; /* Legacy character value */
- cups_utf32_t unichar; /* Unicode character value */
- int i; /* Looping variable */
- int worklen; /* Internal UCS-4 string length */
- cups_utf32_t work[CUPS_MAX_USTRING];
- /* Internal UCS-4 string */
-
- /*
- * Check for valid arguments and clear output...
- */
- if ((dest == NULL)
- || (src == NULL)
- || (maxout < 1)
- || (maxout > CUPS_MAX_USTRING)
- || (encoding == CUPS_UTF8))
- return (-1);
*dest = '\0';
- /*
- * Find legacy charset map in cache...
- */
- vmap = (_cups_vmap_t *) cupsCharmapGet(encoding);
- if (vmap == NULL)
- return (-1);
-
- /*
- * Convert input legacy charset to internal UCS-4 (and insert BOM)...
- */
- work[0] = 0xfeff;
- for (i = 1; i < (CUPS_MAX_USTRING - 1); src ++)
- {
- if (*src == '\0')
- break;
- legchar = (cups_vbcs_t) *src;
- leadchar = (cups_sbcs_t) *src;
-
- /*
- * Convert ASCII verbatim (optimization)...
- */
- if (legchar <= 0x7f)
- {
- work[i] = (cups_utf32_t) legchar;
- i ++;
- continue;
- }
-
- /*
- * Convert 2-byte legacy character...
- */
- if (vmap->lead2char[(int) leadchar] == leadchar)
- {
- src ++;
- if (*src == '\0')
- return (-1);
- legchar = (legchar << 8) | (cups_vbcs_t) *src;
-
- /*
- * Convert unknown character to Replacement Character...
- */
- crow = vmap->char2uni[(int) ((legchar >> 8) & 0xff)];
- if (crow)
- crow += (int) (legchar & 0xff);
- if ((crow == NULL) || (*crow == 0))
- unichar = 0xfffd;
- else
- unichar = (cups_utf32_t) *crow;
- work[i] = unichar;
- i ++;
- continue;
- }
-
- /*
- * Fetch 3-byte or 4-byte legacy character...
- */
- if (vmap->lead3char[(int) leadchar] == leadchar)
- {
- src ++;
- if (*src == '\0')
- return (-1);
- legchar = (legchar << 8) | (cups_vbcs_t) *src;
- src ++;
- if (*src == '\0')
- return (-1);
- legchar = (legchar << 8) | (cups_vbcs_t) *src;
- }
- else if (vmap->lead4char[(int) leadchar] == leadchar)
- {
- src ++;
- if (*src == '\0')
- return (-1);
- legchar = (legchar << 8) | (cups_vbcs_t) *src;
- src ++;
- if (*src == '\0')
- return (-1);
- legchar = (legchar << 8) | (cups_vbcs_t) *src;
- src ++;
- if (*src == '\0')
- return (-1);
- legchar = (legchar << 8) | (cups_vbcs_t) *src;
- }
- else
- return (-1);
-
- /*
- * Find 3-byte or 4-byte legacy character...
- */
- wide2uni = vmap->wide2uni;
- wide2uni = (_cups_wide2uni_t *) bsearch(&legchar,
- vmap->wide2uni,
- vmap->widecount,
- sizeof(_cups_wide2uni_t),
- compare_wide);
+ DEBUG_printf(("3cupsUTF32ToUTF8: Returning %d", (int)(dest - start)));
- /*
- * Convert unknown character to Replacement Character...
- */
- if ((wide2uni == NULL) || (wide2uni->unichar == 0))
- unichar = 0xfffd;
- else
- unichar = wide2uni->unichar;
- work[i] = unichar;
- i ++;
- }
- work[i] = 0;
-
- /*
- * Convert internal UCS-4 to output UTF-8 (and delete BOM)...
- */
- worklen = cupsUTF32ToUTF8(dest, work, maxout);
- cupsCharmapFree(encoding);
- return (worklen);
-}
-
-/*
- * 'compare_wide()' - Compare key for wide (VBCS) match.
- */
-static int
-compare_wide(const void *k1, /* I - Key char */
- const void *k2) /* I - Map char */
-{
- cups_vbcs_t *kp = (cups_vbcs_t *) k1;
- /* Key char pointer */
- _cups_wide2uni_t *mp = (_cups_wide2uni_t *) k2;
- /* Map char pointer */
- cups_vbcs_t key; /* Legacy key character */
- cups_vbcs_t map; /* Legacy map character */
- int result; /* Result Value */
-
- key = *kp;
- map = mp->widechar;
- if (key >= map)
- result = (int) (key - map);
- else
- result = -1 * ((int) (map - key));
- return (result);
+ return ((int)(dest - start));
}
/*
- * End of "$Id: transcode.c 4903 2006-01-10 20:02:46Z mike $"
+ * End of "$Id$"
*/