From: VMware, Inc <> Date: Mon, 20 Dec 2010 22:02:20 +0000 (-0800) Subject: open-vm-tools: fix compilation without ICU. X-Git-Tag: 2010.12.19-339835~37 X-Git-Url: http://git.ipfire.org/gitweb.cgi?a=commitdiff_plain;h=7d6011d83d2cbb8ad4a97ace007adbdb5f38e4e6;p=thirdparty%2Fopen-vm-tools.git open-vm-tools: fix compilation without ICU. Move functions that don't need ICU from codeset.c into a new file. Signed-off-by: Marcelo Vanzin --- diff --git a/open-vm-tools/lib/misc/Makefile.am b/open-vm-tools/lib/misc/Makefile.am index c8daa5db6..3946d4a5f 100644 --- a/open-vm-tools/lib/misc/Makefile.am +++ b/open-vm-tools/lib/misc/Makefile.am @@ -20,6 +20,7 @@ noinst_LTLIBRARIES = libMisc.la libMisc_la_SOURCES = libMisc_la_SOURCES += atomic.c libMisc_la_SOURCES += base64.c +libMisc_la_SOURCES += codesetBase.c libMisc_la_SOURCES += codesetOld.c libMisc_la_SOURCES += dynarray.c libMisc_la_SOURCES += dynbuf.c diff --git a/open-vm-tools/lib/misc/codeset.c b/open-vm-tools/lib/misc/codeset.c index 48ef1915d..70eca89e6 100644 --- a/open-vm-tools/lib/misc/codeset.c +++ b/open-vm-tools/lib/misc/codeset.c @@ -1653,316 +1653,3 @@ CodeSet_Validate(const char *buf, // IN: the string return uerr == U_BUFFER_OVERFLOW_ERROR; } - -/* - *----------------------------------------------------------------------------- - * - * CodeSet_GetUtf8 -- - * - * Parse the next UTF-8 sequence. - * - * Results: - * 0 on failure. - * Length of sequence and Unicode character in *uchar on success. - * - * Side effects: - * None. - * - *----------------------------------------------------------------------------- - */ - -int -CodeSet_GetUtf8(const char *string, // IN: string - const char *end, // IN: end of string - uint32 *uchar) // OUT: the Unicode character -{ - uint8 *p = (uint8 *) string; - uint8 *e; - uint32 c; - int len; - ASSERT(string < end); - - c = *p; - - if (c < 0x80) { - // ASCII: U+0000 - U+007F: 1 byte of UTF-8. - len = 1; - goto out; - } - - if ((c < 0xc2) || (c > 0xf4)) { - // 0x81 to 0xbf are not valid first bytes - // 0xc0 and 0xc1 cannot appear in UTF-8, see below - // leading char can not be > 0xf4, illegal as well - return 0; - } - - if (c < 0xe0) { - // U+0080 - U+07FF: 2 bytes of UTF-8. - c -= 0xc0; - len = 2; - } else if (c < 0xf0) { - // U+0800 - U+FFFF: 3 bytes of UTF-8. - c -= 0xe0; - len = 3; - } else { - // U+10000 - U+10FFFF: 4 bytes of UTF-8. - c -= 0xf0; - len = 4; - } - - if ((e = p + len) > (uint8 *) end) { - // input too short - return 0; - } - - while (++p < e) { - if ((*p & 0xc0) != 0x80) { - // bad trailing byte - return 0; - } - c <<= 6; - c += *p - 0x80; - } - - /* - * Enforce shortest encoding. - * UTF-8 mandates that shortest possible encoding is used, - * as otherwise doing UTF-8 => anything => UTF-8 could bypass some - * important tests, like '/' for path separator or \0 for string - * termination. - * - * This test does not work for len == 2, but that case is handled - * by requiring the first byte to be 0xc2 or greater (see above). - */ - - if (c < 1U << (len * 5 - 4)) { - return 0; - } - -out: - if (uchar != NULL) { - *uchar = c; - } - - return len; -} - - -/* - *----------------------------------------------------------------------------- - * - * CodeSet_LengthInCodePoints -- - * - * Return the length of a UTF8 string in code points (the number of - * unicode characters present in the string, not the length of the - * string in bytes). - * - * Like strlen, the length returned does not include the terminating NUL. - * - * Results: - * -1 on error - * - * Side effects: - * None - * - *----------------------------------------------------------------------------- - */ - -int -CodeSet_LengthInCodePoints(const char *utf8) // IN: -{ - char *p; - char *end; - uint32 codePoints = 0; - - ASSERT(utf8); - - p = (char *) utf8; - end = p + strlen(utf8); - - while (p < end) { - uint32 utf32; - uint32 len = CodeSet_GetUtf8(p, end, &utf32); - - if (len == 0) { - return -1; - } - - p += len; - codePoints++; - } - - return codePoints; -} - - -/* - *----------------------------------------------------------------------------- - * - * CodeSet_UTF8ToUTF32 -- - * - * Convert a UTF8 string into a UTF32 string. The result is returned as a - * dynamically allocated string that the caller is responsible for. - * - * Results: - * TRUE Input string was valid, converted string in *utf32 - * FALSE Input string was invalid or internal error - * - * Side effects: - * Allocates memory - * - *----------------------------------------------------------------------------- - */ - -Bool -CodeSet_UTF8ToUTF32(const char *utf8, // IN: - char **utf32) // OUT: -{ - char *p; - char *end; - uint32 *ptr; - int codePoints; - - ASSERT(utf32); - - if (utf8 == NULL) { // NULL is not an error - *utf32 = NULL; - - return TRUE; - } - - codePoints = CodeSet_LengthInCodePoints(utf8); - if (codePoints == -1) { - *utf32 = NULL; - - return FALSE; - } - - p = (char *) utf8; - end = p + strlen(utf8); - - ptr = Util_SafeMalloc(sizeof(*ptr) * (codePoints + 1)); - *utf32 = (char *) ptr; - - while (p < end) { - p += CodeSet_GetUtf8(p, end, ptr++); - } - - *ptr = 0; - - return TRUE; -} - - -/* - *----------------------------------------------------------------------------- - * - * CodeSet_UTF32ToUTF8 -- - * - * Convert a UTF32 string into a UTF8 string. The result is returned as a - * dynamically allocated string that the caller is responsible for. - * - * Results: - * TRUE Input string was valid, converted string in *utf8 - * FALSE Input string was invalid or internal error - * - * Side effects: - * Allocates memory - * - *----------------------------------------------------------------------------- - */ - - -Bool -CodeSet_UTF32ToUTF8(const char *utf32, // IN: - char **utf8) // OUT: -{ - uint32 i; - uint8 *p; - uint8 *q; - uint32 len; - union { - uint32 word; - uint8 bytes[4]; - } value; - - ASSERT(utf8); - - if (utf32 == NULL) { // NULL is not an error - *utf8 = NULL; - - return TRUE; - } - - /* - * Determine the length of the UTF32 string. A UTF32 string terminates - * with four (4) bytes of zero (0). - */ - - len = 0; - p = (uint8 *) utf32; - - while (TRUE) { - value.bytes[0] = *p++; - value.bytes[1] = *p++; - value.bytes[2] = *p++; - value.bytes[3] = *p++; - - if (value.word == 0) { - break; - } - - len++; - } - - /* - * Now that we know the length, allocate the memory for the UTF8 string. - * The UTF8 string length calculation ensures that there will always be - * sufficient space to represent the UTF32 string. Most of the time this - * will involved allocating too much memory however the memory wastage - * will be very short lived and very small. - */ - - *utf8 = Util_SafeMalloc((4 * len) + 1); // cover the NUL byte - - /* - * Process the UTF32 string, converting each code point into its - * UTF8 equivalent. - */ - - p = (uint8 *) utf32; - q = (uint8 *) *utf8; - - for (i = 0; i < len; i++) { - value.bytes[0] = *p++; - value.bytes[1] = *p++; - value.bytes[2] = *p++; - value.bytes[3] = *p++; - - if (value.word < 0x80) { // One byte case (ASCII) - *q++ = value.word; - } else if (value.word < 0x800) { // Two byte case - *q++ = 0xC0 | (value.word >> 6); - *q++ = 0x80 | (value.word & 0x3F); - } else if (value.word < 0x10000) { // Three byte case - *q++ = 0xE0 | (value.word >> 12); - *q++ = 0x80 | ((value.word >> 6) & 0x3F); - *q++ = 0x80 | (value.word & 0x3F); - } else if (value.word < 0x110000) { // Four byte case - *q++ = 0xF0 | (value.word >> 18); - *q++ = 0x80 | ((value.word >> 12) & 0x3F); - *q++ = 0x80 | ((value.word >> 6) & 0x3F); - *q++ = 0x80 | (value.word & 0x3F); - } else { // INVALID VALUE! - free(*utf8); - *utf8 = NULL; - - return FALSE; - } - } - - *q = '\0'; - - return TRUE; -} diff --git a/open-vm-tools/lib/misc/codesetBase.c b/open-vm-tools/lib/misc/codesetBase.c new file mode 100644 index 000000000..dfdecdd75 --- /dev/null +++ b/open-vm-tools/lib/misc/codesetBase.c @@ -0,0 +1,342 @@ +/********************************************************* + * Copyright (C) 2010 VMware, Inc. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU Lesser General Public License as published + * by the Free Software Foundation version 2.1 and no later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + * or FITNESS FOR A PARTICULAR PURPOSE. See the Lesser GNU General Public + * License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + *********************************************************/ + +/* + * codesetBase.c -- + * + * Character set and encoding conversion functions, without ICU. + */ + +#include +#include "vmware.h" +#include "codeset.h" +#include "util.h" + +/* + *----------------------------------------------------------------------------- + * + * CodeSet_GetUtf8 -- + * + * Parse the next UTF-8 sequence. + * + * Results: + * 0 on failure. + * Length of sequence and Unicode character in *uchar on success. + * + * Side effects: + * None. + * + *----------------------------------------------------------------------------- + */ + +int +CodeSet_GetUtf8(const char *string, // IN: string + const char *end, // IN: end of string + uint32 *uchar) // OUT: the Unicode character +{ + uint8 *p = (uint8 *) string; + uint8 *e; + uint32 c; + int len; + ASSERT(string < end); + + c = *p; + + if (c < 0x80) { + // ASCII: U+0000 - U+007F: 1 byte of UTF-8. + len = 1; + goto out; + } + + if ((c < 0xc2) || (c > 0xf4)) { + // 0x81 to 0xbf are not valid first bytes + // 0xc0 and 0xc1 cannot appear in UTF-8, see below + // leading char can not be > 0xf4, illegal as well + return 0; + } + + if (c < 0xe0) { + // U+0080 - U+07FF: 2 bytes of UTF-8. + c -= 0xc0; + len = 2; + } else if (c < 0xf0) { + // U+0800 - U+FFFF: 3 bytes of UTF-8. + c -= 0xe0; + len = 3; + } else { + // U+10000 - U+10FFFF: 4 bytes of UTF-8. + c -= 0xf0; + len = 4; + } + + if ((e = p + len) > (uint8 *) end) { + // input too short + return 0; + } + + while (++p < e) { + if ((*p & 0xc0) != 0x80) { + // bad trailing byte + return 0; + } + c <<= 6; + c += *p - 0x80; + } + + /* + * Enforce shortest encoding. + * UTF-8 mandates that shortest possible encoding is used, + * as otherwise doing UTF-8 => anything => UTF-8 could bypass some + * important tests, like '/' for path separator or \0 for string + * termination. + * + * This test does not work for len == 2, but that case is handled + * by requiring the first byte to be 0xc2 or greater (see above). + */ + + if (c < 1U << (len * 5 - 4)) { + return 0; + } + +out: + if (uchar != NULL) { + *uchar = c; + } + + return len; +} + + +/* + *----------------------------------------------------------------------------- + * + * CodeSet_LengthInCodePoints -- + * + * Return the length of a UTF8 string in code points (the number of + * unicode characters present in the string, not the length of the + * string in bytes). + * + * Like strlen, the length returned does not include the terminating NUL. + * + * Results: + * -1 on error + * + * Side effects: + * None + * + *----------------------------------------------------------------------------- + */ + +int +CodeSet_LengthInCodePoints(const char *utf8) // IN: +{ + char *p; + char *end; + uint32 codePoints = 0; + + ASSERT(utf8); + + p = (char *) utf8; + end = p + strlen(utf8); + + while (p < end) { + uint32 utf32; + uint32 len = CodeSet_GetUtf8(p, end, &utf32); + + if (len == 0) { + return -1; + } + + p += len; + codePoints++; + } + + return codePoints; +} + + +/* + *----------------------------------------------------------------------------- + * + * CodeSet_UTF8ToUTF32 -- + * + * Convert a UTF8 string into a UTF32 string. The result is returned as a + * dynamically allocated string that the caller is responsible for. + * + * Results: + * TRUE Input string was valid, converted string in *utf32 + * FALSE Input string was invalid or internal error + * + * Side effects: + * Allocates memory + * + *----------------------------------------------------------------------------- + */ + +Bool +CodeSet_UTF8ToUTF32(const char *utf8, // IN: + char **utf32) // OUT: +{ + char *p; + char *end; + uint32 *ptr; + int codePoints; + + ASSERT(utf32); + + if (utf8 == NULL) { // NULL is not an error + *utf32 = NULL; + + return TRUE; + } + + codePoints = CodeSet_LengthInCodePoints(utf8); + if (codePoints == -1) { + *utf32 = NULL; + + return FALSE; + } + + p = (char *) utf8; + end = p + strlen(utf8); + + ptr = Util_SafeMalloc(sizeof(*ptr) * (codePoints + 1)); + *utf32 = (char *) ptr; + + while (p < end) { + p += CodeSet_GetUtf8(p, end, ptr++); + } + + *ptr = 0; + + return TRUE; +} + + +/* + *----------------------------------------------------------------------------- + * + * CodeSet_UTF32ToUTF8 -- + * + * Convert a UTF32 string into a UTF8 string. The result is returned as a + * dynamically allocated string that the caller is responsible for. + * + * Results: + * TRUE Input string was valid, converted string in *utf8 + * FALSE Input string was invalid or internal error + * + * Side effects: + * Allocates memory + * + *----------------------------------------------------------------------------- + */ + + +Bool +CodeSet_UTF32ToUTF8(const char *utf32, // IN: + char **utf8) // OUT: +{ + uint32 i; + uint8 *p; + uint8 *q; + uint32 len; + union { + uint32 word; + uint8 bytes[4]; + } value; + + ASSERT(utf8); + + if (utf32 == NULL) { // NULL is not an error + *utf8 = NULL; + + return TRUE; + } + + /* + * Determine the length of the UTF32 string. A UTF32 string terminates + * with four (4) bytes of zero (0). + */ + + len = 0; + p = (uint8 *) utf32; + + while (TRUE) { + value.bytes[0] = *p++; + value.bytes[1] = *p++; + value.bytes[2] = *p++; + value.bytes[3] = *p++; + + if (value.word == 0) { + break; + } + + len++; + } + + /* + * Now that we know the length, allocate the memory for the UTF8 string. + * The UTF8 string length calculation ensures that there will always be + * sufficient space to represent the UTF32 string. Most of the time this + * will involved allocating too much memory however the memory wastage + * will be very short lived and very small. + */ + + *utf8 = Util_SafeMalloc((4 * len) + 1); // cover the NUL byte + + /* + * Process the UTF32 string, converting each code point into its + * UTF8 equivalent. + */ + + p = (uint8 *) utf32; + q = (uint8 *) *utf8; + + for (i = 0; i < len; i++) { + value.bytes[0] = *p++; + value.bytes[1] = *p++; + value.bytes[2] = *p++; + value.bytes[3] = *p++; + + if (value.word < 0x80) { // One byte case (ASCII) + *q++ = value.word; + } else if (value.word < 0x800) { // Two byte case + *q++ = 0xC0 | (value.word >> 6); + *q++ = 0x80 | (value.word & 0x3F); + } else if (value.word < 0x10000) { // Three byte case + *q++ = 0xE0 | (value.word >> 12); + *q++ = 0x80 | ((value.word >> 6) & 0x3F); + *q++ = 0x80 | (value.word & 0x3F); + } else if (value.word < 0x110000) { // Four byte case + *q++ = 0xF0 | (value.word >> 18); + *q++ = 0x80 | ((value.word >> 12) & 0x3F); + *q++ = 0x80 | ((value.word >> 6) & 0x3F); + *q++ = 0x80 | (value.word & 0x3F); + } else { // INVALID VALUE! + free(*utf8); + *utf8 = NULL; + + return FALSE; + } + } + + *q = '\0'; + + return TRUE; +} +