]> git.ipfire.org Git - thirdparty/libarchive.git/commitdiff
Issue 551: Fix the best-effort UTF8 conversion
authorTim Kientzle <kientzle@acm.org>
Sun, 9 Aug 2015 03:52:19 +0000 (20:52 -0700)
committerTim Kientzle <kientzle@acm.org>
Sun, 9 Aug 2015 03:52:19 +0000 (20:52 -0700)
If a valid character set conversion is impossible, the code falls back
to a best-effort conversion that preserves ASCII bytes and converts
the rest to Unicode Replacement Characters (if the output is UTF8)
or '?' (otherwise).  This code did not correctly track the remaining
bytes in the output buffer; I've replaced this with slower and simpler
code that utilizes the safe string append functions.

libarchive/archive_string.c

index f6d1b893b8cd01e9a2604a20771869359ce961c0..dcffe6a0a17e8bfb743bb622f3ff5a8e0e993e84 100644 (file)
@@ -131,12 +131,7 @@ struct archive_string_conv {
 #define UNICODE_MAX            0x10FFFF
 #define UNICODE_R_CHAR         0xFFFD  /* Replacement character. */
 /* Set U+FFFD(Replacement character) in UTF-8. */
-#define UTF8_SET_R_CHAR(outp) do {             \
-                       (outp)[0] = 0xef;       \
-                       (outp)[1] = 0xbf;       \
-                       (outp)[2] = 0xbd;       \
-} while (0)
-#define UTF8_R_CHAR_SIZE       3
+const static char utf8_replacement_char[] = {0xef, 0xbf, 0xbd};
 
 static struct archive_string_conv *find_sconv_object(struct archive *,
        const char *, const char *);
@@ -2041,7 +2036,7 @@ iconv_strncat_in_locale(struct archive_string *as, const void *_p,
                        if (sc->flag & (SCONV_TO_UTF8 | SCONV_TO_UTF16)) {
                                size_t rbytes;
                                if (sc->flag & SCONV_TO_UTF8)
-                                       rbytes = UTF8_R_CHAR_SIZE;
+                                       rbytes = sizeof(utf8_replacement_char);
                                else
                                        rbytes = 2;
 
@@ -2057,7 +2052,7 @@ iconv_strncat_in_locale(struct archive_string *as, const void *_p,
                                            - as->length - to_size;
                                }
                                if (sc->flag & SCONV_TO_UTF8)
-                                       UTF8_SET_R_CHAR(outp);
+                                       memcpy(outp, utf8_replacement_char, sizeof(utf8_replacement_char));
                                else if (sc->flag & SCONV_TO_UTF16BE)
                                        archive_be16enc(outp, UNICODE_R_CHAR);
                                else
@@ -2206,9 +2201,7 @@ best_effort_strncat_in_locale(struct archive_string *as, const void *_p,
     size_t length, struct archive_string_conv *sc)
 {
        size_t remaining;
-       char *otp;
        const uint8_t *itp;
-       size_t avail;
        int return_value = 0; /* success */
 
        /*
@@ -2227,46 +2220,25 @@ best_effort_strncat_in_locale(struct archive_string *as, const void *_p,
         * byte sequence 0xEF 0xBD 0xBD, which are code point U+FFFD,
         * a Replacement Character in Unicode.
         */
-       if (archive_string_ensure(as, as->length + length + 1) == NULL)
-               return (-1);
 
        remaining = length;
        itp = (const uint8_t *)_p;
-       otp = as->s + as->length;
-       avail = as->buffer_length - as->length -1;
        while (*itp && remaining > 0) {
-               if (*itp > 127 && (sc->flag & SCONV_TO_UTF8)) {
-                       if (avail < UTF8_R_CHAR_SIZE) {
-                               as->length = otp - as->s;
-                               if (NULL == archive_string_ensure(as,
-                                   as->buffer_length + remaining +
-                                   UTF8_R_CHAR_SIZE))
-                                       return (-1);
-                               otp = as->s + as->length;
-                               avail = as->buffer_length - as->length -1;
+               if (*itp > 127) {
+                       // Non-ASCII: Substitute with suitable replacement
+                       if (sc->flag & SCONV_TO_UTF8) {
+                               if (archive_string_append(as, utf8_replacement_char, sizeof(utf8_replacement_char)) == NULL) {
+                                       __archive_errx(1, "Out of memory");
+                               }
+                       } else {
+                               archive_strappend_char(as, '?');
                        }
-                       /*
-                        * When coping a string in UTF-8, unknown character
-                        * should be U+FFFD (replacement character).
-                        */
-                       UTF8_SET_R_CHAR(otp);
-                       otp += UTF8_R_CHAR_SIZE;
-                       avail -= UTF8_R_CHAR_SIZE;
-                       itp++;
-                       remaining--;
-                       return_value = -1;
-               } else if (*itp > 127) {
-                       *otp++ = '?';
-                       itp++;
-                       remaining--;
                        return_value = -1;
                } else {
-                       *otp++ = (char)*itp++;
-                       remaining--;
+                       archive_strappend_char(as, *itp);
                }
+               ++itp;
        }
-       as->length = otp - as->s;
-       as->s[as->length] = '\0';
        return (return_value);
 }
 
@@ -2492,6 +2464,9 @@ unicode_to_utf8(char *p, size_t remaining, uint32_t uc)
 {
        char *_p = p;
 
+       /* Invalid Unicode char maps to Replacement character */
+       if (uc > UNICODE_MAX)
+               uc = UNICODE_R_CHAR;
        /* Translate code point to UTF8 */
        if (uc <= 0x7f) {
                if (remaining == 0)
@@ -2508,22 +2483,13 @@ unicode_to_utf8(char *p, size_t remaining, uint32_t uc)
                *p++ = 0xe0 | ((uc >> 12) & 0x0f);
                *p++ = 0x80 | ((uc >> 6) & 0x3f);
                *p++ = 0x80 | (uc & 0x3f);
-       } else if (uc <= UNICODE_MAX) {
+       } else {
                if (remaining < 4)
                        return (0);
                *p++ = 0xf0 | ((uc >> 18) & 0x07);
                *p++ = 0x80 | ((uc >> 12) & 0x3f);
                *p++ = 0x80 | ((uc >> 6) & 0x3f);
                *p++ = 0x80 | (uc & 0x3f);
-       } else {
-               /*
-                * Undescribed code point should be U+FFFD
-                * (replacement character).
-                */
-               if (remaining < UTF8_R_CHAR_SIZE)
-                       return (0);
-               UTF8_SET_R_CHAR(p);
-               p += UTF8_R_CHAR_SIZE;
        }
        return (p - _p);
 }