Issue 551: Fix the best-effort UTF8 conversion

author Tim Kientzle <kientzle@acm.org>

Sun, 9 Aug 2015 03:52:19 +0000 (20:52 -0700)

committer Tim Kientzle <kientzle@acm.org>

Sun, 9 Aug 2015 03:52:19 +0000 (20:52 -0700)
author Tim Kientzle <kientzle@acm.org>
Sun, 9 Aug 2015 03:52:19 +0000 (20:52 -0700)
committer Tim Kientzle <kientzle@acm.org>
Sun, 9 Aug 2015 03:52:19 +0000 (20:52 -0700)
diff --git a/libarchive/archive_string.c b/libarchive/archive_string.c

index f6d1b893b8cd01e9a2604a20771869359ce961c0..dcffe6a0a17e8bfb743bb622f3ff5a8e0e993e84 100644 (file)
--- a/libarchive/archive_string.c
+++ b/libarchive/archive_string.c
@@ -131,12 +131,7 @@ struct archive_string_conv {
  #define UNICODE_MAX            0x10FFFF
  #define UNICODE_R_CHAR         0xFFFD  /* Replacement character. */
  /* Set U+FFFD(Replacement character) in UTF-8. */
-#define UTF8_SET_R_CHAR(outp) do {             \
-                       (outp)[0] = 0xef;       \
-                       (outp)[1] = 0xbf;       \
-                       (outp)[2] = 0xbd;       \
-} while (0)
-#define UTF8_R_CHAR_SIZE       3
+const static char utf8_replacement_char[] = {0xef, 0xbf, 0xbd};
  
  static struct archive_string_conv *find_sconv_object(struct archive *,
         const char *, const char *);
@@ -2041,7 +2036,7 @@ iconv_strncat_in_locale(struct archive_string *as, const void *_p,
                         if (sc->flag & (SCONV_TO_UTF8 | SCONV_TO_UTF16)) {
                                 size_t rbytes;
                                 if (sc->flag & SCONV_TO_UTF8)
-                                       rbytes = UTF8_R_CHAR_SIZE;
+                                       rbytes = sizeof(utf8_replacement_char);
                                 else
                                         rbytes = 2;
  
@@ -2057,7 +2052,7 @@ iconv_strncat_in_locale(struct archive_string *as, const void *_p,
                                             - as->length - to_size;
                                 }
                                 if (sc->flag & SCONV_TO_UTF8)
-                                       UTF8_SET_R_CHAR(outp);
+                                       memcpy(outp, utf8_replacement_char, sizeof(utf8_replacement_char));
                                 else if (sc->flag & SCONV_TO_UTF16BE)
                                         archive_be16enc(outp, UNICODE_R_CHAR);
                                 else
@@ -2206,9 +2201,7 @@ best_effort_strncat_in_locale(struct archive_string *as, const void *_p,
      size_t length, struct archive_string_conv *sc)
  {
         size_t remaining;
-       char *otp;
         const uint8_t *itp;
-       size_t avail;
         int return_value = 0; /* success */
  
         /*
@@ -2227,46 +2220,25 @@ best_effort_strncat_in_locale(struct archive_string *as, const void *_p,
          * byte sequence 0xEF 0xBD 0xBD, which are code point U+FFFD,
          * a Replacement Character in Unicode.
          */
-       if (archive_string_ensure(as, as->length + length + 1) == NULL)
-               return (-1);
  
         remaining = length;
         itp = (const uint8_t *)_p;
-       otp = as->s + as->length;
-       avail = as->buffer_length - as->length -1;
         while (*itp && remaining > 0) {
-               if (*itp > 127 && (sc->flag & SCONV_TO_UTF8)) {
-                       if (avail < UTF8_R_CHAR_SIZE) {
-                               as->length = otp - as->s;
-                               if (NULL == archive_string_ensure(as,
-                                   as->buffer_length + remaining +
-                                   UTF8_R_CHAR_SIZE))
-                                       return (-1);
-                               otp = as->s + as->length;
-                               avail = as->buffer_length - as->length -1;
+               if (*itp > 127) {
+                       // Non-ASCII: Substitute with suitable replacement
+                       if (sc->flag & SCONV_TO_UTF8) {
+                               if (archive_string_append(as, utf8_replacement_char, sizeof(utf8_replacement_char)) == NULL) {
+                                       __archive_errx(1, "Out of memory");
+                               }
+                       } else {
+                               archive_strappend_char(as, '?');
                         }
-                       /*
-                        * When coping a string in UTF-8, unknown character
-                        * should be U+FFFD (replacement character).
-                        */
-                       UTF8_SET_R_CHAR(otp);
-                       otp += UTF8_R_CHAR_SIZE;
-                       avail -= UTF8_R_CHAR_SIZE;
-                       itp++;
-                       remaining--;
-                       return_value = -1;
-               } else if (*itp > 127) {
-                       *otp++ = '?';
-                       itp++;
-                       remaining--;
                         return_value = -1;
                 } else {
-                       *otp++ = (char)*itp++;
-                       remaining--;
+                       archive_strappend_char(as, *itp);
                 }
+               ++itp;
         }
-       as->length = otp - as->s;
-       as->s[as->length] = '\0';
         return (return_value);
  }
  
@@ -2492,6 +2464,9 @@ unicode_to_utf8(char *p, size_t remaining, uint32_t uc)
  {
         char *_p = p;
  
+       /* Invalid Unicode char maps to Replacement character */
+       if (uc > UNICODE_MAX)
+               uc = UNICODE_R_CHAR;
         /* Translate code point to UTF8 */
         if (uc <= 0x7f) {
                 if (remaining == 0)
@@ -2508,22 +2483,13 @@ unicode_to_utf8(char *p, size_t remaining, uint32_t uc)
                 *p++ = 0xe0 | ((uc >> 12) & 0x0f);
                 *p++ = 0x80 | ((uc >> 6) & 0x3f);
                 *p++ = 0x80 | (uc & 0x3f);
-       } else if (uc <= UNICODE_MAX) {
+       } else {
                 if (remaining < 4)
                         return (0);
                 *p++ = 0xf0 | ((uc >> 18) & 0x07);
                 *p++ = 0x80 | ((uc >> 12) & 0x3f);
                 *p++ = 0x80 | ((uc >> 6) & 0x3f);
                 *p++ = 0x80 | (uc & 0x3f);
-       } else {
-               /*
-                * Undescribed code point should be U+FFFD
-                * (replacement character).
-                */
-               if (remaining < UTF8_R_CHAR_SIZE)
-                       return (0);
-               UTF8_SET_R_CHAR(p);
-               p += UTF8_R_CHAR_SIZE;
         }
         return (p - _p);
  }
author	Tim Kientzle <kientzle@acm.org>
	Sun, 9 Aug 2015 03:52:19 +0000 (20:52 -0700)
committer	Tim Kientzle <kientzle@acm.org>
	Sun, 9 Aug 2015 03:52:19 +0000 (20:52 -0700)