From: Daiki Ueno Date: Fri, 2 May 2014 06:58:04 +0000 (+0900) Subject: xgettext: Factor out commonly used mixed_string_buffer X-Git-Tag: v0.19~65 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=f597467a;p=thirdparty%2Fgettext.git xgettext: Factor out commonly used mixed_string_buffer * x-python.c (init_mixed_string_buffer) (mixed_string_buffer_append_byte) (mixed_string_buffer_append_unicode_grow) (mixed_string_buffer_append_unicode) (mixed_string_buffer_flush_utf16_surr) (mixed_string_buffer_flush_curr_buffer) (mixed_string_buffer_append, mixed_string_buffer_result) (free_mixed_string_buffer): Move to... * xgettext.c: ...here. (mixed_string_buffer_alloc): Rename from init_mixed_string_buffer. (mixed_string_buffer_append_to_curr_buffer): Rename from mixed_string_buffer_append_byte. (mixed_string_buffer_append_to_utf8_buffer): Rename from mixed_string_buffer_append_unicode. (mixed_string_buffer_grow_utf8_buffer): Rename from mixed_string_buffer_append_unicode_grow. (mixed_string_buffer_append_char): Split from mixed_string_buffer_append. (mixed_string_buffer_append_unicode): Split from mixed_string_buffer_append. (mixed_string_buffer_done): New function merging mixed_string_buffer_result and free_mixed_string_buffer. * xgettext.h (mixed_string_buffer): New struct moved from x-python.c; add logical_file_name and line_number fields. (mixed_string_buffer_alloc): New function declaration. (mixed_string_buffer_append_char): New function declaration. (mixed_string_buffer_append_unicode): New function declaration. (mixed_string_buffer_done): New function declaration. * x-javascript.c (init_mixed_string_buffer) (mixed_string_buffer_append_byte) (mixed_string_buffer_append_unicode_grow) (mixed_string_buffer_append_unicode) (mixed_string_buffer_flush_utf16_surr) (mixed_string_buffer_flush_curr_buffer) (mixed_string_buffer_append, mixed_string_buffer_result) (free_mixed_string_buffer): Remove. --- diff --git a/gettext-tools/src/ChangeLog b/gettext-tools/src/ChangeLog index c7c953fff..f4ead255a 100644 --- a/gettext-tools/src/ChangeLog +++ b/gettext-tools/src/ChangeLog @@ -1,3 +1,43 @@ +2014-05-02 Daiki Ueno + + xgettext: Factor out commonly used mixed_string_buffer + * x-python.c (init_mixed_string_buffer) + (mixed_string_buffer_append_byte) + (mixed_string_buffer_append_unicode_grow) + (mixed_string_buffer_append_unicode) + (mixed_string_buffer_flush_utf16_surr) + (mixed_string_buffer_flush_curr_buffer) + (mixed_string_buffer_append, mixed_string_buffer_result) + (free_mixed_string_buffer): Move to... + * xgettext.c: ...here. + (mixed_string_buffer_alloc): Rename from init_mixed_string_buffer. + (mixed_string_buffer_append_to_curr_buffer): Rename from + mixed_string_buffer_append_byte. + (mixed_string_buffer_append_to_utf8_buffer): Rename from + mixed_string_buffer_append_unicode. + (mixed_string_buffer_grow_utf8_buffer): Rename from + mixed_string_buffer_append_unicode_grow. + (mixed_string_buffer_append_char): Split from + mixed_string_buffer_append. + (mixed_string_buffer_append_unicode): Split from + mixed_string_buffer_append. + (mixed_string_buffer_done): New function merging + mixed_string_buffer_result and free_mixed_string_buffer. + * xgettext.h (mixed_string_buffer): New struct moved from + x-python.c; add logical_file_name and line_number fields. + (mixed_string_buffer_alloc): New function declaration. + (mixed_string_buffer_append_char): New function declaration. + (mixed_string_buffer_append_unicode): New function declaration. + (mixed_string_buffer_done): New function declaration. + * x-javascript.c (init_mixed_string_buffer) + (mixed_string_buffer_append_byte) + (mixed_string_buffer_append_unicode_grow) + (mixed_string_buffer_append_unicode) + (mixed_string_buffer_flush_utf16_surr) + (mixed_string_buffer_flush_curr_buffer) + (mixed_string_buffer_append, mixed_string_buffer_result) + (free_mixed_string_buffer): Remove. + 2014-04-30 Daiki Ueno scheme: Recognize GIMP script-fu extension _"abc" diff --git a/gettext-tools/src/x-javascript.c b/gettext-tools/src/x-javascript.c index 59bbcbe2e..45f67e22d 100644 --- a/gettext-tools/src/x-javascript.c +++ b/gettext-tools/src/x-javascript.c @@ -714,203 +714,6 @@ phase3_ungetc (int c) IS_UNICODE. */ #define UNICODE_VALUE(p7_result) ((p7_result) - 0x100) -/* A string buffer type that allows appending bytes (in the - xgettext_current_source_encoding) or Unicode characters. - Returns the entire string in UTF-8 encoding. */ - -struct mixed_string_buffer -{ - /* The part of the string that has already been converted to UTF-8. */ - char *utf8_buffer; - size_t utf8_buflen; - size_t utf8_allocated; - /* The first half of an UTF-16 surrogate character. */ - unsigned short utf16_surr; - /* The part of the string that is still in the source encoding. */ - char *curr_buffer; - size_t curr_buflen; - size_t curr_allocated; - /* The lexical context. Used only for error message purposes. */ - lexical_context_ty lcontext; -}; - -/* Initialize a 'struct mixed_string_buffer' to empty. */ -static inline void -init_mixed_string_buffer (struct mixed_string_buffer *bp, lexical_context_ty lcontext) -{ - bp->utf8_buffer = NULL; - bp->utf8_buflen = 0; - bp->utf8_allocated = 0; - bp->utf16_surr = 0; - bp->curr_buffer = NULL; - bp->curr_buflen = 0; - bp->curr_allocated = 0; - bp->lcontext = lcontext; -} - -/* Auxiliary function: Append a byte to bp->curr. */ -static inline void -mixed_string_buffer_append_byte (struct mixed_string_buffer *bp, unsigned char c) -{ - if (bp->curr_buflen == bp->curr_allocated) - { - bp->curr_allocated = 2 * bp->curr_allocated + 10; - bp->curr_buffer = xrealloc (bp->curr_buffer, bp->curr_allocated); - } - bp->curr_buffer[bp->curr_buflen++] = c; -} - -/* Auxiliary function: Ensure count more bytes are available in bp->utf8. */ -static inline void -mixed_string_buffer_append_unicode_grow (struct mixed_string_buffer *bp, size_t count) -{ - if (bp->utf8_buflen + count > bp->utf8_allocated) - { - size_t new_allocated = 2 * bp->utf8_allocated + 10; - if (new_allocated < bp->utf8_buflen + count) - new_allocated = bp->utf8_buflen + count; - bp->utf8_allocated = new_allocated; - bp->utf8_buffer = xrealloc (bp->utf8_buffer, new_allocated); - } -} - -/* Auxiliary function: Append a Unicode character to bp->utf8. - uc must be < 0x110000. */ -static inline void -mixed_string_buffer_append_unicode (struct mixed_string_buffer *bp, ucs4_t uc) -{ - unsigned char utf8buf[6]; - int count = u8_uctomb (utf8buf, uc, 6); - - if (count < 0) - /* The caller should have ensured that uc is not out-of-range. */ - abort (); - - mixed_string_buffer_append_unicode_grow (bp, count); - memcpy (bp->utf8_buffer + bp->utf8_buflen, utf8buf, count); - bp->utf8_buflen += count; -} - -/* Auxiliary function: Flush bp->utf16_surr into bp->utf8_buffer. */ -static inline void -mixed_string_buffer_flush_utf16_surr (struct mixed_string_buffer *bp) -{ - if (bp->utf16_surr != 0) - { - /* A half surrogate is invalid, therefore use U+FFFD instead. */ - mixed_string_buffer_append_unicode (bp, 0xfffd); - bp->utf16_surr = 0; - } -} - -/* Auxiliary function: Flush bp->curr_buffer into bp->utf8_buffer. */ -static inline void -mixed_string_buffer_flush_curr_buffer (struct mixed_string_buffer *bp, int lineno) -{ - if (bp->curr_buflen > 0) - { - char *curr; - size_t count; - - mixed_string_buffer_append_byte (bp, '\0'); - - /* Convert from the source encoding to UTF-8. */ - curr = from_current_source_encoding (bp->curr_buffer, bp->lcontext, - logical_file_name, lineno); - - /* Append it to bp->utf8_buffer. */ - count = strlen (curr); - mixed_string_buffer_append_unicode_grow (bp, count); - memcpy (bp->utf8_buffer + bp->utf8_buflen, curr, count); - bp->utf8_buflen += count; - - if (curr != bp->curr_buffer) - free (curr); - bp->curr_buflen = 0; - } -} - -/* Append a character or Unicode character to a 'struct mixed_string_buffer'. */ -static void -mixed_string_buffer_append (struct mixed_string_buffer *bp, int c) -{ - if (IS_UNICODE (c)) - { - /* Append a Unicode character. */ - - /* Switch from multibyte character mode to Unicode character mode. */ - mixed_string_buffer_flush_curr_buffer (bp, line_number); - - /* Test whether this character and the previous one form a Unicode - surrogate character pair. */ - if (bp->utf16_surr != 0 - && (c >= UNICODE (0xdc00) && c < UNICODE (0xe000))) - { - unsigned short utf16buf[2]; - ucs4_t uc; - - utf16buf[0] = bp->utf16_surr; - utf16buf[1] = UNICODE_VALUE (c); - if (u16_mbtouc (&uc, utf16buf, 2) != 2) - abort (); - - mixed_string_buffer_append_unicode (bp, uc); - bp->utf16_surr = 0; - } - else - { - mixed_string_buffer_flush_utf16_surr (bp); - - if (c >= UNICODE (0xd800) && c < UNICODE (0xdc00)) - bp->utf16_surr = UNICODE_VALUE (c); - else if (c >= UNICODE (0xdc00) && c < UNICODE (0xe000)) - { - /* A half surrogate is invalid, therefore use U+FFFD instead. */ - mixed_string_buffer_append_unicode (bp, 0xfffd); - } - else - mixed_string_buffer_append_unicode (bp, UNICODE_VALUE (c)); - } - } - else - { - /* Append a single byte. */ - - /* Switch from Unicode character mode to multibyte character mode. */ - mixed_string_buffer_flush_utf16_surr (bp); - - /* When a newline is seen, convert the accumulated multibyte sequence. - This ensures a correct line number in the error message in case of - a conversion error. The "- 1" is to account for the newline. */ - if (c == '\n') - mixed_string_buffer_flush_curr_buffer (bp, line_number - 1); - - mixed_string_buffer_append_byte (bp, (unsigned char) c); - } -} - -/* Return the string buffer's contents. */ -static char * -mixed_string_buffer_result (struct mixed_string_buffer *bp) -{ - /* Flush all into bp->utf8_buffer. */ - mixed_string_buffer_flush_utf16_surr (bp); - mixed_string_buffer_flush_curr_buffer (bp, line_number); - /* NUL-terminate it. */ - mixed_string_buffer_append_unicode_grow (bp, 1); - bp->utf8_buffer[bp->utf8_buflen] = '\0'; - /* Return it. */ - return bp->utf8_buffer; -} - -/* Free the memory pointed to by a 'struct mixed_string_buffer'. */ -static inline void -free_mixed_string_buffer (struct mixed_string_buffer *bp) -{ - free (bp->utf8_buffer); - free (bp->curr_buffer); -} - /* ========================== Reading of tokens. ========================== */ @@ -1387,29 +1190,36 @@ phase5_get (token_ty *tp) /* Strings. */ { - struct mixed_string_buffer literal; + struct mixed_string_buffer *bp; int quote_char; case '"': case '\'': quote_char = c; lexical_context = lc_string; /* Start accumulating the string. */ - init_mixed_string_buffer (&literal, lc_string); + bp = mixed_string_buffer_alloc (lexical_context, + logical_file_name, + line_number); for (;;) { int uc = phase7_getuc (quote_char); + bp->line_number = line_number; + if (uc == P7_EOF || uc == P7_STRING_END) break; if (IS_UNICODE (uc)) - assert (UNICODE_VALUE (uc) >= 0 - && UNICODE_VALUE (uc) < 0x110000); - - mixed_string_buffer_append (&literal, uc); + { + assert (UNICODE_VALUE (uc) >= 0 + && UNICODE_VALUE (uc) < 0x110000); + mixed_string_buffer_append_unicode (bp, + UNICODE_VALUE (uc)); + } + else + mixed_string_buffer_append_char (bp, uc); } - tp->string = xstrdup (mixed_string_buffer_result (&literal)); - free_mixed_string_buffer (&literal); + tp->string = xstrdup (mixed_string_buffer_done (bp)); tp->comment = add_reference (savable_comment); lexical_context = lc_outside; tp->type = last_token_type = token_type_string; diff --git a/gettext-tools/src/x-python.c b/gettext-tools/src/x-python.c index 4ebe0d714..da7468328 100644 --- a/gettext-tools/src/x-python.c +++ b/gettext-tools/src/x-python.c @@ -825,203 +825,6 @@ phase3_ungetc (int c) IS_UNICODE. */ #define UNICODE_VALUE(p7_result) ((p7_result) - 0x100) -/* A string buffer type that allows appending bytes (in the - xgettext_current_source_encoding) or Unicode characters. - Returns the entire string in UTF-8 encoding. */ - -struct mixed_string_buffer -{ - /* The part of the string that has already been converted to UTF-8. */ - char *utf8_buffer; - size_t utf8_buflen; - size_t utf8_allocated; - /* The first half of an UTF-16 surrogate character. */ - unsigned short utf16_surr; - /* The part of the string that is still in the source encoding. */ - char *curr_buffer; - size_t curr_buflen; - size_t curr_allocated; - /* The lexical context. Used only for error message purposes. */ - lexical_context_ty lcontext; -}; - -/* Initialize a 'struct mixed_string_buffer' to empty. */ -static inline void -init_mixed_string_buffer (struct mixed_string_buffer *bp, lexical_context_ty lcontext) -{ - bp->utf8_buffer = NULL; - bp->utf8_buflen = 0; - bp->utf8_allocated = 0; - bp->utf16_surr = 0; - bp->curr_buffer = NULL; - bp->curr_buflen = 0; - bp->curr_allocated = 0; - bp->lcontext = lcontext; -} - -/* Auxiliary function: Append a byte to bp->curr. */ -static inline void -mixed_string_buffer_append_byte (struct mixed_string_buffer *bp, unsigned char c) -{ - if (bp->curr_buflen == bp->curr_allocated) - { - bp->curr_allocated = 2 * bp->curr_allocated + 10; - bp->curr_buffer = xrealloc (bp->curr_buffer, bp->curr_allocated); - } - bp->curr_buffer[bp->curr_buflen++] = c; -} - -/* Auxiliary function: Ensure count more bytes are available in bp->utf8. */ -static inline void -mixed_string_buffer_append_unicode_grow (struct mixed_string_buffer *bp, size_t count) -{ - if (bp->utf8_buflen + count > bp->utf8_allocated) - { - size_t new_allocated = 2 * bp->utf8_allocated + 10; - if (new_allocated < bp->utf8_buflen + count) - new_allocated = bp->utf8_buflen + count; - bp->utf8_allocated = new_allocated; - bp->utf8_buffer = xrealloc (bp->utf8_buffer, new_allocated); - } -} - -/* Auxiliary function: Append a Unicode character to bp->utf8. - uc must be < 0x110000. */ -static inline void -mixed_string_buffer_append_unicode (struct mixed_string_buffer *bp, ucs4_t uc) -{ - unsigned char utf8buf[6]; - int count = u8_uctomb (utf8buf, uc, 6); - - if (count < 0) - /* The caller should have ensured that uc is not out-of-range. */ - abort (); - - mixed_string_buffer_append_unicode_grow (bp, count); - memcpy (bp->utf8_buffer + bp->utf8_buflen, utf8buf, count); - bp->utf8_buflen += count; -} - -/* Auxiliary function: Flush bp->utf16_surr into bp->utf8_buffer. */ -static inline void -mixed_string_buffer_flush_utf16_surr (struct mixed_string_buffer *bp) -{ - if (bp->utf16_surr != 0) - { - /* A half surrogate is invalid, therefore use U+FFFD instead. */ - mixed_string_buffer_append_unicode (bp, 0xfffd); - bp->utf16_surr = 0; - } -} - -/* Auxiliary function: Flush bp->curr_buffer into bp->utf8_buffer. */ -static inline void -mixed_string_buffer_flush_curr_buffer (struct mixed_string_buffer *bp, int lineno) -{ - if (bp->curr_buflen > 0) - { - char *curr; - size_t count; - - mixed_string_buffer_append_byte (bp, '\0'); - - /* Convert from the source encoding to UTF-8. */ - curr = from_current_source_encoding (bp->curr_buffer, bp->lcontext, - logical_file_name, lineno); - - /* Append it to bp->utf8_buffer. */ - count = strlen (curr); - mixed_string_buffer_append_unicode_grow (bp, count); - memcpy (bp->utf8_buffer + bp->utf8_buflen, curr, count); - bp->utf8_buflen += count; - - if (curr != bp->curr_buffer) - free (curr); - bp->curr_buflen = 0; - } -} - -/* Append a character or Unicode character to a 'struct mixed_string_buffer'. */ -static void -mixed_string_buffer_append (struct mixed_string_buffer *bp, int c) -{ - if (IS_UNICODE (c)) - { - /* Append a Unicode character. */ - - /* Switch from multibyte character mode to Unicode character mode. */ - mixed_string_buffer_flush_curr_buffer (bp, line_number); - - /* Test whether this character and the previous one form a Unicode - surrogate character pair. */ - if (bp->utf16_surr != 0 - && (c >= UNICODE (0xdc00) && c < UNICODE (0xe000))) - { - unsigned short utf16buf[2]; - ucs4_t uc; - - utf16buf[0] = bp->utf16_surr; - utf16buf[1] = UNICODE_VALUE (c); - if (u16_mbtouc (&uc, utf16buf, 2) != 2) - abort (); - - mixed_string_buffer_append_unicode (bp, uc); - bp->utf16_surr = 0; - } - else - { - mixed_string_buffer_flush_utf16_surr (bp); - - if (c >= UNICODE (0xd800) && c < UNICODE (0xdc00)) - bp->utf16_surr = UNICODE_VALUE (c); - else if (c >= UNICODE (0xdc00) && c < UNICODE (0xe000)) - { - /* A half surrogate is invalid, therefore use U+FFFD instead. */ - mixed_string_buffer_append_unicode (bp, 0xfffd); - } - else - mixed_string_buffer_append_unicode (bp, UNICODE_VALUE (c)); - } - } - else - { - /* Append a single byte. */ - - /* Switch from Unicode character mode to multibyte character mode. */ - mixed_string_buffer_flush_utf16_surr (bp); - - /* When a newline is seen, convert the accumulated multibyte sequence. - This ensures a correct line number in the error message in case of - a conversion error. The "- 1" is to account for the newline. */ - if (c == '\n') - mixed_string_buffer_flush_curr_buffer (bp, line_number - 1); - - mixed_string_buffer_append_byte (bp, (unsigned char) c); - } -} - -/* Return the string buffer's contents. */ -static char * -mixed_string_buffer_result (struct mixed_string_buffer *bp) -{ - /* Flush all into bp->utf8_buffer. */ - mixed_string_buffer_flush_utf16_surr (bp); - mixed_string_buffer_flush_curr_buffer (bp, line_number); - /* NUL-terminate it. */ - mixed_string_buffer_append_unicode_grow (bp, 1); - bp->utf8_buffer[bp->utf8_buflen] = '\0'; - /* Return it. */ - return bp->utf8_buffer; -} - -/* Free the memory pointed to by a 'struct mixed_string_buffer'. */ -static inline void -free_mixed_string_buffer (struct mixed_string_buffer *bp) -{ - free (bp->utf8_buffer); - free (bp->curr_buffer); -} - /* ========================== Reading of tokens. ========================== */ @@ -1526,7 +1329,7 @@ phase5_get (token_ty *tp) /* Strings. */ { - struct mixed_string_buffer literal; + struct mixed_string_buffer *bp; int quote_char; bool interpret_ansic; bool interpret_unicode; @@ -1598,23 +1401,30 @@ phase5_get (token_ty *tp) } backslash_counter = 0; /* Start accumulating the string. */ - init_mixed_string_buffer (&literal, lc_string); + bp = mixed_string_buffer_alloc (lexical_context, + logical_file_name, + line_number); for (;;) { int uc = phase7_getuc (quote_char, triple, interpret_ansic, interpret_unicode, &backslash_counter); + bp->line_number = line_number; + if (uc == P7_EOF || uc == P7_STRING_END) break; if (IS_UNICODE (uc)) - assert (UNICODE_VALUE (uc) >= 0 - && UNICODE_VALUE (uc) < 0x110000); - - mixed_string_buffer_append (&literal, uc); + { + assert (UNICODE_VALUE (uc) >= 0 + && UNICODE_VALUE (uc) < 0x110000); + mixed_string_buffer_append_unicode (bp, + UNICODE_VALUE (uc)); + } + else + mixed_string_buffer_append_char (bp, uc); } - tp->string = xstrdup (mixed_string_buffer_result (&literal)); - free_mixed_string_buffer (&literal); + tp->string = xstrdup (mixed_string_buffer_done (bp)); tp->comment = add_reference (savable_comment); lexical_context = lc_outside; tp->type = token_type_string; diff --git a/gettext-tools/src/xgettext.c b/gettext-tools/src/xgettext.c index 42fdc316e..c1697c231 100644 --- a/gettext-tools/src/xgettext.c +++ b/gettext-tools/src/xgettext.c @@ -66,6 +66,7 @@ #include "color.h" #include "format.h" #include "propername.h" +#include "unistr.h" #include "gettext.h" /* A convenience macro. I don't like writing gettext() every time. */ @@ -3068,6 +3069,186 @@ arglist_parser_done (struct arglist_parser *ap, int argnum) } +struct mixed_string_buffer * +mixed_string_buffer_alloc (lexical_context_ty lcontext, + const char *logical_file_name, + int line_number) +{ + struct mixed_string_buffer *bp = XMALLOC (struct mixed_string_buffer); + bp->utf8_buffer = NULL; + bp->utf8_buflen = 0; + bp->utf8_allocated = 0; + bp->utf16_surr = 0; + bp->curr_buffer = NULL; + bp->curr_buflen = 0; + bp->curr_allocated = 0; + bp->lcontext = lcontext; + bp->logical_file_name = logical_file_name; + bp->line_number = line_number; + return bp; +} + +/* Auxiliary function: Append a byte to bp->curr. */ +static inline void +mixed_string_buffer_append_to_curr_buffer (struct mixed_string_buffer *bp, + unsigned char c) +{ + if (bp->curr_buflen == bp->curr_allocated) + { + bp->curr_allocated = 2 * bp->curr_allocated + 10; + bp->curr_buffer = xrealloc (bp->curr_buffer, bp->curr_allocated); + } + bp->curr_buffer[bp->curr_buflen++] = c; +} + +/* Auxiliary function: Ensure count more bytes are available in bp->utf8. */ +static inline void +mixed_string_buffer_grow_utf8_buffer (struct mixed_string_buffer *bp, + size_t count) +{ + if (bp->utf8_buflen + count > bp->utf8_allocated) + { + size_t new_allocated = 2 * bp->utf8_allocated + 10; + if (new_allocated < bp->utf8_buflen + count) + new_allocated = bp->utf8_buflen + count; + bp->utf8_allocated = new_allocated; + bp->utf8_buffer = xrealloc (bp->utf8_buffer, new_allocated); + } +} + +/* Auxiliary function: Append a Unicode character to bp->utf8. + uc must be < 0x110000. */ +static inline void +mixed_string_buffer_append_to_utf8_buffer (struct mixed_string_buffer *bp, + ucs4_t uc) +{ + unsigned char utf8buf[6]; + int count = u8_uctomb (utf8buf, uc, 6); + + if (count < 0) + /* The caller should have ensured that uc is not out-of-range. */ + abort (); + + mixed_string_buffer_grow_utf8_buffer (bp, count); + memcpy (bp->utf8_buffer + bp->utf8_buflen, utf8buf, count); + bp->utf8_buflen += count; +} + +/* Auxiliary function: Flush bp->utf16_surr into bp->utf8_buffer. */ +static inline void +mixed_string_buffer_flush_utf16_surr (struct mixed_string_buffer *bp) +{ + if (bp->utf16_surr != 0) + { + /* A half surrogate is invalid, therefore use U+FFFD instead. */ + mixed_string_buffer_append_to_utf8_buffer (bp, 0xfffd); + bp->utf16_surr = 0; + } +} + +/* Auxiliary function: Flush bp->curr_buffer into bp->utf8_buffer. */ +static inline void +mixed_string_buffer_flush_curr_buffer (struct mixed_string_buffer *bp, + int line_number) +{ + if (bp->curr_buflen > 0) + { + char *curr; + size_t count; + + mixed_string_buffer_append_to_utf8_buffer (bp, '\0'); + + /* Convert from the source encoding to UTF-8. */ + curr = from_current_source_encoding (bp->curr_buffer, bp->lcontext, + bp->logical_file_name, + line_number); + + /* Append it to bp->utf8_buffer. */ + count = strlen (curr); + mixed_string_buffer_grow_utf8_buffer (bp, count); + memcpy (bp->utf8_buffer + bp->utf8_buflen, curr, count); + bp->utf8_buflen += count; + + if (curr != bp->curr_buffer) + free (curr); + bp->curr_buflen = 0; + } +} + +void +mixed_string_buffer_append_char (struct mixed_string_buffer *bp, int c) +{ + /* Switch from Unicode character mode to multibyte character mode. */ + mixed_string_buffer_flush_utf16_surr (bp); + + /* When a newline is seen, convert the accumulated multibyte sequence. + This ensures a correct line number in the error message in case of + a conversion error. The "- 1" is to account for the newline. */ + if (c == '\n') + mixed_string_buffer_flush_curr_buffer (bp, bp->line_number - 1); + + mixed_string_buffer_append_to_utf8_buffer (bp, (unsigned char) c); +} + +void +mixed_string_buffer_append_unicode (struct mixed_string_buffer *bp, int c) +{ + /* Switch from multibyte character mode to Unicode character mode. */ + mixed_string_buffer_flush_curr_buffer (bp, bp->line_number); + + /* Test whether this character and the previous one form a Unicode + surrogate character pair. */ + if (bp->utf16_surr != 0 && (c >= 0xdc00 && c < 0xe000)) + { + unsigned short utf16buf[2]; + ucs4_t uc; + + utf16buf[0] = bp->utf16_surr; + utf16buf[1] = c; + if (u16_mbtouc (&uc, utf16buf, 2) != 2) + abort (); + + mixed_string_buffer_append_to_utf8_buffer (bp, uc); + bp->utf16_surr = 0; + } + else + { + mixed_string_buffer_flush_utf16_surr (bp); + + if (c >= 0xd800 && c < 0xdc00) + bp->utf16_surr = c; + else if (c >= 0xdc00 && c < 0xe000) + { + /* A half surrogate is invalid, therefore use U+FFFD instead. */ + mixed_string_buffer_append_to_utf8_buffer (bp, 0xfffd); + } + else + mixed_string_buffer_append_to_utf8_buffer (bp, c); + } +} + +char * +mixed_string_buffer_done (struct mixed_string_buffer *bp) +{ + char *utf8_buffer; + + /* Flush all into bp->utf8_buffer. */ + mixed_string_buffer_flush_utf16_surr (bp); + mixed_string_buffer_flush_curr_buffer (bp, bp->line_number); + /* NUL-terminate it. */ + mixed_string_buffer_grow_utf8_buffer (bp, 1); + bp->utf8_buffer[bp->utf8_buflen] = '\0'; + + /* Free curr_buffer and bp itself. */ + utf8_buffer = bp->utf8_buffer; + free (bp->curr_buffer); + free (bp); + + /* Return it. */ + return utf8_buffer; +} + + static message_ty * construct_header () { diff --git a/gettext-tools/src/xgettext.h b/gettext-tools/src/xgettext.h index 2f8a08486..81e82667f 100644 --- a/gettext-tools/src/xgettext.h +++ b/gettext-tools/src/xgettext.h @@ -334,6 +334,47 @@ extern bool arglist_parser_decidedp (struct arglist_parser *ap, int argnum); extern void arglist_parser_done (struct arglist_parser *ap, int argnum); +/* A string buffer type that allows appending bytes (in the + xgettext_current_source_encoding) or Unicode characters. + Returns the entire string in UTF-8 encoding. */ + +struct mixed_string_buffer +{ + /* The part of the string that has already been converted to UTF-8. */ + char *utf8_buffer; + size_t utf8_buflen; + size_t utf8_allocated; + /* The first half of an UTF-16 surrogate character. */ + unsigned short utf16_surr; + /* The part of the string that is still in the source encoding. */ + char *curr_buffer; + size_t curr_buflen; + size_t curr_allocated; + /* The lexical context. Used only for error message purposes. */ + lexical_context_ty lcontext; + const char *logical_file_name; + int line_number; +}; + +/* Creates a fresh mixed_string_buffer. */ +extern struct mixed_string_buffer * + mixed_string_buffer_alloc (lexical_context_ty lcontext, + const char *logical_file_name, + int line_number); + +/* Appends a character to a mixed_string_buffer. */ +extern void mixed_string_buffer_append_char (struct mixed_string_buffer *bp, + int c); + +/* Appends a Unicode character to a mixed_string_buffer. */ +extern void mixed_string_buffer_append_unicode (struct mixed_string_buffer *bp, + int c); + +/* Frees mixed_string_buffer and returns the accumulated string as a + UTF-8 string. */ +extern char * mixed_string_buffer_done (struct mixed_string_buffer *bp); + + #ifdef __cplusplus } #endif