From: Bruno Haible Date: Sat, 21 Jun 2025 00:53:23 +0000 (+0200) Subject: xgettext: Shell: Recognize \u and \U escape sequences in $'...' strings. X-Git-Tag: v0.26~95 X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=81c37f4901fc6b69eadabe2ab622629db2941010;p=thirdparty%2Fgettext.git xgettext: Shell: Recognize \u and \U escape sequences in $'...' strings. * gettext-tools/src/xg-mixed-string.h (mixed_string_remove_prefix, mixed_string_buffer_equals, mixed_string_buffer_startswith, mixed_string_buffer_cloned_result): New declarations. * gettext-tools/src/xg-mixed-string.c (mixed_string_remove_prefix): New function. (mixed_string_buffer_is_empty): Also test the absence of utf16_surr. (mixed_string_buffer_equals, mixed_string_buffer_startswith, mixed_string_buffer_cloned_result): New functions. * gettext-tools/src/x-sh.c (struct token): Remove type. (init_token, free_token, grow_token, string_of_token): Remove functions. (struct word): Change type of 'token' field. (free_word): Update. (string_of_word, substring_of_word): Remove functions. (read_word): Call mixed_string_buffer_init instead of init_token, mixed_string_buffer_destroy instead of free_token, mixed_string_buffer_append_char instead of grow_token. Update accesses to wp->token. Recognize \u and \U escape sequences in $'...' strings. (read_command): Call mixed_string_buffer_cloned_result instead of string_of_word. Update accesses to inner.token. * gettext-tools/tests/xgettext-sh-7: Use --from-code option. * gettext-tools/tests/xgettext-sh-9: New file. * gettext-tools/tests/Makefile.am (TESTS): Add it. * NEWS: Mention this and the previous change. --- diff --git a/NEWS b/NEWS index 7e24a48a5..3edfb4983 100644 --- a/NEWS +++ b/NEWS @@ -11,6 +11,9 @@ Version 0.26 - July 2025 are no longer flagged as format strings by default, unless they occur in a context that requires a format string. You can override this heuristic by using a comment of the form /* xgettext: c-format */. + * Shell: + - xgettext now recognizes the \c, \u, and \U escape sequences in dollar- + single-quoted strings $'...'. # Bug fixes: - The AM_GNU_GETTEXT macro now rejects the dysfunctional gettext() function diff --git a/gettext-tools/src/x-sh.c b/gettext-tools/src/x-sh.c index c826dcf9b..84a4ec0a7 100644 --- a/gettext-tools/src/x-sh.c +++ b/gettext-tools/src/x-sh.c @@ -237,59 +237,6 @@ phase1_ungetc (int c) } -/* ========================== Reading of tokens. ========================== */ - - -/* A token consists of a sequence of characters. */ -struct token -{ - int allocated; /* number of allocated 'token_char's */ - int charcount; /* number of used 'token_char's */ - char *chars; /* the token's constituents */ -}; - -/* Initialize a 'struct token'. */ -static inline void -init_token (struct token *tp) -{ - tp->allocated = 10; - tp->chars = XNMALLOC (tp->allocated, char); - tp->charcount = 0; -} - -/* Free the memory pointed to by a 'struct token'. */ -static inline void -free_token (struct token *tp) -{ - free (tp->chars); -} - -/* Ensure there is enough room in the token for one more character. */ -static inline void -grow_token (struct token *tp) -{ - if (tp->charcount == tp->allocated) - { - tp->allocated *= 2; - tp->chars = (char *) xrealloc (tp->chars, tp->allocated * sizeof (char)); - } -} - -/* Convert a struct token * to a char*. */ -static char * -string_of_token (const struct token *tp) -{ - char *str; - int n; - - n = tp->charcount; - str = XNMALLOC (n + 1, char); - memcpy (str, tp->chars, n); - str[n] = '\0'; - return str; -} - - /* ========================= Accumulating messages ========================= */ @@ -450,8 +397,8 @@ enum word_type struct word { enum word_type type; - struct token *token; /* for t_string */ - int line_number_at_start; /* for t_string */ + struct mixed_string_buffer *token; /* for t_string */ + int line_number_at_start; /* for t_string */ }; /* Free the memory pointed to by a 'struct word'. */ @@ -460,45 +407,11 @@ free_word (struct word *wp) { if (wp->type == t_string) { - free_token (wp->token); + mixed_string_buffer_destroy (wp->token); free (wp->token); } } -/* Convert a t_string token to a char*. */ -static char * -string_of_word (const struct word *wp) -{ - char *str; - int n; - - if (!(wp->type == t_string)) - abort (); - n = wp->token->charcount; - str = XNMALLOC (n + 1, char); - memcpy (str, wp->token->chars, n); - str[n] = '\0'; - return str; -} - -/* Convert a t_string token to a char*, ignoring the first OFFSET bytes. */ -static char * -substring_of_word (const struct word *wp, size_t offset) -{ - char *str; - int n; - - if (!(wp->type == t_string)) - abort (); - n = wp->token->charcount; - if (!(offset <= n)) - abort (); - str = XNMALLOC (n - offset + 1, char); - memcpy (str, wp->token->chars + offset, n - offset); - str[n - offset] = '\0'; - return str; -} - /* Whitespace recognition. */ @@ -862,8 +775,9 @@ read_word (struct word *wp, int looking_for, flag_region_ty *region) } wp->type = t_string; - wp->token = XMALLOC (struct token); - init_token (wp->token); + wp->token = XMALLOC (struct mixed_string_buffer); + mixed_string_buffer_init (wp->token, lc_string, + logical_file_name, line_number); wp->line_number_at_start = line_number; /* True while all characters in the token seen so far are digits. */ all_unquoted_digits = true; @@ -897,7 +811,7 @@ read_word (struct word *wp, int looking_for, flag_region_ty *region) phase2_ungetc (c2); wp->type = t_redirect; - free_token (wp->token); + mixed_string_buffer_destroy (wp->token); free (wp->token); last_non_comment_line = line_number; @@ -907,7 +821,9 @@ read_word (struct word *wp, int looking_for, flag_region_ty *region) all_unquoted_digits = all_unquoted_digits && (c >= '0' && c <= '9'); - if (all_unquoted_name_characters && wp->token->charcount > 0 && c == '=') + if (all_unquoted_name_characters + && !mixed_string_buffer_is_empty (wp->token) + && c == '=') { wp->type = t_assignment; continue; @@ -916,7 +832,8 @@ read_word (struct word *wp, int looking_for, flag_region_ty *region) all_unquoted_name_characters = all_unquoted_name_characters && ((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') || c == '_' - || (wp->token->charcount > 0 && c >= '0' && c <= '9')); + || (!mixed_string_buffer_is_empty (wp->token) + && c >= '0' && c <= '9')); if (c == '$') { @@ -1145,14 +1062,56 @@ read_word (struct word *wp, int looking_for, flag_region_ty *region) c = n; } break; + + case 'u': case 'U': + { + unsigned char buf[8]; + int j; + unsigned int n; + + n = 0; + for (j = 0; j < (c == 'u' ? 4 : 8); j++) + { + int c1 = phase1_getc (); + + if (c1 >= '0' && c1 <= '9') + n = (n << 4) + (c1 - '0'); + else if (c1 >= 'A' && c1 <= 'F') + n = (n << 4) + (c1 - 'A' + 10); + else if (c1 >= 'a' && c1 <= 'f') + n = (n << 4) + (c1 - 'a' + 10); + else + { + phase1_ungetc (c1); + break; + } + + buf[j] = c1; + } + if (j > 0) + { + if (n < 0x110000) + { + if (wp->type == t_string) + mixed_string_buffer_append_unicode (wp->token, n); + goto done_escape_sequence; + } + if_error (IF_SEVERITY_WARNING, + logical_file_name, line_number, (size_t)(-1), false, + _("invalid Unicode character")); + while (--j >= 0) + phase1_ungetc (buf[j]); + } + phase1_ungetc (c); + c = '\\'; + break; + } } } if (wp->type == t_string) - { - grow_token (wp->token); - wp->token->chars[wp->token->charcount++] = - (unsigned char) c; - } + mixed_string_buffer_append_char (wp->token, + (unsigned char) c); + done_escape_sequence: ; } /* The result is a literal string. Don't change wp->type. */ continue; @@ -1161,13 +1120,14 @@ read_word (struct word *wp, int looking_for, flag_region_ty *region) { /* $"...": Bash builtin for internationalized string. */ lex_pos_ty pos; - struct token string; + struct mixed_string_buffer string; saw_opening_singlequote (); open_singlequote_terminator = '"'; pos.file_name = logical_file_name; pos.line_number = line_number; - init_token (&string); + mixed_string_buffer_init (&string, lc_string, + logical_file_name, line_number); for (;;) { c = phase2_getc (); @@ -1178,13 +1138,13 @@ read_word (struct word *wp, int looking_for, flag_region_ty *region) saw_closing_singlequote (); break; } - grow_token (&string); - string.chars[string.charcount++] = (unsigned char) c; + mixed_string_buffer_append_char (&string, (unsigned char) c); } - remember_a_message (mlp, NULL, string_of_token (&string), - false, false, region, &pos, + remember_a_message (mlp, NULL, + mixed_string_contents_free1 ( + mixed_string_buffer_result (&string)), + true, false, region, &pos, NULL, savable_comment, false); - free_token (&string); if_error (IF_SEVERITY_WARNING, pos.file_name, pos.line_number, (size_t)(-1), false, @@ -1278,17 +1238,14 @@ read_word (struct word *wp, int looking_for, flag_region_ty *region) break; if (wp->type == t_string) - { - grow_token (wp->token); - wp->token->chars[wp->token->charcount++] = (unsigned char) c; - } + mixed_string_buffer_append_char (wp->token, (unsigned char) c); } phase2_ungetc (c); if (wp->type != t_string) { - free_token (wp->token); + mixed_string_buffer_destroy (wp->token); free (wp->token); } last_non_comment_line = line_number; @@ -1354,8 +1311,10 @@ read_command (int looking_for, flag_region_ty *outer_region) pos.file_name = logical_file_name; pos.line_number = inner.line_number_at_start; - remember_a_message (mlp, NULL, string_of_word (&inner), false, - false, inner_region, &pos, + remember_a_message (mlp, NULL, + mixed_string_contents_free1 ( + mixed_string_buffer_cloned_result (inner.token)), + true, false, inner_region, &pos, NULL, savable_comment, false); } } @@ -1387,7 +1346,9 @@ read_command (int looking_for, flag_region_ty *outer_region) } else if (inner.type == t_string) { - char *function_name = string_of_word (&inner); + char *function_name = + mixed_string_contents_free1 ( + mixed_string_buffer_cloned_result (inner.token)); if (strcmp (function_name, "env") == 0) { @@ -1438,12 +1399,7 @@ read_command (int looking_for, flag_region_ty *outer_region) && memcmp (argparser->keyword, "ngettext", 8) == 0)); if (accepts_context && argparser->next_is_msgctxt) { - char *s = string_of_word (&inner); - mixed_string_ty *ms = - mixed_string_alloc_simple (s, lc_string, - logical_file_name, - inner.line_number_at_start); - free (s); + mixed_string_ty *ms = mixed_string_buffer_cloned_result (inner.token); argparser->next_is_msgctxt = false; arglist_parser_remember_msgctxt (argparser, ms, inner_region, @@ -1452,24 +1408,17 @@ read_command (int looking_for, flag_region_ty *outer_region) matters_for_argparser = false; } else if (accepts_context - && ((inner.token->charcount == 2 - && memcmp (inner.token->chars, "-c", 2) == 0) - || (inner.token->charcount == 9 - && memcmp (inner.token->chars, "--context", 9) == 0))) + && (mixed_string_buffer_equals (inner.token, "-c") + || mixed_string_buffer_equals (inner.token, "--context"))) { argparser->next_is_msgctxt = true; matters_for_argparser = false; } else if (accepts_context - && (inner.token->charcount >= 10 - && memcmp (inner.token->chars, "--context=", 10) == 0)) + && mixed_string_buffer_startswith (inner.token, "--context=")) { - char *s = substring_of_word (&inner, 10); - mixed_string_ty *ms = - mixed_string_alloc_simple (s, lc_string, - logical_file_name, - inner.line_number_at_start); - free (s); + mixed_string_ty *ms = mixed_string_buffer_cloned_result (inner.token); + mixed_string_remove_prefix (ms, 10); argparser->next_is_msgctxt = false; arglist_parser_remember_msgctxt (argparser, ms, inner_region, @@ -1478,20 +1427,19 @@ read_command (int looking_for, flag_region_ty *outer_region) matters_for_argparser = false; } else if (accepts_expand - && inner.token->charcount == 2 - && memcmp (inner.token->chars, "-e", 2) == 0) + && mixed_string_buffer_equals (inner.token, "-e")) { must_expand_arg_strings = true; matters_for_argparser = false; } else { - char *s = string_of_word (&inner); - mixed_string_ty *ms; + mixed_string_ty *ms = mixed_string_buffer_cloned_result (inner.token); /* When '-e' was specified, expand escape sequences in s. */ if (accepts_expand && must_expand_arg_strings) { + char *s = mixed_string_contents (ms); bool expands_backslash_c = (argparser->keyword_len == 7 && memcmp (argparser->keyword, "gettext", 7) == 0); @@ -1502,14 +1450,17 @@ read_command (int looking_for, flag_region_ty *outer_region) /* We can ignore the value of expands_backslash_c, because here we don't support the gettext '-s' option. */ if (expanded != s) - free (s); - s = expanded; + { + mixed_string_ty *expanded_ms = + mixed_string_alloc_utf8 (expanded, ms->lcontext, + ms->logical_file_name, + ms->line_number); + mixed_string_free (ms); + ms = expanded_ms; + } + free (s); } - ms = mixed_string_alloc_simple (s, lc_string, - logical_file_name, - inner.line_number_at_start); - free (s); arglist_parser_remember (argparser, arg, ms, inner_region, logical_file_name, diff --git a/gettext-tools/src/xg-mixed-string.c b/gettext-tools/src/xg-mixed-string.c index a2ce938e1..7a53d11f6 100644 --- a/gettext-tools/src/xg-mixed-string.c +++ b/gettext-tools/src/xg-mixed-string.c @@ -1,6 +1,6 @@ /* Handling strings that are given partially in the source encoding and partially in Unicode. - Copyright (C) 2001-2024 Free Software Foundation, Inc. + Copyright (C) 2001-2025 Free Software Foundation, Inc. This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -394,6 +394,25 @@ mixed_string_concat_free1 (mixed_string_ty *ms1, const mixed_string_ty *ms2) } } +void +mixed_string_remove_prefix (mixed_string_ty *ms, size_t prefix_length) +{ + if (prefix_length > 0) + { + if (ms->nsegments == 0) + abort (); + struct mixed_string_segment *old_segment0 = ms->segments[0]; + if (!(old_segment0->length >= prefix_length)) + abort (); + struct mixed_string_segment *new_segment0 = + segment_alloc (old_segment0->type, + old_segment0->contents + prefix_length, + old_segment0->length - prefix_length); + free (old_segment0); + ms->segments[0] = new_segment0; + } +} + void mixed_string_buffer_init (struct mixed_string_buffer *bp, @@ -417,7 +436,31 @@ mixed_string_buffer_init (struct mixed_string_buffer *bp, bool mixed_string_buffer_is_empty (const struct mixed_string_buffer *bp) { - return (bp->nsegments == 0 && bp->curr_buflen == 0); + return (bp->nsegments == 0 && bp->curr_buflen == 0 && bp->utf16_surr == 0); +} + +bool +mixed_string_buffer_equals (const struct mixed_string_buffer *bp, + const char *other) +{ + size_t other_len = strlen (other); + return (bp->nsegments == 0 + && bp->curr_buflen == other_len + && (other_len == 0 || memcmp (bp->curr_buffer, other, other_len) == 0) + && bp->utf16_surr == 0); +} + +bool +mixed_string_buffer_startswith (const struct mixed_string_buffer *bp, + const char *prefix) +{ + size_t prefix_len = strlen (prefix); + return prefix_len == 0 + || (bp->nsegments == 0 + ? bp->curr_buflen >= prefix_len + && memcmp (bp->curr_buffer, prefix, prefix_len) == 0 + : bp->segments[0]->length >= prefix_len + && memcmp (bp->segments[0]->contents, prefix, prefix_len) == 0); } /* Auxiliary function: Ensure count more bytes are available in @@ -642,3 +685,34 @@ mixed_string_buffer_result (struct mixed_string_buffer *bp) return ms; } } + +mixed_string_ty * +mixed_string_buffer_cloned_result (struct mixed_string_buffer *bp) +{ + mixed_string_buffer_flush_curr (bp); + + { + struct mixed_string *ms = XMALLOC (struct mixed_string); + size_t nsegments = bp->nsegments; + + if (nsegments > 0) + { + size_t i; + + ms->segments = XNMALLOC (nsegments, struct mixed_string_segment *); + for (i = 0; i < nsegments; i++) + ms->segments[i] = segment_clone (bp->segments[i]); + } + else + { + assert (bp->segments == NULL); + ms->segments = NULL; + } + ms->nsegments = nsegments; + ms->lcontext = bp->lcontext; + ms->logical_file_name = bp->logical_file_name; + ms->line_number = bp->line_number; + + return ms; + } +} diff --git a/gettext-tools/src/xg-mixed-string.h b/gettext-tools/src/xg-mixed-string.h index 8674c7545..a3474cd66 100644 --- a/gettext-tools/src/xg-mixed-string.h +++ b/gettext-tools/src/xg-mixed-string.h @@ -1,6 +1,6 @@ /* Handling strings that are given partially in the source encoding and partially in Unicode. - Copyright (C) 2001-2018 Free Software Foundation, Inc. + Copyright (C) 2001-2025 Free Software Foundation, Inc. This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -99,6 +99,10 @@ extern mixed_string_ty * mixed_string_concat_free1 (mixed_string_ty *ms1, const mixed_string_ty *ms2); +/* Removes a known prefix of prefix_length ASCII bytes from a mixed_string. */ +extern void + mixed_string_remove_prefix (mixed_string_ty *ms, size_t prefix_length); + /* A string buffer type that allows appending bytes (in the xgettext_current_source_encoding) or Unicode characters. @@ -134,6 +138,18 @@ extern void extern bool mixed_string_buffer_is_empty (const struct mixed_string_buffer *bp); +/* Determines whether the accumulated string is equal to a given ASCII + string. */ +extern bool + mixed_string_buffer_equals (const struct mixed_string_buffer *bp, + const char *other); + +/* Determines whether the accumulated string starts with a given ASCII + string. */ +extern bool + mixed_string_buffer_startswith (const struct mixed_string_buffer *bp, + const char *prefix); + /* Appends a character to a mixed_string_buffer. */ extern void mixed_string_buffer_append_char (struct mixed_string_buffer *bp, int c); @@ -153,6 +169,11 @@ extern void extern mixed_string_ty * mixed_string_buffer_result (struct mixed_string_buffer *bp); +/* Returns the accumulated string. + Does *not* free the memory pointed to by BP. */ +extern mixed_string_ty * + mixed_string_buffer_cloned_result (struct mixed_string_buffer *bp); + #ifdef __cplusplus } diff --git a/gettext-tools/tests/Makefile.am b/gettext-tools/tests/Makefile.am index babb917d7..e274d9926 100644 --- a/gettext-tools/tests/Makefile.am +++ b/gettext-tools/tests/Makefile.am @@ -176,7 +176,7 @@ TESTS = gettext-1 gettext-2 \ xgettext-scheme-format-1 xgettext-scheme-format-2 \ xgettext-scheme-stackovfl-1 xgettext-scheme-stackovfl-2 \ xgettext-sh-1 xgettext-sh-2 xgettext-sh-3 xgettext-sh-4 xgettext-sh-5 \ - xgettext-sh-6 xgettext-sh-7 xgettext-sh-8 \ + xgettext-sh-6 xgettext-sh-7 xgettext-sh-8 xgettext-sh-9 \ xgettext-sh-stackovfl-1 xgettext-sh-stackovfl-2 \ xgettext-sh-stackovfl-3 xgettext-sh-stackovfl-4 \ xgettext-sh-stackovfl-5 \ diff --git a/gettext-tools/tests/xgettext-sh-7 b/gettext-tools/tests/xgettext-sh-7 index 738803e9a..9edaa4417 100755 --- a/gettext-tools/tests/xgettext-sh-7 +++ b/gettext-tools/tests/xgettext-sh-7 @@ -22,7 +22,8 @@ f0oO_=bar gettext 'invocation with a mixed environment variable' EOF : ${XGETTEXT=xgettext} -${XGETTEXT} --omit-header --no-location -d xg-sh-7.tmp xg-sh-7.sh || Exit 1 +${XGETTEXT} --from-code=ISO-8859-1 --omit-header --no-location \ + -d xg-sh-7.tmp xg-sh-7.sh || Exit 1 LC_ALL=C tr -d '\r' < xg-sh-7.tmp.po > xg-sh-7.po || Exit 1 cat <<\EOF > xg-sh-7.ok diff --git a/gettext-tools/tests/xgettext-sh-9 b/gettext-tools/tests/xgettext-sh-9 new file mode 100755 index 000000000..b62f83803 --- /dev/null +++ b/gettext-tools/tests/xgettext-sh-9 @@ -0,0 +1,64 @@ +#!/bin/sh +. "${srcdir=.}/init.sh"; path_prepend_ . ../src + +# Test of Shell support: non-ASCII dollar-single-quote strings. + +cat <<\EOF > xg-sh-9.sh +gettext $'depth_0_dollar_posix_\xc1mn\301op' +gettext $'depth_0_dollar_bash_\u20accd\U1f603kl' + +echo `gettext $'depth_1_dollar_posix_\xc1mn\301op'` +echo `gettext $'depth_1_dollar_bash_\u20accd\U1f603kl'` + +echo `echo \`gettext $'depth_2_dollar_posix_\xc1mn\301op'\`` +echo `echo \`gettext $'depth_2_dollar_bash_\u20accd\U1f603kl'\`` +EOF + +: ${XGETTEXT=xgettext} +${XGETTEXT} --from-code=ISO-8859-1 --no-location -d xg-sh-9.tmp xg-sh-9.sh \ + || Exit 1 +func_filter_POT_Creation_Date xg-sh-9.tmp.po xg-sh-9.po + +cat <<\EOF > xg-sh-9.ok +# SOME DESCRIPTIVE TITLE. +# Copyright (C) YEAR THE PACKAGE'S COPYRIGHT HOLDER +# This file is distributed under the same license as the PACKAGE package. +# FIRST AUTHOR , YEAR. +# +#, fuzzy +msgid "" +msgstr "" +"Project-Id-Version: PACKAGE VERSION\n" +"Report-Msgid-Bugs-To: \n" +"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n" +"Last-Translator: FULL NAME \n" +"Language-Team: LANGUAGE \n" +"Language: \n" +"MIME-Version: 1.0\n" +"Content-Type: text/plain; charset=UTF-8\n" +"Content-Transfer-Encoding: 8bit\n" + +msgid "depth_0_dollar_posix_ÁmnÁop" +msgstr "" + +msgid "depth_0_dollar_bash_€cd😃kl" +msgstr "" + +msgid "depth_1_dollar_posix_ÁmnÁop" +msgstr "" + +msgid "depth_1_dollar_bash_€cd😃kl" +msgstr "" + +msgid "depth_2_dollar_posix_ÁmnÁop" +msgstr "" + +msgid "depth_2_dollar_bash_€cd😃kl" +msgstr "" +EOF + +: ${DIFF=diff} +${DIFF} xg-sh-9.ok xg-sh-9.po +result=$? + +exit $result