--- /dev/null
+/* xgettext sh backend.
+ Copyright (C) 2003 Free Software Foundation, Inc.
+ Written by Bruno Haible <bruno@clisp.org>, 2003.
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 2, or (at your option)
+ any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software Foundation,
+ Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */
+
+#ifdef HAVE_CONFIG_H
+# include "config.h"
+#endif
+
+#include <errno.h>
+#include <limits.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "message.h"
+#include "x-sh.h"
+#include "xgettext.h"
+#include "error.h"
+#include "xmalloc.h"
+#include "exit.h"
+#include "hash.h"
+#include "gettext.h"
+
+#define _(s) gettext(s)
+
+/* The sh syntax is defined in POSIX:2001, see
+ http://www.opengroup.org/onlinepubs/007904975/utilities/xcu_chap02.html
+ Summary of sh syntax:
+ - Input is broken into words, which are then subject to
+ - tilde expansion ~...
+ - command substitution `...`
+ - variable substitution $var
+ - arithmetic substitution $((...))
+ - field splitting at whitespace (IFS)
+ - wildcard pattern expansion *?
+ - quote removal
+ - Strings are enclosed in "..."; command substitution, variable
+ substitution and arithmetic substitution are performed here as well.
+ - '...' is a string without substitutions.
+ - The list of resulting words is split into commands by semicolon and
+ newline.
+ - '#' at the beginning of a word introduces a comment until end of line.
+ The parser is implemented in bash-2.05b/parse.y. */
+
+
+/* ====================== Keyword set customization. ====================== */
+
+/* If true extract all strings. */
+static bool extract_all = false;
+
+static hash_table keywords;
+static bool default_keywords = true;
+
+
+void
+x_sh_extract_all ()
+{
+ extract_all = true;
+}
+
+
+void
+x_sh_keyword (const char *name)
+{
+ if (name == NULL)
+ default_keywords = false;
+ else
+ {
+ const char *end;
+ int argnum1;
+ int argnum2;
+ const char *colon;
+
+ if (keywords.table == NULL)
+ init_hash (&keywords, 100);
+
+ split_keywordspec (name, &end, &argnum1, &argnum2);
+
+ /* The characters between name and end should form a valid C identifier.
+ A colon means an invalid parse in split_keywordspec(). */
+ colon = strchr (name, ':');
+ if (colon == NULL || colon >= end)
+ {
+ if (argnum1 == 0)
+ argnum1 = 1;
+ insert_entry (&keywords, name, end - name,
+ (void *) (long) (argnum1 + (argnum2 << 10)));
+ }
+ }
+}
+
+/* Finish initializing the keywords hash table.
+ Called after argument processing, before each file is processed. */
+static void
+init_keywords ()
+{
+ if (default_keywords)
+ {
+ x_sh_keyword ("gettext");
+ x_sh_keyword ("ngettext:1,2");
+ x_sh_keyword ("eval_gettext");
+ x_sh_keyword ("eval_ngettext:1,2");
+ default_keywords = false;
+ }
+}
+
+
+/* ======================== Reading of characters. ======================== */
+
+/* Real filename, used in error messages about the input file. */
+static const char *real_file_name;
+
+/* Logical filename and line number, used to label the extracted messages. */
+static char *logical_file_name;
+static int line_number;
+
+/* The input file stream. */
+static FILE *fp;
+
+
+/* Fetch the next character from the input file. */
+static int
+do_getc ()
+{
+ int c = getc (fp);
+
+ if (c == EOF)
+ {
+ if (ferror (fp))
+ error (EXIT_FAILURE, errno, _("\
+error while reading \"%s\""), real_file_name);
+ }
+ else if (c == '\n')
+ line_number++;
+
+ return c;
+}
+
+/* Put back the last fetched character, not EOF. */
+static void
+do_ungetc (int c)
+{
+ if (c == '\n')
+ line_number--;
+ ungetc (c, fp);
+}
+
+
+/* Remove backslash followed by newline from the input stream.
+ Cope with potentially 2 characters of pushback. */
+
+/* Maximum used guaranteed to be < 4. */
+static int phase1_pushback[4];
+static int phase1_pushback_length;
+
+static int
+phase1_getc ()
+{
+ int c;
+
+ if (phase1_pushback_length)
+ {
+ c = phase1_pushback[--phase1_pushback_length];
+ if (c == '\n')
+ ++line_number;
+ return c;
+ }
+ for (;;)
+ {
+ c = do_getc ();
+ if (c != '\\')
+ return c;
+ c = do_getc ();
+ if (c != '\n')
+ {
+ if (c != EOF)
+ do_ungetc (c);
+ return '\\';
+ }
+ }
+}
+
+static void
+phase1_ungetc (int c)
+{
+ switch (c)
+ {
+ case EOF:
+ break;
+
+ case '\n':
+ --line_number;
+ /* FALLTHROUGH */
+
+ default:
+ phase1_pushback[phase1_pushback_length++] = c;
+ break;
+ }
+}
+
+
+/* ========================== Reading of tokens. ========================== */
+
+
+/* A token consists of a sequence of characters. */
+struct token
+{
+ int allocated; /* number of allocated 'token_char's */
+ int charcount; /* number of used 'token_char's */
+ char *chars; /* the token's constituents */
+};
+
+/* Initialize a 'struct token'. */
+static inline void
+init_token (struct token *tp)
+{
+ tp->allocated = 10;
+ tp->chars = (char *) xmalloc (tp->allocated * sizeof (char));
+ tp->charcount = 0;
+}
+
+/* Free the memory pointed to by a 'struct token'. */
+static inline void
+free_token (struct token *tp)
+{
+ free (tp->chars);
+}
+
+/* Ensure there is enough room in the token for one more character. */
+static inline void
+grow_token (struct token *tp)
+{
+ if (tp->charcount == tp->allocated)
+ {
+ tp->allocated *= 2;
+ tp->chars = (char *) xrealloc (tp->chars, tp->allocated * sizeof (char));
+ }
+}
+
+/* Convert a struct token * to a char*. */
+static char *
+string_of_token (const struct token *tp)
+{
+ char *str;
+ int n;
+
+ n = tp->charcount;
+ str = (char *) xmalloc (n + 1);
+ memcpy (str, tp->chars, n);
+ str[n] = '\0';
+ return str;
+}
+
+
+/* ========================= Accumulating messages ========================= */
+
+
+static message_list_ty *mlp;
+
+
+/* ========================= Accumulating comments ========================= */
+
+
+static char *buffer;
+static size_t bufmax;
+static size_t buflen;
+
+static inline void
+comment_start ()
+{
+ buflen = 0;
+}
+
+static inline void
+comment_add (int c)
+{
+ if (buflen >= bufmax)
+ {
+ bufmax = 2 * bufmax + 10;
+ buffer = xrealloc (buffer, bufmax);
+ }
+ buffer[buflen++] = c;
+}
+
+static inline void
+comment_line_end ()
+{
+ while (buflen >= 1
+ && (buffer[buflen - 1] == ' ' || buffer[buflen - 1] == '\t'))
+ --buflen;
+ if (buflen >= bufmax)
+ {
+ bufmax = 2 * bufmax + 10;
+ buffer = xrealloc (buffer, bufmax);
+ }
+ buffer[buflen] = '\0';
+ xgettext_comment_add (buffer);
+}
+
+
+/* These are for tracking whether comments count as immediately before
+ keyword. */
+static int last_comment_line;
+static int last_non_comment_line;
+
+
+/* ========================= Debackslashification ========================== */
+
+/* This state tracks the effect of backquotes, double-quotes and single-quotes
+ on the parsing of backslashes. We make a single pass through the input
+ file, keeping the state up to date. This is much faster than accumulating
+ strings and processing them with explicit debackslashification, like the
+ shell does it. */
+
+/* The number of nested `...` or "`...`" constructs. Assumed to be <= 32. */
+static unsigned int nested_backquotes;
+
+/* A bit mask indicating which of the currently open `...` or "`...`"
+ constructs is with double-quotes: "`...`".
+ A bit value of 1 stands for "`...`", a bit value of 0 stands for `...`.
+ Bit position 0 designates the outermost backquotes nesting,
+ bit position 1 the second-outermost backquotes nesting,
+ ...
+ bit position (nested_backquotes-1) the innermost backquotes nesting. */
+static unsigned int open_doublequotes_mask;
+
+/* A bit indicating whether a double-quote is currently open inside the
+ innermost backquotes nesting. */
+static bool open_doublequote;
+
+/* A bit indicating whether a single-quote is currently open inside the
+ innermost backquotes nesting. */
+static bool open_singlequote;
+
+
+/* Functions to update the state. */
+
+static inline void
+saw_opening_backquote ()
+{
+ if (open_singlequote)
+ abort ();
+ if (open_doublequote)
+ open_doublequotes_mask |= (unsigned int) 1 << nested_backquotes;
+ nested_backquotes++;
+ open_doublequote = false;
+}
+
+static inline void
+saw_closing_backquote ()
+{
+ nested_backquotes--;
+ open_doublequote = (open_doublequotes_mask >> nested_backquotes) & 1;
+ open_doublequotes_mask &= ((unsigned int) 1 << nested_backquotes) - 1;
+ open_singlequote = false; /* just for safety */
+}
+
+static inline void
+saw_opening_doublequote ()
+{
+ if (open_singlequote || open_doublequote)
+ abort ();
+ open_doublequote = true;
+}
+
+static inline void
+saw_closing_doublequote ()
+{
+ if (open_singlequote || !open_doublequote)
+ abort ();
+ open_doublequote = false;
+}
+
+static inline void
+saw_opening_singlequote ()
+{
+ if (open_doublequote || open_singlequote)
+ abort ();
+ open_singlequote = true;
+}
+
+static inline void
+saw_closing_singlequote ()
+{
+ if (open_doublequote || !open_singlequote)
+ abort ();
+ open_singlequote = false;
+}
+
+
+/* ========================== Reading of commands ========================== */
+
+/* We are only interested in constant strings. Other words need not to be
+ represented precisely. */
+enum word_type
+{
+ t_string, /* constant string */
+ t_other, /* other string */
+ t_separator, /* command separator: semicolon or newline */
+ t_redirect, /* redirection: one of < > >| << <<- >> <> <& >& */
+ t_backquote, /* closing '`' pseudo word */
+ t_paren, /* closing ')' pseudo word */
+ t_eof /* EOF marker */
+};
+
+struct word
+{
+ enum word_type type;
+ struct token *token; /* for t_string */
+ int line_number_at_start; /* for t_string */
+};
+
+/* Free the memory pointed to by a 'struct word'. */
+static inline void
+free_word (struct word *wp)
+{
+ if (wp->type == t_string)
+ {
+ free_token (wp->token);
+ free (wp->token);
+ }
+}
+
+/* Convert a t_string token to a char*. */
+static char *
+string_of_word (const struct word *wp)
+{
+ char *str;
+ int n;
+
+ if (!(wp->type == t_string))
+ abort ();
+ n = wp->token->charcount;
+ str = (char *) xmalloc (n + 1);
+ memcpy (str, wp->token->chars, n);
+ str[n] = '\0';
+ return str;
+}
+
+
+/* Whitespace recognition. */
+
+static inline bool
+is_whitespace (int c)
+{
+ return (c == ' ' || c == '\t' || c == '\n');
+}
+
+/* Operator character recognition. */
+
+static inline bool
+is_operator_start (int c)
+{
+ return (c == '|' || c == '&' || c == ';' || c == '<' || c == '>'
+ || c == '(' || c == ')');
+}
+
+
+/* Denotation of a quoted character.
+ The distinction between quoted and unquoted character is important only for
+ the special, whitespace and operator characters; it is irrelevant for
+ alphanumeric characters, '\\' and many others. */
+#define QUOTED(c) (UCHAR_MAX + 1 + (c))
+/* Values in the 'unsigned char' range are implicitly unquoted. Among these,
+ the following are important:
+ '"' opening or closing double quote
+ '\'' opening or closing single quote
+ '$' the unknown result of a dollar expansion
+ '`' does not occur - replaced with OPENING_BACKQUOTE or
+ CLOSING_BACKQUOTE
+ */
+#define OPENING_BACKQUOTE (2 * (UCHAR_MAX + 1) + '`')
+#define CLOSING_BACKQUOTE (3 * (UCHAR_MAX + 1) + '`')
+
+/* Maximum used guaranteed to be < 4. */
+static int phase2_pushback[4];
+static int phase2_pushback_length;
+
+/* Forward declaration of local functions. */
+static void phase2_ungetc (int c);
+
+/* Return the next character, with backslashes removed.
+ The result is QUOTED(c) for some unsigned char c, if the next character
+ is escaped sufficiently often to make it a regular constituent character,
+ or simply an 'unsigned char' if it has its special meaning (of special,
+ whitespace or operator charcter), or OPENING_BACKQUOTE, CLOSING_BACKQUOTE,
+ EOF.
+ It's the caller's responsibility to update the state. */
+static int
+phase2_getc ()
+{
+ int c;
+
+ if (phase2_pushback_length)
+ {
+ c = phase2_pushback[--phase2_pushback_length];
+ if (c == '\n')
+ ++line_number;
+ return c;
+ }
+
+ c = phase1_getc ();
+ if (c == EOF)
+ return c;
+ if (c == '\'')
+ return (open_doublequote ? QUOTED (c) : c);
+ if (!open_singlequote)
+ {
+ if (c == '"' || c == '$')
+ return c;
+ if (c == '`')
+ return (nested_backquotes > 0 ? CLOSING_BACKQUOTE : OPENING_BACKQUOTE);
+ }
+ if (c == '\\')
+ {
+ /* Number of debackslahificication passes that are active at the
+ current point. */
+ unsigned int debackslahify =
+ nested_backquotes + (open_singlequote ? 0 : 1);
+ /* Normal number of backslashes that yield a single backslash in the
+ final output. */
+ unsigned int expected_count =
+ (unsigned int) 1 << debackslahify;
+ /* Number of backslashes found. */
+ unsigned int count;
+
+ for (count = 1; count < expected_count; count++)
+ {
+ c = phase1_getc ();
+ if (c != '\\')
+ break;
+ }
+ if (count == expected_count)
+ return '\\';
+
+ /* The count of backslashes is > 0 and < expected_count, therefore the
+ result depends on c, the first character after the backslashes.
+ Note: The formulas below don't necessarily have a logic; they were
+ empirically determined such that 1. the xgettext-30 test succeeds,
+ 2. the behaviour for count == 0 would correspond to the one without
+ any baskslash. */
+ if (c == '\'')
+ {
+ if (!open_singlequote && count > (expected_count >> 1))
+ {
+ phase1_ungetc (c);
+ return '\\';
+ }
+ else
+ return (open_doublequote ? QUOTED (c) : c);
+ }
+ else if (c == '"')
+ {
+ /* Each debackslahificication pass converts \\ to \ and \" to ";
+ passes corresponding to `...` drop a lone " whereas passes
+ corresponding to "`...`" leave it alone. Therefore, the
+ minimum number of backslashes needed to get one double-quote
+ in the end is open_doublequotes_mask + 1. */
+ if (open_singlequote)
+ {
+ if (count > open_doublequotes_mask)
+ {
+ phase2_ungetc (c);
+ return '\\';
+ }
+ else
+ return QUOTED (c);
+ }
+ else
+ {
+ if (count > open_doublequotes_mask)
+ return QUOTED (c);
+ else
+ /* Some of the count values <= open_doublequotes_mask are
+ actually invalid here, but we assume a syntactically
+ correct input file anyway. */
+ return c;
+ }
+ }
+ else if (c == '`')
+ {
+ /* FIXME: This code looks fishy. */
+ if (count == expected_count - 1)
+ return c;
+ else
+ /* Some of the count values < expected_count - 1 are
+ actually invalid here, but we assume a syntactically
+ correct input file anyway. */
+ if (nested_backquotes > 0 && !open_singlequote
+ && count >= (expected_count >> 2))
+ return OPENING_BACKQUOTE;
+ else
+ return CLOSING_BACKQUOTE;
+ }
+ else if (c == '$')
+ {
+ if (open_singlequote)
+ return QUOTED (c);
+ if (count >= (expected_count >> 1))
+ return QUOTED (c);
+ else
+ return c;
+ }
+ else
+ {
+ /* When not followed by a quoting character or backslash or dollar,
+ a backslash survives a debackslahificication pass unmodified.
+ Therefore each debackslahificication pass performs a
+ count := (count + 1) >> 1
+ operation. Therefore the minimum number of backslashes needed
+ to get one backslash in the end is (expected_count >> 1) + 1. */
+ if (open_doublequote || open_singlequote)
+ {
+ if (count > 0)
+ {
+ phase1_ungetc (c);
+ return '\\';
+ }
+ else
+ return QUOTED (c);
+ }
+ else
+ {
+ if (count > (expected_count >> 1))
+ {
+ phase1_ungetc (c);
+ return '\\';
+ }
+ else if (count > 0)
+ return QUOTED (c);
+ else
+ return c;
+ }
+ }
+ }
+
+ return (open_singlequote || open_doublequote ? QUOTED (c) : c);
+}
+
+static void
+phase2_ungetc (int c)
+{
+ switch (c)
+ {
+ case EOF:
+ break;
+
+ case '\n':
+ --line_number;
+ /* FALLTHROUGH */
+
+ default:
+ phase2_pushback[phase2_pushback_length++] = c;
+ break;
+ }
+}
+
+
+/* Forward declaration of local functions. */
+static enum word_type read_command_list (int looking_for);
+
+
+
+/* Read the next word.
+ 'looking_for' denotes a parse terminator, either CLOSING_BACKQUOTE, ')'
+ or '\0'. */
+static void
+read_word (struct word *wp, int looking_for)
+{
+ int c;
+ bool all_unquoted_digits;
+
+ do
+ {
+ c = phase2_getc ();
+ if (c == '#')
+ {
+ /* Skip a comment up to end of line. */
+ last_comment_line = line_number;
+ comment_start ();
+ for (;;)
+ {
+ c = phase1_getc ();
+ if (c == EOF || c == '\n')
+ break;
+ comment_add (c);
+ }
+ comment_line_end ();
+ }
+ if (c == '\n')
+ {
+ /* Comments assumed to be grouped with a message must immediately
+ precede it, with no non-whitespace token on a line between
+ both. */
+ if (last_non_comment_line > last_comment_line)
+ xgettext_comment_reset ();
+ wp->type = t_separator;
+ return;
+ }
+ }
+ while (is_whitespace (c));
+
+ if (c == EOF)
+ {
+ wp->type = t_eof;
+ return;
+ }
+
+ if (c == '<' || c == '>')
+ {
+ /* Recognize the redirection operators < > >| << <<- >> <> <& >& */
+ int c2 = phase2_getc ();
+ if ((c == '<' ? c2 == '<' : c2 == '|') || c2 == '>' || c2 == '&')
+ {
+ if (c == '<' && c2 == '<')
+ {
+ int c3 = phase2_getc ();
+ if (c3 != '-')
+ phase2_ungetc (c3);
+ }
+ }
+ else
+ phase2_ungetc (c2);
+ wp->type = t_redirect;
+ return;
+ }
+
+ if (looking_for == CLOSING_BACKQUOTE && c == CLOSING_BACKQUOTE)
+ {
+ saw_closing_backquote ();
+ wp->type = t_backquote;
+ last_non_comment_line = line_number;
+ return;
+ }
+
+ if (looking_for == ')' && c == ')')
+ {
+ wp->type = t_paren;
+ last_non_comment_line = line_number;
+ return;
+ }
+
+ if (is_operator_start (c))
+ {
+ wp->type = (c == ';' ? t_separator : t_other);
+ return;
+ }
+
+ wp->type = t_string;
+ wp->token = (struct token *) xmalloc (sizeof (struct token));
+ init_token (wp->token);
+ wp->line_number_at_start = line_number;
+ all_unquoted_digits = true;
+
+ for (;; c = phase2_getc ())
+ {
+ if (c == EOF)
+ break;
+
+ if (all_unquoted_digits && (c == '<' || c == '>'))
+ {
+ /* Recognize the redirection operators < > >| << <<- >> <> <& >&
+ prefixed with a nonempty sequence of unquoted digits. */
+ int c2 = phase2_getc ();
+ if ((c == '<' ? c2 == '<' : c2 == '|') || c2 == '>' || c2 == '&')
+ {
+ if (c == '<' && c2 == '<')
+ {
+ int c3 = phase2_getc ();
+ if (c3 != '-')
+ phase2_ungetc (c3);
+ }
+ }
+ else
+ phase2_ungetc (c2);
+
+ wp->type = t_redirect;
+ free_token (wp->token);
+ free (wp->token);
+
+ last_non_comment_line = line_number;
+
+ return;
+ }
+
+ all_unquoted_digits = all_unquoted_digits && (c >= '0' && c <= '9');
+
+ if (c == '$')
+ {
+ int c2 = phase2_getc ();
+ if (c2 == '(')
+ {
+ int c3 = phase2_getc ();
+ if (c3 == '(')
+ {
+ /* Arithmetic expression. Skip until the matching closing
+ parenthesis. */
+ unsigned int depth = 2;
+
+ do
+ {
+ c = phase2_getc ();
+ if (c == '(')
+ depth++;
+ else if (c == ')')
+ if (--depth == 0)
+ break;
+ }
+ while (c != EOF);
+ }
+ else
+ {
+ /* Command substitution. */
+ phase2_ungetc (c3);
+ read_command_list (')');
+ }
+ }
+ else if (c2 == '\'' && !open_singlequote)
+ {
+ /* Bash builtin for string with ANSI-C escape sequences. */
+ saw_opening_singlequote ();
+ for (;;)
+ {
+ c = phase2_getc ();
+ if (c == EOF)
+ break;
+ if (c == '\'')
+ {
+ saw_closing_singlequote ();
+ break;
+ }
+ if (c == '\\')
+ {
+ c = phase2_getc ();
+ switch (c)
+ {
+ default:
+ phase2_ungetc (c);
+ c = '\\';
+ break;
+
+ case '\\':
+ break;
+ case '\'':
+ /* Don't call saw_closing_singlequote () here. */
+ break;
+
+ case 'a':
+ c = '\a';
+ break;
+ case 'b':
+ c = '\b';
+ break;
+ case 'e':
+ c = 0x1b; /* ESC */
+ break;
+ case 'f':
+ c = '\f';
+ break;
+ case 'n':
+ c = '\n';
+ break;
+ case 'r':
+ c = '\r';
+ break;
+ case 't':
+ c = '\t';
+ break;
+ case 'v':
+ c = '\v';
+ break;
+
+ case 'x':
+ c = phase2_getc ();
+ if ((c >= '0' && c <= '9')
+ || (c >= 'A' && c <= 'F')
+ || (c >= 'a' && c <= 'f'))
+ {
+ int n;
+
+ if (c >= '0' && c <= '9')
+ n = c - '0';
+ else if (c >= 'A' && c <= 'F')
+ n = 10 + c - 'A';
+ else if (c >= 'a' && c <= 'f')
+ n = 10 + c - 'a';
+ else
+ abort ();
+
+ c = phase2_getc ();
+ if ((c >= '0' && c <= '9')
+ || (c >= 'A' && c <= 'F')
+ || (c >= 'a' && c <= 'f'))
+ {
+ if (c >= '0' && c <= '9')
+ n = n * 16 + c - '0';
+ else if (c >= 'A' && c <= 'F')
+ n = n * 16 + 10 + c - 'A';
+ else if (c >= 'a' && c <= 'f')
+ n = n * 16 + 10 + c - 'a';
+ else
+ abort ();
+ }
+ else
+ phase2_ungetc (c);
+
+ c = n;
+ }
+ else
+ {
+ phase2_ungetc (c);
+ phase2_ungetc ('x');
+ c = '\\';
+ }
+ break;
+
+ case '0': case '1': case '2': case '3':
+ case '4': case '5': case '6': case '7':
+ {
+ int n = c - '0';
+
+ c = phase2_getc ();
+ if (c >= '0' && c <= '7')
+ {
+ n = n * 8 + c - '0';
+
+ c = phase2_getc ();
+ if (c >= '0' && c <= '7')
+ n = n * 8 + c - '0';
+ else
+ phase2_ungetc (c);
+ }
+ else
+ phase2_ungetc (c);
+
+ c = n;
+ }
+ break;
+ }
+ }
+ if (wp->type == t_string)
+ {
+ grow_token (wp->token);
+ wp->token->chars[wp->token->charcount++] =
+ (unsigned char) c;
+ }
+ }
+ /* The result is a literal string. Don't change wp->type. */
+ continue;
+ }
+ else if (c2 == '"' && !open_doublequote)
+ {
+ /* Bash builtin for internationalized string. */
+ lex_pos_ty pos;
+ struct token string;
+
+ saw_opening_doublequote ();
+ pos.file_name = logical_file_name;
+ pos.line_number = line_number;
+ init_token (&string);
+ for (;;)
+ {
+ c = phase2_getc ();
+ if (c == EOF)
+ break;
+ if (c == '"')
+ {
+ saw_closing_doublequote ();
+ break;
+ }
+ grow_token (&string);
+ string.chars[string.charcount++] = (unsigned char) c;
+ }
+ remember_a_message (mlp, string_of_token (&string), &pos);
+ free_token (&string);
+ /* The result at runtime is not constant. Therefore we
+ change wp->type. */
+ }
+ else
+ phase2_ungetc (c2);
+ wp->type = t_other;
+ continue;
+ }
+
+ if (c == '\'')
+ {
+ if (!open_singlequote)
+ {
+ /* Handle an opening single quote. */
+ saw_opening_singlequote ();
+ }
+ else
+ {
+ /* Handle a closing single quote. */
+ saw_closing_singlequote ();
+ }
+ continue;
+ }
+
+ if (c == '"')
+ {
+ if (!open_doublequote)
+ {
+ /* Handle an opening double quote. */
+ saw_opening_doublequote ();
+ }
+ else
+ {
+ /* Handle a closing double quote. */
+ saw_closing_doublequote ();
+ }
+ continue;
+ }
+
+ if (c == OPENING_BACKQUOTE)
+ {
+ /* Handle an opening backquote. */
+ saw_opening_backquote ();
+
+ read_command_list (CLOSING_BACKQUOTE);
+
+ wp->type = t_other;
+ continue;
+ }
+ if (c == CLOSING_BACKQUOTE)
+ break;
+
+ if (!open_singlequote && !open_doublequote
+ && (is_whitespace (c) || is_operator_start (c)))
+ break;
+
+ if (wp->type == t_string)
+ {
+ grow_token (wp->token);
+ wp->token->chars[wp->token->charcount++] = (unsigned char) c;
+ }
+ }
+
+ phase2_ungetc (c);
+
+ if (wp->type != t_string)
+ {
+ free_token (wp->token);
+ free (wp->token);
+ }
+ last_non_comment_line = line_number;
+}
+
+
+/* Read the next command.
+ 'looking_for' denotes a parse terminator, either CLOSING_BACKQUOTE, ')'
+ or '\0'.
+ Returns the type of the word that terminated the command. */
+static enum word_type
+read_command (int looking_for)
+{
+ /* Read the words that make up the command.
+ Here we completely ignore field splitting at whitespace and wildcard
+ expansions; i.e. we assume that the source is written in such a way that
+ every word in the program determines exactly one word in the resulting
+ command.
+ But we do not require that the 'gettext'/'ngettext' command is the
+ first in the command; this is because 1. we want to allow for prefixes
+ like "$verbose" that may expand to nothing, and 2. it's a big effort
+ to know where a command starts in a $(for ...) or $(case ...) compound
+ command. */
+ int arg = 0; /* Current argument number. */
+ bool arg_of_redirect = false; /* True right after a redirection operator. */
+ int argnum1 = -1; /* First string position. */
+ int argnum2 = -1; /* Plural string position. */
+ message_ty *plural_mp = NULL; /* Remember the msgid. */
+
+ for (;;)
+ {
+ struct word inner;
+
+ read_word (&inner, looking_for);
+
+ /* Recognize end of command. */
+ if (inner.type == t_separator
+ || inner.type == t_backquote || inner.type == t_paren
+ || inner.type == t_eof)
+ return inner.type;
+
+ if (extract_all)
+ {
+ if (inner.type == t_string)
+ {
+ lex_pos_ty pos;
+
+ pos.file_name = logical_file_name;
+ pos.line_number = inner.line_number_at_start;
+ remember_a_message (mlp, string_of_word (&inner), &pos);
+ }
+ }
+ else
+ {
+ if (arg_of_redirect)
+ {
+ /* Ignore arguments of redirection operators. */
+ arg_of_redirect = false;
+ }
+ else if (inner.type == t_redirect)
+ {
+ /* Ignore this word and the following one. */
+ arg_of_redirect = true;
+ }
+ else
+ {
+ if (argnum1 < 0 && argnum2 < 0)
+ {
+ /* This is the function position. */
+ arg = 0;
+ if (inner.type == t_string)
+ {
+ char *function_name = string_of_word (&inner);
+ void *keyword_value;
+
+ if (find_entry (&keywords,
+ function_name, strlen (function_name),
+ &keyword_value)
+ == 0)
+ {
+ argnum1 = (int) (long) keyword_value & ((1 << 10) - 1);
+ argnum2 = (int) (long) keyword_value >> 10;
+ }
+
+ free (function_name);
+ }
+ }
+ else
+ {
+ /* These are the argument positions.
+ Extract a string if we have reached the right
+ argument position. */
+ if (arg == argnum1)
+ {
+ if (inner.type == t_string)
+ {
+ lex_pos_ty pos;
+ message_ty *mp;
+
+ pos.file_name = logical_file_name;
+ pos.line_number = inner.line_number_at_start;
+ mp = remember_a_message (mlp, string_of_word (&inner), &pos);
+ if (argnum2 > 0)
+ plural_mp = mp;
+ }
+ }
+ else if (arg == argnum2)
+ {
+ if (inner.type == t_string && plural_mp != NULL)
+ {
+ lex_pos_ty pos;
+
+ pos.file_name = logical_file_name;
+ pos.line_number = inner.line_number_at_start;
+ remember_a_message_plural (plural_mp, string_of_word (&inner), &pos);
+ }
+ }
+
+ if (arg >= argnum1 && arg >= argnum2)
+ {
+ /* Stop looking for arguments of the last function_name. */
+ argnum1 = -1;
+ argnum2 = -1;
+ plural_mp = NULL;
+ }
+ }
+
+ arg++;
+ }
+ }
+
+ free_word (&inner);
+ }
+}
+
+
+/* Read a list of commands.
+ 'looking_for' denotes a parse terminator, either CLOSING_BACKQUOTE, ')'
+ or '\0'.
+ Returns the type of the word that terminated the command list. */
+static enum word_type
+read_command_list (int looking_for)
+{
+ for (;;)
+ {
+ enum word_type terminator;
+
+ terminator = read_command (looking_for);
+ if (terminator != t_separator)
+ return terminator;
+ }
+}
+
+
+void
+extract_sh (FILE *f,
+ const char *real_filename, const char *logical_filename,
+ msgdomain_list_ty *mdlp)
+{
+ mlp = mdlp->item[0]->messages;
+
+ fp = f;
+ real_file_name = real_filename;
+ logical_file_name = xstrdup (logical_filename);
+ line_number = 1;
+
+ last_comment_line = -1;
+ last_non_comment_line = -1;
+
+ nested_backquotes = 0;
+ open_doublequotes_mask = 0;
+ open_doublequote = false;
+ open_singlequote = false;
+
+ init_keywords ();
+
+ /* Eat tokens until eof is seen. */
+ read_command_list ('\0');
+
+ fp = NULL;
+ real_file_name = NULL;
+ logical_file_name = NULL;
+ line_number = 0;
+}