From b440926984299740093f463c7f2328b2efd03f68 Mon Sep 17 00:00:00 2001 From: Bruno Haible Date: Wed, 3 Sep 2003 10:27:17 +0000 Subject: [PATCH] sh/bash script parser. --- gettext-tools/src/x-sh.c | 1242 ++++++++++++++++++++++++++++++++++++++ gettext-tools/src/x-sh.h | 33 + 2 files changed, 1275 insertions(+) create mode 100644 gettext-tools/src/x-sh.c create mode 100644 gettext-tools/src/x-sh.h diff --git a/gettext-tools/src/x-sh.c b/gettext-tools/src/x-sh.c new file mode 100644 index 000000000..be721f62e --- /dev/null +++ b/gettext-tools/src/x-sh.c @@ -0,0 +1,1242 @@ +/* xgettext sh backend. + Copyright (C) 2003 Free Software Foundation, Inc. + Written by Bruno Haible , 2003. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2, or (at your option) + any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software Foundation, + Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ + +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + +#include +#include +#include +#include +#include +#include + +#include "message.h" +#include "x-sh.h" +#include "xgettext.h" +#include "error.h" +#include "xmalloc.h" +#include "exit.h" +#include "hash.h" +#include "gettext.h" + +#define _(s) gettext(s) + +/* The sh syntax is defined in POSIX:2001, see + http://www.opengroup.org/onlinepubs/007904975/utilities/xcu_chap02.html + Summary of sh syntax: + - Input is broken into words, which are then subject to + - tilde expansion ~... + - command substitution `...` + - variable substitution $var + - arithmetic substitution $((...)) + - field splitting at whitespace (IFS) + - wildcard pattern expansion *? + - quote removal + - Strings are enclosed in "..."; command substitution, variable + substitution and arithmetic substitution are performed here as well. + - '...' is a string without substitutions. + - The list of resulting words is split into commands by semicolon and + newline. + - '#' at the beginning of a word introduces a comment until end of line. + The parser is implemented in bash-2.05b/parse.y. */ + + +/* ====================== Keyword set customization. ====================== */ + +/* If true extract all strings. */ +static bool extract_all = false; + +static hash_table keywords; +static bool default_keywords = true; + + +void +x_sh_extract_all () +{ + extract_all = true; +} + + +void +x_sh_keyword (const char *name) +{ + if (name == NULL) + default_keywords = false; + else + { + const char *end; + int argnum1; + int argnum2; + const char *colon; + + if (keywords.table == NULL) + init_hash (&keywords, 100); + + split_keywordspec (name, &end, &argnum1, &argnum2); + + /* The characters between name and end should form a valid C identifier. + A colon means an invalid parse in split_keywordspec(). */ + colon = strchr (name, ':'); + if (colon == NULL || colon >= end) + { + if (argnum1 == 0) + argnum1 = 1; + insert_entry (&keywords, name, end - name, + (void *) (long) (argnum1 + (argnum2 << 10))); + } + } +} + +/* Finish initializing the keywords hash table. + Called after argument processing, before each file is processed. */ +static void +init_keywords () +{ + if (default_keywords) + { + x_sh_keyword ("gettext"); + x_sh_keyword ("ngettext:1,2"); + x_sh_keyword ("eval_gettext"); + x_sh_keyword ("eval_ngettext:1,2"); + default_keywords = false; + } +} + + +/* ======================== Reading of characters. ======================== */ + +/* Real filename, used in error messages about the input file. */ +static const char *real_file_name; + +/* Logical filename and line number, used to label the extracted messages. */ +static char *logical_file_name; +static int line_number; + +/* The input file stream. */ +static FILE *fp; + + +/* Fetch the next character from the input file. */ +static int +do_getc () +{ + int c = getc (fp); + + if (c == EOF) + { + if (ferror (fp)) + error (EXIT_FAILURE, errno, _("\ +error while reading \"%s\""), real_file_name); + } + else if (c == '\n') + line_number++; + + return c; +} + +/* Put back the last fetched character, not EOF. */ +static void +do_ungetc (int c) +{ + if (c == '\n') + line_number--; + ungetc (c, fp); +} + + +/* Remove backslash followed by newline from the input stream. + Cope with potentially 2 characters of pushback. */ + +/* Maximum used guaranteed to be < 4. */ +static int phase1_pushback[4]; +static int phase1_pushback_length; + +static int +phase1_getc () +{ + int c; + + if (phase1_pushback_length) + { + c = phase1_pushback[--phase1_pushback_length]; + if (c == '\n') + ++line_number; + return c; + } + for (;;) + { + c = do_getc (); + if (c != '\\') + return c; + c = do_getc (); + if (c != '\n') + { + if (c != EOF) + do_ungetc (c); + return '\\'; + } + } +} + +static void +phase1_ungetc (int c) +{ + switch (c) + { + case EOF: + break; + + case '\n': + --line_number; + /* FALLTHROUGH */ + + default: + phase1_pushback[phase1_pushback_length++] = c; + break; + } +} + + +/* ========================== Reading of tokens. ========================== */ + + +/* A token consists of a sequence of characters. */ +struct token +{ + int allocated; /* number of allocated 'token_char's */ + int charcount; /* number of used 'token_char's */ + char *chars; /* the token's constituents */ +}; + +/* Initialize a 'struct token'. */ +static inline void +init_token (struct token *tp) +{ + tp->allocated = 10; + tp->chars = (char *) xmalloc (tp->allocated * sizeof (char)); + tp->charcount = 0; +} + +/* Free the memory pointed to by a 'struct token'. */ +static inline void +free_token (struct token *tp) +{ + free (tp->chars); +} + +/* Ensure there is enough room in the token for one more character. */ +static inline void +grow_token (struct token *tp) +{ + if (tp->charcount == tp->allocated) + { + tp->allocated *= 2; + tp->chars = (char *) xrealloc (tp->chars, tp->allocated * sizeof (char)); + } +} + +/* Convert a struct token * to a char*. */ +static char * +string_of_token (const struct token *tp) +{ + char *str; + int n; + + n = tp->charcount; + str = (char *) xmalloc (n + 1); + memcpy (str, tp->chars, n); + str[n] = '\0'; + return str; +} + + +/* ========================= Accumulating messages ========================= */ + + +static message_list_ty *mlp; + + +/* ========================= Accumulating comments ========================= */ + + +static char *buffer; +static size_t bufmax; +static size_t buflen; + +static inline void +comment_start () +{ + buflen = 0; +} + +static inline void +comment_add (int c) +{ + if (buflen >= bufmax) + { + bufmax = 2 * bufmax + 10; + buffer = xrealloc (buffer, bufmax); + } + buffer[buflen++] = c; +} + +static inline void +comment_line_end () +{ + while (buflen >= 1 + && (buffer[buflen - 1] == ' ' || buffer[buflen - 1] == '\t')) + --buflen; + if (buflen >= bufmax) + { + bufmax = 2 * bufmax + 10; + buffer = xrealloc (buffer, bufmax); + } + buffer[buflen] = '\0'; + xgettext_comment_add (buffer); +} + + +/* These are for tracking whether comments count as immediately before + keyword. */ +static int last_comment_line; +static int last_non_comment_line; + + +/* ========================= Debackslashification ========================== */ + +/* This state tracks the effect of backquotes, double-quotes and single-quotes + on the parsing of backslashes. We make a single pass through the input + file, keeping the state up to date. This is much faster than accumulating + strings and processing them with explicit debackslashification, like the + shell does it. */ + +/* The number of nested `...` or "`...`" constructs. Assumed to be <= 32. */ +static unsigned int nested_backquotes; + +/* A bit mask indicating which of the currently open `...` or "`...`" + constructs is with double-quotes: "`...`". + A bit value of 1 stands for "`...`", a bit value of 0 stands for `...`. + Bit position 0 designates the outermost backquotes nesting, + bit position 1 the second-outermost backquotes nesting, + ... + bit position (nested_backquotes-1) the innermost backquotes nesting. */ +static unsigned int open_doublequotes_mask; + +/* A bit indicating whether a double-quote is currently open inside the + innermost backquotes nesting. */ +static bool open_doublequote; + +/* A bit indicating whether a single-quote is currently open inside the + innermost backquotes nesting. */ +static bool open_singlequote; + + +/* Functions to update the state. */ + +static inline void +saw_opening_backquote () +{ + if (open_singlequote) + abort (); + if (open_doublequote) + open_doublequotes_mask |= (unsigned int) 1 << nested_backquotes; + nested_backquotes++; + open_doublequote = false; +} + +static inline void +saw_closing_backquote () +{ + nested_backquotes--; + open_doublequote = (open_doublequotes_mask >> nested_backquotes) & 1; + open_doublequotes_mask &= ((unsigned int) 1 << nested_backquotes) - 1; + open_singlequote = false; /* just for safety */ +} + +static inline void +saw_opening_doublequote () +{ + if (open_singlequote || open_doublequote) + abort (); + open_doublequote = true; +} + +static inline void +saw_closing_doublequote () +{ + if (open_singlequote || !open_doublequote) + abort (); + open_doublequote = false; +} + +static inline void +saw_opening_singlequote () +{ + if (open_doublequote || open_singlequote) + abort (); + open_singlequote = true; +} + +static inline void +saw_closing_singlequote () +{ + if (open_doublequote || !open_singlequote) + abort (); + open_singlequote = false; +} + + +/* ========================== Reading of commands ========================== */ + +/* We are only interested in constant strings. Other words need not to be + represented precisely. */ +enum word_type +{ + t_string, /* constant string */ + t_other, /* other string */ + t_separator, /* command separator: semicolon or newline */ + t_redirect, /* redirection: one of < > >| << <<- >> <> <& >& */ + t_backquote, /* closing '`' pseudo word */ + t_paren, /* closing ')' pseudo word */ + t_eof /* EOF marker */ +}; + +struct word +{ + enum word_type type; + struct token *token; /* for t_string */ + int line_number_at_start; /* for t_string */ +}; + +/* Free the memory pointed to by a 'struct word'. */ +static inline void +free_word (struct word *wp) +{ + if (wp->type == t_string) + { + free_token (wp->token); + free (wp->token); + } +} + +/* Convert a t_string token to a char*. */ +static char * +string_of_word (const struct word *wp) +{ + char *str; + int n; + + if (!(wp->type == t_string)) + abort (); + n = wp->token->charcount; + str = (char *) xmalloc (n + 1); + memcpy (str, wp->token->chars, n); + str[n] = '\0'; + return str; +} + + +/* Whitespace recognition. */ + +static inline bool +is_whitespace (int c) +{ + return (c == ' ' || c == '\t' || c == '\n'); +} + +/* Operator character recognition. */ + +static inline bool +is_operator_start (int c) +{ + return (c == '|' || c == '&' || c == ';' || c == '<' || c == '>' + || c == '(' || c == ')'); +} + + +/* Denotation of a quoted character. + The distinction between quoted and unquoted character is important only for + the special, whitespace and operator characters; it is irrelevant for + alphanumeric characters, '\\' and many others. */ +#define QUOTED(c) (UCHAR_MAX + 1 + (c)) +/* Values in the 'unsigned char' range are implicitly unquoted. Among these, + the following are important: + '"' opening or closing double quote + '\'' opening or closing single quote + '$' the unknown result of a dollar expansion + '`' does not occur - replaced with OPENING_BACKQUOTE or + CLOSING_BACKQUOTE + */ +#define OPENING_BACKQUOTE (2 * (UCHAR_MAX + 1) + '`') +#define CLOSING_BACKQUOTE (3 * (UCHAR_MAX + 1) + '`') + +/* Maximum used guaranteed to be < 4. */ +static int phase2_pushback[4]; +static int phase2_pushback_length; + +/* Forward declaration of local functions. */ +static void phase2_ungetc (int c); + +/* Return the next character, with backslashes removed. + The result is QUOTED(c) for some unsigned char c, if the next character + is escaped sufficiently often to make it a regular constituent character, + or simply an 'unsigned char' if it has its special meaning (of special, + whitespace or operator charcter), or OPENING_BACKQUOTE, CLOSING_BACKQUOTE, + EOF. + It's the caller's responsibility to update the state. */ +static int +phase2_getc () +{ + int c; + + if (phase2_pushback_length) + { + c = phase2_pushback[--phase2_pushback_length]; + if (c == '\n') + ++line_number; + return c; + } + + c = phase1_getc (); + if (c == EOF) + return c; + if (c == '\'') + return (open_doublequote ? QUOTED (c) : c); + if (!open_singlequote) + { + if (c == '"' || c == '$') + return c; + if (c == '`') + return (nested_backquotes > 0 ? CLOSING_BACKQUOTE : OPENING_BACKQUOTE); + } + if (c == '\\') + { + /* Number of debackslahificication passes that are active at the + current point. */ + unsigned int debackslahify = + nested_backquotes + (open_singlequote ? 0 : 1); + /* Normal number of backslashes that yield a single backslash in the + final output. */ + unsigned int expected_count = + (unsigned int) 1 << debackslahify; + /* Number of backslashes found. */ + unsigned int count; + + for (count = 1; count < expected_count; count++) + { + c = phase1_getc (); + if (c != '\\') + break; + } + if (count == expected_count) + return '\\'; + + /* The count of backslashes is > 0 and < expected_count, therefore the + result depends on c, the first character after the backslashes. + Note: The formulas below don't necessarily have a logic; they were + empirically determined such that 1. the xgettext-30 test succeeds, + 2. the behaviour for count == 0 would correspond to the one without + any baskslash. */ + if (c == '\'') + { + if (!open_singlequote && count > (expected_count >> 1)) + { + phase1_ungetc (c); + return '\\'; + } + else + return (open_doublequote ? QUOTED (c) : c); + } + else if (c == '"') + { + /* Each debackslahificication pass converts \\ to \ and \" to "; + passes corresponding to `...` drop a lone " whereas passes + corresponding to "`...`" leave it alone. Therefore, the + minimum number of backslashes needed to get one double-quote + in the end is open_doublequotes_mask + 1. */ + if (open_singlequote) + { + if (count > open_doublequotes_mask) + { + phase2_ungetc (c); + return '\\'; + } + else + return QUOTED (c); + } + else + { + if (count > open_doublequotes_mask) + return QUOTED (c); + else + /* Some of the count values <= open_doublequotes_mask are + actually invalid here, but we assume a syntactically + correct input file anyway. */ + return c; + } + } + else if (c == '`') + { + /* FIXME: This code looks fishy. */ + if (count == expected_count - 1) + return c; + else + /* Some of the count values < expected_count - 1 are + actually invalid here, but we assume a syntactically + correct input file anyway. */ + if (nested_backquotes > 0 && !open_singlequote + && count >= (expected_count >> 2)) + return OPENING_BACKQUOTE; + else + return CLOSING_BACKQUOTE; + } + else if (c == '$') + { + if (open_singlequote) + return QUOTED (c); + if (count >= (expected_count >> 1)) + return QUOTED (c); + else + return c; + } + else + { + /* When not followed by a quoting character or backslash or dollar, + a backslash survives a debackslahificication pass unmodified. + Therefore each debackslahificication pass performs a + count := (count + 1) >> 1 + operation. Therefore the minimum number of backslashes needed + to get one backslash in the end is (expected_count >> 1) + 1. */ + if (open_doublequote || open_singlequote) + { + if (count > 0) + { + phase1_ungetc (c); + return '\\'; + } + else + return QUOTED (c); + } + else + { + if (count > (expected_count >> 1)) + { + phase1_ungetc (c); + return '\\'; + } + else if (count > 0) + return QUOTED (c); + else + return c; + } + } + } + + return (open_singlequote || open_doublequote ? QUOTED (c) : c); +} + +static void +phase2_ungetc (int c) +{ + switch (c) + { + case EOF: + break; + + case '\n': + --line_number; + /* FALLTHROUGH */ + + default: + phase2_pushback[phase2_pushback_length++] = c; + break; + } +} + + +/* Forward declaration of local functions. */ +static enum word_type read_command_list (int looking_for); + + + +/* Read the next word. + 'looking_for' denotes a parse terminator, either CLOSING_BACKQUOTE, ')' + or '\0'. */ +static void +read_word (struct word *wp, int looking_for) +{ + int c; + bool all_unquoted_digits; + + do + { + c = phase2_getc (); + if (c == '#') + { + /* Skip a comment up to end of line. */ + last_comment_line = line_number; + comment_start (); + for (;;) + { + c = phase1_getc (); + if (c == EOF || c == '\n') + break; + comment_add (c); + } + comment_line_end (); + } + if (c == '\n') + { + /* Comments assumed to be grouped with a message must immediately + precede it, with no non-whitespace token on a line between + both. */ + if (last_non_comment_line > last_comment_line) + xgettext_comment_reset (); + wp->type = t_separator; + return; + } + } + while (is_whitespace (c)); + + if (c == EOF) + { + wp->type = t_eof; + return; + } + + if (c == '<' || c == '>') + { + /* Recognize the redirection operators < > >| << <<- >> <> <& >& */ + int c2 = phase2_getc (); + if ((c == '<' ? c2 == '<' : c2 == '|') || c2 == '>' || c2 == '&') + { + if (c == '<' && c2 == '<') + { + int c3 = phase2_getc (); + if (c3 != '-') + phase2_ungetc (c3); + } + } + else + phase2_ungetc (c2); + wp->type = t_redirect; + return; + } + + if (looking_for == CLOSING_BACKQUOTE && c == CLOSING_BACKQUOTE) + { + saw_closing_backquote (); + wp->type = t_backquote; + last_non_comment_line = line_number; + return; + } + + if (looking_for == ')' && c == ')') + { + wp->type = t_paren; + last_non_comment_line = line_number; + return; + } + + if (is_operator_start (c)) + { + wp->type = (c == ';' ? t_separator : t_other); + return; + } + + wp->type = t_string; + wp->token = (struct token *) xmalloc (sizeof (struct token)); + init_token (wp->token); + wp->line_number_at_start = line_number; + all_unquoted_digits = true; + + for (;; c = phase2_getc ()) + { + if (c == EOF) + break; + + if (all_unquoted_digits && (c == '<' || c == '>')) + { + /* Recognize the redirection operators < > >| << <<- >> <> <& >& + prefixed with a nonempty sequence of unquoted digits. */ + int c2 = phase2_getc (); + if ((c == '<' ? c2 == '<' : c2 == '|') || c2 == '>' || c2 == '&') + { + if (c == '<' && c2 == '<') + { + int c3 = phase2_getc (); + if (c3 != '-') + phase2_ungetc (c3); + } + } + else + phase2_ungetc (c2); + + wp->type = t_redirect; + free_token (wp->token); + free (wp->token); + + last_non_comment_line = line_number; + + return; + } + + all_unquoted_digits = all_unquoted_digits && (c >= '0' && c <= '9'); + + if (c == '$') + { + int c2 = phase2_getc (); + if (c2 == '(') + { + int c3 = phase2_getc (); + if (c3 == '(') + { + /* Arithmetic expression. Skip until the matching closing + parenthesis. */ + unsigned int depth = 2; + + do + { + c = phase2_getc (); + if (c == '(') + depth++; + else if (c == ')') + if (--depth == 0) + break; + } + while (c != EOF); + } + else + { + /* Command substitution. */ + phase2_ungetc (c3); + read_command_list (')'); + } + } + else if (c2 == '\'' && !open_singlequote) + { + /* Bash builtin for string with ANSI-C escape sequences. */ + saw_opening_singlequote (); + for (;;) + { + c = phase2_getc (); + if (c == EOF) + break; + if (c == '\'') + { + saw_closing_singlequote (); + break; + } + if (c == '\\') + { + c = phase2_getc (); + switch (c) + { + default: + phase2_ungetc (c); + c = '\\'; + break; + + case '\\': + break; + case '\'': + /* Don't call saw_closing_singlequote () here. */ + break; + + case 'a': + c = '\a'; + break; + case 'b': + c = '\b'; + break; + case 'e': + c = 0x1b; /* ESC */ + break; + case 'f': + c = '\f'; + break; + case 'n': + c = '\n'; + break; + case 'r': + c = '\r'; + break; + case 't': + c = '\t'; + break; + case 'v': + c = '\v'; + break; + + case 'x': + c = phase2_getc (); + if ((c >= '0' && c <= '9') + || (c >= 'A' && c <= 'F') + || (c >= 'a' && c <= 'f')) + { + int n; + + if (c >= '0' && c <= '9') + n = c - '0'; + else if (c >= 'A' && c <= 'F') + n = 10 + c - 'A'; + else if (c >= 'a' && c <= 'f') + n = 10 + c - 'a'; + else + abort (); + + c = phase2_getc (); + if ((c >= '0' && c <= '9') + || (c >= 'A' && c <= 'F') + || (c >= 'a' && c <= 'f')) + { + if (c >= '0' && c <= '9') + n = n * 16 + c - '0'; + else if (c >= 'A' && c <= 'F') + n = n * 16 + 10 + c - 'A'; + else if (c >= 'a' && c <= 'f') + n = n * 16 + 10 + c - 'a'; + else + abort (); + } + else + phase2_ungetc (c); + + c = n; + } + else + { + phase2_ungetc (c); + phase2_ungetc ('x'); + c = '\\'; + } + break; + + case '0': case '1': case '2': case '3': + case '4': case '5': case '6': case '7': + { + int n = c - '0'; + + c = phase2_getc (); + if (c >= '0' && c <= '7') + { + n = n * 8 + c - '0'; + + c = phase2_getc (); + if (c >= '0' && c <= '7') + n = n * 8 + c - '0'; + else + phase2_ungetc (c); + } + else + phase2_ungetc (c); + + c = n; + } + break; + } + } + if (wp->type == t_string) + { + grow_token (wp->token); + wp->token->chars[wp->token->charcount++] = + (unsigned char) c; + } + } + /* The result is a literal string. Don't change wp->type. */ + continue; + } + else if (c2 == '"' && !open_doublequote) + { + /* Bash builtin for internationalized string. */ + lex_pos_ty pos; + struct token string; + + saw_opening_doublequote (); + pos.file_name = logical_file_name; + pos.line_number = line_number; + init_token (&string); + for (;;) + { + c = phase2_getc (); + if (c == EOF) + break; + if (c == '"') + { + saw_closing_doublequote (); + break; + } + grow_token (&string); + string.chars[string.charcount++] = (unsigned char) c; + } + remember_a_message (mlp, string_of_token (&string), &pos); + free_token (&string); + /* The result at runtime is not constant. Therefore we + change wp->type. */ + } + else + phase2_ungetc (c2); + wp->type = t_other; + continue; + } + + if (c == '\'') + { + if (!open_singlequote) + { + /* Handle an opening single quote. */ + saw_opening_singlequote (); + } + else + { + /* Handle a closing single quote. */ + saw_closing_singlequote (); + } + continue; + } + + if (c == '"') + { + if (!open_doublequote) + { + /* Handle an opening double quote. */ + saw_opening_doublequote (); + } + else + { + /* Handle a closing double quote. */ + saw_closing_doublequote (); + } + continue; + } + + if (c == OPENING_BACKQUOTE) + { + /* Handle an opening backquote. */ + saw_opening_backquote (); + + read_command_list (CLOSING_BACKQUOTE); + + wp->type = t_other; + continue; + } + if (c == CLOSING_BACKQUOTE) + break; + + if (!open_singlequote && !open_doublequote + && (is_whitespace (c) || is_operator_start (c))) + break; + + if (wp->type == t_string) + { + grow_token (wp->token); + wp->token->chars[wp->token->charcount++] = (unsigned char) c; + } + } + + phase2_ungetc (c); + + if (wp->type != t_string) + { + free_token (wp->token); + free (wp->token); + } + last_non_comment_line = line_number; +} + + +/* Read the next command. + 'looking_for' denotes a parse terminator, either CLOSING_BACKQUOTE, ')' + or '\0'. + Returns the type of the word that terminated the command. */ +static enum word_type +read_command (int looking_for) +{ + /* Read the words that make up the command. + Here we completely ignore field splitting at whitespace and wildcard + expansions; i.e. we assume that the source is written in such a way that + every word in the program determines exactly one word in the resulting + command. + But we do not require that the 'gettext'/'ngettext' command is the + first in the command; this is because 1. we want to allow for prefixes + like "$verbose" that may expand to nothing, and 2. it's a big effort + to know where a command starts in a $(for ...) or $(case ...) compound + command. */ + int arg = 0; /* Current argument number. */ + bool arg_of_redirect = false; /* True right after a redirection operator. */ + int argnum1 = -1; /* First string position. */ + int argnum2 = -1; /* Plural string position. */ + message_ty *plural_mp = NULL; /* Remember the msgid. */ + + for (;;) + { + struct word inner; + + read_word (&inner, looking_for); + + /* Recognize end of command. */ + if (inner.type == t_separator + || inner.type == t_backquote || inner.type == t_paren + || inner.type == t_eof) + return inner.type; + + if (extract_all) + { + if (inner.type == t_string) + { + lex_pos_ty pos; + + pos.file_name = logical_file_name; + pos.line_number = inner.line_number_at_start; + remember_a_message (mlp, string_of_word (&inner), &pos); + } + } + else + { + if (arg_of_redirect) + { + /* Ignore arguments of redirection operators. */ + arg_of_redirect = false; + } + else if (inner.type == t_redirect) + { + /* Ignore this word and the following one. */ + arg_of_redirect = true; + } + else + { + if (argnum1 < 0 && argnum2 < 0) + { + /* This is the function position. */ + arg = 0; + if (inner.type == t_string) + { + char *function_name = string_of_word (&inner); + void *keyword_value; + + if (find_entry (&keywords, + function_name, strlen (function_name), + &keyword_value) + == 0) + { + argnum1 = (int) (long) keyword_value & ((1 << 10) - 1); + argnum2 = (int) (long) keyword_value >> 10; + } + + free (function_name); + } + } + else + { + /* These are the argument positions. + Extract a string if we have reached the right + argument position. */ + if (arg == argnum1) + { + if (inner.type == t_string) + { + lex_pos_ty pos; + message_ty *mp; + + pos.file_name = logical_file_name; + pos.line_number = inner.line_number_at_start; + mp = remember_a_message (mlp, string_of_word (&inner), &pos); + if (argnum2 > 0) + plural_mp = mp; + } + } + else if (arg == argnum2) + { + if (inner.type == t_string && plural_mp != NULL) + { + lex_pos_ty pos; + + pos.file_name = logical_file_name; + pos.line_number = inner.line_number_at_start; + remember_a_message_plural (plural_mp, string_of_word (&inner), &pos); + } + } + + if (arg >= argnum1 && arg >= argnum2) + { + /* Stop looking for arguments of the last function_name. */ + argnum1 = -1; + argnum2 = -1; + plural_mp = NULL; + } + } + + arg++; + } + } + + free_word (&inner); + } +} + + +/* Read a list of commands. + 'looking_for' denotes a parse terminator, either CLOSING_BACKQUOTE, ')' + or '\0'. + Returns the type of the word that terminated the command list. */ +static enum word_type +read_command_list (int looking_for) +{ + for (;;) + { + enum word_type terminator; + + terminator = read_command (looking_for); + if (terminator != t_separator) + return terminator; + } +} + + +void +extract_sh (FILE *f, + const char *real_filename, const char *logical_filename, + msgdomain_list_ty *mdlp) +{ + mlp = mdlp->item[0]->messages; + + fp = f; + real_file_name = real_filename; + logical_file_name = xstrdup (logical_filename); + line_number = 1; + + last_comment_line = -1; + last_non_comment_line = -1; + + nested_backquotes = 0; + open_doublequotes_mask = 0; + open_doublequote = false; + open_singlequote = false; + + init_keywords (); + + /* Eat tokens until eof is seen. */ + read_command_list ('\0'); + + fp = NULL; + real_file_name = NULL; + logical_file_name = NULL; + line_number = 0; +} diff --git a/gettext-tools/src/x-sh.h b/gettext-tools/src/x-sh.h new file mode 100644 index 000000000..84b8b2793 --- /dev/null +++ b/gettext-tools/src/x-sh.h @@ -0,0 +1,33 @@ +/* xgettext sh backend. + Copyright (C) 2003 Free Software Foundation, Inc. + Written by Bruno Haible , 2003. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2, or (at your option) + any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software Foundation, + Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ + + +#define EXTENSIONS_SH \ + { "sh", "Shell" }, \ + { "bash", "Shell" }, \ + +#define SCANNERS_SH \ + { "Shell", extract_sh, &formatstring_sh, NULL }, \ + +/* Scan a shell script file and add its translatable strings to mdlp. */ +extern void extract_sh (FILE *fp, const char *real_filename, + const char *logical_filename, + msgdomain_list_ty *mdlp); + +extern void x_sh_keyword (const char *keyword); +extern void x_sh_extract_all (void); -- 2.47.3