sh/bash script parser.

author Bruno Haible <bruno@clisp.org>

Wed, 3 Sep 2003 10:27:17 +0000 (10:27 +0000)

committer Bruno Haible <bruno@clisp.org>

Tue, 23 Jun 2009 10:10:57 +0000 (12:10 +0200)
author Bruno Haible <bruno@clisp.org>
Wed, 3 Sep 2003 10:27:17 +0000 (10:27 +0000)
committer Bruno Haible <bruno@clisp.org>
Tue, 23 Jun 2009 10:10:57 +0000 (12:10 +0200)
diff --git a/gettext-tools/src/x-sh.c b/gettext-tools/src/x-sh.c

new file mode 100644 (file)

index 0000000..be721f6
--- /dev/null
+++ b/gettext-tools/src/x-sh.c
@@ -0,0 +1,1242 @@
+/* xgettext sh backend.
+   Copyright (C) 2003 Free Software Foundation, Inc.
+   Written by Bruno Haible <bruno@clisp.org>, 2003.
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 2, or (at your option)
+   any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software Foundation,
+   Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.  */
+
+#ifdef HAVE_CONFIG_H
+# include "config.h"
+#endif
+
+#include <errno.h>
+#include <limits.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "message.h"
+#include "x-sh.h"
+#include "xgettext.h"
+#include "error.h"
+#include "xmalloc.h"
+#include "exit.h"
+#include "hash.h"
+#include "gettext.h"
+
+#define _(s) gettext(s)
+
+/* The sh syntax is defined in POSIX:2001, see
+     http://www.opengroup.org/onlinepubs/007904975/utilities/xcu_chap02.html
+   Summary of sh syntax:
+   - Input is broken into words, which are then subject to
+     - tilde expansion ~...
+     - command substitution `...`
+     - variable substitution $var
+     - arithmetic substitution $((...))
+     - field splitting at whitespace (IFS)
+     - wildcard pattern expansion *?
+     - quote removal
+   - Strings are enclosed in "..."; command substitution, variable
+     substitution and arithmetic substitution are performed here as well.
+   - '...' is a string without substitutions.
+   - The list of resulting words is split into commands by semicolon and
+     newline.
+   - '#' at the beginning of a word introduces a comment until end of line.
+   The parser is implemented in bash-2.05b/parse.y.  */
+
+
+/* ====================== Keyword set customization.  ====================== */
+
+/* If true extract all strings.  */
+static bool extract_all = false;
+
+static hash_table keywords;
+static bool default_keywords = true;
+
+
+void
+x_sh_extract_all ()
+{
+  extract_all = true;
+}
+
+
+void
+x_sh_keyword (const char *name)
+{
+  if (name == NULL)
+    default_keywords = false;
+  else
+    {
+      const char *end;
+      int argnum1;
+      int argnum2;
+      const char *colon;
+
+      if (keywords.table == NULL)
+       init_hash (&keywords, 100);
+
+      split_keywordspec (name, &end, &argnum1, &argnum2);
+
+      /* The characters between name and end should form a valid C identifier.
+        A colon means an invalid parse in split_keywordspec().  */
+      colon = strchr (name, ':');
+      if (colon == NULL || colon >= end)
+       {
+         if (argnum1 == 0)
+           argnum1 = 1;
+         insert_entry (&keywords, name, end - name,
+                       (void *) (long) (argnum1 + (argnum2 << 10)));
+       }
+    }
+}
+
+/* Finish initializing the keywords hash table.
+   Called after argument processing, before each file is processed.  */
+static void
+init_keywords ()
+{
+  if (default_keywords)
+    {
+      x_sh_keyword ("gettext");
+      x_sh_keyword ("ngettext:1,2");
+      x_sh_keyword ("eval_gettext");
+      x_sh_keyword ("eval_ngettext:1,2");
+      default_keywords = false;
+    }
+}
+
+
+/* ======================== Reading of characters.  ======================== */
+
+/* Real filename, used in error messages about the input file.  */
+static const char *real_file_name;
+
+/* Logical filename and line number, used to label the extracted messages.  */
+static char *logical_file_name;
+static int line_number;
+
+/* The input file stream.  */
+static FILE *fp;
+
+
+/* Fetch the next character from the input file.  */
+static int
+do_getc ()
+{
+  int c = getc (fp);
+
+  if (c == EOF)
+    {
+      if (ferror (fp))
+       error (EXIT_FAILURE, errno, _("\
+error while reading \"%s\""), real_file_name);
+    }
+  else if (c == '\n')
+   line_number++;
+
+  return c;
+}
+
+/* Put back the last fetched character, not EOF.  */
+static void
+do_ungetc (int c)
+{
+  if (c == '\n')
+    line_number--;
+  ungetc (c, fp);
+}
+
+
+/* Remove backslash followed by newline from the input stream.
+   Cope with potentially 2 characters of pushback.  */
+
+/* Maximum used guaranteed to be < 4.  */
+static int phase1_pushback[4];
+static int phase1_pushback_length;
+
+static int
+phase1_getc ()
+{
+  int c;
+
+  if (phase1_pushback_length)
+    {
+      c = phase1_pushback[--phase1_pushback_length];
+      if (c == '\n')
+       ++line_number;
+      return c;
+    }
+  for (;;)
+    {
+      c = do_getc ();
+      if (c != '\\')
+       return c;
+      c = do_getc ();
+      if (c != '\n')
+       {
+         if (c != EOF)
+           do_ungetc (c);
+         return '\\';
+       }
+    }
+}
+
+static void
+phase1_ungetc (int c)
+{
+  switch (c)
+    {
+    case EOF:
+      break;
+
+    case '\n':
+      --line_number;
+      /* FALLTHROUGH */
+
+    default:
+      phase1_pushback[phase1_pushback_length++] = c;
+      break;
+    }
+}
+
+
+/* ========================== Reading of tokens.  ========================== */
+
+
+/* A token consists of a sequence of characters.  */
+struct token
+{
+  int allocated;               /* number of allocated 'token_char's */
+  int charcount;               /* number of used 'token_char's */
+  char *chars;                 /* the token's constituents */
+};
+
+/* Initialize a 'struct token'.  */
+static inline void
+init_token (struct token *tp)
+{
+  tp->allocated = 10;
+  tp->chars = (char *) xmalloc (tp->allocated * sizeof (char));
+  tp->charcount = 0;
+}
+
+/* Free the memory pointed to by a 'struct token'.  */
+static inline void
+free_token (struct token *tp)
+{
+  free (tp->chars);
+}
+
+/* Ensure there is enough room in the token for one more character.  */
+static inline void
+grow_token (struct token *tp)
+{
+  if (tp->charcount == tp->allocated)
+    {
+      tp->allocated *= 2;
+      tp->chars = (char *) xrealloc (tp->chars, tp->allocated * sizeof (char));
+    }
+}
+
+/* Convert a struct token * to a char*.  */
+static char *
+string_of_token (const struct token *tp)
+{
+  char *str;
+  int n;
+
+  n = tp->charcount;
+  str = (char *) xmalloc (n + 1);
+  memcpy (str, tp->chars, n);
+  str[n] = '\0';
+  return str;
+}
+
+
+/* ========================= Accumulating messages ========================= */
+
+
+static message_list_ty *mlp;
+
+
+/* ========================= Accumulating comments ========================= */
+
+
+static char *buffer;
+static size_t bufmax;
+static size_t buflen;
+
+static inline void
+comment_start ()
+{
+  buflen = 0;
+}
+
+static inline void
+comment_add (int c)
+{
+  if (buflen >= bufmax)
+    {
+      bufmax = 2 * bufmax + 10;
+      buffer = xrealloc (buffer, bufmax);
+    }
+  buffer[buflen++] = c;
+}
+
+static inline void
+comment_line_end ()
+{
+  while (buflen >= 1
+        && (buffer[buflen - 1] == ' ' || buffer[buflen - 1] == '\t'))
+    --buflen;
+  if (buflen >= bufmax)
+    {
+      bufmax = 2 * bufmax + 10;
+      buffer = xrealloc (buffer, bufmax);
+    }
+  buffer[buflen] = '\0';
+  xgettext_comment_add (buffer);
+}
+
+
+/* These are for tracking whether comments count as immediately before
+   keyword.  */
+static int last_comment_line;
+static int last_non_comment_line;
+
+
+/* ========================= Debackslashification ========================== */
+
+/* This state tracks the effect of backquotes, double-quotes and single-quotes
+   on the parsing of backslashes.  We make a single pass through the input
+   file, keeping the state up to date.  This is much faster than accumulating
+   strings and processing them with explicit debackslashification, like the
+   shell does it.  */
+
+/* The number of nested `...` or "`...`" constructs.  Assumed to be <= 32.  */
+static unsigned int nested_backquotes;
+
+/* A bit mask indicating which of the currently open `...` or "`...`"
+   constructs is with double-quotes: "`...`".
+   A bit value of 1 stands for "`...`", a bit value of 0 stands for `...`.
+   Bit position 0 designates the outermost backquotes nesting,
+   bit position 1 the second-outermost backquotes nesting,
+   ...
+   bit position (nested_backquotes-1) the innermost backquotes nesting.  */
+static unsigned int open_doublequotes_mask;
+
+/* A bit indicating whether a double-quote is currently open inside the
+   innermost backquotes nesting.  */
+static bool open_doublequote;
+
+/* A bit indicating whether a single-quote is currently open inside the
+   innermost backquotes nesting.  */
+static bool open_singlequote;
+
+
+/* Functions to update the state.  */
+
+static inline void
+saw_opening_backquote ()
+{
+  if (open_singlequote)
+    abort ();
+  if (open_doublequote)
+    open_doublequotes_mask |= (unsigned int) 1 << nested_backquotes;
+  nested_backquotes++;
+  open_doublequote = false;
+}
+
+static inline void
+saw_closing_backquote ()
+{
+  nested_backquotes--;
+  open_doublequote = (open_doublequotes_mask >> nested_backquotes) & 1;
+  open_doublequotes_mask &= ((unsigned int) 1 << nested_backquotes) - 1;
+  open_singlequote = false; /* just for safety */
+}
+
+static inline void
+saw_opening_doublequote ()
+{
+  if (open_singlequote || open_doublequote)
+    abort ();
+  open_doublequote = true;
+}
+
+static inline void
+saw_closing_doublequote ()
+{
+  if (open_singlequote || !open_doublequote)
+    abort ();
+  open_doublequote = false;
+}
+
+static inline void
+saw_opening_singlequote ()
+{
+  if (open_doublequote || open_singlequote)
+    abort ();
+  open_singlequote = true;
+}
+
+static inline void
+saw_closing_singlequote ()
+{
+  if (open_doublequote || !open_singlequote)
+    abort ();
+  open_singlequote = false;
+}
+
+
+/* ========================== Reading of commands ========================== */
+
+/* We are only interested in constant strings.  Other words need not to be
+   represented precisely.  */
+enum word_type
+{
+  t_string,    /* constant string */
+  t_other,     /* other string */
+  t_separator, /* command separator: semicolon or newline */
+  t_redirect,  /* redirection: one of < > >| << <<- >> <> <& >& */
+  t_backquote, /* closing '`' pseudo word */
+  t_paren,     /* closing ')' pseudo word */
+  t_eof                /* EOF marker */
+};
+
+struct word
+{
+  enum word_type type;
+  struct token *token;         /* for t_string */
+  int line_number_at_start;    /* for t_string */
+};
+
+/* Free the memory pointed to by a 'struct word'.  */
+static inline void
+free_word (struct word *wp)
+{
+  if (wp->type == t_string)
+    {
+      free_token (wp->token);
+      free (wp->token);
+    }
+}
+
+/* Convert a t_string token to a char*.  */
+static char *
+string_of_word (const struct word *wp)
+{
+  char *str;
+  int n;
+
+  if (!(wp->type == t_string))
+    abort ();
+  n = wp->token->charcount;
+  str = (char *) xmalloc (n + 1);
+  memcpy (str, wp->token->chars, n);
+  str[n] = '\0';
+  return str;
+}
+
+
+/* Whitespace recognition.  */
+
+static inline bool
+is_whitespace (int c)
+{
+  return (c == ' ' || c == '\t' || c == '\n');
+}
+
+/* Operator character recognition.  */
+
+static inline bool
+is_operator_start (int c)
+{
+  return (c == '|' || c == '&' || c == ';' || c == '<' || c == '>'
+         || c == '(' || c == ')');
+}
+
+
+/* Denotation of a quoted character.
+   The distinction between quoted and unquoted character is important only for
+   the special, whitespace and operator characters; it is irrelevant for
+   alphanumeric characters, '\\' and many others.  */
+#define QUOTED(c) (UCHAR_MAX + 1 + (c))
+/* Values in the 'unsigned char' range are implicitly unquoted.  Among these,
+   the following are important:
+     '"'         opening or closing double quote
+     '\''        opening or closing single quote
+     '$'         the unknown result of a dollar expansion
+     '`'         does not occur - replaced with OPENING_BACKQUOTE or
+                 CLOSING_BACKQUOTE
+ */
+#define OPENING_BACKQUOTE (2 * (UCHAR_MAX + 1) + '`')
+#define CLOSING_BACKQUOTE (3 * (UCHAR_MAX + 1) + '`')
+
+/* Maximum used guaranteed to be < 4.  */
+static int phase2_pushback[4];
+static int phase2_pushback_length;
+
+/* Forward declaration of local functions.  */
+static void phase2_ungetc (int c);
+
+/* Return the next character, with backslashes removed.
+   The result is QUOTED(c) for some unsigned char c, if the next character
+   is escaped sufficiently often to make it a regular constituent character,
+   or simply an 'unsigned char' if it has its special meaning (of special,
+   whitespace or operator charcter), or OPENING_BACKQUOTE, CLOSING_BACKQUOTE,
+   EOF.
+   It's the caller's responsibility to update the state.  */
+static int
+phase2_getc ()
+{
+  int c;
+
+  if (phase2_pushback_length)
+    {
+      c = phase2_pushback[--phase2_pushback_length];
+      if (c == '\n')
+       ++line_number;
+      return c;
+    }
+
+  c = phase1_getc ();
+  if (c == EOF)
+    return c;
+  if (c == '\'')
+    return (open_doublequote ? QUOTED (c) : c);
+  if (!open_singlequote)
+    {
+      if (c == '"' || c == '$')
+       return c;
+      if (c == '`')
+       return (nested_backquotes > 0 ? CLOSING_BACKQUOTE : OPENING_BACKQUOTE);
+    }
+  if (c == '\\')
+    {
+      /* Number of debackslahificication passes that are active at the
+        current point.  */
+      unsigned int debackslahify =
+       nested_backquotes + (open_singlequote ? 0 : 1);
+      /* Normal number of backslashes that yield a single backslash in the
+        final output.  */
+      unsigned int expected_count =
+       (unsigned int) 1 << debackslahify;
+      /* Number of backslashes found.  */
+      unsigned int count;
+
+      for (count = 1; count < expected_count; count++)
+       {
+         c = phase1_getc ();
+         if (c != '\\')
+           break;
+       }
+      if (count == expected_count)
+       return '\\';
+
+      /* The count of backslashes is > 0 and < expected_count, therefore the
+        result depends on c, the first character after the backslashes.
+        Note: The formulas below don't necessarily have a logic; they were
+        empirically determined such that 1. the xgettext-30 test succeeds,
+        2. the behaviour for count == 0 would correspond to the one without
+        any baskslash.  */
+      if (c == '\'')
+       {
+         if (!open_singlequote && count > (expected_count >> 1))
+           {
+             phase1_ungetc (c);
+             return '\\';
+           }
+         else
+           return (open_doublequote ? QUOTED (c) : c);
+       }
+      else if (c == '"')
+       {
+         /* Each debackslahificication pass converts \\ to \ and \" to ";
+            passes corresponding to `...` drop a lone " whereas passes
+            corresponding to "`...`" leave it alone.  Therefore, the
+            minimum number of backslashes needed to get one double-quote
+            in the end is  open_doublequotes_mask + 1.  */
+         if (open_singlequote)
+           {
+             if (count > open_doublequotes_mask)
+               {
+                 phase2_ungetc (c);
+                 return '\\';
+               }
+             else
+               return QUOTED (c);
+           }
+         else
+           {
+             if (count > open_doublequotes_mask)
+               return QUOTED (c);
+             else
+               /* Some of the count values <= open_doublequotes_mask are
+                  actually invalid here, but we assume a syntactically
+                  correct input file anyway.  */
+               return c;
+           }
+       }
+      else if (c == '`')
+       {
+         /* FIXME: This code looks fishy.  */
+         if (count == expected_count - 1)
+           return c;
+         else
+           /* Some of the count values < expected_count - 1 are
+              actually invalid here, but we assume a syntactically
+              correct input file anyway.  */
+           if (nested_backquotes > 0 && !open_singlequote
+               && count >= (expected_count >> 2))
+             return OPENING_BACKQUOTE;
+           else
+             return CLOSING_BACKQUOTE;
+       }
+      else if (c == '$')
+       {
+         if (open_singlequote)
+           return QUOTED (c);
+         if (count >= (expected_count >> 1))
+           return QUOTED (c);
+         else
+           return c;
+       }
+      else
+       {
+         /* When not followed by a quoting character or backslash or dollar,
+            a backslash survives a debackslahificication pass unmodified.
+            Therefore each debackslahificication pass performs a
+              count := (count + 1) >> 1
+            operation.  Therefore the minimum number of backslashes needed
+            to get one backslash in the end is  (expected_count >> 1) + 1.  */
+         if (open_doublequote || open_singlequote)
+           {
+             if (count > 0)
+               {
+                 phase1_ungetc (c);
+                 return '\\';
+               }
+             else
+               return QUOTED (c);
+           }
+         else
+           {
+             if (count > (expected_count >> 1))
+               {
+                 phase1_ungetc (c);
+                 return '\\';
+               }
+             else if (count > 0)
+               return QUOTED (c);
+             else
+               return c;
+           }
+       }
+    }
+
+  return (open_singlequote || open_doublequote ? QUOTED (c) : c);
+}
+
+static void
+phase2_ungetc (int c)
+{
+  switch (c)
+    {
+    case EOF:
+      break;
+
+    case '\n':
+      --line_number;
+      /* FALLTHROUGH */
+
+    default:
+      phase2_pushback[phase2_pushback_length++] = c;
+      break;
+    }
+}
+
+
+/* Forward declaration of local functions.  */
+static enum word_type read_command_list (int looking_for);
+
+
+
+/* Read the next word.
+   'looking_for' denotes a parse terminator, either CLOSING_BACKQUOTE, ')'
+   or '\0'.  */
+static void
+read_word (struct word *wp, int looking_for)
+{
+  int c;
+  bool all_unquoted_digits;
+
+  do
+    {
+      c = phase2_getc ();
+      if (c == '#')
+       {
+         /* Skip a comment up to end of line.  */
+         last_comment_line = line_number;
+         comment_start ();
+         for (;;)
+           {
+             c = phase1_getc ();
+             if (c == EOF || c == '\n')
+               break;
+             comment_add (c);
+           }
+         comment_line_end ();
+       }
+      if (c == '\n')
+       {
+         /* Comments assumed to be grouped with a message must immediately
+            precede it, with no non-whitespace token on a line between
+            both.  */
+         if (last_non_comment_line > last_comment_line)
+           xgettext_comment_reset ();
+         wp->type = t_separator;
+         return;
+       }
+    }
+  while (is_whitespace (c));
+
+  if (c == EOF)
+    {
+      wp->type = t_eof;
+      return;
+    }
+
+  if (c == '<' || c == '>')
+    {
+      /* Recognize the redirection operators < > >| << <<- >> <> <& >&  */
+      int c2 = phase2_getc ();
+      if ((c == '<' ? c2 == '<' : c2 == '|') || c2 == '>' || c2 == '&')
+       {
+         if (c == '<' && c2 == '<')
+           {
+             int c3 = phase2_getc ();
+             if (c3 != '-')
+               phase2_ungetc (c3);
+           }
+       }
+      else
+       phase2_ungetc (c2);
+      wp->type = t_redirect;
+      return;
+    }
+
+  if (looking_for == CLOSING_BACKQUOTE && c == CLOSING_BACKQUOTE)
+    {
+      saw_closing_backquote ();
+      wp->type = t_backquote;
+      last_non_comment_line = line_number;
+      return;
+    }
+
+  if (looking_for == ')' && c == ')')
+    {
+      wp->type = t_paren;
+      last_non_comment_line = line_number;
+      return;
+    }
+
+  if (is_operator_start (c))
+    {
+      wp->type = (c == ';' ? t_separator : t_other);
+      return;
+    }
+
+  wp->type = t_string;
+  wp->token = (struct token *) xmalloc (sizeof (struct token));
+  init_token (wp->token);
+  wp->line_number_at_start = line_number;
+  all_unquoted_digits = true;
+
+  for (;; c = phase2_getc ())
+    {
+      if (c == EOF)
+       break;
+
+      if (all_unquoted_digits && (c == '<' || c == '>'))
+       {
+         /* Recognize the redirection operators < > >| << <<- >> <> <& >&
+            prefixed with a nonempty sequence of unquoted digits.  */
+         int c2 = phase2_getc ();
+         if ((c == '<' ? c2 == '<' : c2 == '|') || c2 == '>' || c2 == '&')
+           {
+             if (c == '<' && c2 == '<')
+               {
+                 int c3 = phase2_getc ();
+                 if (c3 != '-')
+                   phase2_ungetc (c3);
+               }
+           }
+         else
+           phase2_ungetc (c2);
+
+         wp->type = t_redirect;
+         free_token (wp->token);
+         free (wp->token);
+
+         last_non_comment_line = line_number;
+
+         return;
+       }
+
+      all_unquoted_digits = all_unquoted_digits && (c >= '0' && c <= '9');
+
+      if (c == '$')
+       {
+         int c2 = phase2_getc ();
+         if (c2 == '(')
+           {
+             int c3 = phase2_getc ();
+             if (c3 == '(')
+               {
+                 /* Arithmetic expression.  Skip until the matching closing
+                    parenthesis.  */
+                 unsigned int depth = 2;
+
+                 do
+                   {
+                     c = phase2_getc ();
+                     if (c == '(')
+                       depth++;
+                     else if (c == ')')
+                       if (--depth == 0)
+                         break;
+                   }
+                 while (c != EOF);
+               }
+             else
+               {
+                 /* Command substitution.  */
+                 phase2_ungetc (c3);
+                 read_command_list (')');
+               }
+           }
+         else if (c2 == '\'' && !open_singlequote)
+           {
+             /* Bash builtin for string with ANSI-C escape sequences.  */
+             saw_opening_singlequote ();
+             for (;;)
+               {
+                 c = phase2_getc ();
+                 if (c == EOF)
+                   break;
+                 if (c == '\'')
+                   {
+                     saw_closing_singlequote ();
+                     break;
+                   }
+                 if (c == '\\')
+                   {
+                     c = phase2_getc ();
+                     switch (c)
+                       {
+                       default:
+                         phase2_ungetc (c);
+                         c = '\\';
+                         break;
+
+                       case '\\':
+                         break;
+                       case '\'':
+                         /* Don't call saw_closing_singlequote () here.  */
+                         break;
+
+                       case 'a':
+                         c = '\a';
+                         break;
+                       case 'b':
+                         c = '\b';
+                         break;
+                       case 'e':
+                         c = 0x1b; /* ESC */
+                         break;
+                       case 'f':
+                         c = '\f';
+                         break;
+                       case 'n':
+                         c = '\n';
+                         break;
+                       case 'r':
+                         c = '\r';
+                         break;
+                       case 't':
+                         c = '\t';
+                         break;
+                       case 'v':
+                         c = '\v';
+                         break;
+
+                       case 'x':
+                         c = phase2_getc ();
+                         if ((c >= '0' && c <= '9')
+                             || (c >= 'A' && c <= 'F')
+                             || (c >= 'a' && c <= 'f'))
+                           {
+                             int n;
+
+                             if (c >= '0' && c <= '9')
+                               n = c - '0';
+                             else if (c >= 'A' && c <= 'F')
+                               n = 10 + c - 'A';
+                             else if (c >= 'a' && c <= 'f')
+                               n = 10 + c - 'a';
+                             else
+                               abort ();
+
+                             c = phase2_getc ();
+                             if ((c >= '0' && c <= '9')
+                                 || (c >= 'A' && c <= 'F')
+                                 || (c >= 'a' && c <= 'f'))
+                               {
+                                 if (c >= '0' && c <= '9')
+                                   n = n * 16 + c - '0';
+                                 else if (c >= 'A' && c <= 'F')
+                                   n = n * 16 + 10 + c - 'A';
+                                 else if (c >= 'a' && c <= 'f')
+                                   n = n * 16 + 10 + c - 'a';
+                                 else
+                                   abort ();
+                               }
+                             else
+                               phase2_ungetc (c);
+
+                             c = n;
+                           }
+                         else
+                           {
+                             phase2_ungetc (c);
+                             phase2_ungetc ('x');
+                             c = '\\';
+                           }
+                         break;
+
+                       case '0': case '1': case '2': case '3':
+                       case '4': case '5': case '6': case '7':
+                         {
+                           int n = c - '0';
+
+                           c = phase2_getc ();
+                           if (c >= '0' && c <= '7')
+                             {
+                               n = n * 8 + c - '0';
+
+                               c = phase2_getc ();
+                               if (c >= '0' && c <= '7')
+                                 n = n * 8 + c - '0';
+                               else
+                                 phase2_ungetc (c);
+                             }
+                           else
+                             phase2_ungetc (c);
+
+                           c = n;
+                         }
+                         break;
+                       }
+                   }
+                 if (wp->type == t_string)
+                   {
+                     grow_token (wp->token);
+                     wp->token->chars[wp->token->charcount++] =
+                       (unsigned char) c;
+                   }
+               }
+             /* The result is a literal string.  Don't change wp->type.  */
+             continue;
+           }
+         else if (c2 == '"' && !open_doublequote)
+           {
+             /* Bash builtin for internationalized string.  */
+             lex_pos_ty pos;
+             struct token string;
+
+             saw_opening_doublequote ();
+             pos.file_name = logical_file_name;
+             pos.line_number = line_number;
+             init_token (&string);
+             for (;;)
+               {
+                 c = phase2_getc ();
+                 if (c == EOF)
+                   break;
+                 if (c == '"')
+                   {
+                     saw_closing_doublequote ();
+                     break;
+                   }
+                 grow_token (&string);
+                 string.chars[string.charcount++] = (unsigned char) c;
+               }
+             remember_a_message (mlp, string_of_token (&string), &pos);
+             free_token (&string);
+             /* The result at runtime is not constant. Therefore we
+                change wp->type.  */
+           }
+         else
+           phase2_ungetc (c2);
+         wp->type = t_other;
+         continue;
+       }
+
+      if (c == '\'')
+       {
+         if (!open_singlequote)
+           {
+             /* Handle an opening single quote.  */
+             saw_opening_singlequote ();
+           }
+         else
+           {
+             /* Handle a closing single quote.  */
+             saw_closing_singlequote ();
+           }
+         continue;
+       }
+
+      if (c == '"')
+       {
+         if (!open_doublequote)
+           {
+             /* Handle an opening double quote.  */
+             saw_opening_doublequote ();
+           }
+         else
+           {
+             /* Handle a closing double quote.  */
+             saw_closing_doublequote ();
+           }
+         continue;
+       }
+
+      if (c == OPENING_BACKQUOTE)
+       {
+         /* Handle an opening backquote.  */
+         saw_opening_backquote ();
+
+         read_command_list (CLOSING_BACKQUOTE);
+
+         wp->type = t_other;
+         continue;
+       }
+      if (c == CLOSING_BACKQUOTE)
+       break;
+
+      if (!open_singlequote && !open_doublequote
+         && (is_whitespace (c) || is_operator_start (c)))
+       break;
+
+      if (wp->type == t_string)
+       {
+         grow_token (wp->token);
+         wp->token->chars[wp->token->charcount++] = (unsigned char) c;
+       }
+    }
+
+  phase2_ungetc (c);
+
+  if (wp->type != t_string)
+    {
+      free_token (wp->token);
+      free (wp->token);
+    }
+  last_non_comment_line = line_number;
+}
+
+
+/* Read the next command.
+   'looking_for' denotes a parse terminator, either CLOSING_BACKQUOTE, ')'
+   or '\0'.
+   Returns the type of the word that terminated the command.  */
+static enum word_type
+read_command (int looking_for)
+{
+  /* Read the words that make up the command.
+     Here we completely ignore field splitting at whitespace and wildcard
+     expansions; i.e. we assume that the source is written in such a way that
+     every word in the program determines exactly one word in the resulting
+     command.
+     But we do not require that the 'gettext'/'ngettext' command is the
+     first in the command; this is because 1. we want to allow for prefixes
+     like "$verbose" that may expand to nothing, and 2. it's a big effort
+     to know where a command starts in a $(for ...) or $(case ...) compound
+     command.  */
+  int arg = 0;                 /* Current argument number.  */
+  bool arg_of_redirect = false;        /* True right after a redirection operator.  */
+  int argnum1 = -1;            /* First string position.  */
+  int argnum2 = -1;            /* Plural string position.  */
+  message_ty *plural_mp = NULL;        /* Remember the msgid.  */
+
+  for (;;)
+    {
+      struct word inner;
+
+      read_word (&inner, looking_for);
+
+      /* Recognize end of command.  */
+      if (inner.type == t_separator
+         || inner.type == t_backquote || inner.type == t_paren
+         || inner.type == t_eof)
+       return inner.type;
+
+      if (extract_all)
+       {
+         if (inner.type == t_string)
+           {
+             lex_pos_ty pos;
+
+             pos.file_name = logical_file_name;
+             pos.line_number = inner.line_number_at_start;
+             remember_a_message (mlp, string_of_word (&inner), &pos);
+           }
+       }
+      else
+       {
+         if (arg_of_redirect)
+           {
+             /* Ignore arguments of redirection operators.  */
+             arg_of_redirect = false;
+           }
+         else if (inner.type == t_redirect)
+           {
+             /* Ignore this word and the following one.  */
+             arg_of_redirect = true;
+           }
+         else
+           {
+             if (argnum1 < 0 && argnum2 < 0)
+               {
+                 /* This is the function position.  */
+                 arg = 0;
+                 if (inner.type == t_string)
+                   {
+                     char *function_name = string_of_word (&inner);
+                     void *keyword_value;
+
+                     if (find_entry (&keywords,
+                                     function_name, strlen (function_name),
+                                     &keyword_value)
+                         == 0)
+                       {
+                         argnum1 = (int) (long) keyword_value & ((1 << 10) - 1);
+                         argnum2 = (int) (long) keyword_value >> 10;
+                       }
+
+                     free (function_name);
+                   }
+               }
+             else
+               {
+                 /* These are the argument positions.
+                    Extract a string if we have reached the right
+                    argument position.  */
+                 if (arg == argnum1)
+                   {
+                     if (inner.type == t_string)
+                       {
+                         lex_pos_ty pos;
+                         message_ty *mp;
+
+                         pos.file_name = logical_file_name;
+                         pos.line_number = inner.line_number_at_start;
+                         mp = remember_a_message (mlp, string_of_word (&inner), &pos);
+                         if (argnum2 > 0)
+                           plural_mp = mp;
+                       }
+                   }
+                 else if (arg == argnum2)
+                   {
+                     if (inner.type == t_string && plural_mp != NULL)
+                       {
+                         lex_pos_ty pos;
+
+                         pos.file_name = logical_file_name;
+                         pos.line_number = inner.line_number_at_start;
+                         remember_a_message_plural (plural_mp, string_of_word (&inner), &pos);
+                       }
+                   }
+
+                 if (arg >= argnum1 && arg >= argnum2)
+                   {
+                     /* Stop looking for arguments of the last function_name.  */
+                     argnum1 = -1;
+                     argnum2 = -1;
+                     plural_mp = NULL;
+                   }
+               }
+
+             arg++;
+           }
+       }
+
+      free_word (&inner);
+    }
+}
+
+
+/* Read a list of commands.
+   'looking_for' denotes a parse terminator, either CLOSING_BACKQUOTE, ')'
+   or '\0'.
+   Returns the type of the word that terminated the command list.  */
+static enum word_type
+read_command_list (int looking_for)
+{
+  for (;;)
+    {
+      enum word_type terminator;
+
+      terminator = read_command (looking_for);
+      if (terminator != t_separator)
+       return terminator;
+    }
+}
+
+
+void
+extract_sh (FILE *f,
+           const char *real_filename, const char *logical_filename,
+           msgdomain_list_ty *mdlp)
+{
+  mlp = mdlp->item[0]->messages;
+
+  fp = f;
+  real_file_name = real_filename;
+  logical_file_name = xstrdup (logical_filename);
+  line_number = 1;
+
+  last_comment_line = -1;
+  last_non_comment_line = -1;
+
+  nested_backquotes = 0;
+  open_doublequotes_mask = 0;
+  open_doublequote = false;
+  open_singlequote = false;
+
+  init_keywords ();
+
+  /* Eat tokens until eof is seen.  */
+  read_command_list ('\0');
+
+  fp = NULL;
+  real_file_name = NULL;
+  logical_file_name = NULL;
+  line_number = 0;
+}
diff --git a/gettext-tools/src/x-sh.h b/gettext-tools/src/x-sh.h

new file mode 100644 (file)

index 0000000..84b8b27
--- /dev/null
+++ b/gettext-tools/src/x-sh.h
@@ -0,0 +1,33 @@
+/* xgettext sh backend.
+   Copyright (C) 2003 Free Software Foundation, Inc.
+   Written by Bruno Haible <bruno@clisp.org>, 2003.
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 2, or (at your option)
+   any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software Foundation,
+   Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.  */
+
+
+#define EXTENSIONS_SH \
+  { "sh",    "Shell"   },                                              \
+  { "bash",  "Shell"   },                                              \
+
+#define SCANNERS_SH \
+  { "Shell",      extract_sh, &formatstring_sh, NULL },                        \
+
+/* Scan a shell script file and add its translatable strings to mdlp.  */
+extern void extract_sh (FILE *fp, const char *real_filename,
+                       const char *logical_filename,
+                       msgdomain_list_ty *mdlp);
+
+extern void x_sh_keyword (const char *keyword);
+extern void x_sh_extract_all (void);
author	Bruno Haible <bruno@clisp.org>
	Wed, 3 Sep 2003 10:27:17 +0000 (10:27 +0000)
committer	Bruno Haible <bruno@clisp.org>
	Tue, 23 Jun 2009 10:10:57 +0000 (12:10 +0200)
gettext-tools/src/x-sh.c	[new file with mode: 0644]	patch \| blob
gettext-tools/src/x-sh.h	[new file with mode: 0644]	patch \| blob