From b440926984299740093f463c7f2328b2efd03f68 Mon Sep 17 00:00:00 2001
From: Bruno Haible <bruno@clisp.org>
Date: Wed, 3 Sep 2003 10:27:17 +0000
Subject: [PATCH] sh/bash script parser.

---
 gettext-tools/src/x-sh.c | 1242 ++++++++++++++++++++++++++++++++++++++
 gettext-tools/src/x-sh.h |   33 +
 2 files changed, 1275 insertions(+)
 create mode 100644 gettext-tools/src/x-sh.c
 create mode 100644 gettext-tools/src/x-sh.h

diff --git a/gettext-tools/src/x-sh.c b/gettext-tools/src/x-sh.c
new file mode 100644
index 000000000..be721f62e
--- /dev/null
+++ b/gettext-tools/src/x-sh.c
@@ -0,0 +1,1242 @@
+/* xgettext sh backend.
+   Copyright (C) 2003 Free Software Foundation, Inc.
+   Written by Bruno Haible <bruno@clisp.org>, 2003.
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 2, or (at your option)
+   any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software Foundation,
+   Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.  */
+
+#ifdef HAVE_CONFIG_H
+# include "config.h"
+#endif
+
+#include <errno.h>
+#include <limits.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "message.h"
+#include "x-sh.h"
+#include "xgettext.h"
+#include "error.h"
+#include "xmalloc.h"
+#include "exit.h"
+#include "hash.h"
+#include "gettext.h"
+
+#define _(s) gettext(s)
+
+/* The sh syntax is defined in POSIX:2001, see
+     http://www.opengroup.org/onlinepubs/007904975/utilities/xcu_chap02.html
+   Summary of sh syntax:
+   - Input is broken into words, which are then subject to
+     - tilde expansion ~...
+     - command substitution `...`
+     - variable substitution $var
+     - arithmetic substitution $((...))
+     - field splitting at whitespace (IFS)
+     - wildcard pattern expansion *?
+     - quote removal
+   - Strings are enclosed in "..."; command substitution, variable
+     substitution and arithmetic substitution are performed here as well.
+   - '...' is a string without substitutions.
+   - The list of resulting words is split into commands by semicolon and
+     newline.
+   - '#' at the beginning of a word introduces a comment until end of line.
+   The parser is implemented in bash-2.05b/parse.y.  */
+
+
+/* ====================== Keyword set customization.  ====================== */
+
+/* If true extract all strings.  */
+static bool extract_all = false;
+
+static hash_table keywords;
+static bool default_keywords = true;
+
+
+void
+x_sh_extract_all ()
+{
+  extract_all = true;
+}
+
+
+void
+x_sh_keyword (const char *name)
+{
+  if (name == NULL)
+    default_keywords = false;
+  else
+    {
+      const char *end;
+      int argnum1;
+      int argnum2;
+      const char *colon;
+
+      if (keywords.table == NULL)
+	init_hash (&keywords, 100);
+
+      split_keywordspec (name, &end, &argnum1, &argnum2);
+
+      /* The characters between name and end should form a valid C identifier.
+	 A colon means an invalid parse in split_keywordspec().  */
+      colon = strchr (name, ':');
+      if (colon == NULL || colon >= end)
+	{
+	  if (argnum1 == 0)
+	    argnum1 = 1;
+	  insert_entry (&keywords, name, end - name,
+			(void *) (long) (argnum1 + (argnum2 << 10)));
+	}
+    }
+}
+
+/* Finish initializing the keywords hash table.
+   Called after argument processing, before each file is processed.  */
+static void
+init_keywords ()
+{
+  if (default_keywords)
+    {
+      x_sh_keyword ("gettext");
+      x_sh_keyword ("ngettext:1,2");
+      x_sh_keyword ("eval_gettext");
+      x_sh_keyword ("eval_ngettext:1,2");
+      default_keywords = false;
+    }
+}
+
+
+/* ======================== Reading of characters.  ======================== */
+
+/* Real filename, used in error messages about the input file.  */
+static const char *real_file_name;
+
+/* Logical filename and line number, used to label the extracted messages.  */
+static char *logical_file_name;
+static int line_number;
+
+/* The input file stream.  */
+static FILE *fp;
+
+
+/* Fetch the next character from the input file.  */
+static int
+do_getc ()
+{
+  int c = getc (fp);
+
+  if (c == EOF)
+    {
+      if (ferror (fp))
+	error (EXIT_FAILURE, errno, _("\
+error while reading \"%s\""), real_file_name);
+    }
+  else if (c == '\n')
+   line_number++;
+
+  return c;
+}
+
+/* Put back the last fetched character, not EOF.  */
+static void
+do_ungetc (int c)
+{
+  if (c == '\n')
+    line_number--;
+  ungetc (c, fp);
+}
+
+
+/* Remove backslash followed by newline from the input stream.
+   Cope with potentially 2 characters of pushback.  */
+
+/* Maximum used guaranteed to be < 4.  */
+static int phase1_pushback[4];
+static int phase1_pushback_length;
+
+static int
+phase1_getc ()
+{
+  int c;
+
+  if (phase1_pushback_length)
+    {
+      c = phase1_pushback[--phase1_pushback_length];
+      if (c == '\n')
+	++line_number;
+      return c;
+    }
+  for (;;)
+    {
+      c = do_getc ();
+      if (c != '\\')
+	return c;
+      c = do_getc ();
+      if (c != '\n')
+	{
+	  if (c != EOF)
+	    do_ungetc (c);
+	  return '\\';
+	}
+    }
+}
+
+static void
+phase1_ungetc (int c)
+{
+  switch (c)
+    {
+    case EOF:
+      break;
+
+    case '\n':
+      --line_number;
+      /* FALLTHROUGH */
+
+    default:
+      phase1_pushback[phase1_pushback_length++] = c;
+      break;
+    }
+}
+
+
+/* ========================== Reading of tokens.  ========================== */
+
+
+/* A token consists of a sequence of characters.  */
+struct token
+{
+  int allocated;		/* number of allocated 'token_char's */
+  int charcount;		/* number of used 'token_char's */
+  char *chars;			/* the token's constituents */
+};
+
+/* Initialize a 'struct token'.  */
+static inline void
+init_token (struct token *tp)
+{
+  tp->allocated = 10;
+  tp->chars = (char *) xmalloc (tp->allocated * sizeof (char));
+  tp->charcount = 0;
+}
+
+/* Free the memory pointed to by a 'struct token'.  */
+static inline void
+free_token (struct token *tp)
+{
+  free (tp->chars);
+}
+
+/* Ensure there is enough room in the token for one more character.  */
+static inline void
+grow_token (struct token *tp)
+{
+  if (tp->charcount == tp->allocated)
+    {
+      tp->allocated *= 2;
+      tp->chars = (char *) xrealloc (tp->chars, tp->allocated * sizeof (char));
+    }
+}
+
+/* Convert a struct token * to a char*.  */
+static char *
+string_of_token (const struct token *tp)
+{
+  char *str;
+  int n;
+
+  n = tp->charcount;
+  str = (char *) xmalloc (n + 1);
+  memcpy (str, tp->chars, n);
+  str[n] = '\0';
+  return str;
+}
+
+
+/* ========================= Accumulating messages ========================= */
+
+
+static message_list_ty *mlp;
+
+
+/* ========================= Accumulating comments ========================= */
+
+
+static char *buffer;
+static size_t bufmax;
+static size_t buflen;
+
+static inline void
+comment_start ()
+{
+  buflen = 0;
+}
+
+static inline void
+comment_add (int c)
+{
+  if (buflen >= bufmax)
+    {
+      bufmax = 2 * bufmax + 10;
+      buffer = xrealloc (buffer, bufmax);
+    }
+  buffer[buflen++] = c;
+}
+
+static inline void
+comment_line_end ()
+{
+  while (buflen >= 1
+	 && (buffer[buflen - 1] == ' ' || buffer[buflen - 1] == '\t'))
+    --buflen;
+  if (buflen >= bufmax)
+    {
+      bufmax = 2 * bufmax + 10;
+      buffer = xrealloc (buffer, bufmax);
+    }
+  buffer[buflen] = '\0';
+  xgettext_comment_add (buffer);
+}
+
+
+/* These are for tracking whether comments count as immediately before
+   keyword.  */
+static int last_comment_line;
+static int last_non_comment_line;
+
+
+/* ========================= Debackslashification ========================== */
+
+/* This state tracks the effect of backquotes, double-quotes and single-quotes
+   on the parsing of backslashes.  We make a single pass through the input
+   file, keeping the state up to date.  This is much faster than accumulating
+   strings and processing them with explicit debackslashification, like the
+   shell does it.  */
+
+/* The number of nested `...` or "`...`" constructs.  Assumed to be <= 32.  */
+static unsigned int nested_backquotes;
+
+/* A bit mask indicating which of the currently open `...` or "`...`"
+   constructs is with double-quotes: "`...`".
+   A bit value of 1 stands for "`...`", a bit value of 0 stands for `...`.
+   Bit position 0 designates the outermost backquotes nesting,
+   bit position 1 the second-outermost backquotes nesting,
+   ...
+   bit position (nested_backquotes-1) the innermost backquotes nesting.  */
+static unsigned int open_doublequotes_mask;
+
+/* A bit indicating whether a double-quote is currently open inside the
+   innermost backquotes nesting.  */
+static bool open_doublequote;
+
+/* A bit indicating whether a single-quote is currently open inside the
+   innermost backquotes nesting.  */
+static bool open_singlequote;
+
+
+/* Functions to update the state.  */
+
+static inline void
+saw_opening_backquote ()
+{
+  if (open_singlequote)
+    abort ();
+  if (open_doublequote)
+    open_doublequotes_mask |= (unsigned int) 1 << nested_backquotes;
+  nested_backquotes++;
+  open_doublequote = false;
+}
+
+static inline void
+saw_closing_backquote ()
+{
+  nested_backquotes--;
+  open_doublequote = (open_doublequotes_mask >> nested_backquotes) & 1;
+  open_doublequotes_mask &= ((unsigned int) 1 << nested_backquotes) - 1;
+  open_singlequote = false; /* just for safety */
+}
+
+static inline void
+saw_opening_doublequote ()
+{
+  if (open_singlequote || open_doublequote)
+    abort ();
+  open_doublequote = true;
+}
+
+static inline void
+saw_closing_doublequote ()
+{
+  if (open_singlequote || !open_doublequote)
+    abort ();
+  open_doublequote = false;
+}
+
+static inline void
+saw_opening_singlequote ()
+{
+  if (open_doublequote || open_singlequote)
+    abort ();
+  open_singlequote = true;
+}
+
+static inline void
+saw_closing_singlequote ()
+{
+  if (open_doublequote || !open_singlequote)
+    abort ();
+  open_singlequote = false;
+}
+
+
+/* ========================== Reading of commands ========================== */
+
+/* We are only interested in constant strings.  Other words need not to be
+   represented precisely.  */
+enum word_type
+{
+  t_string,	/* constant string */
+  t_other,	/* other string */
+  t_separator,	/* command separator: semicolon or newline */
+  t_redirect,	/* redirection: one of < > >| << <<- >> <> <& >& */
+  t_backquote,	/* closing '`' pseudo word */
+  t_paren,	/* closing ')' pseudo word */
+  t_eof		/* EOF marker */
+};
+
+struct word
+{
+  enum word_type type;
+  struct token *token;		/* for t_string */
+  int line_number_at_start;	/* for t_string */
+};
+
+/* Free the memory pointed to by a 'struct word'.  */
+static inline void
+free_word (struct word *wp)
+{
+  if (wp->type == t_string)
+    {
+      free_token (wp->token);
+      free (wp->token);
+    }
+}
+
+/* Convert a t_string token to a char*.  */
+static char *
+string_of_word (const struct word *wp)
+{
+  char *str;
+  int n;
+
+  if (!(wp->type == t_string))
+    abort ();
+  n = wp->token->charcount;
+  str = (char *) xmalloc (n + 1);
+  memcpy (str, wp->token->chars, n);
+  str[n] = '\0';
+  return str;
+}
+
+
+/* Whitespace recognition.  */
+
+static inline bool
+is_whitespace (int c)
+{
+  return (c == ' ' || c == '\t' || c == '\n');
+}
+
+/* Operator character recognition.  */
+
+static inline bool
+is_operator_start (int c)
+{
+  return (c == '|' || c == '&' || c == ';' || c == '<' || c == '>'
+	  || c == '(' || c == ')');
+}
+
+
+/* Denotation of a quoted character.
+   The distinction between quoted and unquoted character is important only for
+   the special, whitespace and operator characters; it is irrelevant for
+   alphanumeric characters, '\\' and many others.  */
+#define QUOTED(c) (UCHAR_MAX + 1 + (c))
+/* Values in the 'unsigned char' range are implicitly unquoted.  Among these,
+   the following are important:
+     '"'         opening or closing double quote
+     '\''        opening or closing single quote
+     '$'         the unknown result of a dollar expansion
+     '`'         does not occur - replaced with OPENING_BACKQUOTE or
+                 CLOSING_BACKQUOTE
+ */
+#define OPENING_BACKQUOTE (2 * (UCHAR_MAX + 1) + '`')
+#define CLOSING_BACKQUOTE (3 * (UCHAR_MAX + 1) + '`')
+
+/* Maximum used guaranteed to be < 4.  */
+static int phase2_pushback[4];
+static int phase2_pushback_length;
+
+/* Forward declaration of local functions.  */
+static void phase2_ungetc (int c);
+
+/* Return the next character, with backslashes removed.
+   The result is QUOTED(c) for some unsigned char c, if the next character
+   is escaped sufficiently often to make it a regular constituent character,
+   or simply an 'unsigned char' if it has its special meaning (of special,
+   whitespace or operator charcter), or OPENING_BACKQUOTE, CLOSING_BACKQUOTE,
+   EOF.
+   It's the caller's responsibility to update the state.  */
+static int
+phase2_getc ()
+{
+  int c;
+
+  if (phase2_pushback_length)
+    {
+      c = phase2_pushback[--phase2_pushback_length];
+      if (c == '\n')
+	++line_number;
+      return c;
+    }
+
+  c = phase1_getc ();
+  if (c == EOF)
+    return c;
+  if (c == '\'')
+    return (open_doublequote ? QUOTED (c) : c);
+  if (!open_singlequote)
+    {
+      if (c == '"' || c == '$')
+	return c;
+      if (c == '`')
+	return (nested_backquotes > 0 ? CLOSING_BACKQUOTE : OPENING_BACKQUOTE);
+    }
+  if (c == '\\')
+    {
+      /* Number of debackslahificication passes that are active at the
+	 current point.  */
+      unsigned int debackslahify =
+	nested_backquotes + (open_singlequote ? 0 : 1);
+      /* Normal number of backslashes that yield a single backslash in the
+	 final output.  */
+      unsigned int expected_count =
+	(unsigned int) 1 << debackslahify;
+      /* Number of backslashes found.  */
+      unsigned int count;
+
+      for (count = 1; count < expected_count; count++)
+	{
+	  c = phase1_getc ();
+	  if (c != '\\')
+	    break;
+	}
+      if (count == expected_count)
+	return '\\';
+
+      /* The count of backslashes is > 0 and < expected_count, therefore the
+	 result depends on c, the first character after the backslashes.
+	 Note: The formulas below don't necessarily have a logic; they were
+	 empirically determined such that 1. the xgettext-30 test succeeds,
+	 2. the behaviour for count == 0 would correspond to the one without
+	 any baskslash.  */
+      if (c == '\'')
+	{
+	  if (!open_singlequote && count > (expected_count >> 1))
+	    {
+	      phase1_ungetc (c);
+	      return '\\';
+	    }
+	  else
+	    return (open_doublequote ? QUOTED (c) : c);
+	}
+      else if (c == '"')
+	{
+	  /* Each debackslahificication pass converts \\ to \ and \" to ";
+	     passes corresponding to `...` drop a lone " whereas passes
+	     corresponding to "`...`" leave it alone.  Therefore, the
+	     minimum number of backslashes needed to get one double-quote
+	     in the end is  open_doublequotes_mask + 1.  */
+	  if (open_singlequote)
+	    {
+	      if (count > open_doublequotes_mask)
+		{
+		  phase2_ungetc (c);
+		  return '\\';
+		}
+	      else
+		return QUOTED (c);
+	    }
+	  else
+	    {
+	      if (count > open_doublequotes_mask)
+		return QUOTED (c);
+	      else
+	        /* Some of the count values <= open_doublequotes_mask are
+		   actually invalid here, but we assume a syntactically
+		   correct input file anyway.  */
+		return c;
+	    }
+	}
+      else if (c == '`')
+	{
+	  /* FIXME: This code looks fishy.  */
+	  if (count == expected_count - 1)
+	    return c;
+	  else
+	    /* Some of the count values < expected_count - 1 are
+	       actually invalid here, but we assume a syntactically
+	       correct input file anyway.  */
+	    if (nested_backquotes > 0 && !open_singlequote
+		&& count >= (expected_count >> 2))
+	      return OPENING_BACKQUOTE;
+	    else
+	      return CLOSING_BACKQUOTE;
+	}
+      else if (c == '$')
+	{
+	  if (open_singlequote)
+	    return QUOTED (c);
+	  if (count >= (expected_count >> 1))
+	    return QUOTED (c);
+	  else
+	    return c;
+	}
+      else
+	{
+	  /* When not followed by a quoting character or backslash or dollar,
+	     a backslash survives a debackslahificication pass unmodified.
+	     Therefore each debackslahificication pass performs a
+	       count := (count + 1) >> 1
+	     operation.  Therefore the minimum number of backslashes needed
+	     to get one backslash in the end is  (expected_count >> 1) + 1.  */
+	  if (open_doublequote || open_singlequote)
+	    {
+	      if (count > 0)
+		{
+		  phase1_ungetc (c);
+		  return '\\';
+		}
+	      else
+		return QUOTED (c);
+	    }
+	  else
+	    {
+	      if (count > (expected_count >> 1))
+		{
+		  phase1_ungetc (c);
+		  return '\\';
+		}
+	      else if (count > 0)
+		return QUOTED (c);
+	      else
+		return c;
+	    }
+	}
+    }
+
+  return (open_singlequote || open_doublequote ? QUOTED (c) : c);
+}
+
+static void
+phase2_ungetc (int c)
+{
+  switch (c)
+    {
+    case EOF:
+      break;
+
+    case '\n':
+      --line_number;
+      /* FALLTHROUGH */
+
+    default:
+      phase2_pushback[phase2_pushback_length++] = c;
+      break;
+    }
+}
+
+
+/* Forward declaration of local functions.  */
+static enum word_type read_command_list (int looking_for);
+
+
+
+/* Read the next word.
+   'looking_for' denotes a parse terminator, either CLOSING_BACKQUOTE, ')'
+   or '\0'.  */
+static void
+read_word (struct word *wp, int looking_for)
+{
+  int c;
+  bool all_unquoted_digits;
+
+  do
+    {
+      c = phase2_getc ();
+      if (c == '#')
+	{
+	  /* Skip a comment up to end of line.  */
+	  last_comment_line = line_number;
+	  comment_start ();
+	  for (;;)
+	    {
+	      c = phase1_getc ();
+	      if (c == EOF || c == '\n')
+		break;
+	      comment_add (c);
+	    }
+	  comment_line_end ();
+	}
+      if (c == '\n')
+	{
+	  /* Comments assumed to be grouped with a message must immediately
+	     precede it, with no non-whitespace token on a line between
+	     both.  */
+	  if (last_non_comment_line > last_comment_line)
+	    xgettext_comment_reset ();
+	  wp->type = t_separator;
+	  return;
+	}
+    }
+  while (is_whitespace (c));
+
+  if (c == EOF)
+    {
+      wp->type = t_eof;
+      return;
+    }
+
+  if (c == '<' || c == '>')
+    {
+      /* Recognize the redirection operators < > >| << <<- >> <> <& >&  */
+      int c2 = phase2_getc ();
+      if ((c == '<' ? c2 == '<' : c2 == '|') || c2 == '>' || c2 == '&')
+	{
+	  if (c == '<' && c2 == '<')
+	    {
+	      int c3 = phase2_getc ();
+	      if (c3 != '-')
+		phase2_ungetc (c3);
+	    }
+	}
+      else
+	phase2_ungetc (c2);
+      wp->type = t_redirect;
+      return;
+    }
+
+  if (looking_for == CLOSING_BACKQUOTE && c == CLOSING_BACKQUOTE)
+    {
+      saw_closing_backquote ();
+      wp->type = t_backquote;
+      last_non_comment_line = line_number;
+      return;
+    }
+
+  if (looking_for == ')' && c == ')')
+    {
+      wp->type = t_paren;
+      last_non_comment_line = line_number;
+      return;
+    }
+
+  if (is_operator_start (c))
+    {
+      wp->type = (c == ';' ? t_separator : t_other);
+      return;
+    }
+
+  wp->type = t_string;
+  wp->token = (struct token *) xmalloc (sizeof (struct token));
+  init_token (wp->token);
+  wp->line_number_at_start = line_number;
+  all_unquoted_digits = true;
+
+  for (;; c = phase2_getc ())
+    {
+      if (c == EOF)
+	break;
+
+      if (all_unquoted_digits && (c == '<' || c == '>'))
+	{
+	  /* Recognize the redirection operators < > >| << <<- >> <> <& >&
+	     prefixed with a nonempty sequence of unquoted digits.  */
+	  int c2 = phase2_getc ();
+	  if ((c == '<' ? c2 == '<' : c2 == '|') || c2 == '>' || c2 == '&')
+	    {
+	      if (c == '<' && c2 == '<')
+		{
+		  int c3 = phase2_getc ();
+		  if (c3 != '-')
+		    phase2_ungetc (c3);
+		}
+	    }
+	  else
+	    phase2_ungetc (c2);
+
+	  wp->type = t_redirect;
+	  free_token (wp->token);
+	  free (wp->token);
+
+	  last_non_comment_line = line_number;
+
+	  return;
+	}
+
+      all_unquoted_digits = all_unquoted_digits && (c >= '0' && c <= '9');
+
+      if (c == '$')
+	{
+	  int c2 = phase2_getc ();
+	  if (c2 == '(')
+	    {
+	      int c3 = phase2_getc ();
+	      if (c3 == '(')
+		{
+		  /* Arithmetic expression.  Skip until the matching closing
+		     parenthesis.  */
+		  unsigned int depth = 2;
+
+		  do
+		    {
+		      c = phase2_getc ();
+		      if (c == '(')
+			depth++;
+		      else if (c == ')')
+			if (--depth == 0)
+			  break;
+		    }
+		  while (c != EOF);
+		}
+	      else
+		{
+		  /* Command substitution.  */
+		  phase2_ungetc (c3);
+		  read_command_list (')');
+		}
+	    }
+	  else if (c2 == '\'' && !open_singlequote)
+	    {
+	      /* Bash builtin for string with ANSI-C escape sequences.  */
+	      saw_opening_singlequote ();
+	      for (;;)
+		{
+		  c = phase2_getc ();
+		  if (c == EOF)
+		    break;
+		  if (c == '\'')
+		    {
+		      saw_closing_singlequote ();
+		      break;
+		    }
+		  if (c == '\\')
+		    {
+		      c = phase2_getc ();
+		      switch (c)
+			{
+			default:
+			  phase2_ungetc (c);
+			  c = '\\';
+			  break;
+
+			case '\\':
+			  break;
+			case '\'':
+			  /* Don't call saw_closing_singlequote () here.  */
+			  break;
+
+			case 'a':
+			  c = '\a';
+			  break;
+			case 'b':
+			  c = '\b';
+			  break;
+			case 'e':
+			  c = 0x1b; /* ESC */
+			  break;
+			case 'f':
+			  c = '\f';
+			  break;
+			case 'n':
+			  c = '\n';
+			  break;
+			case 'r':
+			  c = '\r';
+			  break;
+			case 't':
+			  c = '\t';
+			  break;
+			case 'v':
+			  c = '\v';
+			  break;
+
+			case 'x':
+			  c = phase2_getc ();
+			  if ((c >= '0' && c <= '9')
+			      || (c >= 'A' && c <= 'F')
+			      || (c >= 'a' && c <= 'f'))
+			    {
+			      int n;
+
+			      if (c >= '0' && c <= '9')
+				n = c - '0';
+			      else if (c >= 'A' && c <= 'F')
+				n = 10 + c - 'A';
+			      else if (c >= 'a' && c <= 'f')
+				n = 10 + c - 'a';
+			      else
+				abort ();
+
+			      c = phase2_getc ();
+			      if ((c >= '0' && c <= '9')
+				  || (c >= 'A' && c <= 'F')
+				  || (c >= 'a' && c <= 'f'))
+				{
+				  if (c >= '0' && c <= '9')
+				    n = n * 16 + c - '0';
+				  else if (c >= 'A' && c <= 'F')
+				    n = n * 16 + 10 + c - 'A';
+				  else if (c >= 'a' && c <= 'f')
+				    n = n * 16 + 10 + c - 'a';
+				  else
+				    abort ();
+				}
+			      else
+				phase2_ungetc (c);
+
+			      c = n;
+			    }
+			  else
+			    {
+			      phase2_ungetc (c);
+			      phase2_ungetc ('x');
+			      c = '\\';
+			    }
+			  break;
+
+			case '0': case '1': case '2': case '3':
+			case '4': case '5': case '6': case '7':
+			  {
+			    int n = c - '0';
+
+			    c = phase2_getc ();
+			    if (c >= '0' && c <= '7')
+			      {
+				n = n * 8 + c - '0';
+
+				c = phase2_getc ();
+				if (c >= '0' && c <= '7')
+				  n = n * 8 + c - '0';
+				else
+				  phase2_ungetc (c);
+			      }
+			    else
+			      phase2_ungetc (c);
+
+			    c = n;
+			  }
+			  break;
+			}
+		    }
+		  if (wp->type == t_string)
+		    {
+		      grow_token (wp->token);
+		      wp->token->chars[wp->token->charcount++] =
+			(unsigned char) c;
+		    }
+		}
+	      /* The result is a literal string.  Don't change wp->type.  */
+	      continue;
+	    }
+	  else if (c2 == '"' && !open_doublequote)
+	    {
+	      /* Bash builtin for internationalized string.  */
+	      lex_pos_ty pos;
+	      struct token string;
+
+	      saw_opening_doublequote ();
+	      pos.file_name = logical_file_name;
+	      pos.line_number = line_number;
+	      init_token (&string);
+	      for (;;)
+		{
+		  c = phase2_getc ();
+		  if (c == EOF)
+		    break;
+		  if (c == '"')
+		    {
+		      saw_closing_doublequote ();
+		      break;
+		    }
+		  grow_token (&string);
+		  string.chars[string.charcount++] = (unsigned char) c;
+		}
+	      remember_a_message (mlp, string_of_token (&string), &pos);
+	      free_token (&string);
+	      /* The result at runtime is not constant. Therefore we
+		 change wp->type.  */
+	    }
+	  else
+	    phase2_ungetc (c2);
+	  wp->type = t_other;
+	  continue;
+	}
+
+      if (c == '\'')
+	{
+	  if (!open_singlequote)
+	    {
+	      /* Handle an opening single quote.  */
+	      saw_opening_singlequote ();
+	    }
+	  else
+	    {
+	      /* Handle a closing single quote.  */
+	      saw_closing_singlequote ();
+	    }
+	  continue;
+	}
+
+      if (c == '"')
+	{
+	  if (!open_doublequote)
+	    {
+	      /* Handle an opening double quote.  */
+	      saw_opening_doublequote ();
+	    }
+	  else
+	    {
+	      /* Handle a closing double quote.  */
+	      saw_closing_doublequote ();
+	    }
+	  continue;
+	}
+
+      if (c == OPENING_BACKQUOTE)
+	{
+	  /* Handle an opening backquote.  */
+	  saw_opening_backquote ();
+
+	  read_command_list (CLOSING_BACKQUOTE);
+
+	  wp->type = t_other;
+	  continue;
+	}
+      if (c == CLOSING_BACKQUOTE)
+	break;
+
+      if (!open_singlequote && !open_doublequote
+	  && (is_whitespace (c) || is_operator_start (c)))
+	break;
+
+      if (wp->type == t_string)
+	{
+	  grow_token (wp->token);
+	  wp->token->chars[wp->token->charcount++] = (unsigned char) c;
+	}
+    }
+
+  phase2_ungetc (c);
+
+  if (wp->type != t_string)
+    {
+      free_token (wp->token);
+      free (wp->token);
+    }
+  last_non_comment_line = line_number;
+}
+
+
+/* Read the next command.
+   'looking_for' denotes a parse terminator, either CLOSING_BACKQUOTE, ')'
+   or '\0'.
+   Returns the type of the word that terminated the command.  */
+static enum word_type
+read_command (int looking_for)
+{
+  /* Read the words that make up the command.
+     Here we completely ignore field splitting at whitespace and wildcard
+     expansions; i.e. we assume that the source is written in such a way that
+     every word in the program determines exactly one word in the resulting
+     command.
+     But we do not require that the 'gettext'/'ngettext' command is the
+     first in the command; this is because 1. we want to allow for prefixes
+     like "$verbose" that may expand to nothing, and 2. it's a big effort
+     to know where a command starts in a $(for ...) or $(case ...) compound
+     command.  */
+  int arg = 0;			/* Current argument number.  */
+  bool arg_of_redirect = false;	/* True right after a redirection operator.  */
+  int argnum1 = -1;		/* First string position.  */
+  int argnum2 = -1;		/* Plural string position.  */
+  message_ty *plural_mp = NULL;	/* Remember the msgid.  */
+
+  for (;;)
+    {
+      struct word inner;
+
+      read_word (&inner, looking_for);
+
+      /* Recognize end of command.  */
+      if (inner.type == t_separator
+	  || inner.type == t_backquote || inner.type == t_paren
+	  || inner.type == t_eof)
+	return inner.type;
+
+      if (extract_all)
+	{
+	  if (inner.type == t_string)
+	    {
+	      lex_pos_ty pos;
+
+	      pos.file_name = logical_file_name;
+	      pos.line_number = inner.line_number_at_start;
+	      remember_a_message (mlp, string_of_word (&inner), &pos);
+	    }
+	}
+      else
+	{
+	  if (arg_of_redirect)
+	    {
+	      /* Ignore arguments of redirection operators.  */
+	      arg_of_redirect = false;
+	    }
+	  else if (inner.type == t_redirect)
+	    {
+	      /* Ignore this word and the following one.  */
+	      arg_of_redirect = true;
+	    }
+	  else
+	    {
+	      if (argnum1 < 0 && argnum2 < 0)
+		{
+		  /* This is the function position.  */
+		  arg = 0;
+		  if (inner.type == t_string)
+		    {
+		      char *function_name = string_of_word (&inner);
+		      void *keyword_value;
+
+		      if (find_entry (&keywords,
+				      function_name, strlen (function_name),
+				      &keyword_value)
+			  == 0)
+			{
+			  argnum1 = (int) (long) keyword_value & ((1 << 10) - 1);
+			  argnum2 = (int) (long) keyword_value >> 10;
+			}
+
+		      free (function_name);
+		    }
+		}
+	      else
+		{
+		  /* These are the argument positions.
+		     Extract a string if we have reached the right
+		     argument position.  */
+		  if (arg == argnum1)
+		    {
+		      if (inner.type == t_string)
+			{
+			  lex_pos_ty pos;
+			  message_ty *mp;
+
+			  pos.file_name = logical_file_name;
+			  pos.line_number = inner.line_number_at_start;
+			  mp = remember_a_message (mlp, string_of_word (&inner), &pos);
+			  if (argnum2 > 0)
+			    plural_mp = mp;
+			}
+		    }
+		  else if (arg == argnum2)
+		    {
+		      if (inner.type == t_string && plural_mp != NULL)
+			{
+			  lex_pos_ty pos;
+
+			  pos.file_name = logical_file_name;
+			  pos.line_number = inner.line_number_at_start;
+			  remember_a_message_plural (plural_mp, string_of_word (&inner), &pos);
+			}
+		    }
+
+		  if (arg >= argnum1 && arg >= argnum2)
+		    {
+		      /* Stop looking for arguments of the last function_name.  */
+		      argnum1 = -1;
+		      argnum2 = -1;
+		      plural_mp = NULL;
+		    }
+		}
+
+	      arg++;
+	    }
+	}
+
+      free_word (&inner);
+    }
+}
+
+
+/* Read a list of commands.
+   'looking_for' denotes a parse terminator, either CLOSING_BACKQUOTE, ')'
+   or '\0'.
+   Returns the type of the word that terminated the command list.  */
+static enum word_type
+read_command_list (int looking_for)
+{
+  for (;;)
+    {
+      enum word_type terminator;
+
+      terminator = read_command (looking_for);
+      if (terminator != t_separator)
+	return terminator;
+    }
+}
+
+
+void
+extract_sh (FILE *f,
+	    const char *real_filename, const char *logical_filename,
+	    msgdomain_list_ty *mdlp)
+{
+  mlp = mdlp->item[0]->messages;
+
+  fp = f;
+  real_file_name = real_filename;
+  logical_file_name = xstrdup (logical_filename);
+  line_number = 1;
+
+  last_comment_line = -1;
+  last_non_comment_line = -1;
+
+  nested_backquotes = 0;
+  open_doublequotes_mask = 0;
+  open_doublequote = false;
+  open_singlequote = false;
+
+  init_keywords ();
+
+  /* Eat tokens until eof is seen.  */
+  read_command_list ('\0');
+
+  fp = NULL;
+  real_file_name = NULL;
+  logical_file_name = NULL;
+  line_number = 0;
+}
diff --git a/gettext-tools/src/x-sh.h b/gettext-tools/src/x-sh.h
new file mode 100644
index 000000000..84b8b2793
--- /dev/null
+++ b/gettext-tools/src/x-sh.h
@@ -0,0 +1,33 @@
+/* xgettext sh backend.
+   Copyright (C) 2003 Free Software Foundation, Inc.
+   Written by Bruno Haible <bruno@clisp.org>, 2003.
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 2, or (at your option)
+   any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software Foundation,
+   Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.  */
+
+
+#define EXTENSIONS_SH \
+  { "sh",    "Shell"   },						\
+  { "bash",  "Shell"   },						\
+
+#define SCANNERS_SH \
+  { "Shell",      extract_sh, &formatstring_sh, NULL },			\
+
+/* Scan a shell script file and add its translatable strings to mdlp.  */
+extern void extract_sh (FILE *fp, const char *real_filename,
+			const char *logical_filename,
+			msgdomain_list_ty *mdlp);
+
+extern void x_sh_keyword (const char *keyword);
+extern void x_sh_extract_all (void);
-- 
2.47.3