#include "progname.h"
#include "xmalloc.h"
#include "exit.h"
+#include "po-charset.h"
#include "ucs4-utf8.h"
#include "uniname.h"
#include "gettext.h"
}
}
-/* Read a here document and return its contents. */
+/* Read a here document and return its contents.
+ The delimiter is an UTF-8 encoded string; the resulting string is UTF-8
+ encoded as well. */
static char *
get_here_document (const char *delimiter)
for (;;)
{
int read_bytes = getline (&my_linebuf, &my_linebuf_size, fp);
+ char *my_line_utf8;
bool chomp;
if (read_bytes < 0)
++here_eaten;
+ /* Convert to UTF-8. */
+ my_line_utf8 =
+ from_current_source_encoding (my_linebuf, logical_file_name,
+ line_number + here_eaten);
+ if (my_line_utf8 != my_linebuf)
+ {
+ if (strlen (my_line_utf8) >= my_linebuf_size)
+ {
+ my_linebuf_size = strlen (my_line_utf8) + 1;
+ my_linebuf = xrealloc (my_linebuf, my_linebuf_size);
+ }
+ strcpy (my_linebuf, my_line_utf8);
+ free (my_line_utf8);
+ }
+
/* Undosify. This is important for catching the end of <<EOF and
<<'EOF'. We could rely on stdio doing this for us but you
it is not uncommon to to come across Perl scripts with CRLF
size_t buflen;
int lineno;
int c;
+ char *utf8_string;
c = phase1_getc ();
if (c == '#')
break;
}
}
+ /* Accumulate the comment. */
for (;;)
{
c = phase1_getc ();
buffer = xrealloc (buffer, bufmax);
}
buffer[buflen] = '\0';
- xgettext_comment_add (buffer);
+ /* Convert it to UTF-8. */
+ utf8_string =
+ from_current_source_encoding (buffer, logical_file_name, lineno);
+ /* Save it until we encounter the corresponding string. */
+ xgettext_current_source_encoding = po_charset_utf8;
+ xgettext_comment_add (utf8_string);
+ xgettext_current_source_encoding = xgettext_global_source_encoding;
last_comment_line = lineno;
}
return c;
token_type_named_op, /* if, unless, while, ... */
token_type_variable, /* $... */
token_type_symbol, /* symbol, number */
- token_type_keyword_symbol, /* keyword symbol (used by parser) */
token_type_regex_op, /* s, tr, y, m. */
token_type_dot, /* . */
- token_type_other /* regexp, misc. operator */
+ token_type_other, /* regexp, misc. operator */
+ /* The following are not really token types, but variants used by
+ the parser. */
+ token_type_keyword_symbol /* keyword symbol */
};
typedef enum token_type_ty token_type_ty;
{
token_type_ty type;
string_type_ty string_type; /* for token_type_string */
- char *string; /* for token_type_named_op, token_type_string,
- token_type_symbol, token_type_keyword_symbol,
- token_type_variable */
+ char *string; /* for: in encoding:
+ token_type_named_op ASCII
+ token_type_string UTF-8
+ token_type_symbol ASCII
+ token_type_variable global_source_encoding
+ */
int line_number;
};
return "token_type_variable";
case token_type_symbol:
return "token_type_symbol";
- case token_type_keyword_symbol:
- return "token_type_keyword_symbol";
case token_type_regex_op:
return "token_type_regex_op";
case token_type_dot:
case token_type_named_op:
case token_type_string:
case token_type_symbol:
- case token_type_keyword_symbol:
case token_type_variable:
free (tp->string);
break;
}
}
+/* Like extract_quotelike_pass1, but return the complete string in UTF-8
+ encoding. */
+static char *
+extract_quotelike_pass1_utf8 (int delim)
+{
+ char *string = extract_quotelike_pass1 (delim);
+ char *utf8_string =
+ from_current_source_encoding (string, logical_file_name, line_number);
+ if (utf8_string != string)
+ free (string);
+ return utf8_string;
+}
+
/* ========= Reading of tokens and commands. Extracting strings. ========= */
static void
extract_quotelike (token_ty *tp, int delim)
{
- char *string = extract_quotelike_pass1 (delim);
+ char *string = extract_quotelike_pass1_utf8 (delim);
size_t len = strlen (string);
tp->type = token_type_string;
tp->type = token_type_regex_op;
- string = extract_quotelike_pass1 (delim);
+ string = extract_quotelike_pass1_utf8 (delim);
if (interpolate)
interpolate_keywords (mlp, string, line_number);
free (string);
delim = phase2_getc ();
}
}
- string = extract_quotelike_pass1 (delim);
+ string = extract_quotelike_pass1_utf8 (delim);
if (interpolate)
interpolate_keywords (mlp, string, line_number);
free (string);
lex_pos_ty pos;
pos.line_number = line_number;
pos.file_name = logical_file_name;
+ xgettext_current_source_encoding = po_charset_utf8;
remember_a_message (mlp, xstrdup (t1->string), &pos);
+ xgettext_current_source_encoding = xgettext_global_source_encoding;
free_token (t2);
free_token (t1);
}
/* Actually a simplified version of extract_variable(). It searches for
variables inside a double-quoted string that may interpolate to
- some keyword hash (reference). */
+ some keyword hash (reference). The string is UTF-8 encoded. */
static void
interpolate_keywords (message_list_ty *mlp, const char *string, int lineno)
{
extract_quotelike_pass3 (&token, EXIT_FAILURE);
/* The string can only shrink with interpolation (because
we ignore \Q). */
+ if (!(strlen (token.string) <= bufpos))
+ abort ();
strcpy (buffer, token.string);
free (token.string);
state = wait_rbrace;
buffer[bufpos] = '\0';
token.string = xstrdup (buffer);
extract_quotelike_pass3 (&token, EXIT_FAILURE);
+ xgettext_current_source_encoding = po_charset_utf8;
remember_a_message (mlp, token.string, &pos);
+ xgettext_current_source_encoding = xgettext_global_source_encoding;
/* FALLTHROUGH */
default:
state = initial;
case token_type_named_op:
case token_type_string:
case token_type_symbol:
- case token_type_keyword_symbol:
case token_type_variable:
fprintf (stderr, " string: %s\n", token->string);
break;
if (extract_all)
{
lex_pos_ty pos;
+ char *string;
pos.file_name = logical_file_name;
pos.line_number = tp->line_number;
- remember_a_message (mlp, collect_message (mlp, tp,
- EXIT_SUCCESS),
- &pos);
+ string = collect_message (mlp, tp, EXIT_SUCCESS);
+ xgettext_current_source_encoding = po_charset_utf8;
+ remember_a_message (mlp, string, &pos);
+ xgettext_current_source_encoding = xgettext_global_source_encoding;
}
else if (state)
{
lex_pos_ty pos;
+ char *string;
pos.file_name = logical_file_name;
pos.line_number = tp->line_number;
if (arg_count == arg_sg)
{
- plural_mp =
- remember_a_message (mlp, collect_message (mlp, tp,
- EXIT_FAILURE),
- &pos);
+ string = collect_message (mlp, tp, EXIT_FAILURE);
+ xgettext_current_source_encoding = po_charset_utf8;
+ plural_mp = remember_a_message (mlp, string, &pos);
+ xgettext_current_source_encoding = xgettext_global_source_encoding;
arg_sg = -1;
}
else if (arg_count == arg_pl && plural_mp == NULL)
}
else if (arg_count == arg_pl)
{
- remember_a_message_plural (plural_mp,
- collect_message (mlp, tp,
- EXIT_FAILURE),
- &pos);
+ string = collect_message (mlp, tp, EXIT_FAILURE);
+ xgettext_current_source_encoding = po_charset_utf8;
+ remember_a_message_plural (plural_mp, string, &pos);
+ xgettext_current_source_encoding = xgettext_global_source_encoding;
arg_pl = -1;
}
}
static struct formatstring_parser *current_formatstring_parser2;
-/* Convert the given string from xgettext_current_source_encoding to
- the output file encoding (i.e. ASCII or UTF-8). */
-#define CONVERT_STRING(string) \
- if (xgettext_current_source_encoding == po_charset_ascii) \
- { \
- if (!is_ascii_string (string)) \
- { \
- char buffer[21]; \
- if (pos->line_number == (size_t)(-1)) \
- buffer[0] = '\0'; \
- else \
- sprintf (buffer, ":%ld", (long) pos->line_number); \
- error (EXIT_FAILURE, 0, _("Non-ASCII string at %s%s.\nPlease specify the source encoding through --from-code."), \
- pos->file_name, buffer); \
- } \
- } \
- else if (xgettext_current_source_encoding != po_charset_utf8) \
- { \
- string = convert_string (xgettext_current_source_iconv, string); \
- }
-
#if !HAVE_ICONV
/* If we don't have iconv(), the only supported values for
xgettext_global_source_encoding and thus also for
#define convert_string(cd,string) (abort (), (string))
#endif
+/* Convert the given string from xgettext_current_source_encoding to
+ the output file encoding (i.e. ASCII or UTF-8).
+ The resulting string is either the argument string, or freshly allocated.
+ The file_name and line_number are only used for error message purposes. */
+char *
+from_current_source_encoding (const char *string,
+ const char *file_name, size_t line_number)
+{
+ if (xgettext_current_source_encoding == po_charset_ascii)
+ {
+ if (!is_ascii_string (string))
+ {
+ char buffer[21];
+
+ if (line_number == (size_t)(-1))
+ buffer[0] = '\0';
+ else
+ sprintf (buffer, ":%ld", (long) line_number);
+ error (EXIT_FAILURE, 0, _("\
+Non-ASCII string at %s%s.\n\
+Please specify the source encoding through --from-code."),
+ file_name, buffer);
+ }
+ }
+ else if (xgettext_current_source_encoding != po_charset_utf8)
+ string = convert_string (xgettext_current_source_iconv, string);
+
+ return (char *) string;
+}
+
+#define CONVERT_STRING(string) \
+ string = from_current_source_encoding (string, pos->file_name, \
+ pos->line_number);
+
message_ty *
remember_a_message (message_list_ty *mlp, char *string, lex_pos_ty *pos)
CONVERT_STRING (s);
- /* To reduce the possibility of unwanted matches be do a two
+ /* To reduce the possibility of unwanted matches we do a two
step match: the line must contain `xgettext:' and one of
the possible format description strings. */
if ((t = strstr (s, "xgettext:")) != NULL)