if (errno == EILSEQ)
{
/* An invalid multibyte sequence was encountered. */
- multiline_error (xstrdup (""),
- xasprintf (_("\
-%s:%d: Invalid multibyte sequence.\n\
-Please specify the correct source encoding through --from-code or through a\n\
-comment as specified in https://www.python.org/peps/pep-0263.html.\n"),
- real_file_name, line_number));
- exit (EXIT_FAILURE);
+ goto invalid;
}
else if (errno == EINVAL)
{
/* Read one more byte and retry iconv. */
c = phase1_getc ();
if (c == EOF)
- {
- multiline_error (xstrdup (""),
- xasprintf (_("\
-%s:%d: Incomplete multibyte sequence at end of file.\n\
-Please specify the correct source encoding through --from-code or through a\n\
-comment as specified in https://www.python.org/peps/pep-0263.html.\n"),
- real_file_name, line_number));
- exit (EXIT_FAILURE);
- }
+ goto incomplete_at_eof;
if (c == '\n')
- {
- multiline_error (xstrdup (""),
- xasprintf (_("\
-%s:%d: Incomplete multibyte sequence at end of line.\n\
-Please specify the correct source encoding through --from-code or through a\n\
-comment as specified in https://www.python.org/peps/pep-0263.html.\n"),
- real_file_name, line_number - 1));
- exit (EXIT_FAILURE);
- }
+ goto incomplete_at_eol;
buf[bufcount++] = (unsigned char) c;
}
else
{
/* scratchbuf contains an out-of-range Unicode character
(> 0x10ffff). */
- multiline_error (xstrdup (""),
- xasprintf (_("\
-%s:%d: Invalid multibyte sequence.\n\
-Please specify the source encoding through --from-code or through a comment\n\
-as specified in https://www.python.org/peps/pep-0263.html.\n"),
- real_file_name, line_number));
- exit (EXIT_FAILURE);
+ goto invalid;
}
return uc;
}
}
else
{
- /* Read an UTF-8 encoded character. */
- unsigned char buf[6];
- unsigned int count;
+ /* Read an UTF-8 encoded character.
+ Reject invalid input, like u8_mbtouc does. */
int c;
ucs4_t uc;
c = phase1_getc ();
if (c == EOF)
return UEOF;
- buf[0] = c;
- count = 1;
-
- if (buf[0] >= 0xc0)
- {
- c = phase1_getc ();
- if (c == EOF)
- return UEOF;
- buf[1] = c;
- count = 2;
- }
-
- if (buf[0] >= 0xe0
- && ((buf[1] ^ 0x80) < 0x40))
+ if (c < 0x80)
{
- c = phase1_getc ();
- if (c == EOF)
- return UEOF;
- buf[2] = c;
- count = 3;
+ uc = c;
}
-
- if (buf[0] >= 0xf0
- && ((buf[1] ^ 0x80) < 0x40)
- && ((buf[2] ^ 0x80) < 0x40))
+ else if (c < 0xc2)
+ goto invalid;
+ else if (c < 0xe0)
{
- c = phase1_getc ();
- if (c == EOF)
- return UEOF;
- buf[3] = c;
- count = 4;
+ int c1 = phase1_getc ();
+ if (c1 == EOF)
+ goto incomplete_at_eof;
+ if (c1 == '\n')
+ goto incomplete_at_eol;
+ if ((c1 ^ 0x80) < 0x40)
+ uc = ((unsigned int) (c & 0x1f) << 6)
+ | (unsigned int) (c1 ^ 0x80);
+ else
+ goto invalid;
}
-
- if (buf[0] >= 0xf8
- && ((buf[1] ^ 0x80) < 0x40)
- && ((buf[2] ^ 0x80) < 0x40)
- && ((buf[3] ^ 0x80) < 0x40))
+ else if (c < 0xf0)
{
- c = phase1_getc ();
- if (c == EOF)
- return UEOF;
- buf[4] = c;
- count = 5;
+ int c1 = phase1_getc ();
+ if (c1 == EOF)
+ goto incomplete_at_eof;
+ if (c1 == '\n')
+ goto incomplete_at_eol;
+ if ((c1 ^ 0x80) < 0x40
+ && (c >= 0xe1 || c1 >= 0xa0)
+ && (c != 0xed || c1 < 0xa0))
+ {
+ int c2 = phase1_getc ();
+ if (c2 == EOF)
+ goto incomplete_at_eof;
+ if (c2 == '\n')
+ goto incomplete_at_eol;
+ if ((c2 ^ 0x80) < 0x40)
+ uc = ((unsigned int) (c & 0x0f) << 12)
+ | ((unsigned int) (c1 ^ 0x80) << 6)
+ | (unsigned int) (c2 ^ 0x80);
+ else
+ goto invalid;
+ }
+ else
+ goto invalid;
}
-
- if (buf[0] >= 0xfc
- && ((buf[1] ^ 0x80) < 0x40)
- && ((buf[2] ^ 0x80) < 0x40)
- && ((buf[3] ^ 0x80) < 0x40)
- && ((buf[4] ^ 0x80) < 0x40))
+ else if (c < 0xf8)
{
- c = phase1_getc ();
- if (c == EOF)
- return UEOF;
- buf[5] = c;
- count = 6;
+ int c1 = phase1_getc ();
+ if (c1 == EOF)
+ goto incomplete_at_eof;
+ if (c1 == '\n')
+ goto incomplete_at_eol;
+ if ((c1 ^ 0x80) < 0x40
+ && (c >= 0xf1 || c1 >= 0x90)
+ && (c < 0xf4 || (c == 0xf4 && c1 < 0x90)))
+ {
+ int c2 = phase1_getc ();
+ if (c2 == EOF)
+ goto incomplete_at_eof;
+ if (c2 == '\n')
+ goto incomplete_at_eol;
+ if ((c2 ^ 0x80) < 0x40)
+ {
+ int c3 = phase1_getc ();
+ if (c3 == EOF)
+ goto incomplete_at_eof;
+ if (c3 == '\n')
+ goto incomplete_at_eol;
+ if ((c3 ^ 0x80) < 0x40)
+ uc = ((unsigned int) (c & 0x07) << 18)
+ | ((unsigned int) (c1 ^ 0x80) << 12)
+ | ((unsigned int) (c2 ^ 0x80) << 6)
+ | (unsigned int) (c3 ^ 0x80);
+ else
+ goto invalid;
+ }
+ else
+ goto invalid;
+ }
+ else
+ goto invalid;
}
+ else
+ goto invalid;
- u8_mbtouc (&uc, buf, count);
return uc;
}
+
+ invalid:
+ /* An invalid multibyte sequence was encountered. */
+ multiline_error (xstrdup (""),
+ xasprintf (_("\
+%s:%d: Invalid multibyte sequence.\n\
+Please specify the correct source encoding through --from-code or through a\n\
+comment as specified in https://www.python.org/peps/pep-0263.html.\n"),
+ real_file_name, line_number));
+ exit (EXIT_FAILURE);
+
+ incomplete_at_eof:
+ multiline_error (xstrdup (""),
+ xasprintf (_("\
+%s:%d: Incomplete multibyte sequence at end of file.\n\
+Please specify the correct source encoding through --from-code or through a\n\
+comment as specified in https://www.python.org/peps/pep-0263.html.\n"),
+ real_file_name, line_number));
+ exit (EXIT_FAILURE);
+
+ incomplete_at_eol:
+ multiline_error (xstrdup (""),
+ xasprintf (_("\
+%s:%d: Incomplete multibyte sequence at end of line.\n\
+Please specify the correct source encoding through --from-code or through a\n\
+comment as specified in https://www.python.org/peps/pep-0263.html.\n"),
+ real_file_name, line_number - 1));
+ exit (EXIT_FAILURE);
}
/* Supports max (9, UNINAME_MAX + 3) pushback characters. */
#else
error_at_line (EXIT_FAILURE, 0, logical_file_name, line_number - 1,
_("Cannot convert from \"%s\" to \"%s\". %s relies on iconv(). This version was built without iconv()."),
- xgettext_global_source_encoding, po_charset_utf8,
+ xgettext_current_file_source_encoding, po_charset_utf8,
basename (program_name));
#endif
}
last_comment_line = -1;
last_non_comment_line = -1;
- xgettext_current_file_source_encoding = xgettext_global_source_encoding;
+ /* For Python, the default source file encoding is UTF-8. This is specified
+ in PEP 3120. */
+ xgettext_current_file_source_encoding =
+ (xgettext_global_source_encoding != NULL ? xgettext_global_source_encoding :
+ po_charset_utf8);
#if HAVE_ICONV
xgettext_current_file_source_iconv = xgettext_global_source_iconv;
#endif
/* Set initial value of variables. */
default_domain = MESSAGE_DOMAIN_DEFAULT;
- xgettext_global_source_encoding = po_charset_ascii;
+ xgettext_global_source_encoding = NULL;
init_flag_table_c ();
init_flag_table_objc ();
init_flag_table_gcc_internal ();
/* Allocate converter from xgettext_global_source_encoding to UTF-8 (except
from ASCII or UTF-8, when this conversion is a no-op). */
- if (xgettext_global_source_encoding != po_charset_ascii
+ if (xgettext_global_source_encoding != NULL
+ && xgettext_global_source_encoding != po_charset_ascii
&& xgettext_global_source_encoding != po_charset_utf8)
{
#if HAVE_ICONV
/* Free the allocated converter. */
#if HAVE_ICONV
- if (xgettext_global_source_encoding != po_charset_ascii
+ if (xgettext_global_source_encoding != NULL
+ && xgettext_global_source_encoding != po_charset_ascii
&& xgettext_global_source_encoding != po_charset_utf8)
iconv_close (xgettext_global_source_iconv);
#endif
/* Set the default for the source file encoding. May be overridden by
the extractor function. */
- xgettext_current_source_encoding = xgettext_global_source_encoding;
+ xgettext_current_source_encoding =
+ (xgettext_global_source_encoding != NULL ? xgettext_global_source_encoding :
+ po_charset_ascii);
#if HAVE_ICONV
xgettext_current_source_iconv = xgettext_global_source_iconv;
#endif