/* xgettext Java backend.
- Copyright (C) 2003, 2005-2006 Free Software Foundation, Inc.
+ Copyright (C) 2003, 2005-2007 Free Software Foundation, Inc.
Written by Bruno Haible <bruno@clisp.org>, 2003.
This program is free software; you can redistribute it and/or modify
{
if (bp->utf16_surr != 0)
{
- /* A half surrogate is invalid, therefore use U+FFFD instead. */
+ /* A half surrogate is invalid, therefore use U+FFFD instead.
+ It appears to be valid Java: The Java Language Specification,
+ 3rd ed., says "The Java programming language represents text
+ in sequences of 16-bit code units, using the UTF-16 encoding."
+ but does not impose constraints on the use of \uxxxx escape
+ sequences for surrogates. And the JDK's javac happily groks
+ half surrogates.
+ But a half surrogate is invalid in UTF-8:
+ - RFC 3629 says
+ "The definition of UTF-8 prohibits encoding character
+ numbers between U+D800 and U+DFFF".
+ - Unicode 4.0 chapter 3
+ <http://www.unicode.org/versions/Unicode4.0.0/ch03.pdf>
+ section 3.9, p.77, says
+ "Because surrogate code points are not Unicode scalar
+ values, any UTF-8 byte sequence that would otherwise
+ map to code points D800..DFFF is ill-formed."
+ and in table 3-6, p. 78, does not mention D800..DFFF.
+ - The unicode.org FAQ question "How do I convert an unpaired
+ UTF-16 surrogate to UTF-8?" has the answer
+ "By representing such an unpaired surrogate on its own
+ as a 3-byte sequence, the resulting UTF-8 data stream
+ would become ill-formed."
+ So use U+FFFD instead. */
+ error_with_progname = false;
+ error (0, 0, _("%s:%d: warning: lone surrogate U+%04X"),
+ logical_file_name, line_number, bp->utf16_surr);
+ error_with_progname = true;
string_buffer_append_unicode (bp, 0xfffd);
bp->utf16_surr = 0;
}
if (c >= UNICODE (0xd800) && c < UNICODE (0xdc00))
bp->utf16_surr = UTF16_VALUE (c);
+ else if (c >= UNICODE (0xdc00) && c < UNICODE (0xe000))
+ {
+ /* A half surrogate is invalid, therefore use U+FFFD instead.
+ It appears to be valid Java: The Java Language Specification,
+ 3rd ed., says "The Java programming language represents text
+ in sequences of 16-bit code units, using the UTF-16 encoding."
+ but does not impose constraints on the use of \uxxxx escape
+ sequences for surrogates. And the JDK's javac happily groks
+ half surrogates.
+ But a half surrogate is invalid in UTF-8:
+ - RFC 3629 says
+ "The definition of UTF-8 prohibits encoding character
+ numbers between U+D800 and U+DFFF".
+ - Unicode 4.0 chapter 3
+ <http://www.unicode.org/versions/Unicode4.0.0/ch03.pdf>
+ section 3.9, p.77, says
+ "Because surrogate code points are not Unicode scalar
+ values, any UTF-8 byte sequence that would otherwise
+ map to code points D800..DFFF is ill-formed."
+ and in table 3-6, p. 78, does not mention D800..DFFF.
+ - The unicode.org FAQ question "How do I convert an unpaired
+ UTF-16 surrogate to UTF-8?" has the answer
+ "By representing such an unpaired surrogate on its own
+ as a 3-byte sequence, the resulting UTF-8 data stream
+ would become ill-formed."
+ So use U+FFFD instead. */
+ error_with_progname = false;
+ error (0, 0, _("%s:%d: warning: lone surrogate U+%04X"),
+ logical_file_name, line_number, UTF16_VALUE (c));
+ error_with_progname = true;
+ string_buffer_append_unicode (bp, 0xfffd);
+ }
else
string_buffer_append_unicode (bp, UTF16_VALUE (c));
}