From: Bruno Haible Date: Tue, 31 Dec 2024 01:43:41 +0000 (+0100) Subject: When reading PO files, treat CRLF line terminators like LF. X-Git-Tag: v0.24~77 X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=2f8309bf23eb35a51754e32acb7c52fc56885b57;p=thirdparty%2Fgettext.git When reading PO files, treat CRLF line terminators like LF. Reported by Hendy Irawan at . * gettext-tools/src/read-po-internal.h (MBFILE_MAX_PUSHBACK): Renamed from NPUSHBACK. Increase by 1. (struct mbfile): Rename field 'have_pushback' to 'pushback_count'. * gettext-tools/src/read-po-lex.c (mbfile_init): Update. (mbfile_getc): Read pushed-back character before testing for sticky EOF. (mbfile_ungetc): Update. (mbfile_getc_normalized): New function. (lex_getc): Invoke mbfile_getc_normalized instead of mbfile_getc. * gettext-tools/tests/testdata/crlf.pot: New file. * gettext-tools/tests/msgcat-23: New file. * gettext-tools/tests/Makefile.am (TESTS): Add it. --- diff --git a/gettext-tools/src/read-po-internal.h b/gettext-tools/src/read-po-internal.h index a04b9f33e..871290c1f 100644 --- a/gettext-tools/src/read-po-internal.h +++ b/gettext-tools/src/read-po-internal.h @@ -56,18 +56,19 @@ typedef struct mbchar mbchar_t[1]; /* Number of characters that can be pushed back. - We need 1 for lex_getc, plus 1 for lex_ungetc. */ -#define NPUSHBACK 2 + We need 1 for mbfile_getc_normalized, plus 1 for lex_getc, + plus 1 for lex_ungetc. */ +#define MBFILE_MAX_PUSHBACK 3 /* Data type of a multibyte character input stream. */ struct mbfile { FILE *fp; bool eof_seen; - int have_pushback; + unsigned int pushback_count; /* <= MBFILE_MAX_PUSHBACK */ unsigned int bufcount; char buf[MBCHAR_BUF_SIZE]; - struct mbchar pushback[NPUSHBACK]; + struct mbchar pushback[MBFILE_MAX_PUSHBACK]; }; /* We want to pass multibyte streams by reference automatically, diff --git a/gettext-tools/src/read-po-lex.c b/gettext-tools/src/read-po-lex.c index b1ed37923..893ef0288 100644 --- a/gettext-tools/src/read-po-lex.c +++ b/gettext-tools/src/read-po-lex.c @@ -571,7 +571,7 @@ mbfile_init (mbfile_t mbf, FILE *stream) { mbf->fp = stream; mbf->eof_seen = false; - mbf->have_pushback = 0; + mbf->pushback_count = 0; mbf->bufcount = 0; } @@ -582,19 +582,19 @@ mbfile_getc (struct po_parser_state *ps, mbchar_t mbc, mbfile_t mbf) { size_t bytes; - /* If EOF has already been seen, don't use getc. This matters if - mbf->fp is connected to an interactive tty. */ - if (mbf->eof_seen) - goto eof; - /* Return character pushed back, if there is one. */ - if (mbf->have_pushback > 0) + if (mbf->pushback_count > 0) { - mbf->have_pushback--; - mb_copy (mbc, &mbf->pushback[mbf->have_pushback]); + mbf->pushback_count--; + mb_copy (mbc, &mbf->pushback[mbf->pushback_count]); return; } + /* If EOF has already been seen, don't use getc. This matters if + mbf->fp is connected to an interactive tty. */ + if (mbf->eof_seen) + goto eof; + /* Before using iconv, we need at least one byte. */ if (mbf->bufcount == 0) { @@ -791,10 +791,10 @@ eof: static void mbfile_ungetc (const mbchar_t mbc, mbfile_t mbf) { - if (mbf->have_pushback >= NPUSHBACK) + if (mbf->pushback_count >= MBFILE_MAX_PUSHBACK) abort (); - mb_copy (&mbf->pushback[mbf->have_pushback], mbc); - mbf->have_pushback++; + mb_copy (&mbf->pushback[mbf->pushback_count], mbc); + mbf->pushback_count++; } @@ -830,6 +830,35 @@ lex_end (struct po_parser_state *ps) } +/* Read a single character, collapsing the Windows CRLF line terminator + to a single LF. + Supports 1 character of pushback (via mbfile_ungetc). */ +static void +mbfile_getc_normalized (struct po_parser_state *ps, mbchar_t mbc, mbfile_t mbf) +{ + mbfile_getc (ps, mbc, ps->mbf); + if (!mb_iseof (mbc) && mb_iseq (mbc, '\r')) + { + mbchar_t mbc2; + + mbfile_getc (ps, mbc2, ps->mbf); + if (!mb_iseof (mbc2)) + { + if (mb_iseq (mbc2, '\n')) + /* Eliminate the CR. */ + mb_copy (mbc, mbc2); + else + { + mbfile_ungetc (mbc2, ps->mbf); + /* If we get here, the caller can still do + mbfile_ungetc (mbc, ps->mbf); + since mbfile_getc supports 2 characters of pushback. */ + } + } + } +} + + /* Read a single character, dealing with backslash-newline. Also keep track of the current line number and column number. */ static void @@ -837,7 +866,7 @@ lex_getc (struct po_parser_state *ps, mbchar_t mbc) { for (;;) { - mbfile_getc (ps, mbc, ps->mbf); + mbfile_getc_normalized (ps, mbc, ps->mbf); if (mb_iseof (mbc)) { @@ -867,7 +896,7 @@ lex_getc (struct po_parser_state *ps, mbchar_t mbc) { mbchar_t mbc2; - mbfile_getc (ps, mbc2, ps->mbf); + mbfile_getc_normalized (ps, mbc2, ps->mbf); if (mb_iseof (mbc2)) { diff --git a/gettext-tools/tests/Makefile.am b/gettext-tools/tests/Makefile.am index 6437b14e1..d4a880fb0 100644 --- a/gettext-tools/tests/Makefile.am +++ b/gettext-tools/tests/Makefile.am @@ -34,7 +34,7 @@ TESTS = gettext-1 gettext-2 \ msgcat-1 msgcat-2 msgcat-3 msgcat-4 msgcat-5 msgcat-6 msgcat-7 \ msgcat-8 msgcat-9 msgcat-10 msgcat-11 msgcat-12 msgcat-13 msgcat-14 \ msgcat-15 msgcat-16 msgcat-17 msgcat-18 msgcat-19 msgcat-20 msgcat-21 \ - msgcat-22 \ + msgcat-22 msgcat-23 \ msgcat-properties-1 msgcat-properties-2 \ msgcat-stringtable-1 \ msgcmp-1 msgcmp-2 msgcmp-3 msgcmp-4 \ diff --git a/gettext-tools/tests/msgcat-23 b/gettext-tools/tests/msgcat-23 new file mode 100755 index 000000000..b1b5e0858 --- /dev/null +++ b/gettext-tools/tests/msgcat-23 @@ -0,0 +1,19 @@ +#! /bin/sh +. "${srcdir=.}/init.sh"; path_prepend_ . ../src + +# Test processing of POT files that have CRLF as line terminators. + +: ${MSGCAT=msgcat} +${MSGCAT} -o mcat-test23.out "$wabs_srcdir"/testdata/crlf.pot || Exit 1 + +cat <<\EOF > mcat-test23.ok +#: .\edu\templates\edu\home.html:69 .\edu\templates\edu\music_country.html:34 +#, python-format +msgid "Hi" +msgstr "" +EOF + +: ${DIFF=diff} +${DIFF} mcat-test23.ok mcat-test23.out || Exit 1 + +exit 0 diff --git a/gettext-tools/tests/testdata/crlf.pot b/gettext-tools/tests/testdata/crlf.pot new file mode 100644 index 000000000..a413e9c72 --- /dev/null +++ b/gettext-tools/tests/testdata/crlf.pot @@ -0,0 +1,5 @@ +#: .\edu\templates\edu\home.html:69 +#: .\edu\templates\edu\music_country.html:34 +#, python-format +msgid "Hi" +msgstr ""