From 9dae67bcc8dbc389e0ceef12f1fa04352ef75b8e Mon Sep 17 00:00:00 2001 From: =?utf8?q?Mat=C4=9Bj=20Volf?= Date: Tue, 9 Mar 2021 14:19:58 +0100 Subject: [PATCH] Split lines in lexer only by \r\n, \r and \n Python str.splitlines() splits by more characters[1], which, however, causes problems when keeping these special characters in processed templates is desirable, i.e. these bug reports: #769, #952, #1313. The keep_trailing_newlines logic is reworked because splitlines() removes them already (so they had to be added), while re.split doesn't so they have to be removed. [1] https://docs.python.org/3/library/stdtypes.html#str.splitlines --- CHANGES.rst | 2 ++ src/jinja2/lexer.py | 15 ++++++++++----- tests/test_regression.py | 7 +++++++ 3 files changed, 19 insertions(+), 5 deletions(-) diff --git a/CHANGES.rst b/CHANGES.rst index dad0d0ef..dd3739fb 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -56,6 +56,8 @@ Unreleased instead of a ``TypeError``. :issue:`1198` - ``Undefined`` is iterable in an async environment. :issue:`1294` - ``NativeEnvironment`` supports async mode. :issue:`1362` +- Template rendering only treats ``\n``, ``\r\n`` and ``\r`` as line + breaks. Other characters are left unchanged. :issue:`769, 952, 1313` Version 2.11.3 diff --git a/src/jinja2/lexer.py b/src/jinja2/lexer.py index d992f0d1..0cade7a3 100644 --- a/src/jinja2/lexer.py +++ b/src/jinja2/lexer.py @@ -638,12 +638,17 @@ class Lexer: def tokeniter(self, source, name, filename=None, state=None): """This method tokenizes the text and returns the tokens in a - generator. Use this method if you just want to tokenize a template. + generator. Use this method if you just want to tokenize a template. + + .. versionchanged:: 3.0 + Only ``\\n``, ``\\r\\n`` and ``\\r`` are treated as line + breaks. """ - lines = source.splitlines() - if self.keep_trailing_newline and source: - if source.endswith(("\r\n", "\r", "\n")): - lines.append("") + lines = newline_re.split(source)[::2] + + if not self.keep_trailing_newline and lines[-1] == "": + del lines[-1] + source = "\n".join(lines) pos = 0 lineno = 1 diff --git a/tests/test_regression.py b/tests/test_regression.py index a49356b3..29caee52 100644 --- a/tests/test_regression.py +++ b/tests/test_regression.py @@ -745,3 +745,10 @@ End""" tmpl = env.get_template("base") assert tmpl.render() == "42 y" + + +@pytest.mark.parametrize("unicode_char", ["\N{FORM FEED}", "\x85"]) +def test_unicode_whitespace(env, unicode_char): + content = "Lorem ipsum\n" + unicode_char + "\nMore text" + tmpl = env.from_string(content) + assert tmpl.render() == content -- 2.47.2