From: Peter Dolak Date: Sun, 27 May 2018 09:41:20 +0000 (+0200) Subject: Fix lexer being extremely slow on large amounts of whitespace. Fixes #857 X-Git-Tag: 2.11.0~53^2~2 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=7d00a40465c89bee141ab5a3db545a20e7d30509;p=thirdparty%2Fjinja.git Fix lexer being extremely slow on large amounts of whitespace. Fixes #857 --- diff --git a/jinja2/lexer.py b/jinja2/lexer.py index 6fd135dd..1ef63946 100644 --- a/jinja2/lexer.py +++ b/jinja2/lexer.py @@ -409,6 +409,11 @@ def get_lexer(environment): return lexer +class OptionalLStrip(tuple): + def __new__(cls, *members, **kwargs): + return super(OptionalLStrip, cls).__new__(cls, members) + + class Lexer(object): """Class that implements a lexer for a given environment. Automatically created by the environment class, usually you don't have to do that. @@ -443,41 +448,9 @@ class Lexer(object): # block suffix if trimming is enabled block_suffix_re = environment.trim_blocks and '\\n?' or '' - # strip leading spaces if lstrip_blocks is enabled - prefix_re = {} - if environment.lstrip_blocks: - # use '{%+' to manually disable lstrip_blocks behavior - no_lstrip_re = e('+') - # detect overlap between block and variable or comment strings - block_diff = c(r'^%s(.*)' % e(environment.block_start_string)) - # make sure we don't mistake a block for a variable or a comment - m = block_diff.match(environment.comment_start_string) - no_lstrip_re += m and r'|%s' % e(m.group(1)) or '' - m = block_diff.match(environment.variable_start_string) - no_lstrip_re += m and r'|%s' % e(m.group(1)) or '' - - # detect overlap between comment and variable strings - comment_diff = c(r'^%s(.*)' % e(environment.comment_start_string)) - m = comment_diff.match(environment.variable_start_string) - no_variable_re = m and r'(?!%s)' % e(m.group(1)) or '' - - lstrip_re = r'^[ \t]*' - block_prefix_re = r'%s%s(?!%s)|%s\+?' % ( - lstrip_re, - e(environment.block_start_string), - no_lstrip_re, - e(environment.block_start_string), - ) - comment_prefix_re = r'%s%s%s|%s\+?' % ( - lstrip_re, - e(environment.comment_start_string), - no_variable_re, - e(environment.comment_start_string), - ) - prefix_re['block'] = block_prefix_re - prefix_re['comment'] = comment_prefix_re - else: - block_prefix_re = '%s' % e(environment.block_start_string) + # strip leading spaces in blocks/comments if lstrip_blocks is enabled + # use for example '{%+' to disable lstrip_blocks behavior + self.lstrip_blocks_unless_re = c(r'[^ \t]') if environment.lstrip_blocks else None self.newline_sequence = environment.newline_sequence self.keep_trailing_newline = environment.keep_trailing_newline @@ -487,15 +460,14 @@ class Lexer(object): 'root': [ # directives (c('(.*?)(?:%s)' % '|'.join( - [r'(?P(?:\s*%s\-|%s)\s*raw\s*(?:\-%s\s*|%s))' % ( + [r'(?P%s(\-|\+|)\s*raw\s*(?:\-%s\s*|%s))' % ( e(environment.block_start_string), - block_prefix_re, e(environment.block_end_string), e(environment.block_end_string) )] + [ - r'(?P<%s_begin>\s*%s\-|%s)' % (n, r, prefix_re.get(n,r)) + r'(?P<%s_begin>%s(\-|\+|))' % (n, r) for n, r in root_tag_rules - ])), (TOKEN_DATA, '#bygroup'), '#bygroup'), + ])), OptionalLStrip(TOKEN_DATA, '#bygroup'), '#bygroup'), # data (c('.+'), TOKEN_DATA, None) ], @@ -525,13 +497,12 @@ class Lexer(object): ] + tag_rules, # raw block TOKEN_RAW_BEGIN: [ - (c(r'(.*?)((?:\s*%s\-|%s)\s*endraw\s*(?:\-%s\s*|%s%s))' % ( + (c(r'(.*?)((?:%s(\-|\+|))\s*endraw\s*(?:\-%s\s*|%s%s))' % ( e(environment.block_start_string), - block_prefix_re, e(environment.block_end_string), e(environment.block_end_string), block_suffix_re - )), (TOKEN_DATA, TOKEN_RAW_END), '#pop'), + )), OptionalLStrip(TOKEN_DATA, TOKEN_RAW_END), '#pop'), (c('(.)'), (Failure('Missing end of raw directive'),), None) ], # line statements @@ -600,6 +571,7 @@ class Lexer(object): """This method tokenizes the text and returns the tokens in a generator. Use this method if you just want to tokenize a template. """ + neg_lstrip_re = self.lstrip_blocks_unless_re source = text_type(source) lines = source.splitlines() if self.keep_trailing_newline and source: @@ -640,6 +612,21 @@ class Lexer(object): # tuples support more options if isinstance(tokens, tuple): + groups = m.groups() + + if isinstance(tokens, OptionalLStrip): + # state supports lstrip, determine override (-/+) + strip_sign = next(group for group in groups[2::2] + if group is not None) + if strip_sign == '-': + groups = (groups[0].rstrip(),) + groups[1:] + elif strip_sign != '+' and neg_lstrip_re is not None \ + and not m.groupdict().get('variable_begin'): + # no override, but lstrip_blocks enabled + l_pos = groups[0].rfind('\n') + 1 + if not neg_lstrip_re.search(groups[0], l_pos): + groups = (groups[0][:l_pos],) + groups[1:] + for idx, token in enumerate(tokens): # failure group if token.__class__ is Failure: @@ -660,7 +647,7 @@ class Lexer(object): % regex) # normal group else: - data = m.group(idx + 1) + data = groups[idx] if data or token not in ignore_if_empty: yield lineno, token, data lineno += data.count('\n')