From: David Lord Date: Sat, 30 Jan 2021 21:23:04 +0000 (-0800) Subject: update urlize docs, clean up code X-Git-Tag: 3.0.0rc1~45^2~1 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=be83e7e06cc3b1b72771be96502869c77970d39e;p=thirdparty%2Fjinja.git update urlize docs, clean up code move regexes near implementation commented verbose regex for http pattern renamed extra_uri_schemes to extra_schemes --- diff --git a/CHANGES.rst b/CHANGES.rst index b5c4df29..2f5fdeff 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -23,6 +23,12 @@ Unreleased some point, but not necessarily by the direct child. :issue:`1147` - Deprecate the ``autoescape`` and ``with`` extensions, they are built-in to the compiler. :issue:`1203` +- The ``urlize`` filter recognizes ``mailto:`` links and takes + ``extra_schemes`` (or ``env.policies["urlize.extra_schemes"]``) to + recognize other schemes such as ``ftp://`` or ``tel:``. It ignores + parentheses around URLs. The URL parsing in general has been + updated. URLs without a scheme are linked as ``https://`` instead of + ``http://``. :issue:`522, 827, 1172`, :pr:`1195` Version 2.11.2 diff --git a/docs/api.rst b/docs/api.rst index 91896425..f6a41b5d 100644 --- a/docs/api.rst +++ b/docs/api.rst @@ -550,9 +550,9 @@ Example:: The default target that is issued for links from the `urlize` filter if no other target is defined by the call explicitly. -``urlize.additional_uri_schemes``: - Additional uri scheme prefixes that will generate links from the - `urlize` filter in addition to http://, https://, and mailto:. +``urlize.extra_schemes``: + Recognize URLs that start with these schemes in addition to the + default ``http://``, ``https://``, and ``mailto:``. ``json.dumps_function``: If this is set to a value other than `None` then the `tojson` filter diff --git a/src/jinja2/defaults.py b/src/jinja2/defaults.py index d582836a..6e72b624 100644 --- a/src/jinja2/defaults.py +++ b/src/jinja2/defaults.py @@ -35,7 +35,7 @@ DEFAULT_POLICIES = { "compiler.ascii_str": True, "urlize.rel": "noopener", "urlize.target": None, - "urlize.extra_uri_schemes": None, + "urlize.extra_schemes": None, "truncate.leeway": 5, "json.dumps_function": None, "json.dumps_kwargs": {"sort_keys": True}, diff --git a/src/jinja2/filters.py b/src/jinja2/filters.py index 0d1639f6..6940c49c 100644 --- a/src/jinja2/filters.py +++ b/src/jinja2/filters.py @@ -20,7 +20,6 @@ from .utils import urlize _word_re = re.compile(r"\w+") _word_beginning_split_re = re.compile(r"([-\s({\[<]+)") -_uri_scheme_re = re.compile(r"^([\w\.\+-]{2,}:(/){0,2})$") def contextfilter(f): @@ -568,6 +567,9 @@ def do_pprint(value): return pformat(value) +_uri_scheme_re = re.compile(r"^([\w.+-]{2,}:(/){0,2})$") + + @evalcontextfilter def do_urlize( eval_ctx, @@ -576,66 +578,75 @@ def do_urlize( nofollow=False, target=None, rel=None, - extra_uri_schemes=None, + extra_schemes=None, ): - """Converts URLs in plain text into clickable links. - - If you pass the filter an additional integer it will shorten the urls - to that number. Also a third argument exists that makes the urls - "nofollow": - - .. sourcecode:: jinja - - {{ mytext|urlize(40, true) }} - links are shortened to 40 chars and defined with rel="nofollow" - - If *target* is specified, the ``target`` attribute will be added to the - ```` tag: - - .. sourcecode:: jinja - - {{ mytext|urlize(40, target='_blank') }} + """Convert URLs in text into clickable links. + + This may not recognize links in some situations. Usually, a more + comprehensive formatter, such as a Markdown library, is a better + choice. + + Works on ``http://``, ``https://``, ``www.``, ``mailto:``, and email + addresses. Links with trailing punctuation (periods, commas, closing + parentheses) and leading punctuation (opening parentheses) are + recognized excluding the punctuation. Email addresses that include + header fields are not recognized (for example, + ``mailto:address@example.com?cc=copy@example.com``). + + :param value: Original text containing URLs to link. + :param trim_url_limit: Shorten displayed URL values to this length. + :param nofollow: Add the ``rel=nofollow`` attribute to links. + :param target: Add the ``target`` attribute to links. + :param rel: Add the ``rel`` attribute to links. + :param extra_schemes: Recognize URLs that start with these schemes + in addition to the default behavior. Defaults to + ``env.policies["urlize.extra_schemes"]``, which defaults to no + extra schemes. - If *extra_uri_schemes* are added then links will be generated for those - in addition to http(s): and mailto: schemes. + .. versionchanged:: 3.0 + The ``extra_schemes`` parameter was added. - .. sourcecode:: jinja + .. versionchanged:: 3.0 + Generate ``https://`` links for URLs without a scheme. - {{ mytext|urlize(extra_uri_schemes=['tel:', 'ftp://']) }} - links are generated for tel and ftp. + .. versionchanged:: 3.0 + The parsing rules were updated. Recognize email addresses with + or without the ``mailto:`` scheme. Validate IP addresses. Ignore + parentheses and brackets in more cases. .. versionchanged:: 2.8 The ``target`` parameter was added. - - .. versionchanged:: 3.0 - The ``extra_uri_schemes`` parameter was added. """ policies = eval_ctx.environment.policies + rel_parts = set((rel or "").split()) - rel = set((rel or "").split() or []) if nofollow: - rel.add("nofollow") - rel.update((policies["urlize.rel"] or "").split()) - rel = " ".join(sorted(rel)) or None + rel_parts.add("nofollow") + + rel_parts.update((policies["urlize.rel"] or "").split()) + rel = " ".join(sorted(rel_parts)) or None if target is None: target = policies["urlize.target"] - if extra_uri_schemes is None: - extra_uri_schemes = policies["urlize.extra_uri_schemes"] or [] - for uri_scheme in extra_uri_schemes: - if _uri_scheme_re.fullmatch(uri_scheme) is None: - raise FilterArgumentError(f"{uri_scheme} is not a valid URI scheme prefix.") + if extra_schemes is None: + extra_schemes = policies["urlize.extra_schemes"] or () + + for scheme in extra_schemes: + if _uri_scheme_re.fullmatch(scheme) is None: + raise FilterArgumentError(f"{scheme!r} is not a valid URI scheme prefix.") rv = urlize( value, - trim_url_limit, + trim_url_limit=trim_url_limit, rel=rel, target=target, - extra_uri_schemes=extra_uri_schemes, + extra_schemes=extra_schemes, ) + if eval_ctx.autoescape: rv = Markup(rv) + return rv diff --git a/src/jinja2/utils.py b/src/jinja2/utils.py index 9ab5eb04..538518fb 100644 --- a/src/jinja2/utils.py +++ b/src/jinja2/utils.py @@ -11,21 +11,6 @@ from urllib.parse import quote_from_bytes from markupsafe import escape from markupsafe import Markup -_word_split_re = re.compile(r"(\s+)") -_lead_pattern = "|".join(map(re.escape, ("(", "<", "<"))) -_trail_pattern = "|".join(map(re.escape, (".", ",", ")", ">", "\n", ">"))) -_punctuation_re = re.compile( - fr"^(?P(?:{_lead_pattern})*)(?P.*?)(?P(?:{_trail_pattern})*)$" -) -_simple_http_https_re = re.compile( - r"^((https?://|www\.)(([\w%-]+\.)+)?([a-z]{2,63}|xn--[\w%]{2,59})|" - r"([\w%-]{2,63}\.)+(com|net|int|edu|gov|org|info|mil)|" - r"(https?://)((([\d]{1,3})(\.[\d]{1,3}){3})|" - r"(\[([\da-f]{0,4}:){2}([\da-f]{0,4}:?){1,6}\])))" - r"(?::[\d]{1,5})?(?:[/?#]\S*)?$", - re.IGNORECASE, -) -_simple_email_re = re.compile(r"^\S+@[a-zA-Z0-9._-]+\.[a-zA-Z0-9._-]+$") _striptags_re = re.compile(r"(|<[^>]*>)") _entity_re = re.compile(r"&([^;]+);") _letters = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ" @@ -183,26 +168,74 @@ def pformat(obj): return pformat(obj) -def urlize(text, trim_url_limit=None, rel=None, target=None, extra_uri_schemes=None): - """Converts any URLs in text into clickable links. Works on http://, - https://, www., mailto:, and email links. Links can have trailing - punctuation (periods, commas, close-parens) and leading punctuation - (opening parens) and it'll still do the right thing. +_word_split_re = re.compile(r"(\s+)") +_lead_pattern = "|".join(map(re.escape, ("(", "<", "<"))) +_trail_pattern = "|".join(map(re.escape, (".", ",", ")", ">", "\n", ">"))) +_punctuation_re = re.compile( + fr"^(?P(?:{_lead_pattern})*)(?P.*?)(?P(?:{_trail_pattern})*)$" +) +_http_re = re.compile( + r""" + ^ + ( + (https?://|www\.) # scheme or www + (([\w%-]+\.)+)? # subdomain + ( + [a-z]{2,63} # basic tld + | + xn--[\w%]{2,59} # idna tld + ) + | + ([\w%-]{2,63}\.)+ # basic domain + (com|net|int|edu|gov|org|info|mil) # basic tld + | + (https?://) # scheme + ( + (([\d]{1,3})(\.[\d]{1,3}){3}) # IPv4 + | + (\[([\da-f]{0,4}:){2}([\da-f]{0,4}:?){1,6}]) # IPv6 + ) + ) + (?::[\d]{1,5})? # port + (?:[/?#]\S*)? # path, query, and fragment + $ + """, + re.IGNORECASE | re.VERBOSE, +) +_simple_email_re = re.compile(r"^\S+@[a-zA-Z0-9._-]+\.[a-zA-Z0-9._-]+$") + - If trim_url_limit is not None, the URLs in link text will be limited - to trim_url_limit characters. +def urlize(text, trim_url_limit=None, rel=None, target=None, extra_schemes=None): + """Convert URLs in text into clickable links. - If nofollow is True, the URLs in link text will get a rel="nofollow" - attribute. + This may not recognize links in some situations. Usually, a more + comprehensive formatter, such as a Markdown library, is a better + choice. - If target is not None, a target attribute will be added to the link. + Works on ``http://``, ``https://``, ``www.``, ``mailto:``, and email + addresses. Links with trailing punctuation (periods, commas, closing + parentheses) and leading punctuation (opening parentheses) are + recognized excluding the punctuation. Email addresses that include + header fields are not recognized (for example, + ``mailto:address@example.com?cc=copy@example.com``). - Known Limitations: - - Will not urlize emails or mailto: links if they include header fields - (for example, mailto:address@example.com?cc=copy@example.com). + :param text: Original text containing URLs to link. + :param trim_url_limit: Shorten displayed URL values to this length. + :param target: Add the ``target`` attribute to links. + :param rel: Add the ``rel`` attribute to links. + :param extra_schemes: Recognize URLs that start with these schemes + in addition to the default behavior. .. versionchanged:: 3.0 - Adds limited support for mailto: links + The ``extra_schemes`` parameter was added. + + .. versionchanged:: 3.0 + Generate ``https://`` links for URLs without a scheme. + + .. versionchanged:: 3.0 + The parsing rules were updated. Recognize email addresses with + or without the ``mailto:`` scheme. Validate IP addresses. Ignore + parentheses and brackets in more cases. """ def trim_url(x, limit=trim_url_limit): @@ -217,12 +250,15 @@ def urlize(text, trim_url_limit=None, rel=None, target=None, extra_uri_schemes=N for i, word in enumerate(words): match = _punctuation_re.match(word) + if match: lead, middle, trail = match.groups() # fix for mismatched opening and closing parentheses pairs = [("(", ")"), ("<", ">"), ("<", ">")] + for start_char in re.findall(_lead_pattern, middle): end_char = next(c for o, c in pairs if o == start_char) + while ( middle.count(start_char) > middle.count(end_char) and end_char in trail @@ -231,7 +267,7 @@ def urlize(text, trim_url_limit=None, rel=None, target=None, extra_uri_schemes=N middle = middle + trail[: end_char_index + len(end_char)] trail = trail[end_char_index + len(end_char) :] - if _simple_http_https_re.match(middle): + if _http_re.match(middle): if middle.startswith("https://") or middle.startswith("http://"): middle = ( f'' @@ -250,11 +286,13 @@ def urlize(text, trim_url_limit=None, rel=None, target=None, extra_uri_schemes=N and _simple_email_re.match(middle) ): middle = f'{middle}' + if middle.startswith("mailto:") and _simple_email_re.match(middle[7:]): middle = f'{middle[7:]}' - if extra_uri_schemes is not None: - schemes = {x for x in extra_uri_schemes if middle.startswith(x)} + if extra_schemes is not None: + schemes = {x for x in extra_schemes if middle.startswith(x)} + for uri_scheme in schemes: if len(middle) > len(uri_scheme): middle = ( diff --git a/tests/test_filters.py b/tests/test_filters.py index bf00f069..c3ff8f32 100644 --- a/tests/test_filters.py +++ b/tests/test_filters.py @@ -373,10 +373,10 @@ class TestFilter: "http://www.example.com/ bar" ) - def test_urlize_extra_uri_schemes_parameter(self, env): + def test_urlize_extra_schemes_parameter(self, env): tmpl = env.from_string( '{{ "foo tel:+1-514-555-1234 ftp://localhost bar"|' - 'urlize(extra_uri_schemes=["tel:", "ftp:"]) }}' + 'urlize(extra_schemes=["tel:", "ftp:"]) }}' ) assert tmpl.render() == ( 'foo '