From: David Lord <davidism@gmail.com>
Date: Sat, 30 Jan 2021 21:23:04 +0000 (-0800)
Subject: update urlize docs, clean up code
X-Git-Tag: 3.0.0rc1~45^2~1
X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=be83e7e06cc3b1b72771be96502869c77970d39e;p=thirdparty%2Fjinja.git

update urlize docs, clean up code

move regexes near implementation
commented verbose regex for http pattern
renamed extra_uri_schemes to extra_schemes
---

diff --git a/CHANGES.rst b/CHANGES.rst
index b5c4df29..2f5fdeff 100644
--- a/CHANGES.rst
+++ b/CHANGES.rst
@@ -23,6 +23,12 @@ Unreleased
     some point, but not necessarily by the direct child. :issue:`1147`
 -   Deprecate the ``autoescape`` and ``with`` extensions, they are
     built-in to the compiler. :issue:`1203`
+-   The ``urlize`` filter recognizes ``mailto:`` links and takes
+    ``extra_schemes`` (or ``env.policies["urlize.extra_schemes"]``) to
+    recognize other schemes such as ``ftp://`` or ``tel:``. It ignores
+    parentheses around URLs. The URL parsing in general has been
+    updated. URLs without a scheme are linked as ``https://`` instead of
+    ``http://``. :issue:`522, 827, 1172`, :pr:`1195`
 
 
 Version 2.11.2
diff --git a/docs/api.rst b/docs/api.rst
index 91896425..f6a41b5d 100644
--- a/docs/api.rst
+++ b/docs/api.rst
@@ -550,9 +550,9 @@ Example::
     The default target that is issued for links from the `urlize` filter
     if no other target is defined by the call explicitly.
 
-``urlize.additional_uri_schemes``:
-    Additional uri scheme prefixes that will generate links from the
-    `urlize` filter in addition to http://, https://, and mailto:.
+``urlize.extra_schemes``:
+    Recognize URLs that start with these schemes in addition to the
+    default ``http://``, ``https://``, and ``mailto:``.
 
 ``json.dumps_function``:
     If this is set to a value other than `None` then the `tojson` filter
diff --git a/src/jinja2/defaults.py b/src/jinja2/defaults.py
index d582836a..6e72b624 100644
--- a/src/jinja2/defaults.py
+++ b/src/jinja2/defaults.py
@@ -35,7 +35,7 @@ DEFAULT_POLICIES = {
     "compiler.ascii_str": True,
     "urlize.rel": "noopener",
     "urlize.target": None,
-    "urlize.extra_uri_schemes": None,
+    "urlize.extra_schemes": None,
     "truncate.leeway": 5,
     "json.dumps_function": None,
     "json.dumps_kwargs": {"sort_keys": True},
diff --git a/src/jinja2/filters.py b/src/jinja2/filters.py
index 0d1639f6..6940c49c 100644
--- a/src/jinja2/filters.py
+++ b/src/jinja2/filters.py
@@ -20,7 +20,6 @@ from .utils import urlize
 
 _word_re = re.compile(r"\w+")
 _word_beginning_split_re = re.compile(r"([-\s({\[<]+)")
-_uri_scheme_re = re.compile(r"^([\w\.\+-]{2,}:(/){0,2})$")
 
 
 def contextfilter(f):
@@ -568,6 +567,9 @@ def do_pprint(value):
     return pformat(value)
 
 
+_uri_scheme_re = re.compile(r"^([\w.+-]{2,}:(/){0,2})$")
+
+
 @evalcontextfilter
 def do_urlize(
     eval_ctx,
@@ -576,66 +578,75 @@ def do_urlize(
     nofollow=False,
     target=None,
     rel=None,
-    extra_uri_schemes=None,
+    extra_schemes=None,
 ):
-    """Converts URLs in plain text into clickable links.
-
-    If you pass the filter an additional integer it will shorten the urls
-    to that number. Also a third argument exists that makes the urls
-    "nofollow":
-
-    .. sourcecode:: jinja
-
-        {{ mytext|urlize(40, true) }}
-            links are shortened to 40 chars and defined with rel="nofollow"
-
-    If *target* is specified, the ``target`` attribute will be added to the
-    ``<a>`` tag:
-
-    .. sourcecode:: jinja
-
-       {{ mytext|urlize(40, target='_blank') }}
+    """Convert URLs in text into clickable links.
+
+    This may not recognize links in some situations. Usually, a more
+    comprehensive formatter, such as a Markdown library, is a better
+    choice.
+
+    Works on ``http://``, ``https://``, ``www.``, ``mailto:``, and email
+    addresses. Links with trailing punctuation (periods, commas, closing
+    parentheses) and leading punctuation (opening parentheses) are
+    recognized excluding the punctuation. Email addresses that include
+    header fields are not recognized (for example,
+    ``mailto:address@example.com?cc=copy@example.com``).
+
+    :param value: Original text containing URLs to link.
+    :param trim_url_limit: Shorten displayed URL values to this length.
+    :param nofollow: Add the ``rel=nofollow`` attribute to links.
+    :param target: Add the ``target`` attribute to links.
+    :param rel: Add the ``rel`` attribute to links.
+    :param extra_schemes: Recognize URLs that start with these schemes
+        in addition to the default behavior. Defaults to
+        ``env.policies["urlize.extra_schemes"]``, which defaults to no
+        extra schemes.
 
-    If *extra_uri_schemes* are added then links will be generated for those
-    in addition to http(s): and mailto: schemes.
+    .. versionchanged:: 3.0
+        The ``extra_schemes`` parameter was added.
 
-    .. sourcecode:: jinja
+    .. versionchanged:: 3.0
+        Generate ``https://`` links for URLs without a scheme.
 
-        {{ mytext|urlize(extra_uri_schemes=['tel:', 'ftp://']) }}
-            links are generated for tel and ftp.
+    .. versionchanged:: 3.0
+        The parsing rules were updated. Recognize email addresses with
+        or without the ``mailto:`` scheme. Validate IP addresses. Ignore
+        parentheses and brackets in more cases.
 
     .. versionchanged:: 2.8
        The ``target`` parameter was added.
-
-    .. versionchanged:: 3.0
-       The ``extra_uri_schemes`` parameter was added.
     """
     policies = eval_ctx.environment.policies
+    rel_parts = set((rel or "").split())
 
-    rel = set((rel or "").split() or [])
     if nofollow:
-        rel.add("nofollow")
-    rel.update((policies["urlize.rel"] or "").split())
-    rel = " ".join(sorted(rel)) or None
+        rel_parts.add("nofollow")
+
+    rel_parts.update((policies["urlize.rel"] or "").split())
+    rel = " ".join(sorted(rel_parts)) or None
 
     if target is None:
         target = policies["urlize.target"]
 
-    if extra_uri_schemes is None:
-        extra_uri_schemes = policies["urlize.extra_uri_schemes"] or []
-    for uri_scheme in extra_uri_schemes:
-        if _uri_scheme_re.fullmatch(uri_scheme) is None:
-            raise FilterArgumentError(f"{uri_scheme} is not a valid URI scheme prefix.")
+    if extra_schemes is None:
+        extra_schemes = policies["urlize.extra_schemes"] or ()
+
+    for scheme in extra_schemes:
+        if _uri_scheme_re.fullmatch(scheme) is None:
+            raise FilterArgumentError(f"{scheme!r} is not a valid URI scheme prefix.")
 
     rv = urlize(
         value,
-        trim_url_limit,
+        trim_url_limit=trim_url_limit,
         rel=rel,
         target=target,
-        extra_uri_schemes=extra_uri_schemes,
+        extra_schemes=extra_schemes,
     )
+
     if eval_ctx.autoescape:
         rv = Markup(rv)
+
     return rv
 
 
diff --git a/src/jinja2/utils.py b/src/jinja2/utils.py
index 9ab5eb04..538518fb 100644
--- a/src/jinja2/utils.py
+++ b/src/jinja2/utils.py
@@ -11,21 +11,6 @@ from urllib.parse import quote_from_bytes
 from markupsafe import escape
 from markupsafe import Markup
 
-_word_split_re = re.compile(r"(\s+)")
-_lead_pattern = "|".join(map(re.escape, ("(", "<", "&lt;")))
-_trail_pattern = "|".join(map(re.escape, (".", ",", ")", ">", "\n", "&gt;")))
-_punctuation_re = re.compile(
-    fr"^(?P<lead>(?:{_lead_pattern})*)(?P<middle>.*?)(?P<trail>(?:{_trail_pattern})*)$"
-)
-_simple_http_https_re = re.compile(
-    r"^((https?://|www\.)(([\w%-]+\.)+)?([a-z]{2,63}|xn--[\w%]{2,59})|"
-    r"([\w%-]{2,63}\.)+(com|net|int|edu|gov|org|info|mil)|"
-    r"(https?://)((([\d]{1,3})(\.[\d]{1,3}){3})|"
-    r"(\[([\da-f]{0,4}:){2}([\da-f]{0,4}:?){1,6}\])))"
-    r"(?::[\d]{1,5})?(?:[/?#]\S*)?$",
-    re.IGNORECASE,
-)
-_simple_email_re = re.compile(r"^\S+@[a-zA-Z0-9._-]+\.[a-zA-Z0-9._-]+$")
 _striptags_re = re.compile(r"(<!--.*?-->|<[^>]*>)")
 _entity_re = re.compile(r"&([^;]+);")
 _letters = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"
@@ -183,26 +168,74 @@ def pformat(obj):
     return pformat(obj)
 
 
-def urlize(text, trim_url_limit=None, rel=None, target=None, extra_uri_schemes=None):
-    """Converts any URLs in text into clickable links. Works on http://,
-    https://, www., mailto:, and email links. Links can have trailing
-    punctuation (periods, commas, close-parens) and leading punctuation
-    (opening parens) and it'll still do the right thing.
+_word_split_re = re.compile(r"(\s+)")
+_lead_pattern = "|".join(map(re.escape, ("(", "<", "&lt;")))
+_trail_pattern = "|".join(map(re.escape, (".", ",", ")", ">", "\n", "&gt;")))
+_punctuation_re = re.compile(
+    fr"^(?P<lead>(?:{_lead_pattern})*)(?P<middle>.*?)(?P<trail>(?:{_trail_pattern})*)$"
+)
+_http_re = re.compile(
+    r"""
+    ^
+    (
+        (https?://|www\.)  # scheme or www
+        (([\w%-]+\.)+)?  # subdomain
+        (
+            [a-z]{2,63}  # basic tld
+        |
+            xn--[\w%]{2,59}  # idna tld
+        )
+    |
+        ([\w%-]{2,63}\.)+  # basic domain
+        (com|net|int|edu|gov|org|info|mil)  # basic tld
+    |
+        (https?://)  # scheme
+        (
+            (([\d]{1,3})(\.[\d]{1,3}){3})  # IPv4
+        |
+            (\[([\da-f]{0,4}:){2}([\da-f]{0,4}:?){1,6}])  # IPv6
+        )
+    )
+    (?::[\d]{1,5})?  # port
+    (?:[/?#]\S*)?  # path, query, and fragment
+    $
+    """,
+    re.IGNORECASE | re.VERBOSE,
+)
+_simple_email_re = re.compile(r"^\S+@[a-zA-Z0-9._-]+\.[a-zA-Z0-9._-]+$")
+
 
-    If trim_url_limit is not None, the URLs in link text will be limited
-    to trim_url_limit characters.
+def urlize(text, trim_url_limit=None, rel=None, target=None, extra_schemes=None):
+    """Convert URLs in text into clickable links.
 
-    If nofollow is True, the URLs in link text will get a rel="nofollow"
-    attribute.
+    This may not recognize links in some situations. Usually, a more
+    comprehensive formatter, such as a Markdown library, is a better
+    choice.
 
-    If target is not None, a target attribute will be added to the link.
+    Works on ``http://``, ``https://``, ``www.``, ``mailto:``, and email
+    addresses. Links with trailing punctuation (periods, commas, closing
+    parentheses) and leading punctuation (opening parentheses) are
+    recognized excluding the punctuation. Email addresses that include
+    header fields are not recognized (for example,
+    ``mailto:address@example.com?cc=copy@example.com``).
 
-    Known Limitations:
-    -   Will not urlize emails or mailto: links if they include header fields
-        (for example, mailto:address@example.com?cc=copy@example.com).
+    :param text: Original text containing URLs to link.
+    :param trim_url_limit: Shorten displayed URL values to this length.
+    :param target: Add the ``target`` attribute to links.
+    :param rel: Add the ``rel`` attribute to links.
+    :param extra_schemes: Recognize URLs that start with these schemes
+        in addition to the default behavior.
 
     .. versionchanged:: 3.0
-        Adds limited support for mailto: links
+        The ``extra_schemes`` parameter was added.
+
+    .. versionchanged:: 3.0
+        Generate ``https://`` links for URLs without a scheme.
+
+    .. versionchanged:: 3.0
+        The parsing rules were updated. Recognize email addresses with
+        or without the ``mailto:`` scheme. Validate IP addresses. Ignore
+        parentheses and brackets in more cases.
     """
 
     def trim_url(x, limit=trim_url_limit):
@@ -217,12 +250,15 @@ def urlize(text, trim_url_limit=None, rel=None, target=None, extra_uri_schemes=N
 
     for i, word in enumerate(words):
         match = _punctuation_re.match(word)
+
         if match:
             lead, middle, trail = match.groups()
             # fix for mismatched opening and closing parentheses
             pairs = [("(", ")"), ("<", ">"), ("&lt;", "&gt;")]
+
             for start_char in re.findall(_lead_pattern, middle):
                 end_char = next(c for o, c in pairs if o == start_char)
+
                 while (
                     middle.count(start_char) > middle.count(end_char)
                     and end_char in trail
@@ -231,7 +267,7 @@ def urlize(text, trim_url_limit=None, rel=None, target=None, extra_uri_schemes=N
                     middle = middle + trail[: end_char_index + len(end_char)]
                     trail = trail[end_char_index + len(end_char) :]
 
-            if _simple_http_https_re.match(middle):
+            if _http_re.match(middle):
                 if middle.startswith("https://") or middle.startswith("http://"):
                     middle = (
                         f'<a href="{middle}"{rel_attr}{target_attr}>'
@@ -250,11 +286,13 @@ def urlize(text, trim_url_limit=None, rel=None, target=None, extra_uri_schemes=N
                 and _simple_email_re.match(middle)
             ):
                 middle = f'<a href="mailto:{middle}">{middle}</a>'
+
             if middle.startswith("mailto:") and _simple_email_re.match(middle[7:]):
                 middle = f'<a href="{middle}">{middle[7:]}</a>'
 
-            if extra_uri_schemes is not None:
-                schemes = {x for x in extra_uri_schemes if middle.startswith(x)}
+            if extra_schemes is not None:
+                schemes = {x for x in extra_schemes if middle.startswith(x)}
+
                 for uri_scheme in schemes:
                     if len(middle) > len(uri_scheme):
                         middle = (
diff --git a/tests/test_filters.py b/tests/test_filters.py
index bf00f069..c3ff8f32 100644
--- a/tests/test_filters.py
+++ b/tests/test_filters.py
@@ -373,10 +373,10 @@ class TestFilter:
             "http://www.example.com/</a> bar"
         )
 
-    def test_urlize_extra_uri_schemes_parameter(self, env):
+    def test_urlize_extra_schemes_parameter(self, env):
         tmpl = env.from_string(
             '{{ "foo tel:+1-514-555-1234 ftp://localhost bar"|'
-            'urlize(extra_uri_schemes=["tel:", "ftp:"]) }}'
+            'urlize(extra_schemes=["tel:", "ftp:"]) }}'
         )
         assert tmpl.render() == (
             'foo <a href="tel:+1-514-555-1234" rel="noopener">'