update urlize docs, clean up code

author David Lord <davidism@gmail.com>

Sat, 30 Jan 2021 21:23:04 +0000 (13:23 -0800)

committer David Lord <davidism@gmail.com>

Sat, 30 Jan 2021 21:38:56 +0000 (13:38 -0800)
author David Lord <davidism@gmail.com>
Sat, 30 Jan 2021 21:23:04 +0000 (13:23 -0800)
committer David Lord <davidism@gmail.com>
Sat, 30 Jan 2021 21:38:56 +0000 (13:38 -0800)
diff --git a/CHANGES.rst b/CHANGES.rst

index b5c4df29f0af9c21473c8de85dbaa2bdce067050..2f5fdeff7ccaf708982c27e1b58c11a863cc42c2 100644 (file)
--- a/CHANGES.rst
+++ b/CHANGES.rst
@@ -23,6 +23,12 @@ Unreleased
      some point, but not necessarily by the direct child. :issue:`1147`
  -   Deprecate the ``autoescape`` and ``with`` extensions, they are
      built-in to the compiler. :issue:`1203`
+-   The ``urlize`` filter recognizes ``mailto:`` links and takes
+    ``extra_schemes`` (or ``env.policies["urlize.extra_schemes"]``) to
+    recognize other schemes such as ``ftp://`` or ``tel:``. It ignores
+    parentheses around URLs. The URL parsing in general has been
+    updated. URLs without a scheme are linked as ``https://`` instead of
+    ``http://``. :issue:`522, 827, 1172`, :pr:`1195`
  
  
  Version 2.11.2
diff --git a/docs/api.rst b/docs/api.rst

index 91896425413492bf6ba6c251d5ce33f9d4e03f23..f6a41b5d77628487721be46c211dc5980b639613 100644 (file)
--- a/docs/api.rst
+++ b/docs/api.rst
@@ -550,9 +550,9 @@ Example::
      The default target that is issued for links from the `urlize` filter
      if no other target is defined by the call explicitly.
  
-``urlize.additional_uri_schemes``:
-    Additional uri scheme prefixes that will generate links from the
-    `urlize` filter in addition to http://, https://, and mailto:.
+``urlize.extra_schemes``:
+    Recognize URLs that start with these schemes in addition to the
+    default ``http://``, ``https://``, and ``mailto:``.
  
  ``json.dumps_function``:
      If this is set to a value other than `None` then the `tojson` filter
diff --git a/src/jinja2/defaults.py b/src/jinja2/defaults.py

index d582836a01a6235f3eae8104b57444e4e25a8c17..6e72b624599f96f607c78959fd083da46a8a4f8d 100644 (file)
--- a/src/jinja2/defaults.py
+++ b/src/jinja2/defaults.py
@@ -35,7 +35,7 @@ DEFAULT_POLICIES = {
      "compiler.ascii_str": True,
      "urlize.rel": "noopener",
      "urlize.target": None,
-    "urlize.extra_uri_schemes": None,
+    "urlize.extra_schemes": None,
      "truncate.leeway": 5,
      "json.dumps_function": None,
      "json.dumps_kwargs": {"sort_keys": True},
diff --git a/src/jinja2/filters.py b/src/jinja2/filters.py

index 0d1639f6b36e52f9a58b6f13310df1fcda5da314..6940c49c0ac2c051936818840a419d415fa14133 100644 (file)
--- a/src/jinja2/filters.py
+++ b/src/jinja2/filters.py
@@ -20,7 +20,6 @@ from .utils import urlize
  
  _word_re = re.compile(r"\w+")
  _word_beginning_split_re = re.compile(r"([-\s({\[<]+)")
-_uri_scheme_re = re.compile(r"^([\w\.\+-]{2,}:(/){0,2})$")
  
  
  def contextfilter(f):
@@ -568,6 +567,9 @@ def do_pprint(value):
      return pformat(value)
  
  
+_uri_scheme_re = re.compile(r"^([\w.+-]{2,}:(/){0,2})$")
+
+
  @evalcontextfilter
  def do_urlize(
      eval_ctx,
@@ -576,66 +578,75 @@ def do_urlize(
      nofollow=False,
      target=None,
      rel=None,
-    extra_uri_schemes=None,
+    extra_schemes=None,
  ):
-    """Converts URLs in plain text into clickable links.
-
-    If you pass the filter an additional integer it will shorten the urls
-    to that number. Also a third argument exists that makes the urls
-    "nofollow":
-
-    .. sourcecode:: jinja
-
-        {{ mytext|urlize(40, true) }}
-            links are shortened to 40 chars and defined with rel="nofollow"
-
-    If *target* is specified, the ``target`` attribute will be added to the
-    ``<a>`` tag:
-
-    .. sourcecode:: jinja
-
-       {{ mytext|urlize(40, target='_blank') }}
+    """Convert URLs in text into clickable links.
+
+    This may not recognize links in some situations. Usually, a more
+    comprehensive formatter, such as a Markdown library, is a better
+    choice.
+
+    Works on ``http://``, ``https://``, ``www.``, ``mailto:``, and email
+    addresses. Links with trailing punctuation (periods, commas, closing
+    parentheses) and leading punctuation (opening parentheses) are
+    recognized excluding the punctuation. Email addresses that include
+    header fields are not recognized (for example,
+    ``mailto:address@example.com?cc=copy@example.com``).
+
+    :param value: Original text containing URLs to link.
+    :param trim_url_limit: Shorten displayed URL values to this length.
+    :param nofollow: Add the ``rel=nofollow`` attribute to links.
+    :param target: Add the ``target`` attribute to links.
+    :param rel: Add the ``rel`` attribute to links.
+    :param extra_schemes: Recognize URLs that start with these schemes
+        in addition to the default behavior. Defaults to
+        ``env.policies["urlize.extra_schemes"]``, which defaults to no
+        extra schemes.
  
-    If *extra_uri_schemes* are added then links will be generated for those
-    in addition to http(s): and mailto: schemes.
+    .. versionchanged:: 3.0
+        The ``extra_schemes`` parameter was added.
  
-    .. sourcecode:: jinja
+    .. versionchanged:: 3.0
+        Generate ``https://`` links for URLs without a scheme.
  
-        {{ mytext|urlize(extra_uri_schemes=['tel:', 'ftp://']) }}
-            links are generated for tel and ftp.
+    .. versionchanged:: 3.0
+        The parsing rules were updated. Recognize email addresses with
+        or without the ``mailto:`` scheme. Validate IP addresses. Ignore
+        parentheses and brackets in more cases.
  
      .. versionchanged:: 2.8
         The ``target`` parameter was added.
-
-    .. versionchanged:: 3.0
-       The ``extra_uri_schemes`` parameter was added.
      """
      policies = eval_ctx.environment.policies
+    rel_parts = set((rel or "").split())
  
-    rel = set((rel or "").split() or [])
      if nofollow:
-        rel.add("nofollow")
-    rel.update((policies["urlize.rel"] or "").split())
-    rel = " ".join(sorted(rel)) or None
+        rel_parts.add("nofollow")
+
+    rel_parts.update((policies["urlize.rel"] or "").split())
+    rel = " ".join(sorted(rel_parts)) or None
  
      if target is None:
          target = policies["urlize.target"]
  
-    if extra_uri_schemes is None:
-        extra_uri_schemes = policies["urlize.extra_uri_schemes"] or []
-    for uri_scheme in extra_uri_schemes:
-        if _uri_scheme_re.fullmatch(uri_scheme) is None:
-            raise FilterArgumentError(f"{uri_scheme} is not a valid URI scheme prefix.")
+    if extra_schemes is None:
+        extra_schemes = policies["urlize.extra_schemes"] or ()
+
+    for scheme in extra_schemes:
+        if _uri_scheme_re.fullmatch(scheme) is None:
+            raise FilterArgumentError(f"{scheme!r} is not a valid URI scheme prefix.")
  
      rv = urlize(
          value,
-        trim_url_limit,
+        trim_url_limit=trim_url_limit,
          rel=rel,
          target=target,
-        extra_uri_schemes=extra_uri_schemes,
+        extra_schemes=extra_schemes,
      )
+
      if eval_ctx.autoescape:
          rv = Markup(rv)
+
      return rv
  
  
diff --git a/src/jinja2/utils.py b/src/jinja2/utils.py

index 9ab5eb048a036292b9950a5285aaf904c4b132ab..538518fb8b5380f1997fa3f531c65e204a2a9eea 100644 (file)
--- a/src/jinja2/utils.py
+++ b/src/jinja2/utils.py
@@ -11,21 +11,6 @@ from urllib.parse import quote_from_bytes
  from markupsafe import escape
  from markupsafe import Markup
  
-_word_split_re = re.compile(r"(\s+)")
-_lead_pattern = "|".join(map(re.escape, ("(", "<", "&lt;")))
-_trail_pattern = "|".join(map(re.escape, (".", ",", ")", ">", "\n", "&gt;")))
-_punctuation_re = re.compile(
-    fr"^(?P<lead>(?:{_lead_pattern})*)(?P<middle>.*?)(?P<trail>(?:{_trail_pattern})*)$"
-)
-_simple_http_https_re = re.compile(
-    r"^((https?://|www\.)(([\w%-]+\.)+)?([a-z]{2,63}|xn--[\w%]{2,59})|"
-    r"([\w%-]{2,63}\.)+(com|net|int|edu|gov|org|info|mil)|"
-    r"(https?://)((([\d]{1,3})(\.[\d]{1,3}){3})|"
-    r"(\[([\da-f]{0,4}:){2}([\da-f]{0,4}:?){1,6}\])))"
-    r"(?::[\d]{1,5})?(?:[/?#]\S*)?$",
-    re.IGNORECASE,
-)
-_simple_email_re = re.compile(r"^\S+@[a-zA-Z0-9._-]+\.[a-zA-Z0-9._-]+$")
  _striptags_re = re.compile(r"(<!--.*?-->|<[^>]*>)")
  _entity_re = re.compile(r"&([^;]+);")
  _letters = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"
@@ -183,26 +168,74 @@ def pformat(obj):
      return pformat(obj)
  
  
-def urlize(text, trim_url_limit=None, rel=None, target=None, extra_uri_schemes=None):
-    """Converts any URLs in text into clickable links. Works on http://,
-    https://, www., mailto:, and email links. Links can have trailing
-    punctuation (periods, commas, close-parens) and leading punctuation
-    (opening parens) and it'll still do the right thing.
+_word_split_re = re.compile(r"(\s+)")
+_lead_pattern = "|".join(map(re.escape, ("(", "<", "&lt;")))
+_trail_pattern = "|".join(map(re.escape, (".", ",", ")", ">", "\n", "&gt;")))
+_punctuation_re = re.compile(
+    fr"^(?P<lead>(?:{_lead_pattern})*)(?P<middle>.*?)(?P<trail>(?:{_trail_pattern})*)$"
+)
+_http_re = re.compile(
+    r"""
+    ^
+    (
+        (https?://|www\.)  # scheme or www
+        (([\w%-]+\.)+)?  # subdomain
+        (
+            [a-z]{2,63}  # basic tld
+        |
+            xn--[\w%]{2,59}  # idna tld
+        )
+    |
+        ([\w%-]{2,63}\.)+  # basic domain
+        (com|net|int|edu|gov|org|info|mil)  # basic tld
+    |
+        (https?://)  # scheme
+        (
+            (([\d]{1,3})(\.[\d]{1,3}){3})  # IPv4
+        |
+            (\[([\da-f]{0,4}:){2}([\da-f]{0,4}:?){1,6}])  # IPv6
+        )
+    )
+    (?::[\d]{1,5})?  # port
+    (?:[/?#]\S*)?  # path, query, and fragment
+    $
+    """,
+    re.IGNORECASE | re.VERBOSE,
+)
+_simple_email_re = re.compile(r"^\S+@[a-zA-Z0-9._-]+\.[a-zA-Z0-9._-]+$")
+
  
-    If trim_url_limit is not None, the URLs in link text will be limited
-    to trim_url_limit characters.
+def urlize(text, trim_url_limit=None, rel=None, target=None, extra_schemes=None):
+    """Convert URLs in text into clickable links.
  
-    If nofollow is True, the URLs in link text will get a rel="nofollow"
-    attribute.
+    This may not recognize links in some situations. Usually, a more
+    comprehensive formatter, such as a Markdown library, is a better
+    choice.
  
-    If target is not None, a target attribute will be added to the link.
+    Works on ``http://``, ``https://``, ``www.``, ``mailto:``, and email
+    addresses. Links with trailing punctuation (periods, commas, closing
+    parentheses) and leading punctuation (opening parentheses) are
+    recognized excluding the punctuation. Email addresses that include
+    header fields are not recognized (for example,
+    ``mailto:address@example.com?cc=copy@example.com``).
  
-    Known Limitations:
-    -   Will not urlize emails or mailto: links if they include header fields
-        (for example, mailto:address@example.com?cc=copy@example.com).
+    :param text: Original text containing URLs to link.
+    :param trim_url_limit: Shorten displayed URL values to this length.
+    :param target: Add the ``target`` attribute to links.
+    :param rel: Add the ``rel`` attribute to links.
+    :param extra_schemes: Recognize URLs that start with these schemes
+        in addition to the default behavior.
  
      .. versionchanged:: 3.0
-        Adds limited support for mailto: links
+        The ``extra_schemes`` parameter was added.
+
+    .. versionchanged:: 3.0
+        Generate ``https://`` links for URLs without a scheme.
+
+    .. versionchanged:: 3.0
+        The parsing rules were updated. Recognize email addresses with
+        or without the ``mailto:`` scheme. Validate IP addresses. Ignore
+        parentheses and brackets in more cases.
      """
  
      def trim_url(x, limit=trim_url_limit):
@@ -217,12 +250,15 @@ def urlize(text, trim_url_limit=None, rel=None, target=None, extra_uri_schemes=N
  
      for i, word in enumerate(words):
          match = _punctuation_re.match(word)
+
          if match:
              lead, middle, trail = match.groups()
              # fix for mismatched opening and closing parentheses
              pairs = [("(", ")"), ("<", ">"), ("&lt;", "&gt;")]
+
              for start_char in re.findall(_lead_pattern, middle):
                  end_char = next(c for o, c in pairs if o == start_char)
+
                  while (
                      middle.count(start_char) > middle.count(end_char)
                      and end_char in trail
@@ -231,7 +267,7 @@ def urlize(text, trim_url_limit=None, rel=None, target=None, extra_uri_schemes=N
                      middle = middle + trail[: end_char_index + len(end_char)]
                      trail = trail[end_char_index + len(end_char) :]
  
-            if _simple_http_https_re.match(middle):
+            if _http_re.match(middle):
                  if middle.startswith("https://") or middle.startswith("http://"):
                      middle = (
                          f'<a href="{middle}"{rel_attr}{target_attr}>'
@@ -250,11 +286,13 @@ def urlize(text, trim_url_limit=None, rel=None, target=None, extra_uri_schemes=N
                  and _simple_email_re.match(middle)
              ):
                  middle = f'<a href="mailto:{middle}">{middle}</a>'
+
              if middle.startswith("mailto:") and _simple_email_re.match(middle[7:]):
                  middle = f'<a href="{middle}">{middle[7:]}</a>'
  
-            if extra_uri_schemes is not None:
-                schemes = {x for x in extra_uri_schemes if middle.startswith(x)}
+            if extra_schemes is not None:
+                schemes = {x for x in extra_schemes if middle.startswith(x)}
+
                  for uri_scheme in schemes:
                      if len(middle) > len(uri_scheme):
                          middle = (
diff --git a/tests/test_filters.py b/tests/test_filters.py

index bf00f069237c866a52cb9feff57de002d219fead..c3ff8f3288cfe16d6892469bc1fa5b335538a575 100644 (file)
--- a/tests/test_filters.py
+++ b/tests/test_filters.py
@@ -373,10 +373,10 @@ class TestFilter:
              "http://www.example.com/</a> bar"
          )
  
-    def test_urlize_extra_uri_schemes_parameter(self, env):
+    def test_urlize_extra_schemes_parameter(self, env):
          tmpl = env.from_string(
              '{{ "foo tel:+1-514-555-1234 ftp://localhost bar"|'
-            'urlize(extra_uri_schemes=["tel:", "ftp:"]) }}'
+            'urlize(extra_schemes=["tel:", "ftp:"]) }}'
          )
          assert tmpl.render() == (
              'foo <a href="tel:+1-514-555-1234" rel="noopener">'
author	David Lord <davidism@gmail.com>
	Sat, 30 Jan 2021 21:23:04 +0000 (13:23 -0800)
committer	David Lord <davidism@gmail.com>
	Sat, 30 Jan 2021 21:38:56 +0000 (13:38 -0800)
CHANGES.rst		patch \| blob \| blame \| history
docs/api.rst		patch \| blob \| blame \| history
src/jinja2/defaults.py		patch \| blob \| blame \| history
src/jinja2/filters.py		patch \| blob \| blame \| history
src/jinja2/utils.py		patch \| blob \| blame \| history
tests/test_filters.py		patch \| blob \| blame \| history