Improve and extend urlize

author Bebleo <james.warne@outlook.com>

Sun, 19 Apr 2020 09:42:12 +0000 (05:42 -0400)

committer David Lord <davidism@gmail.com>

Sat, 30 Jan 2021 14:25:03 +0000 (06:25 -0800)
author Bebleo <james.warne@outlook.com>
Sun, 19 Apr 2020 09:42:12 +0000 (05:42 -0400)
committer David Lord <davidism@gmail.com>
Sat, 30 Jan 2021 14:25:03 +0000 (06:25 -0800)
diff --git a/docs/api.rst b/docs/api.rst

index ec083a8a6f33ae1198d059cd5ca7d4a518091091..91896425413492bf6ba6c251d5ce33f9d4e03f23 100644 (file)
--- a/docs/api.rst
+++ b/docs/api.rst
@@ -550,6 +550,10 @@ Example::
      The default target that is issued for links from the `urlize` filter
      if no other target is defined by the call explicitly.
  
+``urlize.additional_uri_schemes``:
+    Additional uri scheme prefixes that will generate links from the
+    `urlize` filter in addition to http://, https://, and mailto:.
+
  ``json.dumps_function``:
      If this is set to a value other than `None` then the `tojson` filter
      will dump with this function instead of the default one.  Note that
diff --git a/src/jinja2/defaults.py b/src/jinja2/defaults.py

index 1f0b0ab00db25f3822276c7304035de082fd9772..d582836a01a6235f3eae8104b57444e4e25a8c17 100644 (file)
--- a/src/jinja2/defaults.py
+++ b/src/jinja2/defaults.py
@@ -35,6 +35,7 @@ DEFAULT_POLICIES = {
      "compiler.ascii_str": True,
      "urlize.rel": "noopener",
      "urlize.target": None,
+    "urlize.extra_uri_schemes": None,
      "truncate.leeway": 5,
      "json.dumps_function": None,
      "json.dumps_kwargs": {"sort_keys": True},
diff --git a/src/jinja2/filters.py b/src/jinja2/filters.py

index 7a554a0e94f234bc99f7b287a220b031ff0bc168..0d1639f6b36e52f9a58b6f13310df1fcda5da314 100644 (file)
--- a/src/jinja2/filters.py
+++ b/src/jinja2/filters.py
@@ -20,6 +20,7 @@ from .utils import urlize
  
  _word_re = re.compile(r"\w+")
  _word_beginning_split_re = re.compile(r"([-\s({\[<]+)")
+_uri_scheme_re = re.compile(r"^([\w\.\+-]{2,}:(/){0,2})$")
  
  
  def contextfilter(f):
@@ -569,7 +570,13 @@ def do_pprint(value):
  
  @evalcontextfilter
  def do_urlize(
-    eval_ctx, value, trim_url_limit=None, nofollow=False, target=None, rel=None
+    eval_ctx,
+    value,
+    trim_url_limit=None,
+    nofollow=False,
+    target=None,
+    rel=None,
+    extra_uri_schemes=None,
  ):
      """Converts URLs in plain text into clickable links.
  
@@ -589,18 +596,44 @@ def do_urlize(
  
         {{ mytext|urlize(40, target='_blank') }}
  
+    If *extra_uri_schemes* are added then links will be generated for those
+    in addition to http(s): and mailto: schemes.
+
+    .. sourcecode:: jinja
+
+        {{ mytext|urlize(extra_uri_schemes=['tel:', 'ftp://']) }}
+            links are generated for tel and ftp.
+
      .. versionchanged:: 2.8
         The ``target`` parameter was added.
+
+    .. versionchanged:: 3.0
+       The ``extra_uri_schemes`` parameter was added.
      """
      policies = eval_ctx.environment.policies
+
      rel = set((rel or "").split() or [])
      if nofollow:
          rel.add("nofollow")
      rel.update((policies["urlize.rel"] or "").split())
+    rel = " ".join(sorted(rel)) or None
+
      if target is None:
          target = policies["urlize.target"]
-    rel = " ".join(sorted(rel)) or None
-    rv = urlize(value, trim_url_limit, rel=rel, target=target)
+
+    if extra_uri_schemes is None:
+        extra_uri_schemes = policies["urlize.extra_uri_schemes"] or []
+    for uri_scheme in extra_uri_schemes:
+        if _uri_scheme_re.fullmatch(uri_scheme) is None:
+            raise FilterArgumentError(f"{uri_scheme} is not a valid URI scheme prefix.")
+
+    rv = urlize(
+        value,
+        trim_url_limit,
+        rel=rel,
+        target=target,
+        extra_uri_schemes=extra_uri_schemes,
+    )
      if eval_ctx.autoescape:
          rv = Markup(rv)
      return rv
diff --git a/src/jinja2/utils.py b/src/jinja2/utils.py

index 8ee029580ff120e28b6203513d2fb1dd4105f1d8..9ab5eb048a036292b9950a5285aaf904c4b132ab 100644 (file)
--- a/src/jinja2/utils.py
+++ b/src/jinja2/utils.py
@@ -17,6 +17,14 @@ _trail_pattern = "|".join(map(re.escape, (".", ",", ")", ">", "\n", "&gt;")))
  _punctuation_re = re.compile(
      fr"^(?P<lead>(?:{_lead_pattern})*)(?P<middle>.*?)(?P<trail>(?:{_trail_pattern})*)$"
  )
+_simple_http_https_re = re.compile(
+    r"^((https?://|www\.)(([\w%-]+\.)+)?([a-z]{2,63}|xn--[\w%]{2,59})|"
+    r"([\w%-]{2,63}\.)+(com|net|int|edu|gov|org|info|mil)|"
+    r"(https?://)((([\d]{1,3})(\.[\d]{1,3}){3})|"
+    r"(\[([\da-f]{0,4}:){2}([\da-f]{0,4}:?){1,6}\])))"
+    r"(?::[\d]{1,5})?(?:[/?#]\S*)?$",
+    re.IGNORECASE,
+)
  _simple_email_re = re.compile(r"^\S+@[a-zA-Z0-9._-]+\.[a-zA-Z0-9._-]+$")
  _striptags_re = re.compile(r"(<!--.*?-->|<[^>]*>)")
  _entity_re = re.compile(r"&([^;]+);")
@@ -175,11 +183,11 @@ def pformat(obj):
      return pformat(obj)
  
  
-def urlize(text, trim_url_limit=None, rel=None, target=None):
+def urlize(text, trim_url_limit=None, rel=None, target=None, extra_uri_schemes=None):
      """Converts any URLs in text into clickable links. Works on http://,
-    https:// and www. links. Links can have trailing punctuation (periods,
-    commas, close-parens) and leading punctuation (opening parens) and
-    it'll still do the right thing.
+    https://, www., mailto:, and email links. Links can have trailing
+    punctuation (periods, commas, close-parens) and leading punctuation
+    (opening parens) and it'll still do the right thing.
  
      If trim_url_limit is not None, the URLs in link text will be limited
      to trim_url_limit characters.
@@ -188,6 +196,13 @@ def urlize(text, trim_url_limit=None, rel=None, target=None):
      attribute.
  
      If target is not None, a target attribute will be added to the link.
+
+    Known Limitations:
+    -   Will not urlize emails or mailto: links if they include header fields
+        (for example, mailto:address@example.com?cc=copy@example.com).
+
+    .. versionchanged:: 3.0
+        Adds limited support for mailto: links
      """
  
      def trim_url(x, limit=trim_url_limit):
@@ -204,26 +219,30 @@ def urlize(text, trim_url_limit=None, rel=None, target=None):
          match = _punctuation_re.match(word)
          if match:
              lead, middle, trail = match.groups()
-            if middle.startswith("www.") or (
-                "@" not in middle
-                and not middle.startswith("http://")
-                and not middle.startswith("https://")
-                and len(middle) > 0
-                and middle[0] in _letters + _digits
-                and (
-                    middle.endswith(".org")
-                    or middle.endswith(".net")
-                    or middle.endswith(".com")
-                )
-            ):
-                middle = (
-                    f'<a href="http://{middle}"{rel_attr}{target_attr}>'
-                    f"{trim_url(middle)}</a>"
-                )
-            if middle.startswith("http://") or middle.startswith("https://"):
-                middle = (
-                    f'<a href="{middle}"{rel_attr}{target_attr}>{trim_url(middle)}</a>'
-                )
+            # fix for mismatched opening and closing parentheses
+            pairs = [("(", ")"), ("<", ">"), ("&lt;", "&gt;")]
+            for start_char in re.findall(_lead_pattern, middle):
+                end_char = next(c for o, c in pairs if o == start_char)
+                while (
+                    middle.count(start_char) > middle.count(end_char)
+                    and end_char in trail
+                ):
+                    end_char_index = trail.index(end_char)
+                    middle = middle + trail[: end_char_index + len(end_char)]
+                    trail = trail[end_char_index + len(end_char) :]
+
+            if _simple_http_https_re.match(middle):
+                if middle.startswith("https://") or middle.startswith("http://"):
+                    middle = (
+                        f'<a href="{middle}"{rel_attr}{target_attr}>'
+                        f"{trim_url(middle)}</a>"
+                    )
+                else:
+                    middle = (
+                        f'<a href="https://{middle}"{rel_attr}{target_attr}>'
+                        f"{trim_url(middle)}</a>"
+                    )
+
              if (
                  "@" in middle
                  and not middle.startswith("www.")
@@ -231,8 +250,21 @@ def urlize(text, trim_url_limit=None, rel=None, target=None):
                  and _simple_email_re.match(middle)
              ):
                  middle = f'<a href="mailto:{middle}">{middle}</a>'
+            if middle.startswith("mailto:") and _simple_email_re.match(middle[7:]):
+                middle = f'<a href="{middle}">{middle[7:]}</a>'
+
+            if extra_uri_schemes is not None:
+                schemes = {x for x in extra_uri_schemes if middle.startswith(x)}
+                for uri_scheme in schemes:
+                    if len(middle) > len(uri_scheme):
+                        middle = (
+                            f'<a href="{middle}"{rel_attr}{target_attr}>'
+                            f"{middle}</a>"
+                        )
+
              if lead + middle + trail != word:
                  words[i] = lead + middle + trail
+
      return "".join(words)
  
  
diff --git a/tests/test_filters.py b/tests/test_filters.py

index 8087a248148a8c25c7bdcb3e36491c584ad7813d..bf00f069237c866a52cb9feff57de002d219fead 100644 (file)
--- a/tests/test_filters.py
+++ b/tests/test_filters.py
@@ -337,11 +337,23 @@ class TestFilter:
          assert tmpl.render() == "FOO"
  
      def test_urlize(self, env):
+        tmpl = env.from_string('{{ "foo example.org bar"|urlize }}')
+        assert tmpl.render() == (
+            'foo <a href="https://example.org" rel="noopener">' "example.org</a> bar"
+        )
          tmpl = env.from_string('{{ "foo http://www.example.com/ bar"|urlize }}')
          assert tmpl.render() == (
              'foo <a href="http://www.example.com/" rel="noopener">'
              "http://www.example.com/</a> bar"
          )
+        tmpl = env.from_string('{{ "foo mailto:email@example.com bar"|urlize }}')
+        assert tmpl.render() == (
+            'foo <a href="mailto:email@example.com">email@example.com</a> bar'
+        )
+        tmpl = env.from_string('{{ "foo email@example.com bar"|urlize }}')
+        assert tmpl.render() == (
+            'foo <a href="mailto:email@example.com">email@example.com</a> bar'
+        )
  
      def test_urlize_rel_policy(self):
          env = Environment()
@@ -361,6 +373,17 @@ class TestFilter:
              "http://www.example.com/</a> bar"
          )
  
+    def test_urlize_extra_uri_schemes_parameter(self, env):
+        tmpl = env.from_string(
+            '{{ "foo tel:+1-514-555-1234 ftp://localhost bar"|'
+            'urlize(extra_uri_schemes=["tel:", "ftp:"]) }}'
+        )
+        assert tmpl.render() == (
+            'foo <a href="tel:+1-514-555-1234" rel="noopener">'
+            'tel:+1-514-555-1234</a> <a href="ftp://localhost" rel="noopener">'
+            "ftp://localhost</a> bar"
+        )
+
      def test_wordcount(self, env):
          tmpl = env.from_string('{{ "foo bar baz"|wordcount }}')
          assert tmpl.render() == "3"
diff --git a/tests/test_regression.py b/tests/test_regression.py

index d052f43ecced1a4c5887aca6a7fc309b1bf4b321..21a6d922d4e1d902d4fb64de53151e9645184553 100644 (file)
--- a/tests/test_regression.py
+++ b/tests/test_regression.py
@@ -109,6 +109,15 @@ class TestBug:
              "http://www.example.org/&lt;foo</a>"
          )
  
+    def test_urlize_filter_closing_punctuation(self, env):
+        tmpl = env.from_string(
+            '{{ "(see http://www.example.org/?page=subj_<desc.h>)"|urlize }}'
+        )
+        assert tmpl.render() == (
+            '(see <a href="http://www.example.org/?page=subj_&lt;desc.h&gt;" '
+            'rel="noopener">http://www.example.org/?page=subj_&lt;desc.h&gt;</a>)'
+        )
+
      def test_loop_call_loop(self, env):
          tmpl = env.from_string(
              """
author	Bebleo <james.warne@outlook.com>
	Sun, 19 Apr 2020 09:42:12 +0000 (05:42 -0400)
committer	David Lord <davidism@gmail.com>
	Sat, 30 Jan 2021 14:25:03 +0000 (06:25 -0800)
docs/api.rst		patch \| blob \| blame \| history
src/jinja2/defaults.py		patch \| blob \| blame \| history
src/jinja2/filters.py		patch \| blob \| blame \| history
src/jinja2/utils.py		patch \| blob \| blame \| history
tests/test_filters.py		patch \| blob \| blame \| history
tests/test_regression.py		patch \| blob \| blame \| history