From: Paul Buchheit <pbuchheit@dev428.snc1.facebook.com>
Date: Thu, 30 Sep 2010 21:41:13 +0000 (-0700)
Subject: add a linkify() function for converting text to html with link detection. Also change... 
X-Git-Tag: v1.2.0~113
X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=f732f98063f8a0bf9f7e331876964bedbbdc8462;p=thirdparty%2Ftornado.git

add a linkify() function for converting text to html with link detection. Also changed xhtml_escape to not convert return value to utf8, since pages should be assembed in unicode and not utf8 encoded until they are ready to write().
---

diff --git a/demos/chat/templates/message.html b/demos/chat/templates/message.html
index 4445cbdfa..20edbe7a6 100644
--- a/demos/chat/templates/message.html
+++ b/demos/chat/templates/message.html
@@ -1 +1,2 @@
-<div class="message" id="m{{ message["id"] }}"><b>{{ escape(message["from"]) }}: </b>{{ escape(message["body"]) }}</div>
+{% import tornado.escape %}
+<div class="message" id="m{{ message["id"] }}"><b>{{ escape(message["from"]) }}: </b>{{ tornado.escape.linkify(message["body"]) }}</div>
diff --git a/tornado/escape.py b/tornado/escape.py
index 5d6d9ea78..174c71cea 100644
--- a/tornado/escape.py
+++ b/tornado/escape.py
@@ -49,7 +49,7 @@ except:
 
 def xhtml_escape(value):
     """Escapes a string so it is valid within XML or XHTML."""
-    return utf8(xml.sax.saxutils.escape(value, {'"': "&quot;"}))
+    return xml.sax.saxutils.escape(value, {'"': "&quot;"})
 
 
 def xhtml_unescape(value):
@@ -95,6 +95,90 @@ def utf8(value):
     return value
 
 
+# Regex from http://daringfireball.net/2010/07/improved_regex_for_matching_urls
+# Modified to capture protocol and to avoid HTML character entities other than &amp;
+_URL_RE = re.compile(ur"""(?i)\b((?:([a-z][\w-]+):(?:(/{1,3})|[a-z0-9%])|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>&]+|&amp;|\(([^\s()<>&]+|(\([^\s()<>&]+\)))*\))+(?:\(([^\s()<>&]+|(\([^\s()<>&]+\)))*\)|[^\s`!()\[\]{};:'".,<>?\xab\xbb\u201c\u201d\u2018\u2019&]))""")
+
+
+def linkify(text, shorten=False, extra_params="",
+            require_protocol=False, permitted_protocols=["http", "https"]):
+    """Converts plain text into HTML with links.
+
+    For example: linkify("Hello http://tornadoweb.org!") would return
+    Hello <a href="http://tornadoweb.org">http://tornadoweb.org</a>!
+
+    Parameters:
+    shorten: Long urls will be shortened for display.
+    extra_params: Extra text to include in the link tag,
+        e.g. linkify(text, extra_params='rel="nofollow" class="external"')
+    require_protocol: Only linkify urls which include a protocol. If this is
+        False, urls such as www.facebook.com will also be linkified.
+    permitted_protocols: List (or set) of protocols which should be linkified,
+        e.g. linkify(text, permitted_protocols=["http", "ftp", "mailto"]).
+        It is very unsafe to include protocols such as "javascript".
+    """
+    if extra_params:
+        extra_params = " " + extra_params.strip()
+
+    def make_link(m):
+        url = m.group(1)
+        proto = m.group(2)
+        if require_protocol and not proto:
+            return url  # not protocol, no linkify
+
+        if proto and proto not in permitted_protocols:
+            return url  # bad protocol, no linkify
+
+        href = m.group(1)
+        if not proto:
+            href = "http://" + href   # no proto specified, use http
+
+        params = extra_params
+
+        # clip long urls. max_len is just an approximation
+        max_len = 30
+        if shorten and len(url) > max_len:
+            before_clip = url
+            if proto:
+                proto_len = len(proto) + 1 + len(m.group(3) or "")  # +1 for :
+            else:
+                proto_len = 0
+
+            parts = url[proto_len:].split("/")
+            if len(parts) > 1:
+                # Grab the whole host part plus the first bit of the path
+                # The path is usually not that interesting once shortened
+                # (no more slug, etc), so it really just provides a little
+                # extra indication of shortening.
+                url = url[:proto_len] + parts[0] + "/" + \
+                        parts[1][:8].split('?')[0].split('.')[0]
+
+            if len(url) > max_len * 1.5:  # still too long
+                url = url[:max_len]
+
+            if url != before_clip:
+                amp = url.rfind('&')
+                # avoid splitting html char entities
+                if amp > max_len - 5:
+                    url = url[:amp]
+                url += "..."
+
+                if len(url) >= len(before_clip):
+                    url = before_clip
+                else:
+                    # full url is visible on mouse-over (for those who don't
+                    # have a status bar, such as Safari by default)
+                    params += ' title="%s"' % href
+
+        return u'<a href="%s"%s>%s</a>' % (href, params, url)
+
+    # First HTML-escape so that our strings are all safe.
+    # The regex is modified to avoid character entites other than &amp; so
+    # that we won't pick up &quot;, etc.
+    text = _unicode(xhtml_escape(text))
+    return _URL_RE.sub(make_link, text)
+
+
 def _unicode(value):
     if isinstance(value, str):
         return value.decode("utf-8")