From: Paul Buchheit Date: Thu, 30 Sep 2010 21:41:13 +0000 (-0700) Subject: add a linkify() function for converting text to html with link detection. Also change... X-Git-Tag: v1.2.0~113 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=f732f98063f8a0bf9f7e331876964bedbbdc8462;p=thirdparty%2Ftornado.git add a linkify() function for converting text to html with link detection. Also changed xhtml_escape to not convert return value to utf8, since pages should be assembed in unicode and not utf8 encoded until they are ready to write(). --- diff --git a/demos/chat/templates/message.html b/demos/chat/templates/message.html index 4445cbdfa..20edbe7a6 100644 --- a/demos/chat/templates/message.html +++ b/demos/chat/templates/message.html @@ -1 +1,2 @@ -
{{ escape(message["from"]) }}: {{ escape(message["body"]) }}
+{% import tornado.escape %} +
{{ escape(message["from"]) }}: {{ tornado.escape.linkify(message["body"]) }}
diff --git a/tornado/escape.py b/tornado/escape.py index 5d6d9ea78..174c71cea 100644 --- a/tornado/escape.py +++ b/tornado/escape.py @@ -49,7 +49,7 @@ except: def xhtml_escape(value): """Escapes a string so it is valid within XML or XHTML.""" - return utf8(xml.sax.saxutils.escape(value, {'"': """})) + return xml.sax.saxutils.escape(value, {'"': """}) def xhtml_unescape(value): @@ -95,6 +95,90 @@ def utf8(value): return value +# Regex from http://daringfireball.net/2010/07/improved_regex_for_matching_urls +# Modified to capture protocol and to avoid HTML character entities other than & +_URL_RE = re.compile(ur"""(?i)\b((?:([a-z][\w-]+):(?:(/{1,3})|[a-z0-9%])|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>&]+|&|\(([^\s()<>&]+|(\([^\s()<>&]+\)))*\))+(?:\(([^\s()<>&]+|(\([^\s()<>&]+\)))*\)|[^\s`!()\[\]{};:'".,<>?\xab\xbb\u201c\u201d\u2018\u2019&]))""") + + +def linkify(text, shorten=False, extra_params="", + require_protocol=False, permitted_protocols=["http", "https"]): + """Converts plain text into HTML with links. + + For example: linkify("Hello http://tornadoweb.org!") would return + Hello http://tornadoweb.org! + + Parameters: + shorten: Long urls will be shortened for display. + extra_params: Extra text to include in the link tag, + e.g. linkify(text, extra_params='rel="nofollow" class="external"') + require_protocol: Only linkify urls which include a protocol. If this is + False, urls such as www.facebook.com will also be linkified. + permitted_protocols: List (or set) of protocols which should be linkified, + e.g. linkify(text, permitted_protocols=["http", "ftp", "mailto"]). + It is very unsafe to include protocols such as "javascript". + """ + if extra_params: + extra_params = " " + extra_params.strip() + + def make_link(m): + url = m.group(1) + proto = m.group(2) + if require_protocol and not proto: + return url # not protocol, no linkify + + if proto and proto not in permitted_protocols: + return url # bad protocol, no linkify + + href = m.group(1) + if not proto: + href = "http://" + href # no proto specified, use http + + params = extra_params + + # clip long urls. max_len is just an approximation + max_len = 30 + if shorten and len(url) > max_len: + before_clip = url + if proto: + proto_len = len(proto) + 1 + len(m.group(3) or "") # +1 for : + else: + proto_len = 0 + + parts = url[proto_len:].split("/") + if len(parts) > 1: + # Grab the whole host part plus the first bit of the path + # The path is usually not that interesting once shortened + # (no more slug, etc), so it really just provides a little + # extra indication of shortening. + url = url[:proto_len] + parts[0] + "/" + \ + parts[1][:8].split('?')[0].split('.')[0] + + if len(url) > max_len * 1.5: # still too long + url = url[:max_len] + + if url != before_clip: + amp = url.rfind('&') + # avoid splitting html char entities + if amp > max_len - 5: + url = url[:amp] + url += "..." + + if len(url) >= len(before_clip): + url = before_clip + else: + # full url is visible on mouse-over (for those who don't + # have a status bar, such as Safari by default) + params += ' title="%s"' % href + + return u'%s' % (href, params, url) + + # First HTML-escape so that our strings are all safe. + # The regex is modified to avoid character entites other than & so + # that we won't pick up ", etc. + text = _unicode(xhtml_escape(text)) + return _URL_RE.sub(make_link, text) + + def _unicode(value): if isinstance(value, str): return value.decode("utf-8")