def xhtml_escape(value):
"""Escapes a string so it is valid within XML or XHTML."""
- return utf8(xml.sax.saxutils.escape(value, {'"': """}))
+ return xml.sax.saxutils.escape(value, {'"': """})
def xhtml_unescape(value):
return value
+# Regex from http://daringfireball.net/2010/07/improved_regex_for_matching_urls
+# Modified to capture protocol and to avoid HTML character entities other than &
+_URL_RE = re.compile(ur"""(?i)\b((?:([a-z][\w-]+):(?:(/{1,3})|[a-z0-9%])|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>&]+|&|\(([^\s()<>&]+|(\([^\s()<>&]+\)))*\))+(?:\(([^\s()<>&]+|(\([^\s()<>&]+\)))*\)|[^\s`!()\[\]{};:'".,<>?\xab\xbb\u201c\u201d\u2018\u2019&]))""")
+
+
+def linkify(text, shorten=False, extra_params="",
+ require_protocol=False, permitted_protocols=["http", "https"]):
+ """Converts plain text into HTML with links.
+
+ For example: linkify("Hello http://tornadoweb.org!") would return
+ Hello <a href="http://tornadoweb.org">http://tornadoweb.org</a>!
+
+ Parameters:
+ shorten: Long urls will be shortened for display.
+ extra_params: Extra text to include in the link tag,
+ e.g. linkify(text, extra_params='rel="nofollow" class="external"')
+ require_protocol: Only linkify urls which include a protocol. If this is
+ False, urls such as www.facebook.com will also be linkified.
+ permitted_protocols: List (or set) of protocols which should be linkified,
+ e.g. linkify(text, permitted_protocols=["http", "ftp", "mailto"]).
+ It is very unsafe to include protocols such as "javascript".
+ """
+ if extra_params:
+ extra_params = " " + extra_params.strip()
+
+ def make_link(m):
+ url = m.group(1)
+ proto = m.group(2)
+ if require_protocol and not proto:
+ return url # not protocol, no linkify
+
+ if proto and proto not in permitted_protocols:
+ return url # bad protocol, no linkify
+
+ href = m.group(1)
+ if not proto:
+ href = "http://" + href # no proto specified, use http
+
+ params = extra_params
+
+ # clip long urls. max_len is just an approximation
+ max_len = 30
+ if shorten and len(url) > max_len:
+ before_clip = url
+ if proto:
+ proto_len = len(proto) + 1 + len(m.group(3) or "") # +1 for :
+ else:
+ proto_len = 0
+
+ parts = url[proto_len:].split("/")
+ if len(parts) > 1:
+ # Grab the whole host part plus the first bit of the path
+ # The path is usually not that interesting once shortened
+ # (no more slug, etc), so it really just provides a little
+ # extra indication of shortening.
+ url = url[:proto_len] + parts[0] + "/" + \
+ parts[1][:8].split('?')[0].split('.')[0]
+
+ if len(url) > max_len * 1.5: # still too long
+ url = url[:max_len]
+
+ if url != before_clip:
+ amp = url.rfind('&')
+ # avoid splitting html char entities
+ if amp > max_len - 5:
+ url = url[:amp]
+ url += "..."
+
+ if len(url) >= len(before_clip):
+ url = before_clip
+ else:
+ # full url is visible on mouse-over (for those who don't
+ # have a status bar, such as Safari by default)
+ params += ' title="%s"' % href
+
+ return u'<a href="%s"%s>%s</a>' % (href, params, url)
+
+ # First HTML-escape so that our strings are all safe.
+ # The regex is modified to avoid character entites other than & so
+ # that we won't pick up ", etc.
+ text = _unicode(xhtml_escape(text))
+ return _URL_RE.sub(make_link, text)
+
+
def _unicode(value):
if isinstance(value, str):
return value.decode("utf-8")