]> git.ipfire.org Git - ipfire.org.git/blame - www/webapp/markdown.py
Create a global webapp database connection and create a config class.
[ipfire.org.git] / www / webapp / markdown.py
CommitLineData
d0d074e0
MT
1#!/usr/bin/env python
2# Copyright (c) 2007-2008 ActiveState Corp.
3# License: MIT (http://www.opensource.org/licenses/mit-license.php)
4
5r"""A fast and complete Python implementation of Markdown.
6
7[from http://daringfireball.net/projects/markdown/]
8> Markdown is a text-to-HTML filter; it translates an easy-to-read /
9> easy-to-write structured text format into HTML. Markdown's text
10> format is most similar to that of plain text email, and supports
11> features such as headers, *emphasis*, code blocks, blockquotes, and
12> links.
13>
14> Markdown's syntax is designed not as a generic markup language, but
15> specifically to serve as a front-end to (X)HTML. You can use span-level
16> HTML tags anywhere in a Markdown document, and you can use block level
17> HTML tags (like <div> and <table> as well).
18
19Module usage:
20
21 >>> import markdown2
22 >>> markdown2.markdown("*boo!*") # or use `html = markdown_path(PATH)`
23 u'<p><em>boo!</em></p>\n'
24
25 >>> markdowner = Markdown()
26 >>> markdowner.convert("*boo!*")
27 u'<p><em>boo!</em></p>\n'
28 >>> markdowner.convert("**boom!**")
29 u'<p><strong>boom!</strong></p>\n'
30
31This implementation of Markdown implements the full "core" syntax plus a
32number of extras (e.g., code syntax coloring, footnotes) as described on
33<http://code.google.com/p/python-markdown2/wiki/Extras>.
34"""
35
36cmdln_desc = """A fast and complete Python implementation of Markdown, a
37text-to-HTML conversion tool for web writers.
38"""
39
40# Dev Notes:
41# - There is already a Python markdown processor
42# (http://www.freewisdom.org/projects/python-markdown/).
43# - Python's regex syntax doesn't have '\z', so I'm using '\Z'. I'm
44# not yet sure if there implications with this. Compare 'pydoc sre'
45# and 'perldoc perlre'.
46
47__version_info__ = (1, 0, 1, 14) # first three nums match Markdown.pl
48__version__ = '1.0.1.14'
49__author__ = "Trent Mick"
50
51import os
52import sys
53from pprint import pprint
54import re
55import logging
56try:
57 from hashlib import md5
58except ImportError:
59 from md5 import md5
60import optparse
61from random import random
62import codecs
63
64
65
66#---- Python version compat
67
68if sys.version_info[:2] < (2,4):
69 from sets import Set as set
70 def reversed(sequence):
71 for i in sequence[::-1]:
72 yield i
73 def _unicode_decode(s, encoding, errors='xmlcharrefreplace'):
74 return unicode(s, encoding, errors)
75else:
76 def _unicode_decode(s, encoding, errors='strict'):
77 return s.decode(encoding, errors)
78
79
80#---- globals
81
82DEBUG = False
83log = logging.getLogger("markdown")
84
85DEFAULT_TAB_WIDTH = 4
86
87# Table of hash values for escaped characters:
88def _escape_hash(s):
89 # Lame attempt to avoid possible collision with someone actually
90 # using the MD5 hexdigest of one of these chars in there text.
91 # Other ideas: random.random(), uuid.uuid()
92 #return md5(s).hexdigest() # Markdown.pl effectively does this.
93 return 'md5-'+md5(s).hexdigest()
94g_escape_table = dict([(ch, _escape_hash(ch))
95 for ch in '\\`*_{}[]()>#+-.!'])
96
97
98
99#---- exceptions
100
101class MarkdownError(Exception):
102 pass
103
104
105
106#---- public api
107
108def markdown_path(path, encoding="utf-8",
109 html4tags=False, tab_width=DEFAULT_TAB_WIDTH,
110 safe_mode=None, extras=None, link_patterns=None,
111 use_file_vars=False):
112 text = codecs.open(path, 'r', encoding).read()
113 return Markdown(html4tags=html4tags, tab_width=tab_width,
114 safe_mode=safe_mode, extras=extras,
115 link_patterns=link_patterns,
116 use_file_vars=use_file_vars).convert(text)
117
118def markdown(text, html4tags=False, tab_width=DEFAULT_TAB_WIDTH,
119 safe_mode=None, extras=None, link_patterns=None,
120 use_file_vars=False):
121 return Markdown(html4tags=html4tags, tab_width=tab_width,
122 safe_mode=safe_mode, extras=extras,
123 link_patterns=link_patterns,
124 use_file_vars=use_file_vars).convert(text)
125
126class Markdown(object):
127 # The dict of "extras" to enable in processing -- a mapping of
128 # extra name to argument for the extra. Most extras do not have an
129 # argument, in which case the value is None.
130 #
131 # This can be set via (a) subclassing and (b) the constructor
132 # "extras" argument.
133 extras = None
134
135 urls = None
136 titles = None
137 html_blocks = None
138 html_spans = None
139 html_removed_text = "[HTML_REMOVED]" # for compat with markdown.py
140
141 # Used to track when we're inside an ordered or unordered list
142 # (see _ProcessListItems() for details):
143 list_level = 0
144
145 _ws_only_line_re = re.compile(r"^[ \t]+$", re.M)
146
147 def __init__(self, html4tags=False, tab_width=4, safe_mode=None,
148 extras=None, link_patterns=None, use_file_vars=False):
149 if html4tags:
150 self.empty_element_suffix = ">"
151 else:
152 self.empty_element_suffix = " />"
153 self.tab_width = tab_width
154
155 # For compatibility with earlier markdown2.py and with
156 # markdown.py's safe_mode being a boolean,
157 # safe_mode == True -> "replace"
158 if safe_mode is True:
159 self.safe_mode = "replace"
160 else:
161 self.safe_mode = safe_mode
162
163 if self.extras is None:
164 self.extras = {}
165 elif not isinstance(self.extras, dict):
166 self.extras = dict([(e, None) for e in self.extras])
167 if extras:
168 if not isinstance(extras, dict):
169 extras = dict([(e, None) for e in extras])
170 self.extras.update(extras)
171 assert isinstance(self.extras, dict)
172 self._instance_extras = self.extras.copy()
173 self.link_patterns = link_patterns
174 self.use_file_vars = use_file_vars
175 self._outdent_re = re.compile(r'^(\t|[ ]{1,%d})' % tab_width, re.M)
176
177 def reset(self):
178 self.urls = {}
179 self.titles = {}
180 self.html_blocks = {}
181 self.html_spans = {}
182 self.list_level = 0
183 self.extras = self._instance_extras.copy()
184 if "footnotes" in self.extras:
185 self.footnotes = {}
186 self.footnote_ids = []
187
188 def convert(self, text):
189 """Convert the given text."""
190 # Main function. The order in which other subs are called here is
191 # essential. Link and image substitutions need to happen before
192 # _EscapeSpecialChars(), so that any *'s or _'s in the <a>
193 # and <img> tags get encoded.
194
195 # Clear the global hashes. If we don't clear these, you get conflicts
196 # from other articles when generating a page which contains more than
197 # one article (e.g. an index page that shows the N most recent
198 # articles):
199 self.reset()
200
201 if not isinstance(text, unicode):
202 #TODO: perhaps shouldn't presume UTF-8 for string input?
203 text = unicode(text, 'utf-8')
204
205 if self.use_file_vars:
206 # Look for emacs-style file variable hints.
207 emacs_vars = self._get_emacs_vars(text)
208 if "markdown-extras" in emacs_vars:
209 splitter = re.compile("[ ,]+")
210 for e in splitter.split(emacs_vars["markdown-extras"]):
211 if '=' in e:
212 ename, earg = e.split('=', 1)
213 try:
214 earg = int(earg)
215 except ValueError:
216 pass
217 else:
218 ename, earg = e, None
219 self.extras[ename] = earg
220
221 # Standardize line endings:
222 text = re.sub("\r\n|\r", "\n", text)
223
224 # Make sure $text ends with a couple of newlines:
225 text += "\n\n"
226
227 # Convert all tabs to spaces.
228 text = self._detab(text)
229
230 # Strip any lines consisting only of spaces and tabs.
231 # This makes subsequent regexen easier to write, because we can
232 # match consecutive blank lines with /\n+/ instead of something
233 # contorted like /[ \t]*\n+/ .
234 text = self._ws_only_line_re.sub("", text)
235
236 if self.safe_mode:
237 text = self._hash_html_spans(text)
238
239 # Turn block-level HTML blocks into hash entries
240 text = self._hash_html_blocks(text, raw=True)
241
242 # Strip link definitions, store in hashes.
243 if "footnotes" in self.extras:
244 # Must do footnotes first because an unlucky footnote defn
245 # looks like a link defn:
246 # [^4]: this "looks like a link defn"
247 text = self._strip_footnote_definitions(text)
248 text = self._strip_link_definitions(text)
249
250 text = self._run_block_gamut(text)
251
252 if "footnotes" in self.extras:
253 text = self._add_footnotes(text)
254
255 text = self._unescape_special_chars(text)
256
257 if self.safe_mode:
258 text = self._unhash_html_spans(text)
259
260 text += "\n"
261 return text
262
263 _emacs_oneliner_vars_pat = re.compile(r"-\*-\s*([^\r\n]*?)\s*-\*-", re.UNICODE)
264 # This regular expression is intended to match blocks like this:
265 # PREFIX Local Variables: SUFFIX
266 # PREFIX mode: Tcl SUFFIX
267 # PREFIX End: SUFFIX
268 # Some notes:
269 # - "[ \t]" is used instead of "\s" to specifically exclude newlines
270 # - "(\r\n|\n|\r)" is used instead of "$" because the sre engine does
271 # not like anything other than Unix-style line terminators.
272 _emacs_local_vars_pat = re.compile(r"""^
273 (?P<prefix>(?:[^\r\n|\n|\r])*?)
274 [\ \t]*Local\ Variables:[\ \t]*
275 (?P<suffix>.*?)(?:\r\n|\n|\r)
276 (?P<content>.*?\1End:)
277 """, re.IGNORECASE | re.MULTILINE | re.DOTALL | re.VERBOSE)
278
279 def _get_emacs_vars(self, text):
280 """Return a dictionary of emacs-style local variables.
281
282 Parsing is done loosely according to this spec (and according to
283 some in-practice deviations from this):
284 http://www.gnu.org/software/emacs/manual/html_node/emacs/Specifying-File-Variables.html#Specifying-File-Variables
285 """
286 emacs_vars = {}
287 SIZE = pow(2, 13) # 8kB
288
289 # Search near the start for a '-*-'-style one-liner of variables.
290 head = text[:SIZE]
291 if "-*-" in head:
292 match = self._emacs_oneliner_vars_pat.search(head)
293 if match:
294 emacs_vars_str = match.group(1)
295 assert '\n' not in emacs_vars_str
296 emacs_var_strs = [s.strip() for s in emacs_vars_str.split(';')
297 if s.strip()]
298 if len(emacs_var_strs) == 1 and ':' not in emacs_var_strs[0]:
299 # While not in the spec, this form is allowed by emacs:
300 # -*- Tcl -*-
301 # where the implied "variable" is "mode". This form
302 # is only allowed if there are no other variables.
303 emacs_vars["mode"] = emacs_var_strs[0].strip()
304 else:
305 for emacs_var_str in emacs_var_strs:
306 try:
307 variable, value = emacs_var_str.strip().split(':', 1)
308 except ValueError:
309 log.debug("emacs variables error: malformed -*- "
310 "line: %r", emacs_var_str)
311 continue
312 # Lowercase the variable name because Emacs allows "Mode"
313 # or "mode" or "MoDe", etc.
314 emacs_vars[variable.lower()] = value.strip()
315
316 tail = text[-SIZE:]
317 if "Local Variables" in tail:
318 match = self._emacs_local_vars_pat.search(tail)
319 if match:
320 prefix = match.group("prefix")
321 suffix = match.group("suffix")
322 lines = match.group("content").splitlines(0)
323 #print "prefix=%r, suffix=%r, content=%r, lines: %s"\
324 # % (prefix, suffix, match.group("content"), lines)
325
326 # Validate the Local Variables block: proper prefix and suffix
327 # usage.
328 for i, line in enumerate(lines):
329 if not line.startswith(prefix):
330 log.debug("emacs variables error: line '%s' "
331 "does not use proper prefix '%s'"
332 % (line, prefix))
333 return {}
334 # Don't validate suffix on last line. Emacs doesn't care,
335 # neither should we.
336 if i != len(lines)-1 and not line.endswith(suffix):
337 log.debug("emacs variables error: line '%s' "
338 "does not use proper suffix '%s'"
339 % (line, suffix))
340 return {}
341
342 # Parse out one emacs var per line.
343 continued_for = None
344 for line in lines[:-1]: # no var on the last line ("PREFIX End:")
345 if prefix: line = line[len(prefix):] # strip prefix
346 if suffix: line = line[:-len(suffix)] # strip suffix
347 line = line.strip()
348 if continued_for:
349 variable = continued_for
350 if line.endswith('\\'):
351 line = line[:-1].rstrip()
352 else:
353 continued_for = None
354 emacs_vars[variable] += ' ' + line
355 else:
356 try:
357 variable, value = line.split(':', 1)
358 except ValueError:
359 log.debug("local variables error: missing colon "
360 "in local variables entry: '%s'" % line)
361 continue
362 # Do NOT lowercase the variable name, because Emacs only
363 # allows "mode" (and not "Mode", "MoDe", etc.) in this block.
364 value = value.strip()
365 if value.endswith('\\'):
366 value = value[:-1].rstrip()
367 continued_for = variable
368 else:
369 continued_for = None
370 emacs_vars[variable] = value
371
372 # Unquote values.
373 for var, val in emacs_vars.items():
374 if len(val) > 1 and (val.startswith('"') and val.endswith('"')
375 or val.startswith('"') and val.endswith('"')):
376 emacs_vars[var] = val[1:-1]
377
378 return emacs_vars
379
380 # Cribbed from a post by Bart Lateur:
381 # <http://www.nntp.perl.org/group/perl.macperl.anyperl/154>
382 _detab_re = re.compile(r'(.*?)\t', re.M)
383 def _detab_sub(self, match):
384 g1 = match.group(1)
385 return g1 + (' ' * (self.tab_width - len(g1) % self.tab_width))
386 def _detab(self, text):
387 r"""Remove (leading?) tabs from a file.
388
389 >>> m = Markdown()
390 >>> m._detab("\tfoo")
391 ' foo'
392 >>> m._detab(" \tfoo")
393 ' foo'
394 >>> m._detab("\t foo")
395 ' foo'
396 >>> m._detab(" foo")
397 ' foo'
398 >>> m._detab(" foo\n\tbar\tblam")
399 ' foo\n bar blam'
400 """
401 if '\t' not in text:
402 return text
403 return self._detab_re.subn(self._detab_sub, text)[0]
404
405 _block_tags_a = 'p|div|h[1-6]|blockquote|pre|table|dl|ol|ul|script|noscript|form|fieldset|iframe|math|ins|del'
406 _strict_tag_block_re = re.compile(r"""
407 ( # save in \1
408 ^ # start of line (with re.M)
409 <(%s) # start tag = \2
410 \b # word break
411 (.*\n)*? # any number of lines, minimally matching
412 </\2> # the matching end tag
413 [ \t]* # trailing spaces/tabs
414 (?=\n+|\Z) # followed by a newline or end of document
415 )
416 """ % _block_tags_a,
417 re.X | re.M)
418
419 _block_tags_b = 'p|div|h[1-6]|blockquote|pre|table|dl|ol|ul|script|noscript|form|fieldset|iframe|math'
420 _liberal_tag_block_re = re.compile(r"""
421 ( # save in \1
422 ^ # start of line (with re.M)
423 <(%s) # start tag = \2
424 \b # word break
425 (.*\n)*? # any number of lines, minimally matching
426 .*</\2> # the matching end tag
427 [ \t]* # trailing spaces/tabs
428 (?=\n+|\Z) # followed by a newline or end of document
429 )
430 """ % _block_tags_b,
431 re.X | re.M)
432
433 def _hash_html_block_sub(self, match, raw=False):
434 html = match.group(1)
435 if raw and self.safe_mode:
436 html = self._sanitize_html(html)
437 key = _hash_text(html)
438 self.html_blocks[key] = html
439 return "\n\n" + key + "\n\n"
440
441 def _hash_html_blocks(self, text, raw=False):
442 """Hashify HTML blocks
443
444 We only want to do this for block-level HTML tags, such as headers,
445 lists, and tables. That's because we still want to wrap <p>s around
446 "paragraphs" that are wrapped in non-block-level tags, such as anchors,
447 phrase emphasis, and spans. The list of tags we're looking for is
448 hard-coded.
449
450 @param raw {boolean} indicates if these are raw HTML blocks in
451 the original source. It makes a difference in "safe" mode.
452 """
453 if '<' not in text:
454 return text
455
456 # Pass `raw` value into our calls to self._hash_html_block_sub.
457 hash_html_block_sub = _curry(self._hash_html_block_sub, raw=raw)
458
459 # First, look for nested blocks, e.g.:
460 # <div>
461 # <div>
462 # tags for inner block must be indented.
463 # </div>
464 # </div>
465 #
466 # The outermost tags must start at the left margin for this to match, and
467 # the inner nested divs must be indented.
468 # We need to do this before the next, more liberal match, because the next
469 # match will start at the first `<div>` and stop at the first `</div>`.
470 text = self._strict_tag_block_re.sub(hash_html_block_sub, text)
471
472 # Now match more liberally, simply from `\n<tag>` to `</tag>\n`
473 text = self._liberal_tag_block_re.sub(hash_html_block_sub, text)
474
475 # Special case just for <hr />. It was easier to make a special
476 # case than to make the other regex more complicated.
477 if "<hr" in text:
478 _hr_tag_re = _hr_tag_re_from_tab_width(self.tab_width)
479 text = _hr_tag_re.sub(hash_html_block_sub, text)
480
481 # Special case for standalone HTML comments:
482 if "<!--" in text:
483 start = 0
484 while True:
485 # Delimiters for next comment block.
486 try:
487 start_idx = text.index("<!--", start)
488 except ValueError, ex:
489 break
490 try:
491 end_idx = text.index("-->", start_idx) + 3
492 except ValueError, ex:
493 break
494
495 # Start position for next comment block search.
496 start = end_idx
497
498 # Validate whitespace before comment.
499 if start_idx:
500 # - Up to `tab_width - 1` spaces before start_idx.
501 for i in range(self.tab_width - 1):
502 if text[start_idx - 1] != ' ':
503 break
504 start_idx -= 1
505 if start_idx == 0:
506 break
507 # - Must be preceded by 2 newlines or hit the start of
508 # the document.
509 if start_idx == 0:
510 pass
511 elif start_idx == 1 and text[0] == '\n':
512 start_idx = 0 # to match minute detail of Markdown.pl regex
513 elif text[start_idx-2:start_idx] == '\n\n':
514 pass
515 else:
516 break
517
518 # Validate whitespace after comment.
519 # - Any number of spaces and tabs.
520 while end_idx < len(text):
521 if text[end_idx] not in ' \t':
522 break
523 end_idx += 1
524 # - Must be following by 2 newlines or hit end of text.
525 if text[end_idx:end_idx+2] not in ('', '\n', '\n\n'):
526 continue
527
528 # Escape and hash (must match `_hash_html_block_sub`).
529 html = text[start_idx:end_idx]
530 if raw and self.safe_mode:
531 html = self._sanitize_html(html)
532 key = _hash_text(html)
533 self.html_blocks[key] = html
534 text = text[:start_idx] + "\n\n" + key + "\n\n" + text[end_idx:]
535
536 if "xml" in self.extras:
537 # Treat XML processing instructions and namespaced one-liner
538 # tags as if they were block HTML tags. E.g., if standalone
539 # (i.e. are their own paragraph), the following do not get
540 # wrapped in a <p> tag:
541 # <?foo bar?>
542 #
543 # <xi:include xmlns:xi="http://www.w3.org/2001/XInclude" href="chapter_1.md"/>
544 _xml_oneliner_re = _xml_oneliner_re_from_tab_width(self.tab_width)
545 text = _xml_oneliner_re.sub(hash_html_block_sub, text)
546
547 return text
548
549 def _strip_link_definitions(self, text):
550 # Strips link definitions from text, stores the URLs and titles in
551 # hash references.
552 less_than_tab = self.tab_width - 1
553
554 # Link defs are in the form:
555 # [id]: url "optional title"
556 _link_def_re = re.compile(r"""
557 ^[ ]{0,%d}\[(.+)\]: # id = \1
558 [ \t]*
559 \n? # maybe *one* newline
560 [ \t]*
561 <?(.+?)>? # url = \2
562 [ \t]*
563 (?:
564 \n? # maybe one newline
565 [ \t]*
566 (?<=\s) # lookbehind for whitespace
567 ['"(]
568 ([^\n]*) # title = \3
569 ['")]
570 [ \t]*
571 )? # title is optional
572 (?:\n+|\Z)
573 """ % less_than_tab, re.X | re.M | re.U)
574 return _link_def_re.sub(self._extract_link_def_sub, text)
575
576 def _extract_link_def_sub(self, match):
577 id, url, title = match.groups()
578 key = id.lower() # Link IDs are case-insensitive
579 self.urls[key] = self._encode_amps_and_angles(url)
580 if title:
581 self.titles[key] = title.replace('"', '&quot;')
582 return ""
583
584 def _extract_footnote_def_sub(self, match):
585 id, text = match.groups()
586 text = _dedent(text, skip_first_line=not text.startswith('\n')).strip()
587 normed_id = re.sub(r'\W', '-', id)
588 # Ensure footnote text ends with a couple newlines (for some
589 # block gamut matches).
590 self.footnotes[normed_id] = text + "\n\n"
591 return ""
592
593 def _strip_footnote_definitions(self, text):
594 """A footnote definition looks like this:
595
596 [^note-id]: Text of the note.
597
598 May include one or more indented paragraphs.
599
600 Where,
601 - The 'note-id' can be pretty much anything, though typically it
602 is the number of the footnote.
603 - The first paragraph may start on the next line, like so:
604
605 [^note-id]:
606 Text of the note.
607 """
608 less_than_tab = self.tab_width - 1
609 footnote_def_re = re.compile(r'''
610 ^[ ]{0,%d}\[\^(.+)\]: # id = \1
611 [ \t]*
612 ( # footnote text = \2
613 # First line need not start with the spaces.
614 (?:\s*.*\n+)
615 (?:
616 (?:[ ]{%d} | \t) # Subsequent lines must be indented.
617 .*\n+
618 )*
619 )
620 # Lookahead for non-space at line-start, or end of doc.
621 (?:(?=^[ ]{0,%d}\S)|\Z)
622 ''' % (less_than_tab, self.tab_width, self.tab_width),
623 re.X | re.M)
624 return footnote_def_re.sub(self._extract_footnote_def_sub, text)
625
626
627 _hr_res = [
628 re.compile(r"^[ ]{0,2}([ ]?\*[ ]?){3,}[ \t]*$", re.M),
629 re.compile(r"^[ ]{0,2}([ ]?\-[ ]?){3,}[ \t]*$", re.M),
630 re.compile(r"^[ ]{0,2}([ ]?\_[ ]?){3,}[ \t]*$", re.M),
631 ]
632
633 def _run_block_gamut(self, text):
634 # These are all the transformations that form block-level
635 # tags like paragraphs, headers, and list items.
636
637 text = self._do_headers(text)
638
639 # Do Horizontal Rules:
640 hr = "\n<hr"+self.empty_element_suffix+"\n"
641 for hr_re in self._hr_res:
642 text = hr_re.sub(hr, text)
643
644 text = self._do_lists(text)
645
646 if "pyshell" in self.extras:
647 text = self._prepare_pyshell_blocks(text)
648
649 text = self._do_code_blocks(text)
650
651 text = self._do_block_quotes(text)
652
653 # We already ran _HashHTMLBlocks() before, in Markdown(), but that
654 # was to escape raw HTML in the original Markdown source. This time,
655 # we're escaping the markup we've just created, so that we don't wrap
656 # <p> tags around block-level tags.
657 text = self._hash_html_blocks(text)
658
659 text = self._form_paragraphs(text)
660
661 return text
662
663 def _pyshell_block_sub(self, match):
664 lines = match.group(0).splitlines(0)
665 _dedentlines(lines)
666 indent = ' ' * self.tab_width
667 s = ('\n' # separate from possible cuddled paragraph
668 + indent + ('\n'+indent).join(lines)
669 + '\n\n')
670 return s
671
672 def _prepare_pyshell_blocks(self, text):
673 """Ensure that Python interactive shell sessions are put in
674 code blocks -- even if not properly indented.
675 """
676 if ">>>" not in text:
677 return text
678
679 less_than_tab = self.tab_width - 1
680 _pyshell_block_re = re.compile(r"""
681 ^([ ]{0,%d})>>>[ ].*\n # first line
682 ^(\1.*\S+.*\n)* # any number of subsequent lines
683 ^\n # ends with a blank line
684 """ % less_than_tab, re.M | re.X)
685
686 return _pyshell_block_re.sub(self._pyshell_block_sub, text)
687
688 def _run_span_gamut(self, text):
689 # These are all the transformations that occur *within* block-level
690 # tags like paragraphs, headers, and list items.
691
692 text = self._do_code_spans(text)
693
694 text = self._escape_special_chars(text)
695
696 # Process anchor and image tags.
697 text = self._do_links(text)
698
699 # Make links out of things like `<http://example.com/>`
700 # Must come after _do_links(), because you can use < and >
701 # delimiters in inline links like [this](<url>).
702 text = self._do_auto_links(text)
703
704 if "link-patterns" in self.extras:
705 text = self._do_link_patterns(text)
706
707 text = self._encode_amps_and_angles(text)
708
709 text = self._do_italics_and_bold(text)
710
711 # Do hard breaks:
712 text = re.sub(r" {2,}\n", " <br%s\n" % self.empty_element_suffix, text)
713
714 return text
715
716 # "Sorta" because auto-links are identified as "tag" tokens.
717 _sorta_html_tokenize_re = re.compile(r"""
718 (
719 # tag
720 </?
721 (?:\w+) # tag name
722 (?:\s+(?:[\w-]+:)?[\w-]+=(?:".*?"|'.*?'))* # attributes
723 \s*/?>
724 |
725 # auto-link (e.g., <http://www.activestate.com/>)
726 <\w+[^>]*>
727 |
728 <!--.*?--> # comment
729 |
730 <\?.*?\?> # processing instruction
731 )
732 """, re.X)
733
734 def _escape_special_chars(self, text):
735 # Python markdown note: the HTML tokenization here differs from
736 # that in Markdown.pl, hence the behaviour for subtle cases can
737 # differ (I believe the tokenizer here does a better job because
738 # it isn't susceptible to unmatched '<' and '>' in HTML tags).
739 # Note, however, that '>' is not allowed in an auto-link URL
740 # here.
741 escaped = []
742 is_html_markup = False
743 for token in self._sorta_html_tokenize_re.split(text):
744 if is_html_markup:
745 # Within tags/HTML-comments/auto-links, encode * and _
746 # so they don't conflict with their use in Markdown for
747 # italics and strong. We're replacing each such
748 # character with its corresponding MD5 checksum value;
749 # this is likely overkill, but it should prevent us from
750 # colliding with the escape values by accident.
751 escaped.append(token.replace('*', g_escape_table['*'])
752 .replace('_', g_escape_table['_']))
753 else:
754 escaped.append(self._encode_backslash_escapes(token))
755 is_html_markup = not is_html_markup
756 return ''.join(escaped)
757
758 def _hash_html_spans(self, text):
759 # Used for safe_mode.
760
761 def _is_auto_link(s):
762 if ':' in s and self._auto_link_re.match(s):
763 return True
764 elif '@' in s and self._auto_email_link_re.match(s):
765 return True
766 return False
767
768 tokens = []
769 is_html_markup = False
770 for token in self._sorta_html_tokenize_re.split(text):
771 if is_html_markup and not _is_auto_link(token):
772 sanitized = self._sanitize_html(token)
773 key = _hash_text(sanitized)
774 self.html_spans[key] = sanitized
775 tokens.append(key)
776 else:
777 tokens.append(token)
778 is_html_markup = not is_html_markup
779 return ''.join(tokens)
780
781 def _unhash_html_spans(self, text):
782 for key, sanitized in self.html_spans.items():
783 text = text.replace(key, sanitized)
784 return text
785
786 def _sanitize_html(self, s):
787 if self.safe_mode == "replace":
788 return self.html_removed_text
789 elif self.safe_mode == "escape":
790 replacements = [
791 ('&', '&amp;'),
792 ('<', '&lt;'),
793 ('>', '&gt;'),
794 ]
795 for before, after in replacements:
796 s = s.replace(before, after)
797 return s
798 else:
799 raise MarkdownError("invalid value for 'safe_mode': %r (must be "
800 "'escape' or 'replace')" % self.safe_mode)
801
802 _tail_of_inline_link_re = re.compile(r'''
803 # Match tail of: [text](/url/) or [text](/url/ "title")
804 \( # literal paren
805 [ \t]*
806 (?P<url> # \1
807 <.*?>
808 |
809 .*?
810 )
811 [ \t]*
812 ( # \2
813 (['"]) # quote char = \3
814 (?P<title>.*?)
815 \3 # matching quote
816 )? # title is optional
817 \)
818 ''', re.X | re.S)
819 _tail_of_reference_link_re = re.compile(r'''
820 # Match tail of: [text][id]
821 [ ]? # one optional space
822 (?:\n[ ]*)? # one optional newline followed by spaces
823 \[
824 (?P<id>.*?)
825 \]
826 ''', re.X | re.S)
827
828 def _do_links(self, text):
829 """Turn Markdown link shortcuts into XHTML <a> and <img> tags.
830
831 This is a combination of Markdown.pl's _DoAnchors() and
832 _DoImages(). They are done together because that simplified the
833 approach. It was necessary to use a different approach than
834 Markdown.pl because of the lack of atomic matching support in
835 Python's regex engine used in $g_nested_brackets.
836 """
837 MAX_LINK_TEXT_SENTINEL = 3000 # markdown2 issue 24
838
839 # `anchor_allowed_pos` is used to support img links inside
840 # anchors, but not anchors inside anchors. An anchor's start
841 # pos must be `>= anchor_allowed_pos`.
842 anchor_allowed_pos = 0
843
844 curr_pos = 0
845 while True: # Handle the next link.
846 # The next '[' is the start of:
847 # - an inline anchor: [text](url "title")
848 # - a reference anchor: [text][id]
849 # - an inline img: ![text](url "title")
850 # - a reference img: ![text][id]
851 # - a footnote ref: [^id]
852 # (Only if 'footnotes' extra enabled)
853 # - a footnote defn: [^id]: ...
854 # (Only if 'footnotes' extra enabled) These have already
855 # been stripped in _strip_footnote_definitions() so no
856 # need to watch for them.
857 # - a link definition: [id]: url "title"
858 # These have already been stripped in
859 # _strip_link_definitions() so no need to watch for them.
860 # - not markup: [...anything else...
861 try:
862 start_idx = text.index('[', curr_pos)
863 except ValueError:
864 break
865 text_length = len(text)
866
867 # Find the matching closing ']'.
868 # Markdown.pl allows *matching* brackets in link text so we
869 # will here too. Markdown.pl *doesn't* currently allow
870 # matching brackets in img alt text -- we'll differ in that
871 # regard.
872 bracket_depth = 0
873 for p in range(start_idx+1, min(start_idx+MAX_LINK_TEXT_SENTINEL,
874 text_length)):
875 ch = text[p]
876 if ch == ']':
877 bracket_depth -= 1
878 if bracket_depth < 0:
879 break
880 elif ch == '[':
881 bracket_depth += 1
882 else:
883 # Closing bracket not found within sentinel length.
884 # This isn't markup.
885 curr_pos = start_idx + 1
886 continue
887 link_text = text[start_idx+1:p]
888
889 # Possibly a footnote ref?
890 if "footnotes" in self.extras and link_text.startswith("^"):
891 normed_id = re.sub(r'\W', '-', link_text[1:])
892 if normed_id in self.footnotes:
893 self.footnote_ids.append(normed_id)
894 result = '<sup class="footnote-ref" id="fnref-%s">' \
895 '<a href="#fn-%s">%s</a></sup>' \
896 % (normed_id, normed_id, len(self.footnote_ids))
897 text = text[:start_idx] + result + text[p+1:]
898 else:
899 # This id isn't defined, leave the markup alone.
900 curr_pos = p+1
901 continue
902
903 # Now determine what this is by the remainder.
904 p += 1
905 if p == text_length:
906 return text
907
908 # Inline anchor or img?
909 if text[p] == '(': # attempt at perf improvement
910 match = self._tail_of_inline_link_re.match(text, p)
911 if match:
912 # Handle an inline anchor or img.
913 is_img = start_idx > 0 and text[start_idx-1] == "!"
914 if is_img:
915 start_idx -= 1
916
917 url, title = match.group("url"), match.group("title")
918 if url and url[0] == '<':
919 url = url[1:-1] # '<url>' -> 'url'
920 # We've got to encode these to avoid conflicting
921 # with italics/bold.
922 url = url.replace('*', g_escape_table['*']) \
923 .replace('_', g_escape_table['_'])
924 if title:
925 title_str = ' title="%s"' \
926 % title.replace('*', g_escape_table['*']) \
927 .replace('_', g_escape_table['_']) \
928 .replace('"', '&quot;')
929 else:
930 title_str = ''
931 if is_img:
932 result = '<img src="%s" alt="%s"%s%s' \
933 % (url, link_text.replace('"', '&quot;'),
934 title_str, self.empty_element_suffix)
935 curr_pos = start_idx + len(result)
936 text = text[:start_idx] + result + text[match.end():]
937 elif start_idx >= anchor_allowed_pos:
938 result_head = '<a href="%s"%s>' % (url, title_str)
939 result = '%s%s</a>' % (result_head, link_text)
940 # <img> allowed from curr_pos on, <a> from
941 # anchor_allowed_pos on.
942 curr_pos = start_idx + len(result_head)
943 anchor_allowed_pos = start_idx + len(result)
944 text = text[:start_idx] + result + text[match.end():]
945 else:
946 # Anchor not allowed here.
947 curr_pos = start_idx + 1
948 continue
949
950 # Reference anchor or img?
951 else:
952 match = self._tail_of_reference_link_re.match(text, p)
953 if match:
954 # Handle a reference-style anchor or img.
955 is_img = start_idx > 0 and text[start_idx-1] == "!"
956 if is_img:
957 start_idx -= 1
958 link_id = match.group("id").lower()
959 if not link_id:
960 link_id = link_text.lower() # for links like [this][]
961 if link_id in self.urls:
962 url = self.urls[link_id]
963 # We've got to encode these to avoid conflicting
964 # with italics/bold.
965 url = url.replace('*', g_escape_table['*']) \
966 .replace('_', g_escape_table['_'])
967 title = self.titles.get(link_id)
968 if title:
969 title = title.replace('*', g_escape_table['*']) \
970 .replace('_', g_escape_table['_'])
971 title_str = ' title="%s"' % title
972 else:
973 title_str = ''
974 if is_img:
975 result = '<img src="%s" alt="%s"%s%s' \
976 % (url, link_text.replace('"', '&quot;'),
977 title_str, self.empty_element_suffix)
978 curr_pos = start_idx + len(result)
979 text = text[:start_idx] + result + text[match.end():]
980 elif start_idx >= anchor_allowed_pos:
981 result = '<a href="%s"%s>%s</a>' \
982 % (url, title_str, link_text)
983 result_head = '<a href="%s"%s>' % (url, title_str)
984 result = '%s%s</a>' % (result_head, link_text)
985 # <img> allowed from curr_pos on, <a> from
986 # anchor_allowed_pos on.
987 curr_pos = start_idx + len(result_head)
988 anchor_allowed_pos = start_idx + len(result)
989 text = text[:start_idx] + result + text[match.end():]
990 else:
991 # Anchor not allowed here.
992 curr_pos = start_idx + 1
993 else:
994 # This id isn't defined, leave the markup alone.
995 curr_pos = match.end()
996 continue
997
998 # Otherwise, it isn't markup.
999 curr_pos = start_idx + 1
1000
1001 return text
1002
1003
1004 _setext_h_re = re.compile(r'^(.+)[ \t]*\n(=+|-+)[ \t]*\n+', re.M)
1005 def _setext_h_sub(self, match):
1006 n = {"=": 1, "-": 2}[match.group(2)[0]]
1007 demote_headers = self.extras.get("demote-headers")
1008 if demote_headers:
1009 n = min(n + demote_headers, 6)
1010 return "<h%d>%s</h%d>\n\n" \
1011 % (n, self._run_span_gamut(match.group(1)), n)
1012
1013 _atx_h_re = re.compile(r'''
1014 ^(\#{1,6}) # \1 = string of #'s
1015 [ \t]*
1016 (.+?) # \2 = Header text
1017 [ \t]*
1018 (?<!\\) # ensure not an escaped trailing '#'
1019 \#* # optional closing #'s (not counted)
1020 \n+
1021 ''', re.X | re.M)
1022 def _atx_h_sub(self, match):
1023 n = len(match.group(1))
1024 demote_headers = self.extras.get("demote-headers")
1025 if demote_headers:
1026 n = min(n + demote_headers, 6)
1027 return "<h%d>%s</h%d>\n\n" \
1028 % (n, self._run_span_gamut(match.group(2)), n)
1029
1030 def _do_headers(self, text):
1031 # Setext-style headers:
1032 # Header 1
1033 # ========
1034 #
1035 # Header 2
1036 # --------
1037 text = self._setext_h_re.sub(self._setext_h_sub, text)
1038
1039 # atx-style headers:
1040 # # Header 1
1041 # ## Header 2
1042 # ## Header 2 with closing hashes ##
1043 # ...
1044 # ###### Header 6
1045 text = self._atx_h_re.sub(self._atx_h_sub, text)
1046
1047 return text
1048
1049
1050 _marker_ul_chars = '*+-'
1051 _marker_any = r'(?:[%s]|\d+\.)' % _marker_ul_chars
1052 _marker_ul = '(?:[%s])' % _marker_ul_chars
1053 _marker_ol = r'(?:\d+\.)'
1054
1055 def _list_sub(self, match):
1056 lst = match.group(1)
1057 lst_type = match.group(3) in self._marker_ul_chars and "ul" or "ol"
1058 result = self._process_list_items(lst)
1059 if self.list_level:
1060 return "<%s>\n%s</%s>\n" % (lst_type, result, lst_type)
1061 else:
1062 return "<%s>\n%s</%s>\n\n" % (lst_type, result, lst_type)
1063
1064 def _do_lists(self, text):
1065 # Form HTML ordered (numbered) and unordered (bulleted) lists.
1066
1067 for marker_pat in (self._marker_ul, self._marker_ol):
1068 # Re-usable pattern to match any entire ul or ol list:
1069 less_than_tab = self.tab_width - 1
1070 whole_list = r'''
1071 ( # \1 = whole list
1072 ( # \2
1073 [ ]{0,%d}
1074 (%s) # \3 = first list item marker
1075 [ \t]+
1076 )
1077 (?:.+?)
1078 ( # \4
1079 \Z
1080 |
1081 \n{2,}
1082 (?=\S)
1083 (?! # Negative lookahead for another list item marker
1084 [ \t]*
1085 %s[ \t]+
1086 )
1087 )
1088 )
1089 ''' % (less_than_tab, marker_pat, marker_pat)
1090
1091 # We use a different prefix before nested lists than top-level lists.
1092 # See extended comment in _process_list_items().
1093 #
1094 # Note: There's a bit of duplication here. My original implementation
1095 # created a scalar regex pattern as the conditional result of the test on
1096 # $g_list_level, and then only ran the $text =~ s{...}{...}egmx
1097 # substitution once, using the scalar as the pattern. This worked,
1098 # everywhere except when running under MT on my hosting account at Pair
1099 # Networks. There, this caused all rebuilds to be killed by the reaper (or
1100 # perhaps they crashed, but that seems incredibly unlikely given that the
1101 # same script on the same server ran fine *except* under MT. I've spent
1102 # more time trying to figure out why this is happening than I'd like to
1103 # admit. My only guess, backed up by the fact that this workaround works,
1104 # is that Perl optimizes the substition when it can figure out that the
1105 # pattern will never change, and when this optimization isn't on, we run
1106 # afoul of the reaper. Thus, the slightly redundant code to that uses two
1107 # static s/// patterns rather than one conditional pattern.
1108
1109 if self.list_level:
1110 sub_list_re = re.compile("^"+whole_list, re.X | re.M | re.S)
1111 text = sub_list_re.sub(self._list_sub, text)
1112 else:
1113 list_re = re.compile(r"(?:(?<=\n\n)|\A\n?)"+whole_list,
1114 re.X | re.M | re.S)
1115 text = list_re.sub(self._list_sub, text)
1116
1117 return text
1118
1119 _list_item_re = re.compile(r'''
1120 (\n)? # leading line = \1
1121 (^[ \t]*) # leading whitespace = \2
1122 (%s) [ \t]+ # list marker = \3
1123 ((?:.+?) # list item text = \4
1124 (\n{1,2})) # eols = \5
1125 (?= \n* (\Z | \2 (%s) [ \t]+))
1126 ''' % (_marker_any, _marker_any),
1127 re.M | re.X | re.S)
1128
1129 _last_li_endswith_two_eols = False
1130 def _list_item_sub(self, match):
1131 item = match.group(4)
1132 leading_line = match.group(1)
1133 leading_space = match.group(2)
1134 if leading_line or "\n\n" in item or self._last_li_endswith_two_eols:
1135 item = self._run_block_gamut(self._outdent(item))
1136 else:
1137 # Recursion for sub-lists:
1138 item = self._do_lists(self._outdent(item))
1139 if item.endswith('\n'):
1140 item = item[:-1]
1141 item = self._run_span_gamut(item)
1142 self._last_li_endswith_two_eols = (len(match.group(5)) == 2)
1143 return "<li>%s</li>\n" % item
1144
1145 def _process_list_items(self, list_str):
1146 # Process the contents of a single ordered or unordered list,
1147 # splitting it into individual list items.
1148
1149 # The $g_list_level global keeps track of when we're inside a list.
1150 # Each time we enter a list, we increment it; when we leave a list,
1151 # we decrement. If it's zero, we're not in a list anymore.
1152 #
1153 # We do this because when we're not inside a list, we want to treat
1154 # something like this:
1155 #
1156 # I recommend upgrading to version
1157 # 8. Oops, now this line is treated
1158 # as a sub-list.
1159 #
1160 # As a single paragraph, despite the fact that the second line starts
1161 # with a digit-period-space sequence.
1162 #
1163 # Whereas when we're inside a list (or sub-list), that line will be
1164 # treated as the start of a sub-list. What a kludge, huh? This is
1165 # an aspect of Markdown's syntax that's hard to parse perfectly
1166 # without resorting to mind-reading. Perhaps the solution is to
1167 # change the syntax rules such that sub-lists must start with a
1168 # starting cardinal number; e.g. "1." or "a.".
1169 self.list_level += 1
1170 self._last_li_endswith_two_eols = False
1171 list_str = list_str.rstrip('\n') + '\n'
1172 list_str = self._list_item_re.sub(self._list_item_sub, list_str)
1173 self.list_level -= 1
1174 return list_str
1175
1176 def _get_pygments_lexer(self, lexer_name):
1177 try:
1178 from pygments import lexers, util
1179 except ImportError:
1180 return None
1181 try:
1182 return lexers.get_lexer_by_name(lexer_name)
1183 except util.ClassNotFound:
1184 return None
1185
1186 def _color_with_pygments(self, codeblock, lexer, **formatter_opts):
1187 import pygments
1188 import pygments.formatters
1189
1190 class HtmlCodeFormatter(pygments.formatters.HtmlFormatter):
1191 def _wrap_code(self, inner):
1192 """A function for use in a Pygments Formatter which
1193 wraps in <code> tags.
1194 """
1195 yield 0, "<code>"
1196 for tup in inner:
1197 yield tup
1198 yield 0, "</code>"
1199
1200 def wrap(self, source, outfile):
1201 """Return the source with a code, pre, and div."""
1202 return self._wrap_div(self._wrap_pre(self._wrap_code(source)))
1203
1204 formatter = HtmlCodeFormatter(cssclass="codehilite", **formatter_opts)
1205 return pygments.highlight(codeblock, lexer, formatter)
1206
1207 def _code_block_sub(self, match):
1208 codeblock = match.group(1)
1209 codeblock = self._outdent(codeblock)
1210 codeblock = self._detab(codeblock)
1211 codeblock = codeblock.lstrip('\n') # trim leading newlines
1212 codeblock = codeblock.rstrip() # trim trailing whitespace
1213
1214 if "code-color" in self.extras and codeblock.startswith(":::"):
1215 lexer_name, rest = codeblock.split('\n', 1)
1216 lexer_name = lexer_name[3:].strip()
1217 lexer = self._get_pygments_lexer(lexer_name)
1218 codeblock = rest.lstrip("\n") # Remove lexer declaration line.
1219 if lexer:
1220 formatter_opts = self.extras['code-color'] or {}
1221 colored = self._color_with_pygments(codeblock, lexer,
1222 **formatter_opts)
1223 return "\n\n%s\n\n" % colored
1224
1225 codeblock = self._encode_code(codeblock)
1226 return "\n\n<pre><code>%s\n</code></pre>\n\n" % codeblock
1227
1228 def _do_code_blocks(self, text):
1229 """Process Markdown `<pre><code>` blocks."""
1230 code_block_re = re.compile(r'''
1231 (?:\n\n|\A)
1232 ( # $1 = the code block -- one or more lines, starting with a space/tab
1233 (?:
1234 (?:[ ]{%d} | \t) # Lines must start with a tab or a tab-width of spaces
1235 .*\n+
1236 )+
1237 )
1238 ((?=^[ ]{0,%d}\S)|\Z) # Lookahead for non-space at line-start, or end of doc
1239 ''' % (self.tab_width, self.tab_width),
1240 re.M | re.X)
1241
1242 return code_block_re.sub(self._code_block_sub, text)
1243
1244
1245 # Rules for a code span:
1246 # - backslash escapes are not interpreted in a code span
1247 # - to include one or or a run of more backticks the delimiters must
1248 # be a longer run of backticks
1249 # - cannot start or end a code span with a backtick; pad with a
1250 # space and that space will be removed in the emitted HTML
1251 # See `test/tm-cases/escapes.text` for a number of edge-case
1252 # examples.
1253 _code_span_re = re.compile(r'''
1254 (?<!\\)
1255 (`+) # \1 = Opening run of `
1256 (?!`) # See Note A test/tm-cases/escapes.text
1257 (.+?) # \2 = The code block
1258 (?<!`)
1259 \1 # Matching closer
1260 (?!`)
1261 ''', re.X | re.S)
1262
1263 def _code_span_sub(self, match):
1264 c = match.group(2).strip(" \t")
1265 c = self._encode_code(c)
1266 return "<code>%s</code>" % c
1267
1268 def _do_code_spans(self, text):
1269 # * Backtick quotes are used for <code></code> spans.
1270 #
1271 # * You can use multiple backticks as the delimiters if you want to
1272 # include literal backticks in the code span. So, this input:
1273 #
1274 # Just type ``foo `bar` baz`` at the prompt.
1275 #
1276 # Will translate to:
1277 #
1278 # <p>Just type <code>foo `bar` baz</code> at the prompt.</p>
1279 #
1280 # There's no arbitrary limit to the number of backticks you
1281 # can use as delimters. If you need three consecutive backticks
1282 # in your code, use four for delimiters, etc.
1283 #
1284 # * You can use spaces to get literal backticks at the edges:
1285 #
1286 # ... type `` `bar` `` ...
1287 #
1288 # Turns to:
1289 #
1290 # ... type <code>`bar`</code> ...
1291 return self._code_span_re.sub(self._code_span_sub, text)
1292
1293 def _encode_code(self, text):
1294 """Encode/escape certain characters inside Markdown code runs.
1295 The point is that in code, these characters are literals,
1296 and lose their special Markdown meanings.
1297 """
1298 replacements = [
1299 # Encode all ampersands; HTML entities are not
1300 # entities within a Markdown code span.
1301 ('&', '&amp;'),
1302 # Do the angle bracket song and dance:
1303 ('<', '&lt;'),
1304 ('>', '&gt;'),
1305 # Now, escape characters that are magic in Markdown:
1306 ('*', g_escape_table['*']),
1307 ('_', g_escape_table['_']),
1308 ('{', g_escape_table['{']),
1309 ('}', g_escape_table['}']),
1310 ('[', g_escape_table['[']),
1311 (']', g_escape_table[']']),
1312 ('\\', g_escape_table['\\']),
1313 ]
1314 for before, after in replacements:
1315 text = text.replace(before, after)
1316 return text
1317
1318 _strong_re = re.compile(r"(\*\*|__)(?=\S)(.+?[*_]*)(?<=\S)\1", re.S)
1319 _em_re = re.compile(r"(\*|_)(?=\S)(.+?)(?<=\S)\1", re.S)
1320 _code_friendly_strong_re = re.compile(r"\*\*(?=\S)(.+?[*_]*)(?<=\S)\*\*", re.S)
1321 _code_friendly_em_re = re.compile(r"\*(?=\S)(.+?)(?<=\S)\*", re.S)
1322 def _do_italics_and_bold(self, text):
1323 # <strong> must go first:
1324 if "code-friendly" in self.extras:
1325 text = self._code_friendly_strong_re.sub(r"<strong>\1</strong>", text)
1326 text = self._code_friendly_em_re.sub(r"<em>\1</em>", text)
1327 else:
1328 text = self._strong_re.sub(r"<strong>\2</strong>", text)
1329 text = self._em_re.sub(r"<em>\2</em>", text)
1330 return text
1331
1332
1333 _block_quote_re = re.compile(r'''
1334 ( # Wrap whole match in \1
1335 (
1336 ^[ \t]*>[ \t]? # '>' at the start of a line
1337 .+\n # rest of the first line
1338 (.+\n)* # subsequent consecutive lines
1339 \n* # blanks
1340 )+
1341 )
1342 ''', re.M | re.X)
1343 _bq_one_level_re = re.compile('^[ \t]*>[ \t]?', re.M);
1344
1345 _html_pre_block_re = re.compile(r'(\s*<pre>.+?</pre>)', re.S)
1346 def _dedent_two_spaces_sub(self, match):
1347 return re.sub(r'(?m)^ ', '', match.group(1))
1348
1349 def _block_quote_sub(self, match):
1350 bq = match.group(1)
1351 bq = self._bq_one_level_re.sub('', bq) # trim one level of quoting
1352 bq = self._ws_only_line_re.sub('', bq) # trim whitespace-only lines
1353 bq = self._run_block_gamut(bq) # recurse
1354
1355 bq = re.sub('(?m)^', ' ', bq)
1356 # These leading spaces screw with <pre> content, so we need to fix that:
1357 bq = self._html_pre_block_re.sub(self._dedent_two_spaces_sub, bq)
1358
1359 return "<blockquote>\n%s\n</blockquote>\n\n" % bq
1360
1361 def _do_block_quotes(self, text):
1362 if '>' not in text:
1363 return text
1364 return self._block_quote_re.sub(self._block_quote_sub, text)
1365
1366 def _form_paragraphs(self, text):
1367 # Strip leading and trailing lines:
1368 text = text.strip('\n')
1369
1370 # Wrap <p> tags.
1371 grafs = re.split(r"\n{2,}", text)
1372 for i, graf in enumerate(grafs):
1373 if graf in self.html_blocks:
1374 # Unhashify HTML blocks
1375 grafs[i] = self.html_blocks[graf]
1376 else:
1377 # Wrap <p> tags.
1378 graf = self._run_span_gamut(graf)
1379 grafs[i] = "<p>" + graf.lstrip(" \t") + "</p>"
1380
1381 return "\n\n".join(grafs)
1382
1383 def _add_footnotes(self, text):
1384 if self.footnotes:
1385 footer = [
1386 '<div class="footnotes">',
1387 '<hr' + self.empty_element_suffix,
1388 '<ol>',
1389 ]
1390 for i, id in enumerate(self.footnote_ids):
1391 if i != 0:
1392 footer.append('')
1393 footer.append('<li id="fn-%s">' % id)
1394 footer.append(self._run_block_gamut(self.footnotes[id]))
1395 backlink = ('<a href="#fnref-%s" '
1396 'class="footnoteBackLink" '
1397 'title="Jump back to footnote %d in the text.">'
1398 '&#8617;</a>' % (id, i+1))
1399 if footer[-1].endswith("</p>"):
1400 footer[-1] = footer[-1][:-len("</p>")] \
1401 + '&nbsp;' + backlink + "</p>"
1402 else:
1403 footer.append("\n<p>%s</p>" % backlink)
1404 footer.append('</li>')
1405 footer.append('</ol>')
1406 footer.append('</div>')
1407 return text + '\n\n' + '\n'.join(footer)
1408 else:
1409 return text
1410
1411 # Ampersand-encoding based entirely on Nat Irons's Amputator MT plugin:
1412 # http://bumppo.net/projects/amputator/
1413 _ampersand_re = re.compile(r'&(?!#?[xX]?(?:[0-9a-fA-F]+|\w+);)')
1414 _naked_lt_re = re.compile(r'<(?![a-z/?\$!])', re.I)
1415 _naked_gt_re = re.compile(r'''(?<![a-z?!/'"-])>''', re.I)
1416
1417 def _encode_amps_and_angles(self, text):
1418 # Smart processing for ampersands and angle brackets that need
1419 # to be encoded.
1420 text = self._ampersand_re.sub('&amp;', text)
1421
1422 # Encode naked <'s
1423 text = self._naked_lt_re.sub('&lt;', text)
1424
1425 # Encode naked >'s
1426 # Note: Other markdown implementations (e.g. Markdown.pl, PHP
1427 # Markdown) don't do this.
1428 text = self._naked_gt_re.sub('&gt;', text)
1429 return text
1430
1431 def _encode_backslash_escapes(self, text):
1432 for ch, escape in g_escape_table.items():
1433 text = text.replace("\\"+ch, escape)
1434 return text
1435
1436 _auto_link_re = re.compile(r'<((https?|ftp):[^\'">\s]+)>', re.I)
1437 def _auto_link_sub(self, match):
1438 g1 = match.group(1)
1439 return '<a href="%s">%s</a>' % (g1, g1)
1440
1441 _auto_email_link_re = re.compile(r"""
1442 <
1443 (?:mailto:)?
1444 (
1445 [-.\w]+
1446 \@
1447 [-\w]+(\.[-\w]+)*\.[a-z]+
1448 )
1449 >
1450 """, re.I | re.X | re.U)
1451 def _auto_email_link_sub(self, match):
1452 return self._encode_email_address(
1453 self._unescape_special_chars(match.group(1)))
1454
1455 def _do_auto_links(self, text):
1456 text = self._auto_link_re.sub(self._auto_link_sub, text)
1457 text = self._auto_email_link_re.sub(self._auto_email_link_sub, text)
1458 return text
1459
1460 def _encode_email_address(self, addr):
1461 # Input: an email address, e.g. "foo@example.com"
1462 #
1463 # Output: the email address as a mailto link, with each character
1464 # of the address encoded as either a decimal or hex entity, in
1465 # the hopes of foiling most address harvesting spam bots. E.g.:
1466 #
1467 # <a href="&#x6D;&#97;&#105;&#108;&#x74;&#111;:&#102;&#111;&#111;&#64;&#101;
1468 # x&#x61;&#109;&#x70;&#108;&#x65;&#x2E;&#99;&#111;&#109;">&#102;&#111;&#111;
1469 # &#64;&#101;x&#x61;&#109;&#x70;&#108;&#x65;&#x2E;&#99;&#111;&#109;</a>
1470 #
1471 # Based on a filter by Matthew Wickline, posted to the BBEdit-Talk
1472 # mailing list: <http://tinyurl.com/yu7ue>
1473 chars = [_xml_encode_email_char_at_random(ch)
1474 for ch in "mailto:" + addr]
1475 # Strip the mailto: from the visible part.
1476 addr = '<a href="%s">%s</a>' \
1477 % (''.join(chars), ''.join(chars[7:]))
1478 return addr
1479
1480 def _do_link_patterns(self, text):
1481 """Caveat emptor: there isn't much guarding against link
1482 patterns being formed inside other standard Markdown links, e.g.
1483 inside a [link def][like this].
1484
1485 Dev Notes: *Could* consider prefixing regexes with a negative
1486 lookbehind assertion to attempt to guard against this.
1487 """
1488 link_from_hash = {}
1489 for regex, repl in self.link_patterns:
1490 replacements = []
1491 for match in regex.finditer(text):
1492 if hasattr(repl, "__call__"):
1493 href = repl(match)
1494 else:
1495 href = match.expand(repl)
1496 replacements.append((match.span(), href))
1497 for (start, end), href in reversed(replacements):
1498 escaped_href = (
1499 href.replace('"', '&quot;') # b/c of attr quote
1500 # To avoid markdown <em> and <strong>:
1501 .replace('*', g_escape_table['*'])
1502 .replace('_', g_escape_table['_']))
1503 link = '<a href="%s">%s</a>' % (escaped_href, text[start:end])
1504 hash = md5(link).hexdigest()
1505 link_from_hash[hash] = link
1506 text = text[:start] + hash + text[end:]
1507 for hash, link in link_from_hash.items():
1508 text = text.replace(hash, link)
1509 return text
1510
1511 def _unescape_special_chars(self, text):
1512 # Swap back in all the special characters we've hidden.
1513 for ch, hash in g_escape_table.items():
1514 text = text.replace(hash, ch)
1515 return text
1516
1517 def _outdent(self, text):
1518 # Remove one level of line-leading tabs or spaces
1519 return self._outdent_re.sub('', text)
1520
1521
1522class MarkdownWithExtras(Markdown):
1523 """A markdowner class that enables most extras:
1524
1525 - footnotes
1526 - code-color (only has effect if 'pygments' Python module on path)
1527
1528 These are not included:
1529 - pyshell (specific to Python-related documenting)
1530 - code-friendly (because it *disables* part of the syntax)
1531 - link-patterns (because you need to specify some actual
1532 link-patterns anyway)
1533 """
1534 extras = ["footnotes", "code-color"]
1535
1536
1537#---- internal support functions
1538
1539# From http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/52549
1540def _curry(*args, **kwargs):
1541 function, args = args[0], args[1:]
1542 def result(*rest, **kwrest):
1543 combined = kwargs.copy()
1544 combined.update(kwrest)
1545 return function(*args + rest, **combined)
1546 return result
1547
1548# Recipe: regex_from_encoded_pattern (1.0)
1549def _regex_from_encoded_pattern(s):
1550 """'foo' -> re.compile(re.escape('foo'))
1551 '/foo/' -> re.compile('foo')
1552 '/foo/i' -> re.compile('foo', re.I)
1553 """
1554 if s.startswith('/') and s.rfind('/') != 0:
1555 # Parse it: /PATTERN/FLAGS
1556 idx = s.rfind('/')
1557 pattern, flags_str = s[1:idx], s[idx+1:]
1558 flag_from_char = {
1559 "i": re.IGNORECASE,
1560 "l": re.LOCALE,
1561 "s": re.DOTALL,
1562 "m": re.MULTILINE,
1563 "u": re.UNICODE,
1564 }
1565 flags = 0
1566 for char in flags_str:
1567 try:
1568 flags |= flag_from_char[char]
1569 except KeyError:
1570 raise ValueError("unsupported regex flag: '%s' in '%s' "
1571 "(must be one of '%s')"
1572 % (char, s, ''.join(flag_from_char.keys())))
1573 return re.compile(s[1:idx], flags)
1574 else: # not an encoded regex
1575 return re.compile(re.escape(s))
1576
1577# Recipe: dedent (0.1.2)
1578def _dedentlines(lines, tabsize=8, skip_first_line=False):
1579 """_dedentlines(lines, tabsize=8, skip_first_line=False) -> dedented lines
1580
1581 "lines" is a list of lines to dedent.
1582 "tabsize" is the tab width to use for indent width calculations.
1583 "skip_first_line" is a boolean indicating if the first line should
1584 be skipped for calculating the indent width and for dedenting.
1585 This is sometimes useful for docstrings and similar.
1586
1587 Same as dedent() except operates on a sequence of lines. Note: the
1588 lines list is modified **in-place**.
1589 """
1590 DEBUG = False
1591 if DEBUG:
1592 print "dedent: dedent(..., tabsize=%d, skip_first_line=%r)"\
1593 % (tabsize, skip_first_line)
1594 indents = []
1595 margin = None
1596 for i, line in enumerate(lines):
1597 if i == 0 and skip_first_line: continue
1598 indent = 0
1599 for ch in line:
1600 if ch == ' ':
1601 indent += 1
1602 elif ch == '\t':
1603 indent += tabsize - (indent % tabsize)
1604 elif ch in '\r\n':
1605 continue # skip all-whitespace lines
1606 else:
1607 break
1608 else:
1609 continue # skip all-whitespace lines
1610 if DEBUG: print "dedent: indent=%d: %r" % (indent, line)
1611 if margin is None:
1612 margin = indent
1613 else:
1614 margin = min(margin, indent)
1615 if DEBUG: print "dedent: margin=%r" % margin
1616
1617 if margin is not None and margin > 0:
1618 for i, line in enumerate(lines):
1619 if i == 0 and skip_first_line: continue
1620 removed = 0
1621 for j, ch in enumerate(line):
1622 if ch == ' ':
1623 removed += 1
1624 elif ch == '\t':
1625 removed += tabsize - (removed % tabsize)
1626 elif ch in '\r\n':
1627 if DEBUG: print "dedent: %r: EOL -> strip up to EOL" % line
1628 lines[i] = lines[i][j:]
1629 break
1630 else:
1631 raise ValueError("unexpected non-whitespace char %r in "
1632 "line %r while removing %d-space margin"
1633 % (ch, line, margin))
1634 if DEBUG:
1635 print "dedent: %r: %r -> removed %d/%d"\
1636 % (line, ch, removed, margin)
1637 if removed == margin:
1638 lines[i] = lines[i][j+1:]
1639 break
1640 elif removed > margin:
1641 lines[i] = ' '*(removed-margin) + lines[i][j+1:]
1642 break
1643 else:
1644 if removed:
1645 lines[i] = lines[i][removed:]
1646 return lines
1647
1648def _dedent(text, tabsize=8, skip_first_line=False):
1649 """_dedent(text, tabsize=8, skip_first_line=False) -> dedented text
1650
1651 "text" is the text to dedent.
1652 "tabsize" is the tab width to use for indent width calculations.
1653 "skip_first_line" is a boolean indicating if the first line should
1654 be skipped for calculating the indent width and for dedenting.
1655 This is sometimes useful for docstrings and similar.
1656
1657 textwrap.dedent(s), but don't expand tabs to spaces
1658 """
1659 lines = text.splitlines(1)
1660 _dedentlines(lines, tabsize=tabsize, skip_first_line=skip_first_line)
1661 return ''.join(lines)
1662
1663
1664class _memoized(object):
1665 """Decorator that caches a function's return value each time it is called.
1666 If called later with the same arguments, the cached value is returned, and
1667 not re-evaluated.
1668
1669 http://wiki.python.org/moin/PythonDecoratorLibrary
1670 """
1671 def __init__(self, func):
1672 self.func = func
1673 self.cache = {}
1674 def __call__(self, *args):
1675 try:
1676 return self.cache[args]
1677 except KeyError:
1678 self.cache[args] = value = self.func(*args)
1679 return value
1680 except TypeError:
1681 # uncachable -- for instance, passing a list as an argument.
1682 # Better to not cache than to blow up entirely.
1683 return self.func(*args)
1684 def __repr__(self):
1685 """Return the function's docstring."""
1686 return self.func.__doc__
1687
1688
1689def _xml_oneliner_re_from_tab_width(tab_width):
1690 """Standalone XML processing instruction regex."""
1691 return re.compile(r"""
1692 (?:
1693 (?<=\n\n) # Starting after a blank line
1694 | # or
1695 \A\n? # the beginning of the doc
1696 )
1697 ( # save in $1
1698 [ ]{0,%d}
1699 (?:
1700 <\?\w+\b\s+.*?\?> # XML processing instruction
1701 |
1702 <\w+:\w+\b\s+.*?/> # namespaced single tag
1703 )
1704 [ \t]*
1705 (?=\n{2,}|\Z) # followed by a blank line or end of document
1706 )
1707 """ % (tab_width - 1), re.X)
1708_xml_oneliner_re_from_tab_width = _memoized(_xml_oneliner_re_from_tab_width)
1709
1710def _hr_tag_re_from_tab_width(tab_width):
1711 return re.compile(r"""
1712 (?:
1713 (?<=\n\n) # Starting after a blank line
1714 | # or
1715 \A\n? # the beginning of the doc
1716 )
1717 ( # save in \1
1718 [ ]{0,%d}
1719 <(hr) # start tag = \2
1720 \b # word break
1721 ([^<>])*? #
1722 /?> # the matching end tag
1723 [ \t]*
1724 (?=\n{2,}|\Z) # followed by a blank line or end of document
1725 )
1726 """ % (tab_width - 1), re.X)
1727_hr_tag_re_from_tab_width = _memoized(_hr_tag_re_from_tab_width)
1728
1729
1730def _xml_encode_email_char_at_random(ch):
1731 r = random()
1732 # Roughly 10% raw, 45% hex, 45% dec.
1733 # '@' *must* be encoded. I [John Gruber] insist.
1734 # Issue 26: '_' must be encoded.
1735 if r > 0.9 and ch not in "@_":
1736 return ch
1737 elif r < 0.45:
1738 # The [1:] is to drop leading '0': 0x63 -> x63
1739 return '&#%s;' % hex(ord(ch))[1:]
1740 else:
1741 return '&#%s;' % ord(ch)
1742
1743def _hash_text(text):
1744 return 'md5:'+md5(text.encode("utf-8")).hexdigest()
1745
1746
1747#---- mainline
1748
1749class _NoReflowFormatter(optparse.IndentedHelpFormatter):
1750 """An optparse formatter that does NOT reflow the description."""
1751 def format_description(self, description):
1752 return description or ""
1753
1754def _test():
1755 import doctest
1756 doctest.testmod()
1757
1758def main(argv=None):
1759 if argv is None:
1760 argv = sys.argv
1761 if not logging.root.handlers:
1762 logging.basicConfig()
1763
1764 usage = "usage: %prog [PATHS...]"
1765 version = "%prog "+__version__
1766 parser = optparse.OptionParser(prog="markdown2", usage=usage,
1767 version=version, description=cmdln_desc,
1768 formatter=_NoReflowFormatter())
1769 parser.add_option("-v", "--verbose", dest="log_level",
1770 action="store_const", const=logging.DEBUG,
1771 help="more verbose output")
1772 parser.add_option("--encoding",
1773 help="specify encoding of text content")
1774 parser.add_option("--html4tags", action="store_true", default=False,
1775 help="use HTML 4 style for empty element tags")
1776 parser.add_option("-s", "--safe", metavar="MODE", dest="safe_mode",
1777 help="sanitize literal HTML: 'escape' escapes "
1778 "HTML meta chars, 'replace' replaces with an "
1779 "[HTML_REMOVED] note")
1780 parser.add_option("-x", "--extras", action="append",
1781 help="Turn on specific extra features (not part of "
1782 "the core Markdown spec). Supported values: "
1783 "'code-friendly' disables _/__ for emphasis; "
1784 "'code-color' adds code-block syntax coloring; "
1785 "'link-patterns' adds auto-linking based on patterns; "
1786 "'footnotes' adds the footnotes syntax;"
1787 "'xml' passes one-liner processing instructions and namespaced XML tags;"
1788 "'pyshell' to put unindented Python interactive shell sessions in a <code> block.")
1789 parser.add_option("--use-file-vars",
1790 help="Look for and use Emacs-style 'markdown-extras' "
1791 "file var to turn on extras. See "
1792 "<http://code.google.com/p/python-markdown2/wiki/Extras>.")
1793 parser.add_option("--link-patterns-file",
1794 help="path to a link pattern file")
1795 parser.add_option("--self-test", action="store_true",
1796 help="run internal self-tests (some doctests)")
1797 parser.add_option("--compare", action="store_true",
1798 help="run against Markdown.pl as well (for testing)")
1799 parser.set_defaults(log_level=logging.INFO, compare=False,
1800 encoding="utf-8", safe_mode=None, use_file_vars=False)
1801 opts, paths = parser.parse_args()
1802 log.setLevel(opts.log_level)
1803
1804 if opts.self_test:
1805 return _test()
1806
1807 if opts.extras:
1808 extras = {}
1809 for s in opts.extras:
1810 splitter = re.compile("[,;: ]+")
1811 for e in splitter.split(s):
1812 if '=' in e:
1813 ename, earg = e.split('=', 1)
1814 try:
1815 earg = int(earg)
1816 except ValueError:
1817 pass
1818 else:
1819 ename, earg = e, None
1820 extras[ename] = earg
1821 else:
1822 extras = None
1823
1824 if opts.link_patterns_file:
1825 link_patterns = []
1826 f = open(opts.link_patterns_file)
1827 try:
1828 for i, line in enumerate(f.readlines()):
1829 if not line.strip(): continue
1830 if line.lstrip().startswith("#"): continue
1831 try:
1832 pat, href = line.rstrip().rsplit(None, 1)
1833 except ValueError:
1834 raise MarkdownError("%s:%d: invalid link pattern line: %r"
1835 % (opts.link_patterns_file, i+1, line))
1836 link_patterns.append(
1837 (_regex_from_encoded_pattern(pat), href))
1838 finally:
1839 f.close()
1840 else:
1841 link_patterns = None
1842
1843 from os.path import join, dirname, abspath, exists
1844 markdown_pl = join(dirname(dirname(abspath(__file__))), "test",
1845 "Markdown.pl")
1846 for path in paths:
1847 if opts.compare:
1848 print "==== Markdown.pl ===="
1849 perl_cmd = 'perl %s "%s"' % (markdown_pl, path)
1850 o = os.popen(perl_cmd)
1851 perl_html = o.read()
1852 o.close()
1853 sys.stdout.write(perl_html)
1854 print "==== markdown2.py ===="
1855 html = markdown_path(path, encoding=opts.encoding,
1856 html4tags=opts.html4tags,
1857 safe_mode=opts.safe_mode,
1858 extras=extras, link_patterns=link_patterns,
1859 use_file_vars=opts.use_file_vars)
1860 sys.stdout.write(
1861 html.encode(sys.stdout.encoding or "utf-8", 'xmlcharrefreplace'))
1862 if opts.compare:
1863 test_dir = join(dirname(dirname(abspath(__file__))), "test")
1864 if exists(join(test_dir, "test_markdown2.py")):
1865 sys.path.insert(0, test_dir)
1866 from test_markdown2 import norm_html_from_html
1867 norm_html = norm_html_from_html(html)
1868 norm_perl_html = norm_html_from_html(perl_html)
1869 else:
1870 norm_html = html
1871 norm_perl_html = perl_html
1872 print "==== match? %r ====" % (norm_perl_html == norm_html)
1873
1874
1875if __name__ == "__main__":
1876 sys.exit( main(sys.argv) )
1877