]>
Commit | Line | Data |
---|---|---|
d0d074e0 MT |
1 | #!/usr/bin/env python |
2 | # Copyright (c) 2007-2008 ActiveState Corp. | |
3 | # License: MIT (http://www.opensource.org/licenses/mit-license.php) | |
4 | ||
5 | r"""A fast and complete Python implementation of Markdown. | |
6 | ||
7 | [from http://daringfireball.net/projects/markdown/] | |
8 | > Markdown is a text-to-HTML filter; it translates an easy-to-read / | |
9 | > easy-to-write structured text format into HTML. Markdown's text | |
10 | > format is most similar to that of plain text email, and supports | |
11 | > features such as headers, *emphasis*, code blocks, blockquotes, and | |
12 | > links. | |
13 | > | |
14 | > Markdown's syntax is designed not as a generic markup language, but | |
15 | > specifically to serve as a front-end to (X)HTML. You can use span-level | |
16 | > HTML tags anywhere in a Markdown document, and you can use block level | |
17 | > HTML tags (like <div> and <table> as well). | |
18 | ||
19 | Module usage: | |
20 | ||
21 | >>> import markdown2 | |
22 | >>> markdown2.markdown("*boo!*") # or use `html = markdown_path(PATH)` | |
23 | u'<p><em>boo!</em></p>\n' | |
24 | ||
25 | >>> markdowner = Markdown() | |
26 | >>> markdowner.convert("*boo!*") | |
27 | u'<p><em>boo!</em></p>\n' | |
28 | >>> markdowner.convert("**boom!**") | |
29 | u'<p><strong>boom!</strong></p>\n' | |
30 | ||
31 | This implementation of Markdown implements the full "core" syntax plus a | |
32 | number of extras (e.g., code syntax coloring, footnotes) as described on | |
33 | <http://code.google.com/p/python-markdown2/wiki/Extras>. | |
34 | """ | |
35 | ||
36 | cmdln_desc = """A fast and complete Python implementation of Markdown, a | |
37 | text-to-HTML conversion tool for web writers. | |
38 | """ | |
39 | ||
40 | # Dev Notes: | |
41 | # - There is already a Python markdown processor | |
42 | # (http://www.freewisdom.org/projects/python-markdown/). | |
43 | # - Python's regex syntax doesn't have '\z', so I'm using '\Z'. I'm | |
44 | # not yet sure if there implications with this. Compare 'pydoc sre' | |
45 | # and 'perldoc perlre'. | |
46 | ||
47 | __version_info__ = (1, 0, 1, 14) # first three nums match Markdown.pl | |
48 | __version__ = '1.0.1.14' | |
49 | __author__ = "Trent Mick" | |
50 | ||
51 | import os | |
52 | import sys | |
53 | from pprint import pprint | |
54 | import re | |
55 | import logging | |
56 | try: | |
57 | from hashlib import md5 | |
58 | except ImportError: | |
59 | from md5 import md5 | |
60 | import optparse | |
61 | from random import random | |
62 | import codecs | |
63 | ||
64 | ||
65 | ||
66 | #---- Python version compat | |
67 | ||
68 | if sys.version_info[:2] < (2,4): | |
69 | from sets import Set as set | |
70 | def reversed(sequence): | |
71 | for i in sequence[::-1]: | |
72 | yield i | |
73 | def _unicode_decode(s, encoding, errors='xmlcharrefreplace'): | |
74 | return unicode(s, encoding, errors) | |
75 | else: | |
76 | def _unicode_decode(s, encoding, errors='strict'): | |
77 | return s.decode(encoding, errors) | |
78 | ||
79 | ||
80 | #---- globals | |
81 | ||
82 | DEBUG = False | |
83 | log = logging.getLogger("markdown") | |
84 | ||
85 | DEFAULT_TAB_WIDTH = 4 | |
86 | ||
87 | # Table of hash values for escaped characters: | |
88 | def _escape_hash(s): | |
89 | # Lame attempt to avoid possible collision with someone actually | |
90 | # using the MD5 hexdigest of one of these chars in there text. | |
91 | # Other ideas: random.random(), uuid.uuid() | |
92 | #return md5(s).hexdigest() # Markdown.pl effectively does this. | |
93 | return 'md5-'+md5(s).hexdigest() | |
94 | g_escape_table = dict([(ch, _escape_hash(ch)) | |
95 | for ch in '\\`*_{}[]()>#+-.!']) | |
96 | ||
97 | ||
98 | ||
99 | #---- exceptions | |
100 | ||
101 | class MarkdownError(Exception): | |
102 | pass | |
103 | ||
104 | ||
105 | ||
106 | #---- public api | |
107 | ||
108 | def markdown_path(path, encoding="utf-8", | |
109 | html4tags=False, tab_width=DEFAULT_TAB_WIDTH, | |
110 | safe_mode=None, extras=None, link_patterns=None, | |
111 | use_file_vars=False): | |
112 | text = codecs.open(path, 'r', encoding).read() | |
113 | return Markdown(html4tags=html4tags, tab_width=tab_width, | |
114 | safe_mode=safe_mode, extras=extras, | |
115 | link_patterns=link_patterns, | |
116 | use_file_vars=use_file_vars).convert(text) | |
117 | ||
118 | def markdown(text, html4tags=False, tab_width=DEFAULT_TAB_WIDTH, | |
119 | safe_mode=None, extras=None, link_patterns=None, | |
120 | use_file_vars=False): | |
121 | return Markdown(html4tags=html4tags, tab_width=tab_width, | |
122 | safe_mode=safe_mode, extras=extras, | |
123 | link_patterns=link_patterns, | |
124 | use_file_vars=use_file_vars).convert(text) | |
125 | ||
126 | class Markdown(object): | |
127 | # The dict of "extras" to enable in processing -- a mapping of | |
128 | # extra name to argument for the extra. Most extras do not have an | |
129 | # argument, in which case the value is None. | |
130 | # | |
131 | # This can be set via (a) subclassing and (b) the constructor | |
132 | # "extras" argument. | |
133 | extras = None | |
134 | ||
135 | urls = None | |
136 | titles = None | |
137 | html_blocks = None | |
138 | html_spans = None | |
139 | html_removed_text = "[HTML_REMOVED]" # for compat with markdown.py | |
140 | ||
141 | # Used to track when we're inside an ordered or unordered list | |
142 | # (see _ProcessListItems() for details): | |
143 | list_level = 0 | |
144 | ||
145 | _ws_only_line_re = re.compile(r"^[ \t]+$", re.M) | |
146 | ||
147 | def __init__(self, html4tags=False, tab_width=4, safe_mode=None, | |
148 | extras=None, link_patterns=None, use_file_vars=False): | |
149 | if html4tags: | |
150 | self.empty_element_suffix = ">" | |
151 | else: | |
152 | self.empty_element_suffix = " />" | |
153 | self.tab_width = tab_width | |
154 | ||
155 | # For compatibility with earlier markdown2.py and with | |
156 | # markdown.py's safe_mode being a boolean, | |
157 | # safe_mode == True -> "replace" | |
158 | if safe_mode is True: | |
159 | self.safe_mode = "replace" | |
160 | else: | |
161 | self.safe_mode = safe_mode | |
162 | ||
163 | if self.extras is None: | |
164 | self.extras = {} | |
165 | elif not isinstance(self.extras, dict): | |
166 | self.extras = dict([(e, None) for e in self.extras]) | |
167 | if extras: | |
168 | if not isinstance(extras, dict): | |
169 | extras = dict([(e, None) for e in extras]) | |
170 | self.extras.update(extras) | |
171 | assert isinstance(self.extras, dict) | |
172 | self._instance_extras = self.extras.copy() | |
173 | self.link_patterns = link_patterns | |
174 | self.use_file_vars = use_file_vars | |
175 | self._outdent_re = re.compile(r'^(\t|[ ]{1,%d})' % tab_width, re.M) | |
176 | ||
177 | def reset(self): | |
178 | self.urls = {} | |
179 | self.titles = {} | |
180 | self.html_blocks = {} | |
181 | self.html_spans = {} | |
182 | self.list_level = 0 | |
183 | self.extras = self._instance_extras.copy() | |
184 | if "footnotes" in self.extras: | |
185 | self.footnotes = {} | |
186 | self.footnote_ids = [] | |
187 | ||
188 | def convert(self, text): | |
189 | """Convert the given text.""" | |
190 | # Main function. The order in which other subs are called here is | |
191 | # essential. Link and image substitutions need to happen before | |
192 | # _EscapeSpecialChars(), so that any *'s or _'s in the <a> | |
193 | # and <img> tags get encoded. | |
194 | ||
195 | # Clear the global hashes. If we don't clear these, you get conflicts | |
196 | # from other articles when generating a page which contains more than | |
197 | # one article (e.g. an index page that shows the N most recent | |
198 | # articles): | |
199 | self.reset() | |
200 | ||
201 | if not isinstance(text, unicode): | |
202 | #TODO: perhaps shouldn't presume UTF-8 for string input? | |
203 | text = unicode(text, 'utf-8') | |
204 | ||
205 | if self.use_file_vars: | |
206 | # Look for emacs-style file variable hints. | |
207 | emacs_vars = self._get_emacs_vars(text) | |
208 | if "markdown-extras" in emacs_vars: | |
209 | splitter = re.compile("[ ,]+") | |
210 | for e in splitter.split(emacs_vars["markdown-extras"]): | |
211 | if '=' in e: | |
212 | ename, earg = e.split('=', 1) | |
213 | try: | |
214 | earg = int(earg) | |
215 | except ValueError: | |
216 | pass | |
217 | else: | |
218 | ename, earg = e, None | |
219 | self.extras[ename] = earg | |
220 | ||
221 | # Standardize line endings: | |
222 | text = re.sub("\r\n|\r", "\n", text) | |
223 | ||
224 | # Make sure $text ends with a couple of newlines: | |
225 | text += "\n\n" | |
226 | ||
227 | # Convert all tabs to spaces. | |
228 | text = self._detab(text) | |
229 | ||
230 | # Strip any lines consisting only of spaces and tabs. | |
231 | # This makes subsequent regexen easier to write, because we can | |
232 | # match consecutive blank lines with /\n+/ instead of something | |
233 | # contorted like /[ \t]*\n+/ . | |
234 | text = self._ws_only_line_re.sub("", text) | |
235 | ||
236 | if self.safe_mode: | |
237 | text = self._hash_html_spans(text) | |
238 | ||
239 | # Turn block-level HTML blocks into hash entries | |
240 | text = self._hash_html_blocks(text, raw=True) | |
241 | ||
242 | # Strip link definitions, store in hashes. | |
243 | if "footnotes" in self.extras: | |
244 | # Must do footnotes first because an unlucky footnote defn | |
245 | # looks like a link defn: | |
246 | # [^4]: this "looks like a link defn" | |
247 | text = self._strip_footnote_definitions(text) | |
248 | text = self._strip_link_definitions(text) | |
249 | ||
250 | text = self._run_block_gamut(text) | |
251 | ||
252 | if "footnotes" in self.extras: | |
253 | text = self._add_footnotes(text) | |
254 | ||
255 | text = self._unescape_special_chars(text) | |
256 | ||
257 | if self.safe_mode: | |
258 | text = self._unhash_html_spans(text) | |
259 | ||
260 | text += "\n" | |
261 | return text | |
262 | ||
263 | _emacs_oneliner_vars_pat = re.compile(r"-\*-\s*([^\r\n]*?)\s*-\*-", re.UNICODE) | |
264 | # This regular expression is intended to match blocks like this: | |
265 | # PREFIX Local Variables: SUFFIX | |
266 | # PREFIX mode: Tcl SUFFIX | |
267 | # PREFIX End: SUFFIX | |
268 | # Some notes: | |
269 | # - "[ \t]" is used instead of "\s" to specifically exclude newlines | |
270 | # - "(\r\n|\n|\r)" is used instead of "$" because the sre engine does | |
271 | # not like anything other than Unix-style line terminators. | |
272 | _emacs_local_vars_pat = re.compile(r"""^ | |
273 | (?P<prefix>(?:[^\r\n|\n|\r])*?) | |
274 | [\ \t]*Local\ Variables:[\ \t]* | |
275 | (?P<suffix>.*?)(?:\r\n|\n|\r) | |
276 | (?P<content>.*?\1End:) | |
277 | """, re.IGNORECASE | re.MULTILINE | re.DOTALL | re.VERBOSE) | |
278 | ||
279 | def _get_emacs_vars(self, text): | |
280 | """Return a dictionary of emacs-style local variables. | |
281 | ||
282 | Parsing is done loosely according to this spec (and according to | |
283 | some in-practice deviations from this): | |
284 | http://www.gnu.org/software/emacs/manual/html_node/emacs/Specifying-File-Variables.html#Specifying-File-Variables | |
285 | """ | |
286 | emacs_vars = {} | |
287 | SIZE = pow(2, 13) # 8kB | |
288 | ||
289 | # Search near the start for a '-*-'-style one-liner of variables. | |
290 | head = text[:SIZE] | |
291 | if "-*-" in head: | |
292 | match = self._emacs_oneliner_vars_pat.search(head) | |
293 | if match: | |
294 | emacs_vars_str = match.group(1) | |
295 | assert '\n' not in emacs_vars_str | |
296 | emacs_var_strs = [s.strip() for s in emacs_vars_str.split(';') | |
297 | if s.strip()] | |
298 | if len(emacs_var_strs) == 1 and ':' not in emacs_var_strs[0]: | |
299 | # While not in the spec, this form is allowed by emacs: | |
300 | # -*- Tcl -*- | |
301 | # where the implied "variable" is "mode". This form | |
302 | # is only allowed if there are no other variables. | |
303 | emacs_vars["mode"] = emacs_var_strs[0].strip() | |
304 | else: | |
305 | for emacs_var_str in emacs_var_strs: | |
306 | try: | |
307 | variable, value = emacs_var_str.strip().split(':', 1) | |
308 | except ValueError: | |
309 | log.debug("emacs variables error: malformed -*- " | |
310 | "line: %r", emacs_var_str) | |
311 | continue | |
312 | # Lowercase the variable name because Emacs allows "Mode" | |
313 | # or "mode" or "MoDe", etc. | |
314 | emacs_vars[variable.lower()] = value.strip() | |
315 | ||
316 | tail = text[-SIZE:] | |
317 | if "Local Variables" in tail: | |
318 | match = self._emacs_local_vars_pat.search(tail) | |
319 | if match: | |
320 | prefix = match.group("prefix") | |
321 | suffix = match.group("suffix") | |
322 | lines = match.group("content").splitlines(0) | |
323 | #print "prefix=%r, suffix=%r, content=%r, lines: %s"\ | |
324 | # % (prefix, suffix, match.group("content"), lines) | |
325 | ||
326 | # Validate the Local Variables block: proper prefix and suffix | |
327 | # usage. | |
328 | for i, line in enumerate(lines): | |
329 | if not line.startswith(prefix): | |
330 | log.debug("emacs variables error: line '%s' " | |
331 | "does not use proper prefix '%s'" | |
332 | % (line, prefix)) | |
333 | return {} | |
334 | # Don't validate suffix on last line. Emacs doesn't care, | |
335 | # neither should we. | |
336 | if i != len(lines)-1 and not line.endswith(suffix): | |
337 | log.debug("emacs variables error: line '%s' " | |
338 | "does not use proper suffix '%s'" | |
339 | % (line, suffix)) | |
340 | return {} | |
341 | ||
342 | # Parse out one emacs var per line. | |
343 | continued_for = None | |
344 | for line in lines[:-1]: # no var on the last line ("PREFIX End:") | |
345 | if prefix: line = line[len(prefix):] # strip prefix | |
346 | if suffix: line = line[:-len(suffix)] # strip suffix | |
347 | line = line.strip() | |
348 | if continued_for: | |
349 | variable = continued_for | |
350 | if line.endswith('\\'): | |
351 | line = line[:-1].rstrip() | |
352 | else: | |
353 | continued_for = None | |
354 | emacs_vars[variable] += ' ' + line | |
355 | else: | |
356 | try: | |
357 | variable, value = line.split(':', 1) | |
358 | except ValueError: | |
359 | log.debug("local variables error: missing colon " | |
360 | "in local variables entry: '%s'" % line) | |
361 | continue | |
362 | # Do NOT lowercase the variable name, because Emacs only | |
363 | # allows "mode" (and not "Mode", "MoDe", etc.) in this block. | |
364 | value = value.strip() | |
365 | if value.endswith('\\'): | |
366 | value = value[:-1].rstrip() | |
367 | continued_for = variable | |
368 | else: | |
369 | continued_for = None | |
370 | emacs_vars[variable] = value | |
371 | ||
372 | # Unquote values. | |
373 | for var, val in emacs_vars.items(): | |
374 | if len(val) > 1 and (val.startswith('"') and val.endswith('"') | |
375 | or val.startswith('"') and val.endswith('"')): | |
376 | emacs_vars[var] = val[1:-1] | |
377 | ||
378 | return emacs_vars | |
379 | ||
380 | # Cribbed from a post by Bart Lateur: | |
381 | # <http://www.nntp.perl.org/group/perl.macperl.anyperl/154> | |
382 | _detab_re = re.compile(r'(.*?)\t', re.M) | |
383 | def _detab_sub(self, match): | |
384 | g1 = match.group(1) | |
385 | return g1 + (' ' * (self.tab_width - len(g1) % self.tab_width)) | |
386 | def _detab(self, text): | |
387 | r"""Remove (leading?) tabs from a file. | |
388 | ||
389 | >>> m = Markdown() | |
390 | >>> m._detab("\tfoo") | |
391 | ' foo' | |
392 | >>> m._detab(" \tfoo") | |
393 | ' foo' | |
394 | >>> m._detab("\t foo") | |
395 | ' foo' | |
396 | >>> m._detab(" foo") | |
397 | ' foo' | |
398 | >>> m._detab(" foo\n\tbar\tblam") | |
399 | ' foo\n bar blam' | |
400 | """ | |
401 | if '\t' not in text: | |
402 | return text | |
403 | return self._detab_re.subn(self._detab_sub, text)[0] | |
404 | ||
405 | _block_tags_a = 'p|div|h[1-6]|blockquote|pre|table|dl|ol|ul|script|noscript|form|fieldset|iframe|math|ins|del' | |
406 | _strict_tag_block_re = re.compile(r""" | |
407 | ( # save in \1 | |
408 | ^ # start of line (with re.M) | |
409 | <(%s) # start tag = \2 | |
410 | \b # word break | |
411 | (.*\n)*? # any number of lines, minimally matching | |
412 | </\2> # the matching end tag | |
413 | [ \t]* # trailing spaces/tabs | |
414 | (?=\n+|\Z) # followed by a newline or end of document | |
415 | ) | |
416 | """ % _block_tags_a, | |
417 | re.X | re.M) | |
418 | ||
419 | _block_tags_b = 'p|div|h[1-6]|blockquote|pre|table|dl|ol|ul|script|noscript|form|fieldset|iframe|math' | |
420 | _liberal_tag_block_re = re.compile(r""" | |
421 | ( # save in \1 | |
422 | ^ # start of line (with re.M) | |
423 | <(%s) # start tag = \2 | |
424 | \b # word break | |
425 | (.*\n)*? # any number of lines, minimally matching | |
426 | .*</\2> # the matching end tag | |
427 | [ \t]* # trailing spaces/tabs | |
428 | (?=\n+|\Z) # followed by a newline or end of document | |
429 | ) | |
430 | """ % _block_tags_b, | |
431 | re.X | re.M) | |
432 | ||
433 | def _hash_html_block_sub(self, match, raw=False): | |
434 | html = match.group(1) | |
435 | if raw and self.safe_mode: | |
436 | html = self._sanitize_html(html) | |
437 | key = _hash_text(html) | |
438 | self.html_blocks[key] = html | |
439 | return "\n\n" + key + "\n\n" | |
440 | ||
441 | def _hash_html_blocks(self, text, raw=False): | |
442 | """Hashify HTML blocks | |
443 | ||
444 | We only want to do this for block-level HTML tags, such as headers, | |
445 | lists, and tables. That's because we still want to wrap <p>s around | |
446 | "paragraphs" that are wrapped in non-block-level tags, such as anchors, | |
447 | phrase emphasis, and spans. The list of tags we're looking for is | |
448 | hard-coded. | |
449 | ||
450 | @param raw {boolean} indicates if these are raw HTML blocks in | |
451 | the original source. It makes a difference in "safe" mode. | |
452 | """ | |
453 | if '<' not in text: | |
454 | return text | |
455 | ||
456 | # Pass `raw` value into our calls to self._hash_html_block_sub. | |
457 | hash_html_block_sub = _curry(self._hash_html_block_sub, raw=raw) | |
458 | ||
459 | # First, look for nested blocks, e.g.: | |
460 | # <div> | |
461 | # <div> | |
462 | # tags for inner block must be indented. | |
463 | # </div> | |
464 | # </div> | |
465 | # | |
466 | # The outermost tags must start at the left margin for this to match, and | |
467 | # the inner nested divs must be indented. | |
468 | # We need to do this before the next, more liberal match, because the next | |
469 | # match will start at the first `<div>` and stop at the first `</div>`. | |
470 | text = self._strict_tag_block_re.sub(hash_html_block_sub, text) | |
471 | ||
472 | # Now match more liberally, simply from `\n<tag>` to `</tag>\n` | |
473 | text = self._liberal_tag_block_re.sub(hash_html_block_sub, text) | |
474 | ||
475 | # Special case just for <hr />. It was easier to make a special | |
476 | # case than to make the other regex more complicated. | |
477 | if "<hr" in text: | |
478 | _hr_tag_re = _hr_tag_re_from_tab_width(self.tab_width) | |
479 | text = _hr_tag_re.sub(hash_html_block_sub, text) | |
480 | ||
481 | # Special case for standalone HTML comments: | |
482 | if "<!--" in text: | |
483 | start = 0 | |
484 | while True: | |
485 | # Delimiters for next comment block. | |
486 | try: | |
487 | start_idx = text.index("<!--", start) | |
488 | except ValueError, ex: | |
489 | break | |
490 | try: | |
491 | end_idx = text.index("-->", start_idx) + 3 | |
492 | except ValueError, ex: | |
493 | break | |
494 | ||
495 | # Start position for next comment block search. | |
496 | start = end_idx | |
497 | ||
498 | # Validate whitespace before comment. | |
499 | if start_idx: | |
500 | # - Up to `tab_width - 1` spaces before start_idx. | |
501 | for i in range(self.tab_width - 1): | |
502 | if text[start_idx - 1] != ' ': | |
503 | break | |
504 | start_idx -= 1 | |
505 | if start_idx == 0: | |
506 | break | |
507 | # - Must be preceded by 2 newlines or hit the start of | |
508 | # the document. | |
509 | if start_idx == 0: | |
510 | pass | |
511 | elif start_idx == 1 and text[0] == '\n': | |
512 | start_idx = 0 # to match minute detail of Markdown.pl regex | |
513 | elif text[start_idx-2:start_idx] == '\n\n': | |
514 | pass | |
515 | else: | |
516 | break | |
517 | ||
518 | # Validate whitespace after comment. | |
519 | # - Any number of spaces and tabs. | |
520 | while end_idx < len(text): | |
521 | if text[end_idx] not in ' \t': | |
522 | break | |
523 | end_idx += 1 | |
524 | # - Must be following by 2 newlines or hit end of text. | |
525 | if text[end_idx:end_idx+2] not in ('', '\n', '\n\n'): | |
526 | continue | |
527 | ||
528 | # Escape and hash (must match `_hash_html_block_sub`). | |
529 | html = text[start_idx:end_idx] | |
530 | if raw and self.safe_mode: | |
531 | html = self._sanitize_html(html) | |
532 | key = _hash_text(html) | |
533 | self.html_blocks[key] = html | |
534 | text = text[:start_idx] + "\n\n" + key + "\n\n" + text[end_idx:] | |
535 | ||
536 | if "xml" in self.extras: | |
537 | # Treat XML processing instructions and namespaced one-liner | |
538 | # tags as if they were block HTML tags. E.g., if standalone | |
539 | # (i.e. are their own paragraph), the following do not get | |
540 | # wrapped in a <p> tag: | |
541 | # <?foo bar?> | |
542 | # | |
543 | # <xi:include xmlns:xi="http://www.w3.org/2001/XInclude" href="chapter_1.md"/> | |
544 | _xml_oneliner_re = _xml_oneliner_re_from_tab_width(self.tab_width) | |
545 | text = _xml_oneliner_re.sub(hash_html_block_sub, text) | |
546 | ||
547 | return text | |
548 | ||
549 | def _strip_link_definitions(self, text): | |
550 | # Strips link definitions from text, stores the URLs and titles in | |
551 | # hash references. | |
552 | less_than_tab = self.tab_width - 1 | |
553 | ||
554 | # Link defs are in the form: | |
555 | # [id]: url "optional title" | |
556 | _link_def_re = re.compile(r""" | |
557 | ^[ ]{0,%d}\[(.+)\]: # id = \1 | |
558 | [ \t]* | |
559 | \n? # maybe *one* newline | |
560 | [ \t]* | |
561 | <?(.+?)>? # url = \2 | |
562 | [ \t]* | |
563 | (?: | |
564 | \n? # maybe one newline | |
565 | [ \t]* | |
566 | (?<=\s) # lookbehind for whitespace | |
567 | ['"(] | |
568 | ([^\n]*) # title = \3 | |
569 | ['")] | |
570 | [ \t]* | |
571 | )? # title is optional | |
572 | (?:\n+|\Z) | |
573 | """ % less_than_tab, re.X | re.M | re.U) | |
574 | return _link_def_re.sub(self._extract_link_def_sub, text) | |
575 | ||
576 | def _extract_link_def_sub(self, match): | |
577 | id, url, title = match.groups() | |
578 | key = id.lower() # Link IDs are case-insensitive | |
579 | self.urls[key] = self._encode_amps_and_angles(url) | |
580 | if title: | |
581 | self.titles[key] = title.replace('"', '"') | |
582 | return "" | |
583 | ||
584 | def _extract_footnote_def_sub(self, match): | |
585 | id, text = match.groups() | |
586 | text = _dedent(text, skip_first_line=not text.startswith('\n')).strip() | |
587 | normed_id = re.sub(r'\W', '-', id) | |
588 | # Ensure footnote text ends with a couple newlines (for some | |
589 | # block gamut matches). | |
590 | self.footnotes[normed_id] = text + "\n\n" | |
591 | return "" | |
592 | ||
593 | def _strip_footnote_definitions(self, text): | |
594 | """A footnote definition looks like this: | |
595 | ||
596 | [^note-id]: Text of the note. | |
597 | ||
598 | May include one or more indented paragraphs. | |
599 | ||
600 | Where, | |
601 | - The 'note-id' can be pretty much anything, though typically it | |
602 | is the number of the footnote. | |
603 | - The first paragraph may start on the next line, like so: | |
604 | ||
605 | [^note-id]: | |
606 | Text of the note. | |
607 | """ | |
608 | less_than_tab = self.tab_width - 1 | |
609 | footnote_def_re = re.compile(r''' | |
610 | ^[ ]{0,%d}\[\^(.+)\]: # id = \1 | |
611 | [ \t]* | |
612 | ( # footnote text = \2 | |
613 | # First line need not start with the spaces. | |
614 | (?:\s*.*\n+) | |
615 | (?: | |
616 | (?:[ ]{%d} | \t) # Subsequent lines must be indented. | |
617 | .*\n+ | |
618 | )* | |
619 | ) | |
620 | # Lookahead for non-space at line-start, or end of doc. | |
621 | (?:(?=^[ ]{0,%d}\S)|\Z) | |
622 | ''' % (less_than_tab, self.tab_width, self.tab_width), | |
623 | re.X | re.M) | |
624 | return footnote_def_re.sub(self._extract_footnote_def_sub, text) | |
625 | ||
626 | ||
627 | _hr_res = [ | |
628 | re.compile(r"^[ ]{0,2}([ ]?\*[ ]?){3,}[ \t]*$", re.M), | |
629 | re.compile(r"^[ ]{0,2}([ ]?\-[ ]?){3,}[ \t]*$", re.M), | |
630 | re.compile(r"^[ ]{0,2}([ ]?\_[ ]?){3,}[ \t]*$", re.M), | |
631 | ] | |
632 | ||
633 | def _run_block_gamut(self, text): | |
634 | # These are all the transformations that form block-level | |
635 | # tags like paragraphs, headers, and list items. | |
636 | ||
637 | text = self._do_headers(text) | |
638 | ||
639 | # Do Horizontal Rules: | |
640 | hr = "\n<hr"+self.empty_element_suffix+"\n" | |
641 | for hr_re in self._hr_res: | |
642 | text = hr_re.sub(hr, text) | |
643 | ||
644 | text = self._do_lists(text) | |
645 | ||
646 | if "pyshell" in self.extras: | |
647 | text = self._prepare_pyshell_blocks(text) | |
648 | ||
649 | text = self._do_code_blocks(text) | |
650 | ||
651 | text = self._do_block_quotes(text) | |
652 | ||
653 | # We already ran _HashHTMLBlocks() before, in Markdown(), but that | |
654 | # was to escape raw HTML in the original Markdown source. This time, | |
655 | # we're escaping the markup we've just created, so that we don't wrap | |
656 | # <p> tags around block-level tags. | |
657 | text = self._hash_html_blocks(text) | |
658 | ||
659 | text = self._form_paragraphs(text) | |
660 | ||
661 | return text | |
662 | ||
663 | def _pyshell_block_sub(self, match): | |
664 | lines = match.group(0).splitlines(0) | |
665 | _dedentlines(lines) | |
666 | indent = ' ' * self.tab_width | |
667 | s = ('\n' # separate from possible cuddled paragraph | |
668 | + indent + ('\n'+indent).join(lines) | |
669 | + '\n\n') | |
670 | return s | |
671 | ||
672 | def _prepare_pyshell_blocks(self, text): | |
673 | """Ensure that Python interactive shell sessions are put in | |
674 | code blocks -- even if not properly indented. | |
675 | """ | |
676 | if ">>>" not in text: | |
677 | return text | |
678 | ||
679 | less_than_tab = self.tab_width - 1 | |
680 | _pyshell_block_re = re.compile(r""" | |
681 | ^([ ]{0,%d})>>>[ ].*\n # first line | |
682 | ^(\1.*\S+.*\n)* # any number of subsequent lines | |
683 | ^\n # ends with a blank line | |
684 | """ % less_than_tab, re.M | re.X) | |
685 | ||
686 | return _pyshell_block_re.sub(self._pyshell_block_sub, text) | |
687 | ||
688 | def _run_span_gamut(self, text): | |
689 | # These are all the transformations that occur *within* block-level | |
690 | # tags like paragraphs, headers, and list items. | |
691 | ||
692 | text = self._do_code_spans(text) | |
693 | ||
694 | text = self._escape_special_chars(text) | |
695 | ||
696 | # Process anchor and image tags. | |
697 | text = self._do_links(text) | |
698 | ||
699 | # Make links out of things like `<http://example.com/>` | |
700 | # Must come after _do_links(), because you can use < and > | |
701 | # delimiters in inline links like [this](<url>). | |
702 | text = self._do_auto_links(text) | |
703 | ||
704 | if "link-patterns" in self.extras: | |
705 | text = self._do_link_patterns(text) | |
706 | ||
707 | text = self._encode_amps_and_angles(text) | |
708 | ||
709 | text = self._do_italics_and_bold(text) | |
710 | ||
711 | # Do hard breaks: | |
712 | text = re.sub(r" {2,}\n", " <br%s\n" % self.empty_element_suffix, text) | |
713 | ||
714 | return text | |
715 | ||
716 | # "Sorta" because auto-links are identified as "tag" tokens. | |
717 | _sorta_html_tokenize_re = re.compile(r""" | |
718 | ( | |
719 | # tag | |
720 | </? | |
721 | (?:\w+) # tag name | |
722 | (?:\s+(?:[\w-]+:)?[\w-]+=(?:".*?"|'.*?'))* # attributes | |
723 | \s*/?> | |
724 | | | |
725 | # auto-link (e.g., <http://www.activestate.com/>) | |
726 | <\w+[^>]*> | |
727 | | | |
728 | <!--.*?--> # comment | |
729 | | | |
730 | <\?.*?\?> # processing instruction | |
731 | ) | |
732 | """, re.X) | |
733 | ||
734 | def _escape_special_chars(self, text): | |
735 | # Python markdown note: the HTML tokenization here differs from | |
736 | # that in Markdown.pl, hence the behaviour for subtle cases can | |
737 | # differ (I believe the tokenizer here does a better job because | |
738 | # it isn't susceptible to unmatched '<' and '>' in HTML tags). | |
739 | # Note, however, that '>' is not allowed in an auto-link URL | |
740 | # here. | |
741 | escaped = [] | |
742 | is_html_markup = False | |
743 | for token in self._sorta_html_tokenize_re.split(text): | |
744 | if is_html_markup: | |
745 | # Within tags/HTML-comments/auto-links, encode * and _ | |
746 | # so they don't conflict with their use in Markdown for | |
747 | # italics and strong. We're replacing each such | |
748 | # character with its corresponding MD5 checksum value; | |
749 | # this is likely overkill, but it should prevent us from | |
750 | # colliding with the escape values by accident. | |
751 | escaped.append(token.replace('*', g_escape_table['*']) | |
752 | .replace('_', g_escape_table['_'])) | |
753 | else: | |
754 | escaped.append(self._encode_backslash_escapes(token)) | |
755 | is_html_markup = not is_html_markup | |
756 | return ''.join(escaped) | |
757 | ||
758 | def _hash_html_spans(self, text): | |
759 | # Used for safe_mode. | |
760 | ||
761 | def _is_auto_link(s): | |
762 | if ':' in s and self._auto_link_re.match(s): | |
763 | return True | |
764 | elif '@' in s and self._auto_email_link_re.match(s): | |
765 | return True | |
766 | return False | |
767 | ||
768 | tokens = [] | |
769 | is_html_markup = False | |
770 | for token in self._sorta_html_tokenize_re.split(text): | |
771 | if is_html_markup and not _is_auto_link(token): | |
772 | sanitized = self._sanitize_html(token) | |
773 | key = _hash_text(sanitized) | |
774 | self.html_spans[key] = sanitized | |
775 | tokens.append(key) | |
776 | else: | |
777 | tokens.append(token) | |
778 | is_html_markup = not is_html_markup | |
779 | return ''.join(tokens) | |
780 | ||
781 | def _unhash_html_spans(self, text): | |
782 | for key, sanitized in self.html_spans.items(): | |
783 | text = text.replace(key, sanitized) | |
784 | return text | |
785 | ||
786 | def _sanitize_html(self, s): | |
787 | if self.safe_mode == "replace": | |
788 | return self.html_removed_text | |
789 | elif self.safe_mode == "escape": | |
790 | replacements = [ | |
791 | ('&', '&'), | |
792 | ('<', '<'), | |
793 | ('>', '>'), | |
794 | ] | |
795 | for before, after in replacements: | |
796 | s = s.replace(before, after) | |
797 | return s | |
798 | else: | |
799 | raise MarkdownError("invalid value for 'safe_mode': %r (must be " | |
800 | "'escape' or 'replace')" % self.safe_mode) | |
801 | ||
802 | _tail_of_inline_link_re = re.compile(r''' | |
803 | # Match tail of: [text](/url/) or [text](/url/ "title") | |
804 | \( # literal paren | |
805 | [ \t]* | |
806 | (?P<url> # \1 | |
807 | <.*?> | |
808 | | | |
809 | .*? | |
810 | ) | |
811 | [ \t]* | |
812 | ( # \2 | |
813 | (['"]) # quote char = \3 | |
814 | (?P<title>.*?) | |
815 | \3 # matching quote | |
816 | )? # title is optional | |
817 | \) | |
818 | ''', re.X | re.S) | |
819 | _tail_of_reference_link_re = re.compile(r''' | |
820 | # Match tail of: [text][id] | |
821 | [ ]? # one optional space | |
822 | (?:\n[ ]*)? # one optional newline followed by spaces | |
823 | \[ | |
824 | (?P<id>.*?) | |
825 | \] | |
826 | ''', re.X | re.S) | |
827 | ||
828 | def _do_links(self, text): | |
829 | """Turn Markdown link shortcuts into XHTML <a> and <img> tags. | |
830 | ||
831 | This is a combination of Markdown.pl's _DoAnchors() and | |
832 | _DoImages(). They are done together because that simplified the | |
833 | approach. It was necessary to use a different approach than | |
834 | Markdown.pl because of the lack of atomic matching support in | |
835 | Python's regex engine used in $g_nested_brackets. | |
836 | """ | |
837 | MAX_LINK_TEXT_SENTINEL = 3000 # markdown2 issue 24 | |
838 | ||
839 | # `anchor_allowed_pos` is used to support img links inside | |
840 | # anchors, but not anchors inside anchors. An anchor's start | |
841 | # pos must be `>= anchor_allowed_pos`. | |
842 | anchor_allowed_pos = 0 | |
843 | ||
844 | curr_pos = 0 | |
845 | while True: # Handle the next link. | |
846 | # The next '[' is the start of: | |
847 | # - an inline anchor: [text](url "title") | |
848 | # - a reference anchor: [text][id] | |
849 | # - an inline img: ![text](url "title") | |
850 | # - a reference img: ![text][id] | |
851 | # - a footnote ref: [^id] | |
852 | # (Only if 'footnotes' extra enabled) | |
853 | # - a footnote defn: [^id]: ... | |
854 | # (Only if 'footnotes' extra enabled) These have already | |
855 | # been stripped in _strip_footnote_definitions() so no | |
856 | # need to watch for them. | |
857 | # - a link definition: [id]: url "title" | |
858 | # These have already been stripped in | |
859 | # _strip_link_definitions() so no need to watch for them. | |
860 | # - not markup: [...anything else... | |
861 | try: | |
862 | start_idx = text.index('[', curr_pos) | |
863 | except ValueError: | |
864 | break | |
865 | text_length = len(text) | |
866 | ||
867 | # Find the matching closing ']'. | |
868 | # Markdown.pl allows *matching* brackets in link text so we | |
869 | # will here too. Markdown.pl *doesn't* currently allow | |
870 | # matching brackets in img alt text -- we'll differ in that | |
871 | # regard. | |
872 | bracket_depth = 0 | |
873 | for p in range(start_idx+1, min(start_idx+MAX_LINK_TEXT_SENTINEL, | |
874 | text_length)): | |
875 | ch = text[p] | |
876 | if ch == ']': | |
877 | bracket_depth -= 1 | |
878 | if bracket_depth < 0: | |
879 | break | |
880 | elif ch == '[': | |
881 | bracket_depth += 1 | |
882 | else: | |
883 | # Closing bracket not found within sentinel length. | |
884 | # This isn't markup. | |
885 | curr_pos = start_idx + 1 | |
886 | continue | |
887 | link_text = text[start_idx+1:p] | |
888 | ||
889 | # Possibly a footnote ref? | |
890 | if "footnotes" in self.extras and link_text.startswith("^"): | |
891 | normed_id = re.sub(r'\W', '-', link_text[1:]) | |
892 | if normed_id in self.footnotes: | |
893 | self.footnote_ids.append(normed_id) | |
894 | result = '<sup class="footnote-ref" id="fnref-%s">' \ | |
895 | '<a href="#fn-%s">%s</a></sup>' \ | |
896 | % (normed_id, normed_id, len(self.footnote_ids)) | |
897 | text = text[:start_idx] + result + text[p+1:] | |
898 | else: | |
899 | # This id isn't defined, leave the markup alone. | |
900 | curr_pos = p+1 | |
901 | continue | |
902 | ||
903 | # Now determine what this is by the remainder. | |
904 | p += 1 | |
905 | if p == text_length: | |
906 | return text | |
907 | ||
908 | # Inline anchor or img? | |
909 | if text[p] == '(': # attempt at perf improvement | |
910 | match = self._tail_of_inline_link_re.match(text, p) | |
911 | if match: | |
912 | # Handle an inline anchor or img. | |
913 | is_img = start_idx > 0 and text[start_idx-1] == "!" | |
914 | if is_img: | |
915 | start_idx -= 1 | |
916 | ||
917 | url, title = match.group("url"), match.group("title") | |
918 | if url and url[0] == '<': | |
919 | url = url[1:-1] # '<url>' -> 'url' | |
920 | # We've got to encode these to avoid conflicting | |
921 | # with italics/bold. | |
922 | url = url.replace('*', g_escape_table['*']) \ | |
923 | .replace('_', g_escape_table['_']) | |
924 | if title: | |
925 | title_str = ' title="%s"' \ | |
926 | % title.replace('*', g_escape_table['*']) \ | |
927 | .replace('_', g_escape_table['_']) \ | |
928 | .replace('"', '"') | |
929 | else: | |
930 | title_str = '' | |
931 | if is_img: | |
932 | result = '<img src="%s" alt="%s"%s%s' \ | |
933 | % (url, link_text.replace('"', '"'), | |
934 | title_str, self.empty_element_suffix) | |
935 | curr_pos = start_idx + len(result) | |
936 | text = text[:start_idx] + result + text[match.end():] | |
937 | elif start_idx >= anchor_allowed_pos: | |
938 | result_head = '<a href="%s"%s>' % (url, title_str) | |
939 | result = '%s%s</a>' % (result_head, link_text) | |
940 | # <img> allowed from curr_pos on, <a> from | |
941 | # anchor_allowed_pos on. | |
942 | curr_pos = start_idx + len(result_head) | |
943 | anchor_allowed_pos = start_idx + len(result) | |
944 | text = text[:start_idx] + result + text[match.end():] | |
945 | else: | |
946 | # Anchor not allowed here. | |
947 | curr_pos = start_idx + 1 | |
948 | continue | |
949 | ||
950 | # Reference anchor or img? | |
951 | else: | |
952 | match = self._tail_of_reference_link_re.match(text, p) | |
953 | if match: | |
954 | # Handle a reference-style anchor or img. | |
955 | is_img = start_idx > 0 and text[start_idx-1] == "!" | |
956 | if is_img: | |
957 | start_idx -= 1 | |
958 | link_id = match.group("id").lower() | |
959 | if not link_id: | |
960 | link_id = link_text.lower() # for links like [this][] | |
961 | if link_id in self.urls: | |
962 | url = self.urls[link_id] | |
963 | # We've got to encode these to avoid conflicting | |
964 | # with italics/bold. | |
965 | url = url.replace('*', g_escape_table['*']) \ | |
966 | .replace('_', g_escape_table['_']) | |
967 | title = self.titles.get(link_id) | |
968 | if title: | |
969 | title = title.replace('*', g_escape_table['*']) \ | |
970 | .replace('_', g_escape_table['_']) | |
971 | title_str = ' title="%s"' % title | |
972 | else: | |
973 | title_str = '' | |
974 | if is_img: | |
975 | result = '<img src="%s" alt="%s"%s%s' \ | |
976 | % (url, link_text.replace('"', '"'), | |
977 | title_str, self.empty_element_suffix) | |
978 | curr_pos = start_idx + len(result) | |
979 | text = text[:start_idx] + result + text[match.end():] | |
980 | elif start_idx >= anchor_allowed_pos: | |
981 | result = '<a href="%s"%s>%s</a>' \ | |
982 | % (url, title_str, link_text) | |
983 | result_head = '<a href="%s"%s>' % (url, title_str) | |
984 | result = '%s%s</a>' % (result_head, link_text) | |
985 | # <img> allowed from curr_pos on, <a> from | |
986 | # anchor_allowed_pos on. | |
987 | curr_pos = start_idx + len(result_head) | |
988 | anchor_allowed_pos = start_idx + len(result) | |
989 | text = text[:start_idx] + result + text[match.end():] | |
990 | else: | |
991 | # Anchor not allowed here. | |
992 | curr_pos = start_idx + 1 | |
993 | else: | |
994 | # This id isn't defined, leave the markup alone. | |
995 | curr_pos = match.end() | |
996 | continue | |
997 | ||
998 | # Otherwise, it isn't markup. | |
999 | curr_pos = start_idx + 1 | |
1000 | ||
1001 | return text | |
1002 | ||
1003 | ||
1004 | _setext_h_re = re.compile(r'^(.+)[ \t]*\n(=+|-+)[ \t]*\n+', re.M) | |
1005 | def _setext_h_sub(self, match): | |
1006 | n = {"=": 1, "-": 2}[match.group(2)[0]] | |
1007 | demote_headers = self.extras.get("demote-headers") | |
1008 | if demote_headers: | |
1009 | n = min(n + demote_headers, 6) | |
1010 | return "<h%d>%s</h%d>\n\n" \ | |
1011 | % (n, self._run_span_gamut(match.group(1)), n) | |
1012 | ||
1013 | _atx_h_re = re.compile(r''' | |
1014 | ^(\#{1,6}) # \1 = string of #'s | |
1015 | [ \t]* | |
1016 | (.+?) # \2 = Header text | |
1017 | [ \t]* | |
1018 | (?<!\\) # ensure not an escaped trailing '#' | |
1019 | \#* # optional closing #'s (not counted) | |
1020 | \n+ | |
1021 | ''', re.X | re.M) | |
1022 | def _atx_h_sub(self, match): | |
1023 | n = len(match.group(1)) | |
1024 | demote_headers = self.extras.get("demote-headers") | |
1025 | if demote_headers: | |
1026 | n = min(n + demote_headers, 6) | |
1027 | return "<h%d>%s</h%d>\n\n" \ | |
1028 | % (n, self._run_span_gamut(match.group(2)), n) | |
1029 | ||
1030 | def _do_headers(self, text): | |
1031 | # Setext-style headers: | |
1032 | # Header 1 | |
1033 | # ======== | |
1034 | # | |
1035 | # Header 2 | |
1036 | # -------- | |
1037 | text = self._setext_h_re.sub(self._setext_h_sub, text) | |
1038 | ||
1039 | # atx-style headers: | |
1040 | # # Header 1 | |
1041 | # ## Header 2 | |
1042 | # ## Header 2 with closing hashes ## | |
1043 | # ... | |
1044 | # ###### Header 6 | |
1045 | text = self._atx_h_re.sub(self._atx_h_sub, text) | |
1046 | ||
1047 | return text | |
1048 | ||
1049 | ||
1050 | _marker_ul_chars = '*+-' | |
1051 | _marker_any = r'(?:[%s]|\d+\.)' % _marker_ul_chars | |
1052 | _marker_ul = '(?:[%s])' % _marker_ul_chars | |
1053 | _marker_ol = r'(?:\d+\.)' | |
1054 | ||
1055 | def _list_sub(self, match): | |
1056 | lst = match.group(1) | |
1057 | lst_type = match.group(3) in self._marker_ul_chars and "ul" or "ol" | |
1058 | result = self._process_list_items(lst) | |
1059 | if self.list_level: | |
1060 | return "<%s>\n%s</%s>\n" % (lst_type, result, lst_type) | |
1061 | else: | |
1062 | return "<%s>\n%s</%s>\n\n" % (lst_type, result, lst_type) | |
1063 | ||
1064 | def _do_lists(self, text): | |
1065 | # Form HTML ordered (numbered) and unordered (bulleted) lists. | |
1066 | ||
1067 | for marker_pat in (self._marker_ul, self._marker_ol): | |
1068 | # Re-usable pattern to match any entire ul or ol list: | |
1069 | less_than_tab = self.tab_width - 1 | |
1070 | whole_list = r''' | |
1071 | ( # \1 = whole list | |
1072 | ( # \2 | |
1073 | [ ]{0,%d} | |
1074 | (%s) # \3 = first list item marker | |
1075 | [ \t]+ | |
1076 | ) | |
1077 | (?:.+?) | |
1078 | ( # \4 | |
1079 | \Z | |
1080 | | | |
1081 | \n{2,} | |
1082 | (?=\S) | |
1083 | (?! # Negative lookahead for another list item marker | |
1084 | [ \t]* | |
1085 | %s[ \t]+ | |
1086 | ) | |
1087 | ) | |
1088 | ) | |
1089 | ''' % (less_than_tab, marker_pat, marker_pat) | |
1090 | ||
1091 | # We use a different prefix before nested lists than top-level lists. | |
1092 | # See extended comment in _process_list_items(). | |
1093 | # | |
1094 | # Note: There's a bit of duplication here. My original implementation | |
1095 | # created a scalar regex pattern as the conditional result of the test on | |
1096 | # $g_list_level, and then only ran the $text =~ s{...}{...}egmx | |
1097 | # substitution once, using the scalar as the pattern. This worked, | |
1098 | # everywhere except when running under MT on my hosting account at Pair | |
1099 | # Networks. There, this caused all rebuilds to be killed by the reaper (or | |
1100 | # perhaps they crashed, but that seems incredibly unlikely given that the | |
1101 | # same script on the same server ran fine *except* under MT. I've spent | |
1102 | # more time trying to figure out why this is happening than I'd like to | |
1103 | # admit. My only guess, backed up by the fact that this workaround works, | |
1104 | # is that Perl optimizes the substition when it can figure out that the | |
1105 | # pattern will never change, and when this optimization isn't on, we run | |
1106 | # afoul of the reaper. Thus, the slightly redundant code to that uses two | |
1107 | # static s/// patterns rather than one conditional pattern. | |
1108 | ||
1109 | if self.list_level: | |
1110 | sub_list_re = re.compile("^"+whole_list, re.X | re.M | re.S) | |
1111 | text = sub_list_re.sub(self._list_sub, text) | |
1112 | else: | |
1113 | list_re = re.compile(r"(?:(?<=\n\n)|\A\n?)"+whole_list, | |
1114 | re.X | re.M | re.S) | |
1115 | text = list_re.sub(self._list_sub, text) | |
1116 | ||
1117 | return text | |
1118 | ||
1119 | _list_item_re = re.compile(r''' | |
1120 | (\n)? # leading line = \1 | |
1121 | (^[ \t]*) # leading whitespace = \2 | |
1122 | (%s) [ \t]+ # list marker = \3 | |
1123 | ((?:.+?) # list item text = \4 | |
1124 | (\n{1,2})) # eols = \5 | |
1125 | (?= \n* (\Z | \2 (%s) [ \t]+)) | |
1126 | ''' % (_marker_any, _marker_any), | |
1127 | re.M | re.X | re.S) | |
1128 | ||
1129 | _last_li_endswith_two_eols = False | |
1130 | def _list_item_sub(self, match): | |
1131 | item = match.group(4) | |
1132 | leading_line = match.group(1) | |
1133 | leading_space = match.group(2) | |
1134 | if leading_line or "\n\n" in item or self._last_li_endswith_two_eols: | |
1135 | item = self._run_block_gamut(self._outdent(item)) | |
1136 | else: | |
1137 | # Recursion for sub-lists: | |
1138 | item = self._do_lists(self._outdent(item)) | |
1139 | if item.endswith('\n'): | |
1140 | item = item[:-1] | |
1141 | item = self._run_span_gamut(item) | |
1142 | self._last_li_endswith_two_eols = (len(match.group(5)) == 2) | |
1143 | return "<li>%s</li>\n" % item | |
1144 | ||
1145 | def _process_list_items(self, list_str): | |
1146 | # Process the contents of a single ordered or unordered list, | |
1147 | # splitting it into individual list items. | |
1148 | ||
1149 | # The $g_list_level global keeps track of when we're inside a list. | |
1150 | # Each time we enter a list, we increment it; when we leave a list, | |
1151 | # we decrement. If it's zero, we're not in a list anymore. | |
1152 | # | |
1153 | # We do this because when we're not inside a list, we want to treat | |
1154 | # something like this: | |
1155 | # | |
1156 | # I recommend upgrading to version | |
1157 | # 8. Oops, now this line is treated | |
1158 | # as a sub-list. | |
1159 | # | |
1160 | # As a single paragraph, despite the fact that the second line starts | |
1161 | # with a digit-period-space sequence. | |
1162 | # | |
1163 | # Whereas when we're inside a list (or sub-list), that line will be | |
1164 | # treated as the start of a sub-list. What a kludge, huh? This is | |
1165 | # an aspect of Markdown's syntax that's hard to parse perfectly | |
1166 | # without resorting to mind-reading. Perhaps the solution is to | |
1167 | # change the syntax rules such that sub-lists must start with a | |
1168 | # starting cardinal number; e.g. "1." or "a.". | |
1169 | self.list_level += 1 | |
1170 | self._last_li_endswith_two_eols = False | |
1171 | list_str = list_str.rstrip('\n') + '\n' | |
1172 | list_str = self._list_item_re.sub(self._list_item_sub, list_str) | |
1173 | self.list_level -= 1 | |
1174 | return list_str | |
1175 | ||
1176 | def _get_pygments_lexer(self, lexer_name): | |
1177 | try: | |
1178 | from pygments import lexers, util | |
1179 | except ImportError: | |
1180 | return None | |
1181 | try: | |
1182 | return lexers.get_lexer_by_name(lexer_name) | |
1183 | except util.ClassNotFound: | |
1184 | return None | |
1185 | ||
1186 | def _color_with_pygments(self, codeblock, lexer, **formatter_opts): | |
1187 | import pygments | |
1188 | import pygments.formatters | |
1189 | ||
1190 | class HtmlCodeFormatter(pygments.formatters.HtmlFormatter): | |
1191 | def _wrap_code(self, inner): | |
1192 | """A function for use in a Pygments Formatter which | |
1193 | wraps in <code> tags. | |
1194 | """ | |
1195 | yield 0, "<code>" | |
1196 | for tup in inner: | |
1197 | yield tup | |
1198 | yield 0, "</code>" | |
1199 | ||
1200 | def wrap(self, source, outfile): | |
1201 | """Return the source with a code, pre, and div.""" | |
1202 | return self._wrap_div(self._wrap_pre(self._wrap_code(source))) | |
1203 | ||
1204 | formatter = HtmlCodeFormatter(cssclass="codehilite", **formatter_opts) | |
1205 | return pygments.highlight(codeblock, lexer, formatter) | |
1206 | ||
1207 | def _code_block_sub(self, match): | |
1208 | codeblock = match.group(1) | |
1209 | codeblock = self._outdent(codeblock) | |
1210 | codeblock = self._detab(codeblock) | |
1211 | codeblock = codeblock.lstrip('\n') # trim leading newlines | |
1212 | codeblock = codeblock.rstrip() # trim trailing whitespace | |
1213 | ||
1214 | if "code-color" in self.extras and codeblock.startswith(":::"): | |
1215 | lexer_name, rest = codeblock.split('\n', 1) | |
1216 | lexer_name = lexer_name[3:].strip() | |
1217 | lexer = self._get_pygments_lexer(lexer_name) | |
1218 | codeblock = rest.lstrip("\n") # Remove lexer declaration line. | |
1219 | if lexer: | |
1220 | formatter_opts = self.extras['code-color'] or {} | |
1221 | colored = self._color_with_pygments(codeblock, lexer, | |
1222 | **formatter_opts) | |
1223 | return "\n\n%s\n\n" % colored | |
1224 | ||
1225 | codeblock = self._encode_code(codeblock) | |
1226 | return "\n\n<pre><code>%s\n</code></pre>\n\n" % codeblock | |
1227 | ||
1228 | def _do_code_blocks(self, text): | |
1229 | """Process Markdown `<pre><code>` blocks.""" | |
1230 | code_block_re = re.compile(r''' | |
1231 | (?:\n\n|\A) | |
1232 | ( # $1 = the code block -- one or more lines, starting with a space/tab | |
1233 | (?: | |
1234 | (?:[ ]{%d} | \t) # Lines must start with a tab or a tab-width of spaces | |
1235 | .*\n+ | |
1236 | )+ | |
1237 | ) | |
1238 | ((?=^[ ]{0,%d}\S)|\Z) # Lookahead for non-space at line-start, or end of doc | |
1239 | ''' % (self.tab_width, self.tab_width), | |
1240 | re.M | re.X) | |
1241 | ||
1242 | return code_block_re.sub(self._code_block_sub, text) | |
1243 | ||
1244 | ||
1245 | # Rules for a code span: | |
1246 | # - backslash escapes are not interpreted in a code span | |
1247 | # - to include one or or a run of more backticks the delimiters must | |
1248 | # be a longer run of backticks | |
1249 | # - cannot start or end a code span with a backtick; pad with a | |
1250 | # space and that space will be removed in the emitted HTML | |
1251 | # See `test/tm-cases/escapes.text` for a number of edge-case | |
1252 | # examples. | |
1253 | _code_span_re = re.compile(r''' | |
1254 | (?<!\\) | |
1255 | (`+) # \1 = Opening run of ` | |
1256 | (?!`) # See Note A test/tm-cases/escapes.text | |
1257 | (.+?) # \2 = The code block | |
1258 | (?<!`) | |
1259 | \1 # Matching closer | |
1260 | (?!`) | |
1261 | ''', re.X | re.S) | |
1262 | ||
1263 | def _code_span_sub(self, match): | |
1264 | c = match.group(2).strip(" \t") | |
1265 | c = self._encode_code(c) | |
1266 | return "<code>%s</code>" % c | |
1267 | ||
1268 | def _do_code_spans(self, text): | |
1269 | # * Backtick quotes are used for <code></code> spans. | |
1270 | # | |
1271 | # * You can use multiple backticks as the delimiters if you want to | |
1272 | # include literal backticks in the code span. So, this input: | |
1273 | # | |
1274 | # Just type ``foo `bar` baz`` at the prompt. | |
1275 | # | |
1276 | # Will translate to: | |
1277 | # | |
1278 | # <p>Just type <code>foo `bar` baz</code> at the prompt.</p> | |
1279 | # | |
1280 | # There's no arbitrary limit to the number of backticks you | |
1281 | # can use as delimters. If you need three consecutive backticks | |
1282 | # in your code, use four for delimiters, etc. | |
1283 | # | |
1284 | # * You can use spaces to get literal backticks at the edges: | |
1285 | # | |
1286 | # ... type `` `bar` `` ... | |
1287 | # | |
1288 | # Turns to: | |
1289 | # | |
1290 | # ... type <code>`bar`</code> ... | |
1291 | return self._code_span_re.sub(self._code_span_sub, text) | |
1292 | ||
1293 | def _encode_code(self, text): | |
1294 | """Encode/escape certain characters inside Markdown code runs. | |
1295 | The point is that in code, these characters are literals, | |
1296 | and lose their special Markdown meanings. | |
1297 | """ | |
1298 | replacements = [ | |
1299 | # Encode all ampersands; HTML entities are not | |
1300 | # entities within a Markdown code span. | |
1301 | ('&', '&'), | |
1302 | # Do the angle bracket song and dance: | |
1303 | ('<', '<'), | |
1304 | ('>', '>'), | |
1305 | # Now, escape characters that are magic in Markdown: | |
1306 | ('*', g_escape_table['*']), | |
1307 | ('_', g_escape_table['_']), | |
1308 | ('{', g_escape_table['{']), | |
1309 | ('}', g_escape_table['}']), | |
1310 | ('[', g_escape_table['[']), | |
1311 | (']', g_escape_table[']']), | |
1312 | ('\\', g_escape_table['\\']), | |
1313 | ] | |
1314 | for before, after in replacements: | |
1315 | text = text.replace(before, after) | |
1316 | return text | |
1317 | ||
1318 | _strong_re = re.compile(r"(\*\*|__)(?=\S)(.+?[*_]*)(?<=\S)\1", re.S) | |
1319 | _em_re = re.compile(r"(\*|_)(?=\S)(.+?)(?<=\S)\1", re.S) | |
1320 | _code_friendly_strong_re = re.compile(r"\*\*(?=\S)(.+?[*_]*)(?<=\S)\*\*", re.S) | |
1321 | _code_friendly_em_re = re.compile(r"\*(?=\S)(.+?)(?<=\S)\*", re.S) | |
1322 | def _do_italics_and_bold(self, text): | |
1323 | # <strong> must go first: | |
1324 | if "code-friendly" in self.extras: | |
1325 | text = self._code_friendly_strong_re.sub(r"<strong>\1</strong>", text) | |
1326 | text = self._code_friendly_em_re.sub(r"<em>\1</em>", text) | |
1327 | else: | |
1328 | text = self._strong_re.sub(r"<strong>\2</strong>", text) | |
1329 | text = self._em_re.sub(r"<em>\2</em>", text) | |
1330 | return text | |
1331 | ||
1332 | ||
1333 | _block_quote_re = re.compile(r''' | |
1334 | ( # Wrap whole match in \1 | |
1335 | ( | |
1336 | ^[ \t]*>[ \t]? # '>' at the start of a line | |
1337 | .+\n # rest of the first line | |
1338 | (.+\n)* # subsequent consecutive lines | |
1339 | \n* # blanks | |
1340 | )+ | |
1341 | ) | |
1342 | ''', re.M | re.X) | |
1343 | _bq_one_level_re = re.compile('^[ \t]*>[ \t]?', re.M); | |
1344 | ||
1345 | _html_pre_block_re = re.compile(r'(\s*<pre>.+?</pre>)', re.S) | |
1346 | def _dedent_two_spaces_sub(self, match): | |
1347 | return re.sub(r'(?m)^ ', '', match.group(1)) | |
1348 | ||
1349 | def _block_quote_sub(self, match): | |
1350 | bq = match.group(1) | |
1351 | bq = self._bq_one_level_re.sub('', bq) # trim one level of quoting | |
1352 | bq = self._ws_only_line_re.sub('', bq) # trim whitespace-only lines | |
1353 | bq = self._run_block_gamut(bq) # recurse | |
1354 | ||
1355 | bq = re.sub('(?m)^', ' ', bq) | |
1356 | # These leading spaces screw with <pre> content, so we need to fix that: | |
1357 | bq = self._html_pre_block_re.sub(self._dedent_two_spaces_sub, bq) | |
1358 | ||
1359 | return "<blockquote>\n%s\n</blockquote>\n\n" % bq | |
1360 | ||
1361 | def _do_block_quotes(self, text): | |
1362 | if '>' not in text: | |
1363 | return text | |
1364 | return self._block_quote_re.sub(self._block_quote_sub, text) | |
1365 | ||
1366 | def _form_paragraphs(self, text): | |
1367 | # Strip leading and trailing lines: | |
1368 | text = text.strip('\n') | |
1369 | ||
1370 | # Wrap <p> tags. | |
1371 | grafs = re.split(r"\n{2,}", text) | |
1372 | for i, graf in enumerate(grafs): | |
1373 | if graf in self.html_blocks: | |
1374 | # Unhashify HTML blocks | |
1375 | grafs[i] = self.html_blocks[graf] | |
1376 | else: | |
1377 | # Wrap <p> tags. | |
1378 | graf = self._run_span_gamut(graf) | |
1379 | grafs[i] = "<p>" + graf.lstrip(" \t") + "</p>" | |
1380 | ||
1381 | return "\n\n".join(grafs) | |
1382 | ||
1383 | def _add_footnotes(self, text): | |
1384 | if self.footnotes: | |
1385 | footer = [ | |
1386 | '<div class="footnotes">', | |
1387 | '<hr' + self.empty_element_suffix, | |
1388 | '<ol>', | |
1389 | ] | |
1390 | for i, id in enumerate(self.footnote_ids): | |
1391 | if i != 0: | |
1392 | footer.append('') | |
1393 | footer.append('<li id="fn-%s">' % id) | |
1394 | footer.append(self._run_block_gamut(self.footnotes[id])) | |
1395 | backlink = ('<a href="#fnref-%s" ' | |
1396 | 'class="footnoteBackLink" ' | |
1397 | 'title="Jump back to footnote %d in the text.">' | |
1398 | '↩</a>' % (id, i+1)) | |
1399 | if footer[-1].endswith("</p>"): | |
1400 | footer[-1] = footer[-1][:-len("</p>")] \ | |
1401 | + ' ' + backlink + "</p>" | |
1402 | else: | |
1403 | footer.append("\n<p>%s</p>" % backlink) | |
1404 | footer.append('</li>') | |
1405 | footer.append('</ol>') | |
1406 | footer.append('</div>') | |
1407 | return text + '\n\n' + '\n'.join(footer) | |
1408 | else: | |
1409 | return text | |
1410 | ||
1411 | # Ampersand-encoding based entirely on Nat Irons's Amputator MT plugin: | |
1412 | # http://bumppo.net/projects/amputator/ | |
1413 | _ampersand_re = re.compile(r'&(?!#?[xX]?(?:[0-9a-fA-F]+|\w+);)') | |
1414 | _naked_lt_re = re.compile(r'<(?![a-z/?\$!])', re.I) | |
1415 | _naked_gt_re = re.compile(r'''(?<![a-z?!/'"-])>''', re.I) | |
1416 | ||
1417 | def _encode_amps_and_angles(self, text): | |
1418 | # Smart processing for ampersands and angle brackets that need | |
1419 | # to be encoded. | |
1420 | text = self._ampersand_re.sub('&', text) | |
1421 | ||
1422 | # Encode naked <'s | |
1423 | text = self._naked_lt_re.sub('<', text) | |
1424 | ||
1425 | # Encode naked >'s | |
1426 | # Note: Other markdown implementations (e.g. Markdown.pl, PHP | |
1427 | # Markdown) don't do this. | |
1428 | text = self._naked_gt_re.sub('>', text) | |
1429 | return text | |
1430 | ||
1431 | def _encode_backslash_escapes(self, text): | |
1432 | for ch, escape in g_escape_table.items(): | |
1433 | text = text.replace("\\"+ch, escape) | |
1434 | return text | |
1435 | ||
1436 | _auto_link_re = re.compile(r'<((https?|ftp):[^\'">\s]+)>', re.I) | |
1437 | def _auto_link_sub(self, match): | |
1438 | g1 = match.group(1) | |
1439 | return '<a href="%s">%s</a>' % (g1, g1) | |
1440 | ||
1441 | _auto_email_link_re = re.compile(r""" | |
1442 | < | |
1443 | (?:mailto:)? | |
1444 | ( | |
1445 | [-.\w]+ | |
1446 | \@ | |
1447 | [-\w]+(\.[-\w]+)*\.[a-z]+ | |
1448 | ) | |
1449 | > | |
1450 | """, re.I | re.X | re.U) | |
1451 | def _auto_email_link_sub(self, match): | |
1452 | return self._encode_email_address( | |
1453 | self._unescape_special_chars(match.group(1))) | |
1454 | ||
1455 | def _do_auto_links(self, text): | |
1456 | text = self._auto_link_re.sub(self._auto_link_sub, text) | |
1457 | text = self._auto_email_link_re.sub(self._auto_email_link_sub, text) | |
1458 | return text | |
1459 | ||
1460 | def _encode_email_address(self, addr): | |
1461 | # Input: an email address, e.g. "foo@example.com" | |
1462 | # | |
1463 | # Output: the email address as a mailto link, with each character | |
1464 | # of the address encoded as either a decimal or hex entity, in | |
1465 | # the hopes of foiling most address harvesting spam bots. E.g.: | |
1466 | # | |
1467 | # <a href="mailto:foo@e | |
1468 | # xample.com">foo | |
1469 | # @example.com</a> | |
1470 | # | |
1471 | # Based on a filter by Matthew Wickline, posted to the BBEdit-Talk | |
1472 | # mailing list: <http://tinyurl.com/yu7ue> | |
1473 | chars = [_xml_encode_email_char_at_random(ch) | |
1474 | for ch in "mailto:" + addr] | |
1475 | # Strip the mailto: from the visible part. | |
1476 | addr = '<a href="%s">%s</a>' \ | |
1477 | % (''.join(chars), ''.join(chars[7:])) | |
1478 | return addr | |
1479 | ||
1480 | def _do_link_patterns(self, text): | |
1481 | """Caveat emptor: there isn't much guarding against link | |
1482 | patterns being formed inside other standard Markdown links, e.g. | |
1483 | inside a [link def][like this]. | |
1484 | ||
1485 | Dev Notes: *Could* consider prefixing regexes with a negative | |
1486 | lookbehind assertion to attempt to guard against this. | |
1487 | """ | |
1488 | link_from_hash = {} | |
1489 | for regex, repl in self.link_patterns: | |
1490 | replacements = [] | |
1491 | for match in regex.finditer(text): | |
1492 | if hasattr(repl, "__call__"): | |
1493 | href = repl(match) | |
1494 | else: | |
1495 | href = match.expand(repl) | |
1496 | replacements.append((match.span(), href)) | |
1497 | for (start, end), href in reversed(replacements): | |
1498 | escaped_href = ( | |
1499 | href.replace('"', '"') # b/c of attr quote | |
1500 | # To avoid markdown <em> and <strong>: | |
1501 | .replace('*', g_escape_table['*']) | |
1502 | .replace('_', g_escape_table['_'])) | |
1503 | link = '<a href="%s">%s</a>' % (escaped_href, text[start:end]) | |
1504 | hash = md5(link).hexdigest() | |
1505 | link_from_hash[hash] = link | |
1506 | text = text[:start] + hash + text[end:] | |
1507 | for hash, link in link_from_hash.items(): | |
1508 | text = text.replace(hash, link) | |
1509 | return text | |
1510 | ||
1511 | def _unescape_special_chars(self, text): | |
1512 | # Swap back in all the special characters we've hidden. | |
1513 | for ch, hash in g_escape_table.items(): | |
1514 | text = text.replace(hash, ch) | |
1515 | return text | |
1516 | ||
1517 | def _outdent(self, text): | |
1518 | # Remove one level of line-leading tabs or spaces | |
1519 | return self._outdent_re.sub('', text) | |
1520 | ||
1521 | ||
1522 | class MarkdownWithExtras(Markdown): | |
1523 | """A markdowner class that enables most extras: | |
1524 | ||
1525 | - footnotes | |
1526 | - code-color (only has effect if 'pygments' Python module on path) | |
1527 | ||
1528 | These are not included: | |
1529 | - pyshell (specific to Python-related documenting) | |
1530 | - code-friendly (because it *disables* part of the syntax) | |
1531 | - link-patterns (because you need to specify some actual | |
1532 | link-patterns anyway) | |
1533 | """ | |
1534 | extras = ["footnotes", "code-color"] | |
1535 | ||
1536 | ||
1537 | #---- internal support functions | |
1538 | ||
1539 | # From http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/52549 | |
1540 | def _curry(*args, **kwargs): | |
1541 | function, args = args[0], args[1:] | |
1542 | def result(*rest, **kwrest): | |
1543 | combined = kwargs.copy() | |
1544 | combined.update(kwrest) | |
1545 | return function(*args + rest, **combined) | |
1546 | return result | |
1547 | ||
1548 | # Recipe: regex_from_encoded_pattern (1.0) | |
1549 | def _regex_from_encoded_pattern(s): | |
1550 | """'foo' -> re.compile(re.escape('foo')) | |
1551 | '/foo/' -> re.compile('foo') | |
1552 | '/foo/i' -> re.compile('foo', re.I) | |
1553 | """ | |
1554 | if s.startswith('/') and s.rfind('/') != 0: | |
1555 | # Parse it: /PATTERN/FLAGS | |
1556 | idx = s.rfind('/') | |
1557 | pattern, flags_str = s[1:idx], s[idx+1:] | |
1558 | flag_from_char = { | |
1559 | "i": re.IGNORECASE, | |
1560 | "l": re.LOCALE, | |
1561 | "s": re.DOTALL, | |
1562 | "m": re.MULTILINE, | |
1563 | "u": re.UNICODE, | |
1564 | } | |
1565 | flags = 0 | |
1566 | for char in flags_str: | |
1567 | try: | |
1568 | flags |= flag_from_char[char] | |
1569 | except KeyError: | |
1570 | raise ValueError("unsupported regex flag: '%s' in '%s' " | |
1571 | "(must be one of '%s')" | |
1572 | % (char, s, ''.join(flag_from_char.keys()))) | |
1573 | return re.compile(s[1:idx], flags) | |
1574 | else: # not an encoded regex | |
1575 | return re.compile(re.escape(s)) | |
1576 | ||
1577 | # Recipe: dedent (0.1.2) | |
1578 | def _dedentlines(lines, tabsize=8, skip_first_line=False): | |
1579 | """_dedentlines(lines, tabsize=8, skip_first_line=False) -> dedented lines | |
1580 | ||
1581 | "lines" is a list of lines to dedent. | |
1582 | "tabsize" is the tab width to use for indent width calculations. | |
1583 | "skip_first_line" is a boolean indicating if the first line should | |
1584 | be skipped for calculating the indent width and for dedenting. | |
1585 | This is sometimes useful for docstrings and similar. | |
1586 | ||
1587 | Same as dedent() except operates on a sequence of lines. Note: the | |
1588 | lines list is modified **in-place**. | |
1589 | """ | |
1590 | DEBUG = False | |
1591 | if DEBUG: | |
1592 | print "dedent: dedent(..., tabsize=%d, skip_first_line=%r)"\ | |
1593 | % (tabsize, skip_first_line) | |
1594 | indents = [] | |
1595 | margin = None | |
1596 | for i, line in enumerate(lines): | |
1597 | if i == 0 and skip_first_line: continue | |
1598 | indent = 0 | |
1599 | for ch in line: | |
1600 | if ch == ' ': | |
1601 | indent += 1 | |
1602 | elif ch == '\t': | |
1603 | indent += tabsize - (indent % tabsize) | |
1604 | elif ch in '\r\n': | |
1605 | continue # skip all-whitespace lines | |
1606 | else: | |
1607 | break | |
1608 | else: | |
1609 | continue # skip all-whitespace lines | |
1610 | if DEBUG: print "dedent: indent=%d: %r" % (indent, line) | |
1611 | if margin is None: | |
1612 | margin = indent | |
1613 | else: | |
1614 | margin = min(margin, indent) | |
1615 | if DEBUG: print "dedent: margin=%r" % margin | |
1616 | ||
1617 | if margin is not None and margin > 0: | |
1618 | for i, line in enumerate(lines): | |
1619 | if i == 0 and skip_first_line: continue | |
1620 | removed = 0 | |
1621 | for j, ch in enumerate(line): | |
1622 | if ch == ' ': | |
1623 | removed += 1 | |
1624 | elif ch == '\t': | |
1625 | removed += tabsize - (removed % tabsize) | |
1626 | elif ch in '\r\n': | |
1627 | if DEBUG: print "dedent: %r: EOL -> strip up to EOL" % line | |
1628 | lines[i] = lines[i][j:] | |
1629 | break | |
1630 | else: | |
1631 | raise ValueError("unexpected non-whitespace char %r in " | |
1632 | "line %r while removing %d-space margin" | |
1633 | % (ch, line, margin)) | |
1634 | if DEBUG: | |
1635 | print "dedent: %r: %r -> removed %d/%d"\ | |
1636 | % (line, ch, removed, margin) | |
1637 | if removed == margin: | |
1638 | lines[i] = lines[i][j+1:] | |
1639 | break | |
1640 | elif removed > margin: | |
1641 | lines[i] = ' '*(removed-margin) + lines[i][j+1:] | |
1642 | break | |
1643 | else: | |
1644 | if removed: | |
1645 | lines[i] = lines[i][removed:] | |
1646 | return lines | |
1647 | ||
1648 | def _dedent(text, tabsize=8, skip_first_line=False): | |
1649 | """_dedent(text, tabsize=8, skip_first_line=False) -> dedented text | |
1650 | ||
1651 | "text" is the text to dedent. | |
1652 | "tabsize" is the tab width to use for indent width calculations. | |
1653 | "skip_first_line" is a boolean indicating if the first line should | |
1654 | be skipped for calculating the indent width and for dedenting. | |
1655 | This is sometimes useful for docstrings and similar. | |
1656 | ||
1657 | textwrap.dedent(s), but don't expand tabs to spaces | |
1658 | """ | |
1659 | lines = text.splitlines(1) | |
1660 | _dedentlines(lines, tabsize=tabsize, skip_first_line=skip_first_line) | |
1661 | return ''.join(lines) | |
1662 | ||
1663 | ||
1664 | class _memoized(object): | |
1665 | """Decorator that caches a function's return value each time it is called. | |
1666 | If called later with the same arguments, the cached value is returned, and | |
1667 | not re-evaluated. | |
1668 | ||
1669 | http://wiki.python.org/moin/PythonDecoratorLibrary | |
1670 | """ | |
1671 | def __init__(self, func): | |
1672 | self.func = func | |
1673 | self.cache = {} | |
1674 | def __call__(self, *args): | |
1675 | try: | |
1676 | return self.cache[args] | |
1677 | except KeyError: | |
1678 | self.cache[args] = value = self.func(*args) | |
1679 | return value | |
1680 | except TypeError: | |
1681 | # uncachable -- for instance, passing a list as an argument. | |
1682 | # Better to not cache than to blow up entirely. | |
1683 | return self.func(*args) | |
1684 | def __repr__(self): | |
1685 | """Return the function's docstring.""" | |
1686 | return self.func.__doc__ | |
1687 | ||
1688 | ||
1689 | def _xml_oneliner_re_from_tab_width(tab_width): | |
1690 | """Standalone XML processing instruction regex.""" | |
1691 | return re.compile(r""" | |
1692 | (?: | |
1693 | (?<=\n\n) # Starting after a blank line | |
1694 | | # or | |
1695 | \A\n? # the beginning of the doc | |
1696 | ) | |
1697 | ( # save in $1 | |
1698 | [ ]{0,%d} | |
1699 | (?: | |
1700 | <\?\w+\b\s+.*?\?> # XML processing instruction | |
1701 | | | |
1702 | <\w+:\w+\b\s+.*?/> # namespaced single tag | |
1703 | ) | |
1704 | [ \t]* | |
1705 | (?=\n{2,}|\Z) # followed by a blank line or end of document | |
1706 | ) | |
1707 | """ % (tab_width - 1), re.X) | |
1708 | _xml_oneliner_re_from_tab_width = _memoized(_xml_oneliner_re_from_tab_width) | |
1709 | ||
1710 | def _hr_tag_re_from_tab_width(tab_width): | |
1711 | return re.compile(r""" | |
1712 | (?: | |
1713 | (?<=\n\n) # Starting after a blank line | |
1714 | | # or | |
1715 | \A\n? # the beginning of the doc | |
1716 | ) | |
1717 | ( # save in \1 | |
1718 | [ ]{0,%d} | |
1719 | <(hr) # start tag = \2 | |
1720 | \b # word break | |
1721 | ([^<>])*? # | |
1722 | /?> # the matching end tag | |
1723 | [ \t]* | |
1724 | (?=\n{2,}|\Z) # followed by a blank line or end of document | |
1725 | ) | |
1726 | """ % (tab_width - 1), re.X) | |
1727 | _hr_tag_re_from_tab_width = _memoized(_hr_tag_re_from_tab_width) | |
1728 | ||
1729 | ||
1730 | def _xml_encode_email_char_at_random(ch): | |
1731 | r = random() | |
1732 | # Roughly 10% raw, 45% hex, 45% dec. | |
1733 | # '@' *must* be encoded. I [John Gruber] insist. | |
1734 | # Issue 26: '_' must be encoded. | |
1735 | if r > 0.9 and ch not in "@_": | |
1736 | return ch | |
1737 | elif r < 0.45: | |
1738 | # The [1:] is to drop leading '0': 0x63 -> x63 | |
1739 | return '&#%s;' % hex(ord(ch))[1:] | |
1740 | else: | |
1741 | return '&#%s;' % ord(ch) | |
1742 | ||
1743 | def _hash_text(text): | |
1744 | return 'md5:'+md5(text.encode("utf-8")).hexdigest() | |
1745 | ||
1746 | ||
1747 | #---- mainline | |
1748 | ||
1749 | class _NoReflowFormatter(optparse.IndentedHelpFormatter): | |
1750 | """An optparse formatter that does NOT reflow the description.""" | |
1751 | def format_description(self, description): | |
1752 | return description or "" | |
1753 | ||
1754 | def _test(): | |
1755 | import doctest | |
1756 | doctest.testmod() | |
1757 | ||
1758 | def main(argv=None): | |
1759 | if argv is None: | |
1760 | argv = sys.argv | |
1761 | if not logging.root.handlers: | |
1762 | logging.basicConfig() | |
1763 | ||
1764 | usage = "usage: %prog [PATHS...]" | |
1765 | version = "%prog "+__version__ | |
1766 | parser = optparse.OptionParser(prog="markdown2", usage=usage, | |
1767 | version=version, description=cmdln_desc, | |
1768 | formatter=_NoReflowFormatter()) | |
1769 | parser.add_option("-v", "--verbose", dest="log_level", | |
1770 | action="store_const", const=logging.DEBUG, | |
1771 | help="more verbose output") | |
1772 | parser.add_option("--encoding", | |
1773 | help="specify encoding of text content") | |
1774 | parser.add_option("--html4tags", action="store_true", default=False, | |
1775 | help="use HTML 4 style for empty element tags") | |
1776 | parser.add_option("-s", "--safe", metavar="MODE", dest="safe_mode", | |
1777 | help="sanitize literal HTML: 'escape' escapes " | |
1778 | "HTML meta chars, 'replace' replaces with an " | |
1779 | "[HTML_REMOVED] note") | |
1780 | parser.add_option("-x", "--extras", action="append", | |
1781 | help="Turn on specific extra features (not part of " | |
1782 | "the core Markdown spec). Supported values: " | |
1783 | "'code-friendly' disables _/__ for emphasis; " | |
1784 | "'code-color' adds code-block syntax coloring; " | |
1785 | "'link-patterns' adds auto-linking based on patterns; " | |
1786 | "'footnotes' adds the footnotes syntax;" | |
1787 | "'xml' passes one-liner processing instructions and namespaced XML tags;" | |
1788 | "'pyshell' to put unindented Python interactive shell sessions in a <code> block.") | |
1789 | parser.add_option("--use-file-vars", | |
1790 | help="Look for and use Emacs-style 'markdown-extras' " | |
1791 | "file var to turn on extras. See " | |
1792 | "<http://code.google.com/p/python-markdown2/wiki/Extras>.") | |
1793 | parser.add_option("--link-patterns-file", | |
1794 | help="path to a link pattern file") | |
1795 | parser.add_option("--self-test", action="store_true", | |
1796 | help="run internal self-tests (some doctests)") | |
1797 | parser.add_option("--compare", action="store_true", | |
1798 | help="run against Markdown.pl as well (for testing)") | |
1799 | parser.set_defaults(log_level=logging.INFO, compare=False, | |
1800 | encoding="utf-8", safe_mode=None, use_file_vars=False) | |
1801 | opts, paths = parser.parse_args() | |
1802 | log.setLevel(opts.log_level) | |
1803 | ||
1804 | if opts.self_test: | |
1805 | return _test() | |
1806 | ||
1807 | if opts.extras: | |
1808 | extras = {} | |
1809 | for s in opts.extras: | |
1810 | splitter = re.compile("[,;: ]+") | |
1811 | for e in splitter.split(s): | |
1812 | if '=' in e: | |
1813 | ename, earg = e.split('=', 1) | |
1814 | try: | |
1815 | earg = int(earg) | |
1816 | except ValueError: | |
1817 | pass | |
1818 | else: | |
1819 | ename, earg = e, None | |
1820 | extras[ename] = earg | |
1821 | else: | |
1822 | extras = None | |
1823 | ||
1824 | if opts.link_patterns_file: | |
1825 | link_patterns = [] | |
1826 | f = open(opts.link_patterns_file) | |
1827 | try: | |
1828 | for i, line in enumerate(f.readlines()): | |
1829 | if not line.strip(): continue | |
1830 | if line.lstrip().startswith("#"): continue | |
1831 | try: | |
1832 | pat, href = line.rstrip().rsplit(None, 1) | |
1833 | except ValueError: | |
1834 | raise MarkdownError("%s:%d: invalid link pattern line: %r" | |
1835 | % (opts.link_patterns_file, i+1, line)) | |
1836 | link_patterns.append( | |
1837 | (_regex_from_encoded_pattern(pat), href)) | |
1838 | finally: | |
1839 | f.close() | |
1840 | else: | |
1841 | link_patterns = None | |
1842 | ||
1843 | from os.path import join, dirname, abspath, exists | |
1844 | markdown_pl = join(dirname(dirname(abspath(__file__))), "test", | |
1845 | "Markdown.pl") | |
1846 | for path in paths: | |
1847 | if opts.compare: | |
1848 | print "==== Markdown.pl ====" | |
1849 | perl_cmd = 'perl %s "%s"' % (markdown_pl, path) | |
1850 | o = os.popen(perl_cmd) | |
1851 | perl_html = o.read() | |
1852 | o.close() | |
1853 | sys.stdout.write(perl_html) | |
1854 | print "==== markdown2.py ====" | |
1855 | html = markdown_path(path, encoding=opts.encoding, | |
1856 | html4tags=opts.html4tags, | |
1857 | safe_mode=opts.safe_mode, | |
1858 | extras=extras, link_patterns=link_patterns, | |
1859 | use_file_vars=opts.use_file_vars) | |
1860 | sys.stdout.write( | |
1861 | html.encode(sys.stdout.encoding or "utf-8", 'xmlcharrefreplace')) | |
1862 | if opts.compare: | |
1863 | test_dir = join(dirname(dirname(abspath(__file__))), "test") | |
1864 | if exists(join(test_dir, "test_markdown2.py")): | |
1865 | sys.path.insert(0, test_dir) | |
1866 | from test_markdown2 import norm_html_from_html | |
1867 | norm_html = norm_html_from_html(html) | |
1868 | norm_perl_html = norm_html_from_html(perl_html) | |
1869 | else: | |
1870 | norm_html = html | |
1871 | norm_perl_html = perl_html | |
1872 | print "==== match? %r ====" % (norm_perl_html == norm_html) | |
1873 | ||
1874 | ||
1875 | if __name__ == "__main__": | |
1876 | sys.exit( main(sys.argv) ) | |
1877 |