]> git.ipfire.org Git - thirdparty/rsync.git/blame - md-convert
More tweaks for Actions.
[thirdparty/rsync.git] / md-convert
CommitLineData
27e88dec 1#!/usr/bin/env python3
53fae556 2
a2b630c0
WD
3# This script transforms markdown files into html and (optionally) nroff. The
4# output files are written into the current directory named for the input file
5# without the .md suffix and either the .html suffix or no suffix.
53fae556 6#
a2b630c0
WD
7# If the input .md file has a section number at the end of the name (e.g.,
8# rsync.1.md) a nroff file is also output (PROJ.NUM.md -> PROJ.NUM).
ec8a05f6 9#
a2b630c0
WD
10# The markdown input format has one extra extension: if a numbered list starts
11# at 0, it is turned into a description list. The dl's dt tag is taken from the
12# contents of the first tag inside the li, which is usually a p, code, or
13# strong tag.
14#
15# The cmarkgfm or commonmark lib is used to transforms the input file into
16# html. Then, the html.parser is used as a state machine that lets us tweak
17# the html and (optionally) output nroff data based on the html tags.
18#
19# If the string @USE_GFM_PARSER@ exists in the file, the string is removed and
20# a github-flavored-markup parser is used to parse the file.
21#
22# The man-page .md files also get the vars @VERSION@, @BINDIR@, and @LIBDIR@
23# substituted. Some of these values depend on the Makefile $(prefix) (see the
24# generated Makefile). If the maintainer wants to build files for /usr/local
25# while creating release-ready man-page files for /usr, use the environment to
26# set RSYNC_OVERRIDE_PREFIX=/usr.
27
28# Copyright (C) 2020 - 2021 Wayne Davison
53fae556
WD
29#
30# This program is freely redistributable.
31
a2b630c0 32import os, sys, re, argparse, subprocess, time
53fae556
WD
33from html.parser import HTMLParser
34
d07272d6
WD
35VALID_PAGES = 'README INSTALL COPYING rsync.1 rrsync.1 rsync-ssl.1 rsyncd.conf.5'.split()
36
e841944b 37CONSUMES_TXT = set('h1 h2 h3 p li pre'.split())
53fae556
WD
38
39HTML_START = """\
40<html><head>
8898aecb
WD
41<title>%TITLE%</title>
42<meta charset="UTF-8"/>
03fc62ad 43<link href="https://fonts.googleapis.com/css2?family=Roboto&family=Roboto+Mono&display=swap" rel="stylesheet">
53fae556
WD
44<style>
45body {
03fc62ad 46 max-width: 50em;
53fae556 47 margin: auto;
03fc62ad
WD
48}
49body, b, strong, u {
53fae556
WD
50 font-family: 'Roboto', sans-serif;
51}
f44e76b6 52a.tgt { font-face: symbol; font-weight: 400; font-size: 70%; visibility: hidden; text-decoration: none; color: #ddd; padding: 0 4px; border: 0; }
8898aecb
WD
53a.tgt:after { content: '🔗'; }
54a.tgt:hover { color: #444; background-color: #eaeaea; }
55h1:hover > a.tgt, h2:hover > a.tgt, h3:hover > a.tgt, dt:hover > a.tgt { visibility: visible; }
03fc62ad
WD
56code {
57 font-family: 'Roboto Mono', monospace;
58 font-weight: bold;
a93ffb1a 59 white-space: pre;
03fc62ad
WD
60}
61pre code {
62 display: block;
63 font-weight: normal;
64}
53fae556 65blockquote pre code {
03fc62ad 66 background: #f1f1f1;
53fae556
WD
67}
68dd p:first-of-type {
69 margin-block-start: 0em;
70}
71</style>
72</head><body>
73"""
74
a2b630c0
WD
75TABLE_STYLE = """\
76table {
77 border-color: grey;
78 border-spacing: 0;
79}
80tr {
81 border-top: 1px solid grey;
82}
83tr:nth-child(2n) {
84 background-color: #f6f8fa;
85}
86th, td {
87 border: 1px solid #dfe2e5;
88 text-align: center;
89 padding-left: 1em;
90 padding-right: 1em;
91}
92"""
93
94MAN_HTML_END = """\
53fae556 95<div style="float: right"><p><i>%s</i></p></div>
a2b630c0
WD
96"""
97
98HTML_END = """\
53fae556
WD
99</body></html>
100"""
101
102MAN_START = r"""
03fc62ad 103.TH "%s" "%s" "%s" "%s" "User Commands"
ec8a05f6 104.\" prefix=%s
53fae556
WD
105""".lstrip()
106
107MAN_END = """\
108"""
109
110NORM_FONT = ('\1', r"\fP")
111BOLD_FONT = ('\2', r"\fB")
a93ffb1a
WD
112UNDR_FONT = ('\3', r"\fI")
113NBR_DASH = ('\4', r"\-")
114NBR_SPACE = ('\xa0', r"\ ")
53fae556 115
7e94e521
WD
116FILENAME_RE = re.compile(r'^(?P<fn>(?P<srcdir>.+/)?(?P<name>(?P<prog>[^/]+?)(\.(?P<sect>\d+))?)\.md)$')
117ASSIGNMENT_RE = re.compile(r'^(\w+)=(.+)')
664639e3
WD
118VER_RE = re.compile(r'^#define\s+RSYNC_VERSION\s+"(\d.+?)"', re.M)
119TZ_RE = re.compile(r'^#define\s+MAINTAINER_TZ_OFFSET\s+(-?\d+(\.\d+)?)', re.M)
7e94e521
WD
120VAR_REF_RE = re.compile(r'\$\{(\w+)\}')
121VERSION_RE = re.compile(r' (\d[.\d]+)[, ]')
122BIN_CHARS_RE = re.compile(r'[\1-\7]+')
123SPACE_DOUBLE_DASH_RE = re.compile(r'\s--(\s)')
124NON_SPACE_SINGLE_DASH_RE = re.compile(r'(^|\W)-')
125WHITESPACE_RE = re.compile(r'\s')
d07272d6 126CODE_BLOCK_RE = re.compile(r'[%s]([^=%s]+)[=%s]' % (BOLD_FONT[0], NORM_FONT[0], NORM_FONT[0]))
7e94e521
WD
127NBR_DASH_RE = re.compile(r'[%s]' % NBR_DASH[0])
128INVALID_TARGET_CHARS_RE = re.compile(r'[^-A-Za-z0-9._]')
129INVALID_START_CHAR_RE = re.compile(r'^([^A-Za-z0-9])')
130MANIFY_LINESTART_RE = re.compile(r"^(['.])", flags=re.M)
131
03fc62ad 132md_parser = None
a2b630c0 133env_subs = { }
03fc62ad 134
995ce719
WD
135warning_count = 0
136
53fae556 137def main():
a2b630c0
WD
138 for mdfn in args.mdfiles:
139 parse_md_file(mdfn)
140
141 if args.test:
142 print("The test was successful.")
143
144
145def parse_md_file(mdfn):
7e94e521 146 fi = FILENAME_RE.match(mdfn)
53fae556 147 if not fi:
a2b630c0 148 die('Failed to parse a md input file name:', mdfn)
53fae556 149 fi = argparse.Namespace(**fi.groupdict())
a2b630c0
WD
150 fi.want_manpage = not not fi.sect
151 if fi.want_manpage:
96ed4b47 152 fi.title = fi.prog + '(' + fi.sect + ') manpage'
a2b630c0 153 else:
d07272d6 154 fi.title = fi.prog + ' for rsync'
a2b630c0
WD
155
156 if fi.want_manpage:
157 if not env_subs:
158 find_man_substitutions()
159 prog_ver = 'rsync ' + env_subs['VERSION']
160 if fi.prog != 'rsync':
161 prog_ver = fi.prog + ' from ' + prog_ver
162 fi.man_headings = (fi.prog, fi.sect, env_subs['date'], prog_ver, env_subs['prefix'])
163
164 with open(mdfn, 'r', encoding='utf-8') as fh:
165 txt = fh.read()
166
167 use_gfm_parser = '@USE_GFM_PARSER@' in txt
168 if use_gfm_parser:
169 txt = txt.replace('@USE_GFM_PARSER@', '')
170
171 if fi.want_manpage:
172 txt = (txt.replace('@VERSION@', env_subs['VERSION'])
173 .replace('@BINDIR@', env_subs['bindir'])
174 .replace('@LIBDIR@', env_subs['libdir']))
175
176 if use_gfm_parser:
177 if not gfm_parser:
178 die('Input file requires cmarkgfm parser:', mdfn)
179 fi.html_in = gfm_parser(txt)
180 else:
181 fi.html_in = md_parser(txt)
182 txt = None
183
184 TransformHtml(fi)
185
186 if args.test:
187 return
188
189 output_list = [ (fi.name + '.html', fi.html_out) ]
190 if fi.want_manpage:
191 output_list += [ (fi.name, fi.man_out) ]
192 for fn, txt in output_list:
d2cc1149
WD
193 if args.dest and args.dest != '.':
194 fn = os.path.join(args.dest, fn)
a2b630c0
WD
195 if os.path.lexists(fn):
196 os.unlink(fn)
197 print("Wrote:", fn)
198 with open(fn, 'w', encoding='utf-8') as fh:
199 fh.write(txt)
6dc94e39 200
53fae556 201
a2b630c0
WD
202def find_man_substitutions():
203 srcdir = os.path.dirname(sys.argv[0]) + '/'
204 mtime = 0
6dc94e39 205
a2b630c0 206 git_dir = srcdir + '.git'
19617f7b 207 if os.path.lexists(git_dir):
a2b630c0 208 mtime = int(subprocess.check_output(['git', '--git-dir', git_dir, 'log', '-1', '--format=%at']))
58e8ecf4 209
a2b630c0
WD
210 # Allow "prefix" to be overridden via the environment:
211 env_subs['prefix'] = os.environ.get('RSYNC_OVERRIDE_PREFIX', None)
53fae556 212
111225a9
WD
213 if args.test:
214 env_subs['VERSION'] = '1.0.0'
ec8a05f6
WD
215 env_subs['bindir'] = '/usr/bin'
216 env_subs['libdir'] = '/usr/lib/rsync'
2ab2ee16 217 tz_offset = 0
111225a9 218 else:
a2b630c0 219 for fn in (srcdir + 'version.h', 'Makefile'):
111225a9 220 try:
19617f7b 221 st = os.lstat(fn)
72adf49b 222 except OSError:
a2b630c0
WD
223 die('Failed to find', srcdir + fn)
224 if not mtime:
225 mtime = st.st_mtime
111225a9 226
a2b630c0 227 with open(srcdir + 'version.h', 'r', encoding='utf-8') as fh:
3b4f5fb8 228 txt = fh.read()
664639e3 229 m = VER_RE.search(txt)
3b4f5fb8 230 env_subs['VERSION'] = m.group(1)
664639e3
WD
231 m = TZ_RE.search(txt) # the tzdata lib may not be installed, so we use a simple hour offset
232 tz_offset = float(m.group(1)) * 60 * 60
3b4f5fb8 233
19617f7b 234 with open('Makefile', 'r', encoding='utf-8') as fh:
111225a9 235 for line in fh:
7e94e521 236 m = ASSIGNMENT_RE.match(line)
111225a9
WD
237 if not m:
238 continue
07a3e1f9 239 var, val = (m.group(1), m.group(2))
111225a9
WD
240 if var == 'prefix' and env_subs[var] is not None:
241 continue
7e94e521
WD
242 while VAR_REF_RE.search(val):
243 val = VAR_REF_RE.sub(lambda m: env_subs[m.group(1)], val)
111225a9 244 env_subs[var] = val
3b4f5fb8 245 if var == 'srcdir':
111225a9
WD
246 break
247
664639e3 248 env_subs['date'] = time.strftime('%d %b %Y', time.gmtime(mtime + tz_offset)).lstrip('0')
53fae556 249
ae82762c 250
03fc62ad
WD
251def html_via_commonmark(txt):
252 return commonmark.HtmlRenderer().render(commonmark.Parser().parse(txt))
253
6dc94e39 254
a2b630c0 255class TransformHtml(HTMLParser):
6dc94e39 256 def __init__(self, fi):
53fae556
WD
257 HTMLParser.__init__(self, convert_charrefs=True)
258
995ce719
WD
259 self.fn = fi.fn
260
68c865c9 261 st = self.state = argparse.Namespace(
53fae556
WD
262 list_state = [ ],
263 p_macro = ".P\n",
6dc94e39
WD
264 at_first_tag_in_li = False,
265 at_first_tag_in_dd = False,
53fae556
WD
266 dt_from = None,
267 in_pre = False,
b65b6db3 268 in_code = False,
8898aecb 269 html_out = [ HTML_START.replace('%TITLE%', fi.title) ],
a2b630c0 270 man_out = [ ],
53fae556 271 txt = '',
a2b630c0 272 want_manpage = fi.want_manpage,
995ce719
WD
273 created_hashtags = set(),
274 derived_hashtags = set(),
275 referenced_hashtags = set(),
276 bad_hashtags = set(),
d07272d6 277 latest_targets = [ ],
995ce719 278 opt_prefix = 'opt',
c3d3b49d
WD
279 a_href = None,
280 a_href_external = False,
995ce719 281 a_txt_start = None,
c3d3b49d 282 after_a_tag = False,
7e94e521 283 target_suf = '',
53fae556
WD
284 )
285
a2b630c0
WD
286 if st.want_manpage:
287 st.man_out.append(MAN_START % fi.man_headings)
288
289 if '</table>' in fi.html_in:
290 st.html_out[0] = st.html_out[0].replace('</style>', TABLE_STYLE + '</style>')
291
6dc94e39
WD
292 self.feed(fi.html_in)
293 fi.html_in = None
53fae556 294
a2b630c0
WD
295 if st.want_manpage:
296 st.html_out.append(MAN_HTML_END % env_subs['date'])
297 st.html_out.append(HTML_END)
68c865c9 298 st.man_out.append(MAN_END)
53fae556 299
68c865c9
WD
300 fi.html_out = ''.join(st.html_out)
301 st.html_out = None
53fae556 302
68c865c9
WD
303 fi.man_out = ''.join(st.man_out)
304 st.man_out = None
53fae556 305
d07272d6
WD
306 for tgt, txt in st.derived_hashtags:
307 derived = txt2target(txt, tgt)
995ce719 308 if derived not in st.created_hashtags:
7e94e521 309 txt = BIN_CHARS_RE.sub('', txt.replace(NBR_DASH[0], '-').replace(NBR_SPACE[0], ' '))
d07272d6 310 warn('Unknown derived hashtag link in', self.fn, 'based on:', (tgt, txt))
995ce719
WD
311
312 for bad in st.bad_hashtags:
313 if bad in st.created_hashtags:
314 warn('Missing "#" in hashtag link in', self.fn + ':', bad)
315 else:
316 warn('Unknown non-hashtag link in', self.fn + ':', bad)
317
318 for bad in st.referenced_hashtags - st.created_hashtags:
319 warn('Unknown hashtag link in', self.fn + ':', '#' + bad)
53fae556 320
c3d3b49d
WD
321 def handle_UE(self):
322 st = self.state
323 if st.txt.startswith(('.', ',', '!', '?', ';', ':')):
324 st.man_out[-1] = ".UE " + st.txt[0] + "\n"
325 st.txt = st.txt[1:]
326 st.after_a_tag = False
327
53fae556
WD
328 def handle_starttag(self, tag, attrs_list):
329 st = self.state
330 if args.debug:
68c865c9 331 self.output_debug('START', (tag, attrs_list))
6dc94e39 332 if st.at_first_tag_in_li:
53fae556
WD
333 if st.list_state[-1] == 'dl':
334 st.dt_from = tag
335 if tag == 'p':
336 tag = 'dt'
337 else:
68c865c9 338 st.html_out.append('<dt>')
d80da9e6
WD
339 elif tag == 'p':
340 st.at_first_tag_in_dd = True # Kluge to suppress a .P at the start of an li.
6dc94e39 341 st.at_first_tag_in_li = False
53fae556 342 if tag == 'p':
6dc94e39 343 if not st.at_first_tag_in_dd:
68c865c9 344 st.man_out.append(st.p_macro)
53fae556 345 elif tag == 'li':
6dc94e39 346 st.at_first_tag_in_li = True
53fae556
WD
347 lstate = st.list_state[-1]
348 if lstate == 'dl':
349 return
350 if lstate == 'o':
68c865c9 351 st.man_out.append(".IP o\n")
53fae556 352 else:
68c865c9 353 st.man_out.append(".IP " + str(lstate) + ".\n")
53fae556
WD
354 st.list_state[-1] += 1
355 elif tag == 'blockquote':
68c865c9 356 st.man_out.append(".RS 4\n")
53fae556
WD
357 elif tag == 'pre':
358 st.in_pre = True
68c865c9 359 st.man_out.append(st.p_macro + ".nf\n")
53fae556 360 elif tag == 'code' and not st.in_pre:
b65b6db3 361 st.in_code = True
53fae556 362 st.txt += BOLD_FONT[0]
03fc62ad 363 elif tag == 'strong' or tag == 'b':
53fae556 364 st.txt += BOLD_FONT[0]
03fc62ad 365 elif tag == 'em' or tag == 'i':
a2b630c0 366 if st.want_manpage:
96ed4b47 367 tag = 'u' # Change it into underline to be more like the manpage
a2b630c0 368 st.txt += UNDR_FONT[0]
53fae556
WD
369 elif tag == 'ol':
370 start = 1
371 for var, val in attrs_list:
372 if var == 'start':
373 start = int(val) # We only support integers.
374 break
375 if st.list_state:
68c865c9 376 st.man_out.append(".RS\n")
53fae556
WD
377 if start == 0:
378 tag = 'dl'
379 attrs_list = [ ]
380 st.list_state.append('dl')
381 else:
382 st.list_state.append(start)
68c865c9 383 st.man_out.append(st.p_macro)
53fae556
WD
384 st.p_macro = ".IP\n"
385 elif tag == 'ul':
68c865c9 386 st.man_out.append(st.p_macro)
53fae556 387 if st.list_state:
68c865c9 388 st.man_out.append(".RS\n")
53fae556
WD
389 st.p_macro = ".IP\n"
390 st.list_state.append('o')
a2b630c0
WD
391 elif tag == 'hr':
392 st.man_out.append(".l\n")
393 st.html_out.append("<hr />")
394 return
995ce719
WD
395 elif tag == 'a':
396 st.a_href = None
397 for var, val in attrs_list:
398 if var == 'href':
d07272d6 399 if val.startswith(('https://', 'http://', 'mailto:', 'ftp:')):
c3d3b49d
WD
400 if st.after_a_tag:
401 self.handle_UE()
402 st.man_out.append(manify(st.txt.strip()) + "\n")
403 st.man_out.append(".UR " + val + "\n")
404 st.txt = ''
405 st.a_href = val
406 st.a_href_external = True
d07272d6 407 elif '#' in val:
1f2f4131 408 pg, tgt = val.split('#', 1)
d07272d6
WD
409 if pg and pg not in VALID_PAGES or '#' in tgt:
410 st.bad_hashtags.add(val)
411 elif tgt in ('', 'opt', 'dopt'):
412 st.a_href = val
c3d3b49d 413 st.a_href_external = False
d07272d6
WD
414 elif pg == '':
415 st.referenced_hashtags.add(tgt)
416 if tgt in st.latest_targets:
417 warn('Found link to the current section in', self.fn + ':', val)
418 elif val not in VALID_PAGES:
995ce719
WD
419 st.bad_hashtags.add(val)
420 st.a_txt_start = len(st.txt)
ae82762c 421 st.html_out.append('<' + tag + ''.join(' ' + var + '="' + htmlify(val) + '"' for var, val in attrs_list) + '>')
6dc94e39
WD
422 st.at_first_tag_in_dd = False
423
53fae556
WD
424
425 def handle_endtag(self, tag):
426 st = self.state
427 if args.debug:
68c865c9 428 self.output_debug('END', (tag,))
c3d3b49d
WD
429 if st.after_a_tag:
430 self.handle_UE()
53fae556
WD
431 if tag in CONSUMES_TXT or st.dt_from == tag:
432 txt = st.txt.strip()
433 st.txt = ''
434 else:
435 txt = None
436 add_to_txt = None
7e94e521
WD
437 if tag == 'h1':
438 tgt = txt
439 target_suf = ''
440 if tgt.startswith('NEWS for '):
441 m = VERSION_RE.search(tgt)
442 if m:
443 tgt = m.group(1)
444 st.target_suf = '-' + tgt
f44e76b6 445 self.add_targets(tag, tgt)
7e94e521 446 elif tag == 'h2':
68c865c9 447 st.man_out.append(st.p_macro + '.SH "' + manify(txt) + '"\n')
f44e76b6 448 self.add_targets(tag, txt, st.target_suf)
d07272d6 449 st.opt_prefix = 'dopt' if txt == 'DAEMON OPTIONS' else 'opt'
e841944b 450 elif tag == 'h3':
68c865c9 451 st.man_out.append(st.p_macro + '.SS "' + manify(txt) + '"\n')
f44e76b6 452 self.add_targets(tag, txt, st.target_suf)
53fae556
WD
453 elif tag == 'p':
454 if st.dt_from == 'p':
455 tag = 'dt'
68c865c9 456 st.man_out.append('.IP "' + manify(txt) + '"\n')
f08505e9 457 if txt.startswith(BOLD_FONT[0]):
f44e76b6 458 self.add_targets(tag, txt)
53fae556 459 st.dt_from = None
68c865c9
WD
460 elif txt != '':
461 st.man_out.append(manify(txt) + "\n")
53fae556
WD
462 elif tag == 'li':
463 if st.list_state[-1] == 'dl':
6dc94e39 464 if st.at_first_tag_in_li:
53fae556
WD
465 die("Invalid 0. -> td translation")
466 tag = 'dd'
467 if txt != '':
68c865c9 468 st.man_out.append(manify(txt) + "\n")
6dc94e39 469 st.at_first_tag_in_li = False
53fae556 470 elif tag == 'blockquote':
68c865c9 471 st.man_out.append(".RE\n")
53fae556
WD
472 elif tag == 'pre':
473 st.in_pre = False
68c865c9 474 st.man_out.append(manify(txt) + "\n.fi\n")
b65b6db3
WD
475 elif (tag == 'code' and not st.in_pre):
476 st.in_code = False
477 add_to_txt = NORM_FONT[0]
478 elif tag == 'strong' or tag == 'b':
03fc62ad
WD
479 add_to_txt = NORM_FONT[0]
480 elif tag == 'em' or tag == 'i':
a2b630c0 481 if st.want_manpage:
96ed4b47 482 tag = 'u' # Change it into underline to be more like the manpage
a2b630c0 483 add_to_txt = NORM_FONT[0]
53fae556
WD
484 elif tag == 'ol' or tag == 'ul':
485 if st.list_state.pop() == 'dl':
486 tag = 'dl'
487 if st.list_state:
68c865c9 488 st.man_out.append(".RE\n")
53fae556
WD
489 else:
490 st.p_macro = ".P\n"
6dc94e39 491 st.at_first_tag_in_dd = False
a2b630c0
WD
492 elif tag == 'hr':
493 return
995ce719 494 elif tag == 'a':
c3d3b49d
WD
495 if st.a_href_external:
496 st.txt = st.txt.strip()
6ae7f408 497 if args.force_link_text or st.a_href != st.txt:
c3d3b49d
WD
498 st.man_out.append(manify(st.txt) + "\n")
499 st.man_out.append(".UE\n") # This might get replaced with a punctuation version in handle_UE()
500 st.after_a_tag = True
501 st.a_href_external = False
502 st.txt = ''
503 elif st.a_href:
995ce719
WD
504 atxt = st.txt[st.a_txt_start:]
505 find = 'href="' + st.a_href + '"'
506 for j in range(len(st.html_out)-1, 0, -1):
507 if find in st.html_out[j]:
1f2f4131 508 pg, tgt = st.a_href.split('#', 1)
d07272d6
WD
509 derived = txt2target(atxt, tgt)
510 if pg == '':
511 if derived in st.latest_targets:
512 warn('Found link to the current section in', self.fn + ':', st.a_href)
513 st.derived_hashtags.add((tgt, atxt))
514 st.html_out[j] = st.html_out[j].replace(find, 'href="' + pg + '#' + derived + '"')
995ce719
WD
515 break
516 else:
517 die('INTERNAL ERROR: failed to find href in html data:', find)
68c865c9 518 st.html_out.append('</' + tag + '>')
53fae556
WD
519 if add_to_txt:
520 if txt is None:
521 st.txt += add_to_txt
522 else:
523 txt += add_to_txt
524 if st.dt_from == tag:
68c865c9
WD
525 st.man_out.append('.IP "' + manify(txt) + '"\n')
526 st.html_out.append('</dt><dd>')
6dc94e39 527 st.at_first_tag_in_dd = True
53fae556
WD
528 st.dt_from = None
529 elif tag == 'dt':
68c865c9 530 st.html_out.append('<dd>')
6dc94e39
WD
531 st.at_first_tag_in_dd = True
532
53fae556 533
a93ffb1a 534 def handle_data(self, txt):
53fae556 535 st = self.state
995ce719
WD
536 if '](' in txt:
537 warn('Malformed link in', self.fn + ':', txt)
53fae556 538 if args.debug:
a93ffb1a
WD
539 self.output_debug('DATA', (txt,))
540 if st.in_pre:
541 html = htmlify(txt)
542 else:
7e94e521
WD
543 txt = SPACE_DOUBLE_DASH_RE.sub(NBR_SPACE[0] + r'--\1', txt).replace('--', NBR_DASH[0]*2)
544 txt = NON_SPACE_SINGLE_DASH_RE.sub(r'\1' + NBR_DASH[0], txt)
a93ffb1a
WD
545 html = htmlify(txt)
546 if st.in_code:
7e94e521 547 txt = WHITESPACE_RE.sub(NBR_SPACE[0], txt)
a93ffb1a
WD
548 html = html.replace(NBR_DASH[0], '-').replace(NBR_SPACE[0], ' ') # <code> is non-breaking in CSS
549 st.html_out.append(html.replace(NBR_SPACE[0], '&nbsp;').replace(NBR_DASH[0], '-&#8288;'))
550 st.txt += txt
53fae556
WD
551
552
f44e76b6 553 def add_targets(self, tag, txt, suf=None):
995ce719 554 st = self.state
f44e76b6 555 tag = '<' + tag + '>'
d07272d6
WD
556 targets = CODE_BLOCK_RE.findall(txt)
557 if not targets:
558 targets = [ txt ]
f44e76b6 559 tag_pos = 0
d07272d6
WD
560 for txt in targets:
561 txt = txt2target(txt, st.opt_prefix)
562 if not txt:
563 continue
7e94e521
WD
564 if suf:
565 txt += suf
566 if txt in st.created_hashtags:
567 for j in range(2, 1000):
568 chk = txt + '-' + str(j)
569 if chk not in st.created_hashtags:
570 print('Made link target unique:', chk)
571 txt = chk
572 break
f44e76b6
WD
573 if tag_pos == 0:
574 tag_pos -= 1
575 while st.html_out[tag_pos] != tag:
576 tag_pos -= 1
577 st.html_out[tag_pos] = tag[:-1] + ' id="' + txt + '">'
578 st.html_out.append('<a href="#' + txt + '" class="tgt"></a>')
579 tag_pos -= 1 # take into account the append
d07272d6 580 else:
f44e76b6 581 st.html_out[tag_pos] = '<span id="' + txt + '"></span>' + st.html_out[tag_pos]
995ce719 582 st.created_hashtags.add(txt)
d07272d6 583 st.latest_targets = targets
995ce719
WD
584
585
68c865c9
WD
586 def output_debug(self, event, extra):
587 import pprint
588 st = self.state
589 if args.debug < 2:
ae82762c 590 st = argparse.Namespace(**vars(st))
68c865c9
WD
591 if len(st.html_out) > 2:
592 st.html_out = ['...'] + st.html_out[-2:]
593 if len(st.man_out) > 2:
594 st.man_out = ['...'] + st.man_out[-2:]
595 print(event, extra)
596 pprint.PrettyPrinter(indent=2).pprint(vars(st))
597
598
995ce719 599def txt2target(txt, opt_prefix):
d07272d6
WD
600 txt = txt.strip().rstrip(':')
601 m = CODE_BLOCK_RE.search(txt)
602 if m:
603 txt = m.group(1)
7e94e521
WD
604 txt = NBR_DASH_RE.sub('-', txt)
605 txt = BIN_CHARS_RE.sub('', txt)
606 txt = INVALID_TARGET_CHARS_RE.sub('_', txt)
995ce719
WD
607 if opt_prefix and txt.startswith('-'):
608 txt = opt_prefix + txt
609 else:
7e94e521 610 txt = INVALID_START_CHAR_RE.sub(r't\1', txt)
995ce719
WD
611 return txt
612
613
53fae556 614def manify(txt):
7e94e521 615 return MANIFY_LINESTART_RE.sub(r'\&\1', txt.replace('\\', '\\\\')
a93ffb1a
WD
616 .replace(NBR_SPACE[0], NBR_SPACE[1])
617 .replace(NBR_DASH[0], NBR_DASH[1])
53fae556
WD
618 .replace(NORM_FONT[0], NORM_FONT[1])
619 .replace(BOLD_FONT[0], BOLD_FONT[1])
7e94e521 620 .replace(UNDR_FONT[0], UNDR_FONT[1]))
53fae556
WD
621
622
ae82762c 623def htmlify(txt):
a93ffb1a 624 return txt.replace('&', '&amp;').replace('<', '&lt;').replace('>', '&gt;').replace('"', '&quot;')
53fae556
WD
625
626
627def warn(*msg):
628 print(*msg, file=sys.stderr)
995ce719
WD
629 global warning_count
630 warning_count += 1
53fae556
WD
631
632
633def die(*msg):
634 warn(*msg)
635 sys.exit(1)
636
637
638if __name__ == '__main__':
0d8cc260 639 parser = argparse.ArgumentParser(description="Convert markdown into html and (optionally) nroff. Each input filename must have a .md suffix, which is changed to .html for the output filename. If the input filename ends with .num.md (e.g. foo.1.md) then a nroff file is also output with the input filename's .md suffix removed (e.g. foo.1).", add_help=False)
a2b630c0 640 parser.add_argument('--test', action='store_true', help="Just test the parsing without outputting any files.")
0d8cc260 641 parser.add_argument('--dest', metavar='DIR', help="Create files in DIR instead of the current directory.")
6ae7f408 642 parser.add_argument('--force-link-text', action='store_true', help="Don't remove the link text if it matches the link href. Useful when nroff doesn't understand .UR and .UE.")
ae82762c 643 parser.add_argument('--debug', '-D', action='count', default=0, help='Output copious info on the html parsing. Repeat for even more.')
53fae556 644 parser.add_argument("--help", "-h", action="help", help="Output this help message and exit.")
0d8cc260 645 parser.add_argument("mdfiles", metavar='FILE.md', nargs='+', help="One or more .md files to convert.")
53fae556
WD
646 args = parser.parse_args()
647
648 try:
649 import cmarkgfm
491ddb08 650 md_parser = cmarkgfm.markdown_to_html
a2b630c0 651 gfm_parser = cmarkgfm.github_flavored_markdown_to_html
53fae556 652 except:
03fc62ad
WD
653 try:
654 import commonmark
655 md_parser = html_via_commonmark
656 except:
657 die("Failed to find cmarkgfm or commonmark for python3.")
a2b630c0 658 gfm_parser = None
53fae556
WD
659
660 main()
995ce719
WD
661 if warning_count:
662 sys.exit(1)