]>
Commit | Line | Data |
---|---|---|
27e88dec | 1 | #!/usr/bin/env python3 |
53fae556 | 2 | |
a2b630c0 WD |
3 | # This script transforms markdown files into html and (optionally) nroff. The |
4 | # output files are written into the current directory named for the input file | |
5 | # without the .md suffix and either the .html suffix or no suffix. | |
53fae556 | 6 | # |
a2b630c0 WD |
7 | # If the input .md file has a section number at the end of the name (e.g., |
8 | # rsync.1.md) a nroff file is also output (PROJ.NUM.md -> PROJ.NUM). | |
ec8a05f6 | 9 | # |
a2b630c0 WD |
10 | # The markdown input format has one extra extension: if a numbered list starts |
11 | # at 0, it is turned into a description list. The dl's dt tag is taken from the | |
12 | # contents of the first tag inside the li, which is usually a p, code, or | |
13 | # strong tag. | |
14 | # | |
15 | # The cmarkgfm or commonmark lib is used to transforms the input file into | |
16 | # html. Then, the html.parser is used as a state machine that lets us tweak | |
17 | # the html and (optionally) output nroff data based on the html tags. | |
18 | # | |
19 | # If the string @USE_GFM_PARSER@ exists in the file, the string is removed and | |
20 | # a github-flavored-markup parser is used to parse the file. | |
21 | # | |
22 | # The man-page .md files also get the vars @VERSION@, @BINDIR@, and @LIBDIR@ | |
23 | # substituted. Some of these values depend on the Makefile $(prefix) (see the | |
24 | # generated Makefile). If the maintainer wants to build files for /usr/local | |
25 | # while creating release-ready man-page files for /usr, use the environment to | |
26 | # set RSYNC_OVERRIDE_PREFIX=/usr. | |
27 | ||
28 | # Copyright (C) 2020 - 2021 Wayne Davison | |
53fae556 WD |
29 | # |
30 | # This program is freely redistributable. | |
31 | ||
a2b630c0 | 32 | import os, sys, re, argparse, subprocess, time |
53fae556 WD |
33 | from html.parser import HTMLParser |
34 | ||
d07272d6 WD |
35 | VALID_PAGES = 'README INSTALL COPYING rsync.1 rrsync.1 rsync-ssl.1 rsyncd.conf.5'.split() |
36 | ||
e841944b | 37 | CONSUMES_TXT = set('h1 h2 h3 p li pre'.split()) |
53fae556 WD |
38 | |
39 | HTML_START = """\ | |
40 | <html><head> | |
8898aecb WD |
41 | <title>%TITLE%</title> |
42 | <meta charset="UTF-8"/> | |
03fc62ad | 43 | <link href="https://fonts.googleapis.com/css2?family=Roboto&family=Roboto+Mono&display=swap" rel="stylesheet"> |
53fae556 WD |
44 | <style> |
45 | body { | |
03fc62ad | 46 | max-width: 50em; |
53fae556 | 47 | margin: auto; |
03fc62ad WD |
48 | } |
49 | body, b, strong, u { | |
53fae556 WD |
50 | font-family: 'Roboto', sans-serif; |
51 | } | |
f44e76b6 | 52 | a.tgt { font-face: symbol; font-weight: 400; font-size: 70%; visibility: hidden; text-decoration: none; color: #ddd; padding: 0 4px; border: 0; } |
8898aecb WD |
53 | a.tgt:after { content: '🔗'; } |
54 | a.tgt:hover { color: #444; background-color: #eaeaea; } | |
55 | h1:hover > a.tgt, h2:hover > a.tgt, h3:hover > a.tgt, dt:hover > a.tgt { visibility: visible; } | |
03fc62ad WD |
56 | code { |
57 | font-family: 'Roboto Mono', monospace; | |
58 | font-weight: bold; | |
a93ffb1a | 59 | white-space: pre; |
03fc62ad WD |
60 | } |
61 | pre code { | |
62 | display: block; | |
63 | font-weight: normal; | |
64 | } | |
53fae556 | 65 | blockquote pre code { |
03fc62ad | 66 | background: #f1f1f1; |
53fae556 WD |
67 | } |
68 | dd p:first-of-type { | |
69 | margin-block-start: 0em; | |
70 | } | |
71 | </style> | |
72 | </head><body> | |
73 | """ | |
74 | ||
a2b630c0 WD |
75 | TABLE_STYLE = """\ |
76 | table { | |
77 | border-color: grey; | |
78 | border-spacing: 0; | |
79 | } | |
80 | tr { | |
81 | border-top: 1px solid grey; | |
82 | } | |
83 | tr:nth-child(2n) { | |
84 | background-color: #f6f8fa; | |
85 | } | |
86 | th, td { | |
87 | border: 1px solid #dfe2e5; | |
88 | text-align: center; | |
89 | padding-left: 1em; | |
90 | padding-right: 1em; | |
91 | } | |
92 | """ | |
93 | ||
94 | MAN_HTML_END = """\ | |
53fae556 | 95 | <div style="float: right"><p><i>%s</i></p></div> |
a2b630c0 WD |
96 | """ |
97 | ||
98 | HTML_END = """\ | |
53fae556 WD |
99 | </body></html> |
100 | """ | |
101 | ||
102 | MAN_START = r""" | |
03fc62ad | 103 | .TH "%s" "%s" "%s" "%s" "User Commands" |
ec8a05f6 | 104 | .\" prefix=%s |
53fae556 WD |
105 | """.lstrip() |
106 | ||
107 | MAN_END = """\ | |
108 | """ | |
109 | ||
110 | NORM_FONT = ('\1', r"\fP") | |
111 | BOLD_FONT = ('\2', r"\fB") | |
a93ffb1a WD |
112 | UNDR_FONT = ('\3', r"\fI") |
113 | NBR_DASH = ('\4', r"\-") | |
114 | NBR_SPACE = ('\xa0', r"\ ") | |
53fae556 | 115 | |
7e94e521 WD |
116 | FILENAME_RE = re.compile(r'^(?P<fn>(?P<srcdir>.+/)?(?P<name>(?P<prog>[^/]+?)(\.(?P<sect>\d+))?)\.md)$') |
117 | ASSIGNMENT_RE = re.compile(r'^(\w+)=(.+)') | |
664639e3 WD |
118 | VER_RE = re.compile(r'^#define\s+RSYNC_VERSION\s+"(\d.+?)"', re.M) |
119 | TZ_RE = re.compile(r'^#define\s+MAINTAINER_TZ_OFFSET\s+(-?\d+(\.\d+)?)', re.M) | |
7e94e521 WD |
120 | VAR_REF_RE = re.compile(r'\$\{(\w+)\}') |
121 | VERSION_RE = re.compile(r' (\d[.\d]+)[, ]') | |
122 | BIN_CHARS_RE = re.compile(r'[\1-\7]+') | |
123 | SPACE_DOUBLE_DASH_RE = re.compile(r'\s--(\s)') | |
124 | NON_SPACE_SINGLE_DASH_RE = re.compile(r'(^|\W)-') | |
125 | WHITESPACE_RE = re.compile(r'\s') | |
d07272d6 | 126 | CODE_BLOCK_RE = re.compile(r'[%s]([^=%s]+)[=%s]' % (BOLD_FONT[0], NORM_FONT[0], NORM_FONT[0])) |
7e94e521 WD |
127 | NBR_DASH_RE = re.compile(r'[%s]' % NBR_DASH[0]) |
128 | INVALID_TARGET_CHARS_RE = re.compile(r'[^-A-Za-z0-9._]') | |
129 | INVALID_START_CHAR_RE = re.compile(r'^([^A-Za-z0-9])') | |
130 | MANIFY_LINESTART_RE = re.compile(r"^(['.])", flags=re.M) | |
131 | ||
03fc62ad | 132 | md_parser = None |
a2b630c0 | 133 | env_subs = { } |
03fc62ad | 134 | |
995ce719 WD |
135 | warning_count = 0 |
136 | ||
53fae556 | 137 | def main(): |
a2b630c0 WD |
138 | for mdfn in args.mdfiles: |
139 | parse_md_file(mdfn) | |
140 | ||
141 | if args.test: | |
142 | print("The test was successful.") | |
143 | ||
144 | ||
145 | def parse_md_file(mdfn): | |
7e94e521 | 146 | fi = FILENAME_RE.match(mdfn) |
53fae556 | 147 | if not fi: |
a2b630c0 | 148 | die('Failed to parse a md input file name:', mdfn) |
53fae556 | 149 | fi = argparse.Namespace(**fi.groupdict()) |
a2b630c0 WD |
150 | fi.want_manpage = not not fi.sect |
151 | if fi.want_manpage: | |
96ed4b47 | 152 | fi.title = fi.prog + '(' + fi.sect + ') manpage' |
a2b630c0 | 153 | else: |
d07272d6 | 154 | fi.title = fi.prog + ' for rsync' |
a2b630c0 WD |
155 | |
156 | if fi.want_manpage: | |
157 | if not env_subs: | |
158 | find_man_substitutions() | |
159 | prog_ver = 'rsync ' + env_subs['VERSION'] | |
160 | if fi.prog != 'rsync': | |
161 | prog_ver = fi.prog + ' from ' + prog_ver | |
162 | fi.man_headings = (fi.prog, fi.sect, env_subs['date'], prog_ver, env_subs['prefix']) | |
163 | ||
164 | with open(mdfn, 'r', encoding='utf-8') as fh: | |
165 | txt = fh.read() | |
166 | ||
167 | use_gfm_parser = '@USE_GFM_PARSER@' in txt | |
168 | if use_gfm_parser: | |
169 | txt = txt.replace('@USE_GFM_PARSER@', '') | |
170 | ||
171 | if fi.want_manpage: | |
172 | txt = (txt.replace('@VERSION@', env_subs['VERSION']) | |
173 | .replace('@BINDIR@', env_subs['bindir']) | |
174 | .replace('@LIBDIR@', env_subs['libdir'])) | |
175 | ||
176 | if use_gfm_parser: | |
177 | if not gfm_parser: | |
178 | die('Input file requires cmarkgfm parser:', mdfn) | |
179 | fi.html_in = gfm_parser(txt) | |
180 | else: | |
181 | fi.html_in = md_parser(txt) | |
182 | txt = None | |
183 | ||
184 | TransformHtml(fi) | |
185 | ||
186 | if args.test: | |
187 | return | |
188 | ||
189 | output_list = [ (fi.name + '.html', fi.html_out) ] | |
190 | if fi.want_manpage: | |
191 | output_list += [ (fi.name, fi.man_out) ] | |
192 | for fn, txt in output_list: | |
d2cc1149 WD |
193 | if args.dest and args.dest != '.': |
194 | fn = os.path.join(args.dest, fn) | |
a2b630c0 WD |
195 | if os.path.lexists(fn): |
196 | os.unlink(fn) | |
197 | print("Wrote:", fn) | |
198 | with open(fn, 'w', encoding='utf-8') as fh: | |
199 | fh.write(txt) | |
6dc94e39 | 200 | |
53fae556 | 201 | |
a2b630c0 WD |
202 | def find_man_substitutions(): |
203 | srcdir = os.path.dirname(sys.argv[0]) + '/' | |
204 | mtime = 0 | |
6dc94e39 | 205 | |
a2b630c0 | 206 | git_dir = srcdir + '.git' |
19617f7b | 207 | if os.path.lexists(git_dir): |
a2b630c0 | 208 | mtime = int(subprocess.check_output(['git', '--git-dir', git_dir, 'log', '-1', '--format=%at'])) |
58e8ecf4 | 209 | |
a2b630c0 WD |
210 | # Allow "prefix" to be overridden via the environment: |
211 | env_subs['prefix'] = os.environ.get('RSYNC_OVERRIDE_PREFIX', None) | |
53fae556 | 212 | |
111225a9 WD |
213 | if args.test: |
214 | env_subs['VERSION'] = '1.0.0' | |
ec8a05f6 WD |
215 | env_subs['bindir'] = '/usr/bin' |
216 | env_subs['libdir'] = '/usr/lib/rsync' | |
2ab2ee16 | 217 | tz_offset = 0 |
111225a9 | 218 | else: |
a2b630c0 | 219 | for fn in (srcdir + 'version.h', 'Makefile'): |
111225a9 | 220 | try: |
19617f7b | 221 | st = os.lstat(fn) |
72adf49b | 222 | except OSError: |
a2b630c0 WD |
223 | die('Failed to find', srcdir + fn) |
224 | if not mtime: | |
225 | mtime = st.st_mtime | |
111225a9 | 226 | |
a2b630c0 | 227 | with open(srcdir + 'version.h', 'r', encoding='utf-8') as fh: |
3b4f5fb8 | 228 | txt = fh.read() |
664639e3 | 229 | m = VER_RE.search(txt) |
3b4f5fb8 | 230 | env_subs['VERSION'] = m.group(1) |
664639e3 WD |
231 | m = TZ_RE.search(txt) # the tzdata lib may not be installed, so we use a simple hour offset |
232 | tz_offset = float(m.group(1)) * 60 * 60 | |
3b4f5fb8 | 233 | |
19617f7b | 234 | with open('Makefile', 'r', encoding='utf-8') as fh: |
111225a9 | 235 | for line in fh: |
7e94e521 | 236 | m = ASSIGNMENT_RE.match(line) |
111225a9 WD |
237 | if not m: |
238 | continue | |
07a3e1f9 | 239 | var, val = (m.group(1), m.group(2)) |
111225a9 WD |
240 | if var == 'prefix' and env_subs[var] is not None: |
241 | continue | |
7e94e521 WD |
242 | while VAR_REF_RE.search(val): |
243 | val = VAR_REF_RE.sub(lambda m: env_subs[m.group(1)], val) | |
111225a9 | 244 | env_subs[var] = val |
3b4f5fb8 | 245 | if var == 'srcdir': |
111225a9 WD |
246 | break |
247 | ||
664639e3 | 248 | env_subs['date'] = time.strftime('%d %b %Y', time.gmtime(mtime + tz_offset)).lstrip('0') |
53fae556 | 249 | |
ae82762c | 250 | |
03fc62ad WD |
251 | def html_via_commonmark(txt): |
252 | return commonmark.HtmlRenderer().render(commonmark.Parser().parse(txt)) | |
253 | ||
6dc94e39 | 254 | |
a2b630c0 | 255 | class TransformHtml(HTMLParser): |
6dc94e39 | 256 | def __init__(self, fi): |
53fae556 WD |
257 | HTMLParser.__init__(self, convert_charrefs=True) |
258 | ||
995ce719 WD |
259 | self.fn = fi.fn |
260 | ||
68c865c9 | 261 | st = self.state = argparse.Namespace( |
53fae556 WD |
262 | list_state = [ ], |
263 | p_macro = ".P\n", | |
6dc94e39 WD |
264 | at_first_tag_in_li = False, |
265 | at_first_tag_in_dd = False, | |
53fae556 WD |
266 | dt_from = None, |
267 | in_pre = False, | |
b65b6db3 | 268 | in_code = False, |
8898aecb | 269 | html_out = [ HTML_START.replace('%TITLE%', fi.title) ], |
a2b630c0 | 270 | man_out = [ ], |
53fae556 | 271 | txt = '', |
a2b630c0 | 272 | want_manpage = fi.want_manpage, |
995ce719 WD |
273 | created_hashtags = set(), |
274 | derived_hashtags = set(), | |
275 | referenced_hashtags = set(), | |
276 | bad_hashtags = set(), | |
d07272d6 | 277 | latest_targets = [ ], |
995ce719 | 278 | opt_prefix = 'opt', |
c3d3b49d WD |
279 | a_href = None, |
280 | a_href_external = False, | |
995ce719 | 281 | a_txt_start = None, |
c3d3b49d | 282 | after_a_tag = False, |
7e94e521 | 283 | target_suf = '', |
53fae556 WD |
284 | ) |
285 | ||
a2b630c0 WD |
286 | if st.want_manpage: |
287 | st.man_out.append(MAN_START % fi.man_headings) | |
288 | ||
289 | if '</table>' in fi.html_in: | |
290 | st.html_out[0] = st.html_out[0].replace('</style>', TABLE_STYLE + '</style>') | |
291 | ||
6dc94e39 WD |
292 | self.feed(fi.html_in) |
293 | fi.html_in = None | |
53fae556 | 294 | |
a2b630c0 WD |
295 | if st.want_manpage: |
296 | st.html_out.append(MAN_HTML_END % env_subs['date']) | |
297 | st.html_out.append(HTML_END) | |
68c865c9 | 298 | st.man_out.append(MAN_END) |
53fae556 | 299 | |
68c865c9 WD |
300 | fi.html_out = ''.join(st.html_out) |
301 | st.html_out = None | |
53fae556 | 302 | |
68c865c9 WD |
303 | fi.man_out = ''.join(st.man_out) |
304 | st.man_out = None | |
53fae556 | 305 | |
d07272d6 WD |
306 | for tgt, txt in st.derived_hashtags: |
307 | derived = txt2target(txt, tgt) | |
995ce719 | 308 | if derived not in st.created_hashtags: |
7e94e521 | 309 | txt = BIN_CHARS_RE.sub('', txt.replace(NBR_DASH[0], '-').replace(NBR_SPACE[0], ' ')) |
d07272d6 | 310 | warn('Unknown derived hashtag link in', self.fn, 'based on:', (tgt, txt)) |
995ce719 WD |
311 | |
312 | for bad in st.bad_hashtags: | |
313 | if bad in st.created_hashtags: | |
314 | warn('Missing "#" in hashtag link in', self.fn + ':', bad) | |
315 | else: | |
316 | warn('Unknown non-hashtag link in', self.fn + ':', bad) | |
317 | ||
318 | for bad in st.referenced_hashtags - st.created_hashtags: | |
319 | warn('Unknown hashtag link in', self.fn + ':', '#' + bad) | |
53fae556 | 320 | |
c3d3b49d WD |
321 | def handle_UE(self): |
322 | st = self.state | |
323 | if st.txt.startswith(('.', ',', '!', '?', ';', ':')): | |
324 | st.man_out[-1] = ".UE " + st.txt[0] + "\n" | |
325 | st.txt = st.txt[1:] | |
326 | st.after_a_tag = False | |
327 | ||
53fae556 WD |
328 | def handle_starttag(self, tag, attrs_list): |
329 | st = self.state | |
330 | if args.debug: | |
68c865c9 | 331 | self.output_debug('START', (tag, attrs_list)) |
6dc94e39 | 332 | if st.at_first_tag_in_li: |
53fae556 WD |
333 | if st.list_state[-1] == 'dl': |
334 | st.dt_from = tag | |
335 | if tag == 'p': | |
336 | tag = 'dt' | |
337 | else: | |
68c865c9 | 338 | st.html_out.append('<dt>') |
d80da9e6 WD |
339 | elif tag == 'p': |
340 | st.at_first_tag_in_dd = True # Kluge to suppress a .P at the start of an li. | |
6dc94e39 | 341 | st.at_first_tag_in_li = False |
53fae556 | 342 | if tag == 'p': |
6dc94e39 | 343 | if not st.at_first_tag_in_dd: |
68c865c9 | 344 | st.man_out.append(st.p_macro) |
53fae556 | 345 | elif tag == 'li': |
6dc94e39 | 346 | st.at_first_tag_in_li = True |
53fae556 WD |
347 | lstate = st.list_state[-1] |
348 | if lstate == 'dl': | |
349 | return | |
350 | if lstate == 'o': | |
68c865c9 | 351 | st.man_out.append(".IP o\n") |
53fae556 | 352 | else: |
68c865c9 | 353 | st.man_out.append(".IP " + str(lstate) + ".\n") |
53fae556 WD |
354 | st.list_state[-1] += 1 |
355 | elif tag == 'blockquote': | |
68c865c9 | 356 | st.man_out.append(".RS 4\n") |
53fae556 WD |
357 | elif tag == 'pre': |
358 | st.in_pre = True | |
68c865c9 | 359 | st.man_out.append(st.p_macro + ".nf\n") |
53fae556 | 360 | elif tag == 'code' and not st.in_pre: |
b65b6db3 | 361 | st.in_code = True |
53fae556 | 362 | st.txt += BOLD_FONT[0] |
03fc62ad | 363 | elif tag == 'strong' or tag == 'b': |
53fae556 | 364 | st.txt += BOLD_FONT[0] |
03fc62ad | 365 | elif tag == 'em' or tag == 'i': |
a2b630c0 | 366 | if st.want_manpage: |
96ed4b47 | 367 | tag = 'u' # Change it into underline to be more like the manpage |
a2b630c0 | 368 | st.txt += UNDR_FONT[0] |
53fae556 WD |
369 | elif tag == 'ol': |
370 | start = 1 | |
371 | for var, val in attrs_list: | |
372 | if var == 'start': | |
373 | start = int(val) # We only support integers. | |
374 | break | |
375 | if st.list_state: | |
68c865c9 | 376 | st.man_out.append(".RS\n") |
53fae556 WD |
377 | if start == 0: |
378 | tag = 'dl' | |
379 | attrs_list = [ ] | |
380 | st.list_state.append('dl') | |
381 | else: | |
382 | st.list_state.append(start) | |
68c865c9 | 383 | st.man_out.append(st.p_macro) |
53fae556 WD |
384 | st.p_macro = ".IP\n" |
385 | elif tag == 'ul': | |
68c865c9 | 386 | st.man_out.append(st.p_macro) |
53fae556 | 387 | if st.list_state: |
68c865c9 | 388 | st.man_out.append(".RS\n") |
53fae556 WD |
389 | st.p_macro = ".IP\n" |
390 | st.list_state.append('o') | |
a2b630c0 WD |
391 | elif tag == 'hr': |
392 | st.man_out.append(".l\n") | |
393 | st.html_out.append("<hr />") | |
394 | return | |
995ce719 WD |
395 | elif tag == 'a': |
396 | st.a_href = None | |
397 | for var, val in attrs_list: | |
398 | if var == 'href': | |
d07272d6 | 399 | if val.startswith(('https://', 'http://', 'mailto:', 'ftp:')): |
c3d3b49d WD |
400 | if st.after_a_tag: |
401 | self.handle_UE() | |
402 | st.man_out.append(manify(st.txt.strip()) + "\n") | |
403 | st.man_out.append(".UR " + val + "\n") | |
404 | st.txt = '' | |
405 | st.a_href = val | |
406 | st.a_href_external = True | |
d07272d6 | 407 | elif '#' in val: |
1f2f4131 | 408 | pg, tgt = val.split('#', 1) |
d07272d6 WD |
409 | if pg and pg not in VALID_PAGES or '#' in tgt: |
410 | st.bad_hashtags.add(val) | |
411 | elif tgt in ('', 'opt', 'dopt'): | |
412 | st.a_href = val | |
c3d3b49d | 413 | st.a_href_external = False |
d07272d6 WD |
414 | elif pg == '': |
415 | st.referenced_hashtags.add(tgt) | |
416 | if tgt in st.latest_targets: | |
417 | warn('Found link to the current section in', self.fn + ':', val) | |
418 | elif val not in VALID_PAGES: | |
995ce719 WD |
419 | st.bad_hashtags.add(val) |
420 | st.a_txt_start = len(st.txt) | |
ae82762c | 421 | st.html_out.append('<' + tag + ''.join(' ' + var + '="' + htmlify(val) + '"' for var, val in attrs_list) + '>') |
6dc94e39 WD |
422 | st.at_first_tag_in_dd = False |
423 | ||
53fae556 WD |
424 | |
425 | def handle_endtag(self, tag): | |
426 | st = self.state | |
427 | if args.debug: | |
68c865c9 | 428 | self.output_debug('END', (tag,)) |
c3d3b49d WD |
429 | if st.after_a_tag: |
430 | self.handle_UE() | |
53fae556 WD |
431 | if tag in CONSUMES_TXT or st.dt_from == tag: |
432 | txt = st.txt.strip() | |
433 | st.txt = '' | |
434 | else: | |
435 | txt = None | |
436 | add_to_txt = None | |
7e94e521 WD |
437 | if tag == 'h1': |
438 | tgt = txt | |
439 | target_suf = '' | |
440 | if tgt.startswith('NEWS for '): | |
441 | m = VERSION_RE.search(tgt) | |
442 | if m: | |
443 | tgt = m.group(1) | |
444 | st.target_suf = '-' + tgt | |
f44e76b6 | 445 | self.add_targets(tag, tgt) |
7e94e521 | 446 | elif tag == 'h2': |
68c865c9 | 447 | st.man_out.append(st.p_macro + '.SH "' + manify(txt) + '"\n') |
f44e76b6 | 448 | self.add_targets(tag, txt, st.target_suf) |
d07272d6 | 449 | st.opt_prefix = 'dopt' if txt == 'DAEMON OPTIONS' else 'opt' |
e841944b | 450 | elif tag == 'h3': |
68c865c9 | 451 | st.man_out.append(st.p_macro + '.SS "' + manify(txt) + '"\n') |
f44e76b6 | 452 | self.add_targets(tag, txt, st.target_suf) |
53fae556 WD |
453 | elif tag == 'p': |
454 | if st.dt_from == 'p': | |
455 | tag = 'dt' | |
68c865c9 | 456 | st.man_out.append('.IP "' + manify(txt) + '"\n') |
f08505e9 | 457 | if txt.startswith(BOLD_FONT[0]): |
f44e76b6 | 458 | self.add_targets(tag, txt) |
53fae556 | 459 | st.dt_from = None |
68c865c9 WD |
460 | elif txt != '': |
461 | st.man_out.append(manify(txt) + "\n") | |
53fae556 WD |
462 | elif tag == 'li': |
463 | if st.list_state[-1] == 'dl': | |
6dc94e39 | 464 | if st.at_first_tag_in_li: |
53fae556 WD |
465 | die("Invalid 0. -> td translation") |
466 | tag = 'dd' | |
467 | if txt != '': | |
68c865c9 | 468 | st.man_out.append(manify(txt) + "\n") |
6dc94e39 | 469 | st.at_first_tag_in_li = False |
53fae556 | 470 | elif tag == 'blockquote': |
68c865c9 | 471 | st.man_out.append(".RE\n") |
53fae556 WD |
472 | elif tag == 'pre': |
473 | st.in_pre = False | |
68c865c9 | 474 | st.man_out.append(manify(txt) + "\n.fi\n") |
b65b6db3 WD |
475 | elif (tag == 'code' and not st.in_pre): |
476 | st.in_code = False | |
477 | add_to_txt = NORM_FONT[0] | |
478 | elif tag == 'strong' or tag == 'b': | |
03fc62ad WD |
479 | add_to_txt = NORM_FONT[0] |
480 | elif tag == 'em' or tag == 'i': | |
a2b630c0 | 481 | if st.want_manpage: |
96ed4b47 | 482 | tag = 'u' # Change it into underline to be more like the manpage |
a2b630c0 | 483 | add_to_txt = NORM_FONT[0] |
53fae556 WD |
484 | elif tag == 'ol' or tag == 'ul': |
485 | if st.list_state.pop() == 'dl': | |
486 | tag = 'dl' | |
487 | if st.list_state: | |
68c865c9 | 488 | st.man_out.append(".RE\n") |
53fae556 WD |
489 | else: |
490 | st.p_macro = ".P\n" | |
6dc94e39 | 491 | st.at_first_tag_in_dd = False |
a2b630c0 WD |
492 | elif tag == 'hr': |
493 | return | |
995ce719 | 494 | elif tag == 'a': |
c3d3b49d WD |
495 | if st.a_href_external: |
496 | st.txt = st.txt.strip() | |
6ae7f408 | 497 | if args.force_link_text or st.a_href != st.txt: |
c3d3b49d WD |
498 | st.man_out.append(manify(st.txt) + "\n") |
499 | st.man_out.append(".UE\n") # This might get replaced with a punctuation version in handle_UE() | |
500 | st.after_a_tag = True | |
501 | st.a_href_external = False | |
502 | st.txt = '' | |
503 | elif st.a_href: | |
995ce719 WD |
504 | atxt = st.txt[st.a_txt_start:] |
505 | find = 'href="' + st.a_href + '"' | |
506 | for j in range(len(st.html_out)-1, 0, -1): | |
507 | if find in st.html_out[j]: | |
1f2f4131 | 508 | pg, tgt = st.a_href.split('#', 1) |
d07272d6 WD |
509 | derived = txt2target(atxt, tgt) |
510 | if pg == '': | |
511 | if derived in st.latest_targets: | |
512 | warn('Found link to the current section in', self.fn + ':', st.a_href) | |
513 | st.derived_hashtags.add((tgt, atxt)) | |
514 | st.html_out[j] = st.html_out[j].replace(find, 'href="' + pg + '#' + derived + '"') | |
995ce719 WD |
515 | break |
516 | else: | |
517 | die('INTERNAL ERROR: failed to find href in html data:', find) | |
68c865c9 | 518 | st.html_out.append('</' + tag + '>') |
53fae556 WD |
519 | if add_to_txt: |
520 | if txt is None: | |
521 | st.txt += add_to_txt | |
522 | else: | |
523 | txt += add_to_txt | |
524 | if st.dt_from == tag: | |
68c865c9 WD |
525 | st.man_out.append('.IP "' + manify(txt) + '"\n') |
526 | st.html_out.append('</dt><dd>') | |
6dc94e39 | 527 | st.at_first_tag_in_dd = True |
53fae556 WD |
528 | st.dt_from = None |
529 | elif tag == 'dt': | |
68c865c9 | 530 | st.html_out.append('<dd>') |
6dc94e39 WD |
531 | st.at_first_tag_in_dd = True |
532 | ||
53fae556 | 533 | |
a93ffb1a | 534 | def handle_data(self, txt): |
53fae556 | 535 | st = self.state |
995ce719 WD |
536 | if '](' in txt: |
537 | warn('Malformed link in', self.fn + ':', txt) | |
53fae556 | 538 | if args.debug: |
a93ffb1a WD |
539 | self.output_debug('DATA', (txt,)) |
540 | if st.in_pre: | |
541 | html = htmlify(txt) | |
542 | else: | |
7e94e521 WD |
543 | txt = SPACE_DOUBLE_DASH_RE.sub(NBR_SPACE[0] + r'--\1', txt).replace('--', NBR_DASH[0]*2) |
544 | txt = NON_SPACE_SINGLE_DASH_RE.sub(r'\1' + NBR_DASH[0], txt) | |
a93ffb1a WD |
545 | html = htmlify(txt) |
546 | if st.in_code: | |
7e94e521 | 547 | txt = WHITESPACE_RE.sub(NBR_SPACE[0], txt) |
a93ffb1a WD |
548 | html = html.replace(NBR_DASH[0], '-').replace(NBR_SPACE[0], ' ') # <code> is non-breaking in CSS |
549 | st.html_out.append(html.replace(NBR_SPACE[0], ' ').replace(NBR_DASH[0], '-⁠')) | |
550 | st.txt += txt | |
53fae556 WD |
551 | |
552 | ||
f44e76b6 | 553 | def add_targets(self, tag, txt, suf=None): |
995ce719 | 554 | st = self.state |
f44e76b6 | 555 | tag = '<' + tag + '>' |
d07272d6 WD |
556 | targets = CODE_BLOCK_RE.findall(txt) |
557 | if not targets: | |
558 | targets = [ txt ] | |
f44e76b6 | 559 | tag_pos = 0 |
d07272d6 WD |
560 | for txt in targets: |
561 | txt = txt2target(txt, st.opt_prefix) | |
562 | if not txt: | |
563 | continue | |
7e94e521 WD |
564 | if suf: |
565 | txt += suf | |
566 | if txt in st.created_hashtags: | |
567 | for j in range(2, 1000): | |
568 | chk = txt + '-' + str(j) | |
569 | if chk not in st.created_hashtags: | |
570 | print('Made link target unique:', chk) | |
571 | txt = chk | |
572 | break | |
f44e76b6 WD |
573 | if tag_pos == 0: |
574 | tag_pos -= 1 | |
575 | while st.html_out[tag_pos] != tag: | |
576 | tag_pos -= 1 | |
577 | st.html_out[tag_pos] = tag[:-1] + ' id="' + txt + '">' | |
578 | st.html_out.append('<a href="#' + txt + '" class="tgt"></a>') | |
579 | tag_pos -= 1 # take into account the append | |
d07272d6 | 580 | else: |
f44e76b6 | 581 | st.html_out[tag_pos] = '<span id="' + txt + '"></span>' + st.html_out[tag_pos] |
995ce719 | 582 | st.created_hashtags.add(txt) |
d07272d6 | 583 | st.latest_targets = targets |
995ce719 WD |
584 | |
585 | ||
68c865c9 WD |
586 | def output_debug(self, event, extra): |
587 | import pprint | |
588 | st = self.state | |
589 | if args.debug < 2: | |
ae82762c | 590 | st = argparse.Namespace(**vars(st)) |
68c865c9 WD |
591 | if len(st.html_out) > 2: |
592 | st.html_out = ['...'] + st.html_out[-2:] | |
593 | if len(st.man_out) > 2: | |
594 | st.man_out = ['...'] + st.man_out[-2:] | |
595 | print(event, extra) | |
596 | pprint.PrettyPrinter(indent=2).pprint(vars(st)) | |
597 | ||
598 | ||
995ce719 | 599 | def txt2target(txt, opt_prefix): |
d07272d6 WD |
600 | txt = txt.strip().rstrip(':') |
601 | m = CODE_BLOCK_RE.search(txt) | |
602 | if m: | |
603 | txt = m.group(1) | |
7e94e521 WD |
604 | txt = NBR_DASH_RE.sub('-', txt) |
605 | txt = BIN_CHARS_RE.sub('', txt) | |
606 | txt = INVALID_TARGET_CHARS_RE.sub('_', txt) | |
995ce719 WD |
607 | if opt_prefix and txt.startswith('-'): |
608 | txt = opt_prefix + txt | |
609 | else: | |
7e94e521 | 610 | txt = INVALID_START_CHAR_RE.sub(r't\1', txt) |
995ce719 WD |
611 | return txt |
612 | ||
613 | ||
53fae556 | 614 | def manify(txt): |
7e94e521 | 615 | return MANIFY_LINESTART_RE.sub(r'\&\1', txt.replace('\\', '\\\\') |
a93ffb1a WD |
616 | .replace(NBR_SPACE[0], NBR_SPACE[1]) |
617 | .replace(NBR_DASH[0], NBR_DASH[1]) | |
53fae556 WD |
618 | .replace(NORM_FONT[0], NORM_FONT[1]) |
619 | .replace(BOLD_FONT[0], BOLD_FONT[1]) | |
7e94e521 | 620 | .replace(UNDR_FONT[0], UNDR_FONT[1])) |
53fae556 WD |
621 | |
622 | ||
ae82762c | 623 | def htmlify(txt): |
a93ffb1a | 624 | return txt.replace('&', '&').replace('<', '<').replace('>', '>').replace('"', '"') |
53fae556 WD |
625 | |
626 | ||
627 | def warn(*msg): | |
628 | print(*msg, file=sys.stderr) | |
995ce719 WD |
629 | global warning_count |
630 | warning_count += 1 | |
53fae556 WD |
631 | |
632 | ||
633 | def die(*msg): | |
634 | warn(*msg) | |
635 | sys.exit(1) | |
636 | ||
637 | ||
638 | if __name__ == '__main__': | |
0d8cc260 | 639 | parser = argparse.ArgumentParser(description="Convert markdown into html and (optionally) nroff. Each input filename must have a .md suffix, which is changed to .html for the output filename. If the input filename ends with .num.md (e.g. foo.1.md) then a nroff file is also output with the input filename's .md suffix removed (e.g. foo.1).", add_help=False) |
a2b630c0 | 640 | parser.add_argument('--test', action='store_true', help="Just test the parsing without outputting any files.") |
0d8cc260 | 641 | parser.add_argument('--dest', metavar='DIR', help="Create files in DIR instead of the current directory.") |
6ae7f408 | 642 | parser.add_argument('--force-link-text', action='store_true', help="Don't remove the link text if it matches the link href. Useful when nroff doesn't understand .UR and .UE.") |
ae82762c | 643 | parser.add_argument('--debug', '-D', action='count', default=0, help='Output copious info on the html parsing. Repeat for even more.') |
53fae556 | 644 | parser.add_argument("--help", "-h", action="help", help="Output this help message and exit.") |
0d8cc260 | 645 | parser.add_argument("mdfiles", metavar='FILE.md', nargs='+', help="One or more .md files to convert.") |
53fae556 WD |
646 | args = parser.parse_args() |
647 | ||
648 | try: | |
649 | import cmarkgfm | |
491ddb08 | 650 | md_parser = cmarkgfm.markdown_to_html |
a2b630c0 | 651 | gfm_parser = cmarkgfm.github_flavored_markdown_to_html |
53fae556 | 652 | except: |
03fc62ad WD |
653 | try: |
654 | import commonmark | |
655 | md_parser = html_via_commonmark | |
656 | except: | |
657 | die("Failed to find cmarkgfm or commonmark for python3.") | |
a2b630c0 | 658 | gfm_parser = None |
53fae556 WD |
659 | |
660 | main() | |
995ce719 WD |
661 | if warning_count: |
662 | sys.exit(1) |