docs: Add HTML reference checker

author Peter Krempa <pkrempa@redhat.com>

Tue, 31 May 2022 13:15:57 +0000 (15:15 +0200)

committer Peter Krempa <pkrempa@redhat.com>

Wed, 1 Jun 2022 10:27:10 +0000 (12:27 +0200)
author Peter Krempa <pkrempa@redhat.com>
Tue, 31 May 2022 13:15:57 +0000 (15:15 +0200)
committer Peter Krempa <pkrempa@redhat.com>
Wed, 1 Jun 2022 10:27:10 +0000 (12:27 +0200)
diff --git a/docs/meson.build b/docs/meson.build

index d71f6006dd662c6279025cddef10a32d2a0a6063..cb70ef608496459ecb3c28ce293306ac57925596 100644 (file)
--- a/docs/meson.build
+++ b/docs/meson.build
@@ -350,3 +350,14 @@ run_target(
    ],
    depends: install_web_deps,
  )
+
+test(
+  'check-html-references',
+  python3_prog,
+  args: [
+    check_html_references_prog.path(),
+    '--prefix',
+    meson.build_root() / 'docs'
+  ],
+  env: runutf8,
+)
diff --git a/scripts/check-html-references.py b/scripts/check-html-references.py

new file mode 100755 (executable)

index 0000000..95a61a6
--- /dev/null
+++ b/scripts/check-html-references.py
@@ -0,0 +1,153 @@
+#!/usr/bin/env python3
+#
+# This library is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public
+# License as published by the Free Software Foundation; either
+# version 2.1 of the License, or (at your option) any later version.
+#
+# This library is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with this library.  If not, see
+# <http://www.gnu.org/licenses/>.
+#
+# Check that external references between documentation HTML files are not broken.
+
+import sys
+import os
+import argparse
+import re
+import xml.etree.ElementTree as ET
+
+ns = {'html': 'http://www.w3.org/1999/xhtml'}
+externallinks = []
+
+
+def get_file_list(prefix):
+    filelist = []
+
+    for root, dir, files in os.walk(prefix):
+        prefixbase = os.path.dirname(prefix)
+
+        if root.startswith(prefixbase):
+            relroot = root[len(prefixbase):]
+        else:
+            relroot = root
+
+        for file in files:
+            if not re.search('\\.html$', file):
+                continue
+
+            # the 404 page doesn't play well
+            if '404.html' in file:
+                continue
+
+            fullfilename = os.path.join(root, file)
+            relfilename = os.path.join(relroot, file)
+            filelist.append((fullfilename, relfilename))
+
+    return filelist
+
+
+# loads an XHTML and extracts all anchors, local and remote links for the one file
+def process_file(filetuple):
+    filename, relfilename = filetuple
+    tree = ET.parse(filename)
+    root = tree.getroot()
+
+    anchors = [relfilename]
+    targets = []
+
+    for elem in root.findall('.//html:a', ns):
+        target = elem.get('href')
+        an = elem.get('id')
+
+        if an:
+            anchors.append(relfilename + '#' + an)
+
+        if target:
+            if re.search('://', target):
+                externallinks.append(target)
+            elif target[0] != '#' and 'mailto:' not in target:
+                dirname = os.path.dirname(relfilename)
+                targetname = os.path.normpath(os.path.join(dirname, target))
+
+                targets.append((targetname, filename, target))
+
+    # older docutils generate "<div class='section'"
+    for elem in root.findall('.//html:div/[@class=\'section\']', ns):
+        an = elem.get('id')
+
+        if an:
+            anchors.append(relfilename + '#' + an)
+
+    # modern docutils generate a <section element
+    for elem in root.findall('.//html:section', ns):
+        an = elem.get('id')
+
+        if an:
+            anchors.append(relfilename + '#' + an)
+
+    return (anchors, targets)
+
+
+def process_all(filelist):
+    anchors = []
+    targets = []
+
+    for filetuple in filelist:
+        anchor, target = process_file(filetuple)
+
+        targets = targets + target
+        anchors = anchors + anchor
+
+    return (targets, anchors)
+
+
+def check_targets(targets, anchors):
+    errors = []
+    for target, targetfrom, targetorig in targets:
+        if target not in anchors:
+            errors.append((targetfrom, targetorig))
+
+    if errors:
+        errors.sort()
+
+        print('broken link targets:')
+
+        for file, target in errors:
+            print(file + " broken link: " + target)
+
+        return True
+
+    return False
+
+
+parser = argparse.ArgumentParser(description='HTML reference checker')
+parser.add_argument('--prefix', default='.',
+                    help='build tree prefix')
+parser.add_argument('--external', action="store_true",
+                    help='print external references instead')
+
+args = parser.parse_args()
+
+files = get_file_list(args.prefix)
+
+targets, anchors = process_all(files)
+
+if args.external:
+    prev = None
+    externallinks.sort()
+    for ext in externallinks:
+        if ext != prev:
+            print(ext)
+
+        prev = ext
+else:
+    if check_targets(targets, anchors):
+        sys.exit(1)
+
+    sys.exit(0)
diff --git a/scripts/meson.build b/scripts/meson.build

index 421e3d2acd095cbcffcfc4caea234b3c5cd56667..05b71184f117247198305791f100854897c0f93a 100644 (file)
--- a/scripts/meson.build
+++ b/scripts/meson.build
@@ -6,6 +6,7 @@ scripts = [
    'check-driverimpls.py',
    'check-drivername.py',
    'check-file-access.py',
+  'check-html-references.py',
    'check-remote-protocol.py',
    'check-symfile.py',
    'check-symsorting.py',
author	Peter Krempa <pkrempa@redhat.com>
	Tue, 31 May 2022 13:15:57 +0000 (15:15 +0200)
committer	Peter Krempa <pkrempa@redhat.com>
	Wed, 1 Jun 2022 10:27:10 +0000 (12:27 +0200)
docs/meson.build		patch \| blob \| blame \| history
scripts/check-html-references.py	[new file with mode: 0755]	patch \| blob
scripts/meson.build		patch \| blob \| blame \| history