]> git.ipfire.org Git - thirdparty/google/fonts.git/commitdiff
run black 10541/head
authorEmma Marichal <emma.marichal@hotmail.fr>
Fri, 22 May 2026 08:16:01 +0000 (10:16 +0200)
committerEmma Marichal <emma.marichal@hotmail.fr>
Fri, 22 May 2026 08:16:01 +0000 (10:16 +0200)
.github/workflows/knowledge_graph.py

index 34b4717e98cddf4c11af12a3a3eb81b774f71e10..8c93910bf8f2ad8456d43dae5cf36ff0a0e98a10 100644 (file)
@@ -5,12 +5,22 @@ from absl import flags
 from gftools import knowledge_pb2
 from google.protobuf import text_format
 import itertools
-import mistune # markdown => ast
+import mistune  # markdown => ast
 from xml.dom import minidom
 from pathlib import Path
 import re
 import sys
-from typing import Callable, Iterable, List, Mapping, NamedTuple, Optional, Tuple, Set, Union
+from typing import (
+    Callable,
+    Iterable,
+    List,
+    Mapping,
+    NamedTuple,
+    Optional,
+    Tuple,
+    Set,
+    Union,
+)
 import requests
 from functools import lru_cache
 from urllib.parse import urlparse
@@ -18,20 +28,25 @@ from urllib.parse import urlparse
 MAX_RASTER_IMAGE_SIZE_KB = 800
 MAX_VECTOR_IMAGE_SIZE_KB = 1750
 
+
 def _topic_target_to_path(_: Set[str], target: str) -> str:
     # TODO sanity check if this is the only valid update
     return Path(target.replace("/topic/", "topics/")) / "topic.textproto"
 
+
 def _module_target_to_path(_: Set[str], target: str) -> str:
     return Path(target.replace("/module/", "modules/")) / "module.textproto"
 
+
 def _content_md(path: str) -> Path:
     return Path(path) / "content.md"
 
+
 def _glossary_target_to_path(_: Set[str], target: str) -> str:
     # TODO sanity check if this is the only valid update
     return _content_md(target.replace("/glossary/", "glossary/terms/"))
 
+
 def _lesson_target_to_path(names: Mapping[str, str], target: str) -> str:
     # /lesson/choosing_type/choosing_reliable_typefaces => modules/choosing_type/lessons/choosing_reliable_typefaces/
     parts = target[1:].split("/")
@@ -46,15 +61,17 @@ def _lesson_target_to_path(names: Mapping[str, str], target: str) -> str:
     else:
         return _content_md(target)
 
+
 def _any_unique_name_to_path(names: Mapping[str, str], target: str) -> str:
     return _content_md(names.get(target, target))
 
+
 _LINK_TO_PATH = [
     (re.compile("^/glossary/"), _glossary_target_to_path),
     (re.compile("^/topic/"), _topic_target_to_path),
     (re.compile("^/lesson/"), _lesson_target_to_path),
     (re.compile("^/module/"), _module_target_to_path),
-    (re.compile("[^/]+"), _any_unique_name_to_path)
+    (re.compile("[^/]+"), _any_unique_name_to_path),
 ]
 
 FLAGS = flags.FLAGS
@@ -63,6 +80,7 @@ flags.DEFINE_bool("check_outbound_links", False, "Check outbound urls")
 
 MdValue = Union[Mapping[str, "MdValue"]]
 
+
 class KnowledgeContent(NamedTuple):
     repo_root: Path
     knowledge_dir: Path
@@ -71,16 +89,27 @@ class KnowledgeContent(NamedTuple):
     unambiguous_names: Mapping[str, Path]
 
     def module_name_to_path(self: "KnowledgeContent", name: str) -> Path:
-        return self.knowledge_dir / "modules" / name.lower().replace(" ", "_") / "module.textproto"
+        return (
+            self.knowledge_dir
+            / "modules"
+            / name.lower().replace(" ", "_")
+            / "module.textproto"
+        )
 
     def lesson_target_to_path(self: "KnowledgeContent", target: str) -> Path:
-        return self.knowledge_dir / _link_target_to_path(self.unambiguous_names, "/lesson/" + target)
+        return self.knowledge_dir / _link_target_to_path(
+            self.unambiguous_names, "/lesson/" + target
+        )
 
     def term_target_to_path(self: "KnowledgeContent", target: str) -> Path:
-        return self.knowledge_dir / _link_target_to_path(self.unambiguous_names, "/glossary/" + target)
+        return self.knowledge_dir / _link_target_to_path(
+            self.unambiguous_names, "/glossary/" + target
+        )
 
     def topic_target_to_path(self: "KnowledgeContent", target: str) -> Path:
-        return self.knowledge_dir / _link_target_to_path(self.unambiguous_names, "/topic/" + target)
+        return self.knowledge_dir / _link_target_to_path(
+            self.unambiguous_names, "/topic/" + target
+        )
 
     def link_target_to_path(self: "KnowledgeContent", target: str) -> Path:
         return self.knowledge_dir / _link_target_to_path(self.unambiguous_names, target)
@@ -99,19 +128,30 @@ class KnowledgeContent(NamedTuple):
             else:
                 pass
         unambiguous_names = {}
-        for name, entries in itertools.groupby(sorted(md_files, key=lambda p: p.parent.name), key=lambda p: p.parent.name):
+        for name, entries in itertools.groupby(
+            sorted(md_files, key=lambda p: p.parent.name), key=lambda p: p.parent.name
+        ):
             entries = list(entries)
             if len(entries) != 1:
                 print(name, "is ambiguous")
                 continue
             unambiguous_names[name] = str(entries[0].relative_to(knowledge_dir).parent)
-        return cls(repo_root, knowledge_dir, tuple(md_files), tuple(textproto_files), unambiguous_names)
+        return cls(
+            repo_root,
+            knowledge_dir,
+            tuple(md_files),
+            tuple(textproto_files),
+            unambiguous_names,
+        )
 
 
 def _markdown_ast(md_file: Path) -> List[MdValue]:
-    return mistune.create_markdown(renderer='ast')(md_file.read_text())
+    return mistune.create_markdown(renderer="ast")(md_file.read_text())
 
-def _ast_iter(root: List[MdValue], filter_fn: Callable[[MdValue], bool]) -> Iterable[MdValue]:
+
+def _ast_iter(
+    root: List[MdValue], filter_fn: Callable[[MdValue], bool]
+) -> Iterable[MdValue]:
     frontier = list(root)
     while frontier:
         current = frontier.pop(0)
@@ -122,19 +162,24 @@ def _ast_iter(root: List[MdValue], filter_fn: Callable[[MdValue], bool]) -> Iter
             if isinstance(entry, list):
                 frontier.extend(entry)
 
+
 def _link_target_to_path(names: Mapping[str, Path], target: str) -> Path:
     for matcher, link_to_path_fn in _LINK_TO_PATH:
         if matcher.search(target):
             return link_to_path_fn(names, target)
     raise ValueError(f"Unrecognized target {target}")
 
+
 def _safe_relative_to(parent: Path, child: Path) -> Path:
     try:
         return child.relative_to(parent)
     except ValueError:
         return child
 
-def _maybe_print_check(result: bool, repo_root: Path, referrer: Path, ref: str, target: Optional[Path]) -> bool:
+
+def _maybe_print_check(
+    result: bool, repo_root: Path, referrer: Path, ref: str, target: Optional[Path]
+) -> bool:
     if FLAGS.print_valid or not result:
         message = "valid "
         if not result:
@@ -142,42 +187,50 @@ def _maybe_print_check(result: bool, repo_root: Path, referrer: Path, ref: str,
         suffix = ""
         if target is not None:
             suffix = " => " + str(_safe_relative_to(repo_root, target))
-        print(message, _safe_relative_to(repo_root, referrer), f"\"{ref}\"{suffix}")
+        print(message, _safe_relative_to(repo_root, referrer), f'"{ref}"{suffix}')
     return result
 
-def _check_file_present(repo_root: Path, referrer: Path, ref: str, target: Path) -> bool:
+
+def _check_file_present(
+    repo_root: Path, referrer: Path, ref: str, target: Path
+) -> bool:
     return _maybe_print_check(target.is_file(), repo_root, referrer, ref, target)
 
-def _check_contributor(repo_root: Path, referrer: Path, ref: str, contributors: Set[str]) -> bool:
+
+def _check_contributor(
+    repo_root: Path, referrer: Path, ref: str, contributors: Set[str]
+) -> bool:
     return _maybe_print_check(ref in contributors, repo_root, referrer, ref, None)
 
-_MD_LINK    = re.compile(r'(?<!!)\[[^\]]+\]\([^)]+\)')
-_MD_BOLD    = re.compile(r'\*\*[^*\s][^*\n]*\*\*')
-_MD_ITALIC  = re.compile(r'(?<!\*)\*(?!\*)[^*\n]+\*(?!\*)')
-_MD_CODE    = re.compile(r'`[^`\n]+`')
-_MD_HEADING = re.compile(r'(?m)^#{1,6}\s')
+
+_MD_LINK = re.compile(r"(?<!!)\[[^\]]+\]\([^)]+\)")
+_MD_BOLD = re.compile(r"\*\*[^*\s][^*\n]*\*\*")
+_MD_ITALIC = re.compile(r"(?<!\*)\*(?!\*)[^*\n]+\*(?!\*)")
+_MD_CODE = re.compile(r"`[^`\n]+`")
+_MD_HEADING = re.compile(r"(?m)^#{1,6}\s")
 
 _MD_PATTERNS = [
-    (_MD_LINK,    "markdown link [text](url)"),
-    (_MD_BOLD,    "markdown bold **text**"),
-    (_MD_ITALIC,  "markdown italic *text*"),
-    (_MD_CODE,    "markdown code `backtick`"),
+    (_MD_LINK, "markdown link [text](url)"),
+    (_MD_BOLD, "markdown bold **text**"),
+    (_MD_ITALIC, "markdown italic *text*"),
+    (_MD_CODE, "markdown code `backtick`"),
     (_MD_HEADING, "markdown heading #"),
 ]
 
 # Single-line inline tags: <figcaption>...</figcaption> all on one line
 _INLINE_TAG_RE = re.compile(
-    r'<(figcaption|p|li|dt|dd|td|th)[^>]*>[^\n]+</\1>',
+    r"<(figcaption|p|li|dt|dd|td|th)[^>]*>[^\n]+</\1>",
     re.IGNORECASE,
 )
 
 # Block tags where content immediately follows the opening tag (no blank line)
 # e.g. <figure>\n*"italic caption"*\n</figure>
 _ADJACENT_BLOCK_RE = re.compile(
-    r'<(figure|aside|div|section|blockquote)[^>]*>\n([^\n].*?)\n</\1>',
+    r"<(figure|aside|div|section|blockquote)[^>]*>\n([^\n].*?)\n</\1>",
     re.DOTALL | re.IGNORECASE,
 )
 
+
 def _check_markdown_in_html(repo_root: Path, md_file: Path, content: str) -> bool:
     result = True
     rel = _safe_relative_to(repo_root, md_file)
@@ -198,11 +251,13 @@ def _check_markdown_in_html(repo_root: Path, md_file: Path, content: str) -> boo
         if not inner.strip():
             continue
         # Markdown images ![alt](url) are expected and render correctly — skip them
-        inner_no_images = re.sub(r'!\[[^\]]*\]\([^)]*\)', '', inner)
+        inner_no_images = re.sub(r"!\[[^\]]*\]\([^)]*\)", "", inner)
         for pattern, description in _MD_PATTERNS:
             if pattern.search(inner_no_images):
-                line_num = content[:match.start()].count('\n') + 1
-                print(f"INVALID  {rel}:{line_num}: {description} adjacent to HTML block (no blank line):")
+                line_num = content[: match.start()].count("\n") + 1
+                print(
+                    f"INVALID  {rel}:{line_num}: {description} adjacent to HTML block (no blank line):"
+                )
                 print(f"  {match.group(0)[:200].strip()!r}")
                 result = False
                 break
@@ -214,13 +269,22 @@ def _check_md_file_contents(repo_root: Path, md_file: Path, ast: List[MdValue])
     for el in _ast_iter(ast, lambda v: v.get("type", None) == "inline_html"):
         text = el.get("text", "")
         if re.search(' id="[^"]+"', text):
-            print("INVALID ", _safe_relative_to(repo_root, md_file), "attr.id not allowed:", text)
+            print(
+                "INVALID ",
+                _safe_relative_to(repo_root, md_file),
+                "attr.id not allowed:",
+                text,
+            )
             return False
 
     content = md_file.read_text()
 
-    if re.search('</figcaption>(?!.*</figure>)', content, re.MULTILINE | re.DOTALL):
-        print("INVALID ", _safe_relative_to(repo_root, md_file), "Cannot have a <figcaption> outside of a <figure>")
+    if re.search("</figcaption>(?!.*</figure>)", content, re.MULTILINE | re.DOTALL):
+        print(
+            "INVALID ",
+            _safe_relative_to(repo_root, md_file),
+            "Cannot have a <figcaption> outside of a <figure>",
+        )
         return False
 
     if not _check_markdown_in_html(repo_root, md_file, content):
@@ -232,47 +296,53 @@ def _check_md_file_contents(repo_root: Path, md_file: Path, ast: List[MdValue])
 @lru_cache()
 def _check_outbound_link(url: str):
     # Following urls work correctly on a web browser but raise a 400 code when using python requests
-    whitelist = frozenset([
-        'circuitousroot.com',
-        'codepen.io',
-        'colourblindawareness.org',
-        'cortezlawfirmpllc.com',
-        'doi.org',
-        'figma.com',
-        'freepik.com',
-        'gigapress.net',
-        'help.figma.com',
-        'kupferschrift.de',
-        'languagegeek.com',
-        'layoutgridcalculator.com',
-        'medium.com',
-        'medium.engineering',
-        'nedwin.medium.com',
-        'nytimes.com',
-        'paulshawletterdesign.com',
-        'psycnet.apa.org',
-        'researchgate.net',
-        'sciencedirect.com',
-        'support.google.com',
-        'twitter.com',
-        'typetura.com',
-        'webmd.com',
-        "jessicahische.is",
-        "type.method.ac",
-        "dev.epicgames.com", # Returns a 403 response when using requests
-    ])
+    whitelist = frozenset(
+        [
+            "circuitousroot.com",
+            "codepen.io",
+            "colourblindawareness.org",
+            "cortezlawfirmpllc.com",
+            "doi.org",
+            "figma.com",
+            "freepik.com",
+            "gigapress.net",
+            "help.figma.com",
+            "kupferschrift.de",
+            "languagegeek.com",
+            "layoutgridcalculator.com",
+            "medium.com",
+            "medium.engineering",
+            "nedwin.medium.com",
+            "nytimes.com",
+            "paulshawletterdesign.com",
+            "psycnet.apa.org",
+            "researchgate.net",
+            "sciencedirect.com",
+            "support.google.com",
+            "twitter.com",
+            "typetura.com",
+            "webmd.com",
+            "jessicahische.is",
+            "type.method.ac",
+            "dev.epicgames.com",  # Returns a 403 response when using requests
+        ]
+    )
     # Following urls will be fixed at a later date. If the CI is failing and a suitable
     # replacement url cannot be found, please add them to this set.
-    to_fix = frozenset([
-        # bad SSL cert
-        "clagnut.com",
-        "xinreality.com"
-    ])
+    to_fix = frozenset(
+        [
+            # bad SSL cert
+            "clagnut.com",
+            "xinreality.com",
+        ]
+    )
     if urlparse(url).netloc.replace("www.", "") in whitelist | to_fix:
         return True
     response = requests.head(url, allow_redirects=True, timeout=30)
     if not response.ok:
-        print(f"INVALID url {url}' returned response status code '{response.status_code}'")
+        print(
+            f"INVALID url {url}' returned response status code '{response.status_code}'"
+        )
     return response.ok
 
 
@@ -291,13 +361,18 @@ def _check_md_files(knowledge: KnowledgeContent) -> bool:
             if "(" in target:
                 target += ")"
             if not target:
-                continue # TODO: are empty links bad
+                continue  # TODO: are empty links bad
             if re.search("^http(s)?://", target.lower()):
                 if FLAGS.check_outbound_links:
                     result = _check_outbound_link(target) and result
             else:
                 target_path = knowledge.link_target_to_path(target)
-                result = _check_file_present(knowledge.repo_root, md_file, target, target_path) and result
+                result = (
+                    _check_file_present(
+                        knowledge.repo_root, md_file, target, target_path
+                    )
+                    and result
+                )
 
     return result
 
@@ -307,40 +382,81 @@ def _check_proto_files(knowledge: KnowledgeContent) -> bool:
     # The set of valid contributors is useful in upcoming validations
     contributors_file = knowledge.knowledge_dir / "contributors.textproto"
     assert contributors_file.is_file(), contributors_file
-    contributors = {c.name for c in text_format.Parse(contributors_file.read_text(), knowledge_pb2.ContributorsProto()).contributors}
+    contributors = {
+        c.name
+        for c in text_format.Parse(
+            contributors_file.read_text(), knowledge_pb2.ContributorsProto()
+        ).contributors
+    }
 
     result = True
     for textproto_file in knowledge.textproto_files:
         expected_files = set()
         if textproto_file.stem == "contributors":
-            pass # handled above
+            pass  # handled above
         elif textproto_file.stem == "knowledge":
-            proto = text_format.Parse(textproto_file.read_text(), knowledge_pb2.KnowledgeProto())
-            expected_files |= {(m, knowledge.module_name_to_path(m)) for m in proto.modules}
+            proto = text_format.Parse(
+                textproto_file.read_text(), knowledge_pb2.KnowledgeProto()
+            )
+            expected_files |= {
+                (m, knowledge.module_name_to_path(m)) for m in proto.modules
+            }
         elif textproto_file.stem == "term":
-            proto = text_format.Parse(textproto_file.read_text(), knowledge_pb2.TermProto())
-            expected_files |= {(n, knowledge.lesson_target_to_path(n)) for n in proto.related_lessons}
+            proto = text_format.Parse(
+                textproto_file.read_text(), knowledge_pb2.TermProto()
+            )
+            expected_files |= {
+                (n, knowledge.lesson_target_to_path(n)) for n in proto.related_lessons
+            }
         elif textproto_file.stem == "lesson":
-            proto = text_format.Parse(textproto_file.read_text(), knowledge_pb2.LessonProto())
+            proto = text_format.Parse(
+                textproto_file.read_text(), knowledge_pb2.LessonProto()
+            )
             for author in set(proto.authors) | set(proto.reviewers):
-                result = _check_contributor(knowledge.repo_root, textproto_file, author, contributors) and result
-            expected_files |= {(n, knowledge.topic_target_to_path(n)) for n in proto.topics}
-            expected_files |= {(n, knowledge.lesson_target_to_path(n)) for n in proto.prev_lessons}
-            expected_files |= {(n, knowledge.lesson_target_to_path(n)) for n in proto.next_lessons}
-            expected_files |= {(n, knowledge.term_target_to_path(n)) for n in proto.related_terms}
+                result = (
+                    _check_contributor(
+                        knowledge.repo_root, textproto_file, author, contributors
+                    )
+                    and result
+                )
+            expected_files |= {
+                (n, knowledge.topic_target_to_path(n)) for n in proto.topics
+            }
+            expected_files |= {
+                (n, knowledge.lesson_target_to_path(n)) for n in proto.prev_lessons
+            }
+            expected_files |= {
+                (n, knowledge.lesson_target_to_path(n)) for n in proto.next_lessons
+            }
+            expected_files |= {
+                (n, knowledge.term_target_to_path(n)) for n in proto.related_terms
+            }
             # thumbnail is mandatory
-            expected_files.add(("thumbnail", textproto_file.parent / "images" / "thumbnail.svg"))
+            expected_files.add(
+                ("thumbnail", textproto_file.parent / "images" / "thumbnail.svg")
+            )
         elif textproto_file.stem == "module":
-            proto = text_format.Parse(textproto_file.read_text(), knowledge_pb2.ModuleProto())
-            expected_files |= {(n, knowledge.lesson_target_to_path(n)) for n in proto.lessons}
+            proto = text_format.Parse(
+                textproto_file.read_text(), knowledge_pb2.ModuleProto()
+            )
+            expected_files |= {
+                (n, knowledge.lesson_target_to_path(n)) for n in proto.lessons
+            }
         elif textproto_file.stem == "topic":
             # The Topic parses. And that's enough.
             text_format.Parse(textproto_file.read_text(), knowledge_pb2.TopicProto())
         else:
-            raise ValueError("No handler for " + textproto_file.relative_to(knowledge.repo_root))
+            raise ValueError(
+                "No handler for " + textproto_file.relative_to(knowledge.repo_root)
+            )
 
         for ref, expected_file in expected_files:
-            result = _check_file_present(knowledge.repo_root, textproto_file, ref, expected_file) and result
+            result = (
+                _check_file_present(
+                    knowledge.repo_root, textproto_file, ref, expected_file
+                )
+                and result
+            )
 
     return result
 
@@ -348,6 +464,7 @@ def _check_proto_files(knowledge: KnowledgeContent) -> bool:
 def _is_svg(image_file: Path) -> bool:
     return image_file.suffix == ".svg"
 
+
 def _check_image_files(knowledge: KnowledgeContent) -> bool:
     result = True
     image_files = list(knowledge.knowledge_dir.glob("**/images/*"))
@@ -355,24 +472,43 @@ def _check_image_files(knowledge: KnowledgeContent) -> bool:
         st_size = image_file.stat().st_size
         if _is_svg(image_file):
             if st_size > MAX_VECTOR_IMAGE_SIZE_KB * 1024:
-                print("File exceeds max size of %s KB (%s KB):" % (MAX_VECTOR_IMAGE_SIZE_KB, st_size // 1024), image_file.relative_to(knowledge.knowledge_dir))
+                print(
+                    "File exceeds max size of %s KB (%s KB):"
+                    % (MAX_VECTOR_IMAGE_SIZE_KB, st_size // 1024),
+                    image_file.relative_to(knowledge.knowledge_dir),
+                )
                 result = False
             root = minidom.parseString(image_file.read_text()).documentElement
             if root.tagName != "svg":
-                print("Root element must be <svg>:", image_file.relative_to(knowledge.repo_root))
+                print(
+                    "Root element must be <svg>:",
+                    image_file.relative_to(knowledge.repo_root),
+                )
                 result = False
             has_view_box = "viewBox" in root.attributes
-            has_width_and_height = "width" in root.attributes and "height" in root.attributes
+            has_width_and_height = (
+                "width" in root.attributes and "height" in root.attributes
+            )
             if not has_view_box and not has_width_and_height:
-                print("Must specify viewBox and/or width+height on <svg>:", image_file.relative_to(knowledge.knowledge_dir))
+                print(
+                    "Must specify viewBox and/or width+height on <svg>:",
+                    image_file.relative_to(knowledge.knowledge_dir),
+                )
                 result = False
             for stopEl in root.getElementsByTagName("stop"):
                 if "offset" not in stopEl.attributes:
-                    print("Must specify offset on <stop>:", image_file.relative_to(knowledge.knowledge_dir))
+                    print(
+                        "Must specify offset on <stop>:",
+                        image_file.relative_to(knowledge.knowledge_dir),
+                    )
                     result = False
         else:
             if st_size > MAX_RASTER_IMAGE_SIZE_KB * 1024:
-                print("File exceeds max size of %s KB (%s KB):" % (MAX_RASTER_IMAGE_SIZE_KB, st_size // 1024), image_file.relative_to(knowledge.knowledge_dir))
+                print(
+                    "File exceeds max size of %s KB (%s KB):"
+                    % (MAX_RASTER_IMAGE_SIZE_KB, st_size // 1024),
+                    image_file.relative_to(knowledge.knowledge_dir),
+                )
                 result = False
     return result
 
@@ -380,9 +516,11 @@ def _check_image_files(knowledge: KnowledgeContent) -> bool:
 def main(_):
     knowledge = KnowledgeContent.load(Path(__file__).parent.parent.parent)
     return_code = 1
-    if (_check_md_files(knowledge)
-            and _check_proto_files(knowledge)
-            and _check_image_files(knowledge)):
+    if (
+        _check_md_files(knowledge)
+        and _check_proto_files(knowledge)
+        and _check_image_files(knowledge)
+    ):
         return_code = 0
     sys.exit(return_code)