from gftools import knowledge_pb2
from google.protobuf import text_format
import itertools
-import mistune # markdown => ast
+import mistune # markdown => ast
from xml.dom import minidom
from pathlib import Path
import re
import sys
-from typing import Callable, Iterable, List, Mapping, NamedTuple, Optional, Tuple, Set, Union
+from typing import (
+ Callable,
+ Iterable,
+ List,
+ Mapping,
+ NamedTuple,
+ Optional,
+ Tuple,
+ Set,
+ Union,
+)
import requests
from functools import lru_cache
from urllib.parse import urlparse
MAX_RASTER_IMAGE_SIZE_KB = 800
MAX_VECTOR_IMAGE_SIZE_KB = 1750
+
def _topic_target_to_path(_: Set[str], target: str) -> str:
# TODO sanity check if this is the only valid update
return Path(target.replace("/topic/", "topics/")) / "topic.textproto"
+
def _module_target_to_path(_: Set[str], target: str) -> str:
return Path(target.replace("/module/", "modules/")) / "module.textproto"
+
def _content_md(path: str) -> Path:
return Path(path) / "content.md"
+
def _glossary_target_to_path(_: Set[str], target: str) -> str:
# TODO sanity check if this is the only valid update
return _content_md(target.replace("/glossary/", "glossary/terms/"))
+
def _lesson_target_to_path(names: Mapping[str, str], target: str) -> str:
# /lesson/choosing_type/choosing_reliable_typefaces => modules/choosing_type/lessons/choosing_reliable_typefaces/
parts = target[1:].split("/")
else:
return _content_md(target)
+
def _any_unique_name_to_path(names: Mapping[str, str], target: str) -> str:
return _content_md(names.get(target, target))
+
_LINK_TO_PATH = [
(re.compile("^/glossary/"), _glossary_target_to_path),
(re.compile("^/topic/"), _topic_target_to_path),
(re.compile("^/lesson/"), _lesson_target_to_path),
(re.compile("^/module/"), _module_target_to_path),
- (re.compile("[^/]+"), _any_unique_name_to_path)
+ (re.compile("[^/]+"), _any_unique_name_to_path),
]
FLAGS = flags.FLAGS
MdValue = Union[Mapping[str, "MdValue"]]
+
class KnowledgeContent(NamedTuple):
repo_root: Path
knowledge_dir: Path
unambiguous_names: Mapping[str, Path]
def module_name_to_path(self: "KnowledgeContent", name: str) -> Path:
- return self.knowledge_dir / "modules" / name.lower().replace(" ", "_") / "module.textproto"
+ return (
+ self.knowledge_dir
+ / "modules"
+ / name.lower().replace(" ", "_")
+ / "module.textproto"
+ )
def lesson_target_to_path(self: "KnowledgeContent", target: str) -> Path:
- return self.knowledge_dir / _link_target_to_path(self.unambiguous_names, "/lesson/" + target)
+ return self.knowledge_dir / _link_target_to_path(
+ self.unambiguous_names, "/lesson/" + target
+ )
def term_target_to_path(self: "KnowledgeContent", target: str) -> Path:
- return self.knowledge_dir / _link_target_to_path(self.unambiguous_names, "/glossary/" + target)
+ return self.knowledge_dir / _link_target_to_path(
+ self.unambiguous_names, "/glossary/" + target
+ )
def topic_target_to_path(self: "KnowledgeContent", target: str) -> Path:
- return self.knowledge_dir / _link_target_to_path(self.unambiguous_names, "/topic/" + target)
+ return self.knowledge_dir / _link_target_to_path(
+ self.unambiguous_names, "/topic/" + target
+ )
def link_target_to_path(self: "KnowledgeContent", target: str) -> Path:
return self.knowledge_dir / _link_target_to_path(self.unambiguous_names, target)
else:
pass
unambiguous_names = {}
- for name, entries in itertools.groupby(sorted(md_files, key=lambda p: p.parent.name), key=lambda p: p.parent.name):
+ for name, entries in itertools.groupby(
+ sorted(md_files, key=lambda p: p.parent.name), key=lambda p: p.parent.name
+ ):
entries = list(entries)
if len(entries) != 1:
print(name, "is ambiguous")
continue
unambiguous_names[name] = str(entries[0].relative_to(knowledge_dir).parent)
- return cls(repo_root, knowledge_dir, tuple(md_files), tuple(textproto_files), unambiguous_names)
+ return cls(
+ repo_root,
+ knowledge_dir,
+ tuple(md_files),
+ tuple(textproto_files),
+ unambiguous_names,
+ )
def _markdown_ast(md_file: Path) -> List[MdValue]:
- return mistune.create_markdown(renderer='ast')(md_file.read_text())
+ return mistune.create_markdown(renderer="ast")(md_file.read_text())
-def _ast_iter(root: List[MdValue], filter_fn: Callable[[MdValue], bool]) -> Iterable[MdValue]:
+
+def _ast_iter(
+ root: List[MdValue], filter_fn: Callable[[MdValue], bool]
+) -> Iterable[MdValue]:
frontier = list(root)
while frontier:
current = frontier.pop(0)
if isinstance(entry, list):
frontier.extend(entry)
+
def _link_target_to_path(names: Mapping[str, Path], target: str) -> Path:
for matcher, link_to_path_fn in _LINK_TO_PATH:
if matcher.search(target):
return link_to_path_fn(names, target)
raise ValueError(f"Unrecognized target {target}")
+
def _safe_relative_to(parent: Path, child: Path) -> Path:
try:
return child.relative_to(parent)
except ValueError:
return child
-def _maybe_print_check(result: bool, repo_root: Path, referrer: Path, ref: str, target: Optional[Path]) -> bool:
+
+def _maybe_print_check(
+ result: bool, repo_root: Path, referrer: Path, ref: str, target: Optional[Path]
+) -> bool:
if FLAGS.print_valid or not result:
message = "valid "
if not result:
suffix = ""
if target is not None:
suffix = " => " + str(_safe_relative_to(repo_root, target))
- print(message, _safe_relative_to(repo_root, referrer), f"\"{ref}\"{suffix}")
+ print(message, _safe_relative_to(repo_root, referrer), f'"{ref}"{suffix}')
return result
-def _check_file_present(repo_root: Path, referrer: Path, ref: str, target: Path) -> bool:
+
+def _check_file_present(
+ repo_root: Path, referrer: Path, ref: str, target: Path
+) -> bool:
return _maybe_print_check(target.is_file(), repo_root, referrer, ref, target)
-def _check_contributor(repo_root: Path, referrer: Path, ref: str, contributors: Set[str]) -> bool:
+
+def _check_contributor(
+ repo_root: Path, referrer: Path, ref: str, contributors: Set[str]
+) -> bool:
return _maybe_print_check(ref in contributors, repo_root, referrer, ref, None)
-_MD_LINK = re.compile(r'(?<!!)\[[^\]]+\]\([^)]+\)')
-_MD_BOLD = re.compile(r'\*\*[^*\s][^*\n]*\*\*')
-_MD_ITALIC = re.compile(r'(?<!\*)\*(?!\*)[^*\n]+\*(?!\*)')
-_MD_CODE = re.compile(r'`[^`\n]+`')
-_MD_HEADING = re.compile(r'(?m)^#{1,6}\s')
+
+_MD_LINK = re.compile(r"(?<!!)\[[^\]]+\]\([^)]+\)")
+_MD_BOLD = re.compile(r"\*\*[^*\s][^*\n]*\*\*")
+_MD_ITALIC = re.compile(r"(?<!\*)\*(?!\*)[^*\n]+\*(?!\*)")
+_MD_CODE = re.compile(r"`[^`\n]+`")
+_MD_HEADING = re.compile(r"(?m)^#{1,6}\s")
_MD_PATTERNS = [
- (_MD_LINK, "markdown link [text](url)"),
- (_MD_BOLD, "markdown bold **text**"),
- (_MD_ITALIC, "markdown italic *text*"),
- (_MD_CODE, "markdown code `backtick`"),
+ (_MD_LINK, "markdown link [text](url)"),
+ (_MD_BOLD, "markdown bold **text**"),
+ (_MD_ITALIC, "markdown italic *text*"),
+ (_MD_CODE, "markdown code `backtick`"),
(_MD_HEADING, "markdown heading #"),
]
# Single-line inline tags: <figcaption>...</figcaption> all on one line
_INLINE_TAG_RE = re.compile(
- r'<(figcaption|p|li|dt|dd|td|th)[^>]*>[^\n]+</\1>',
+ r"<(figcaption|p|li|dt|dd|td|th)[^>]*>[^\n]+</\1>",
re.IGNORECASE,
)
# Block tags where content immediately follows the opening tag (no blank line)
# e.g. <figure>\n*"italic caption"*\n</figure>
_ADJACENT_BLOCK_RE = re.compile(
- r'<(figure|aside|div|section|blockquote)[^>]*>\n([^\n].*?)\n</\1>',
+ r"<(figure|aside|div|section|blockquote)[^>]*>\n([^\n].*?)\n</\1>",
re.DOTALL | re.IGNORECASE,
)
+
def _check_markdown_in_html(repo_root: Path, md_file: Path, content: str) -> bool:
result = True
rel = _safe_relative_to(repo_root, md_file)
if not inner.strip():
continue
# Markdown images  are expected and render correctly — skip them
- inner_no_images = re.sub(r'!\[[^\]]*\]\([^)]*\)', '', inner)
+ inner_no_images = re.sub(r"!\[[^\]]*\]\([^)]*\)", "", inner)
for pattern, description in _MD_PATTERNS:
if pattern.search(inner_no_images):
- line_num = content[:match.start()].count('\n') + 1
- print(f"INVALID {rel}:{line_num}: {description} adjacent to HTML block (no blank line):")
+ line_num = content[: match.start()].count("\n") + 1
+ print(
+ f"INVALID {rel}:{line_num}: {description} adjacent to HTML block (no blank line):"
+ )
print(f" {match.group(0)[:200].strip()!r}")
result = False
break
for el in _ast_iter(ast, lambda v: v.get("type", None) == "inline_html"):
text = el.get("text", "")
if re.search(' id="[^"]+"', text):
- print("INVALID ", _safe_relative_to(repo_root, md_file), "attr.id not allowed:", text)
+ print(
+ "INVALID ",
+ _safe_relative_to(repo_root, md_file),
+ "attr.id not allowed:",
+ text,
+ )
return False
content = md_file.read_text()
- if re.search('</figcaption>(?!.*</figure>)', content, re.MULTILINE | re.DOTALL):
- print("INVALID ", _safe_relative_to(repo_root, md_file), "Cannot have a <figcaption> outside of a <figure>")
+ if re.search("</figcaption>(?!.*</figure>)", content, re.MULTILINE | re.DOTALL):
+ print(
+ "INVALID ",
+ _safe_relative_to(repo_root, md_file),
+ "Cannot have a <figcaption> outside of a <figure>",
+ )
return False
if not _check_markdown_in_html(repo_root, md_file, content):
@lru_cache()
def _check_outbound_link(url: str):
# Following urls work correctly on a web browser but raise a 400 code when using python requests
- whitelist = frozenset([
- 'circuitousroot.com',
- 'codepen.io',
- 'colourblindawareness.org',
- 'cortezlawfirmpllc.com',
- 'doi.org',
- 'figma.com',
- 'freepik.com',
- 'gigapress.net',
- 'help.figma.com',
- 'kupferschrift.de',
- 'languagegeek.com',
- 'layoutgridcalculator.com',
- 'medium.com',
- 'medium.engineering',
- 'nedwin.medium.com',
- 'nytimes.com',
- 'paulshawletterdesign.com',
- 'psycnet.apa.org',
- 'researchgate.net',
- 'sciencedirect.com',
- 'support.google.com',
- 'twitter.com',
- 'typetura.com',
- 'webmd.com',
- "jessicahische.is",
- "type.method.ac",
- "dev.epicgames.com", # Returns a 403 response when using requests
- ])
+ whitelist = frozenset(
+ [
+ "circuitousroot.com",
+ "codepen.io",
+ "colourblindawareness.org",
+ "cortezlawfirmpllc.com",
+ "doi.org",
+ "figma.com",
+ "freepik.com",
+ "gigapress.net",
+ "help.figma.com",
+ "kupferschrift.de",
+ "languagegeek.com",
+ "layoutgridcalculator.com",
+ "medium.com",
+ "medium.engineering",
+ "nedwin.medium.com",
+ "nytimes.com",
+ "paulshawletterdesign.com",
+ "psycnet.apa.org",
+ "researchgate.net",
+ "sciencedirect.com",
+ "support.google.com",
+ "twitter.com",
+ "typetura.com",
+ "webmd.com",
+ "jessicahische.is",
+ "type.method.ac",
+ "dev.epicgames.com", # Returns a 403 response when using requests
+ ]
+ )
# Following urls will be fixed at a later date. If the CI is failing and a suitable
# replacement url cannot be found, please add them to this set.
- to_fix = frozenset([
- # bad SSL cert
- "clagnut.com",
- "xinreality.com"
- ])
+ to_fix = frozenset(
+ [
+ # bad SSL cert
+ "clagnut.com",
+ "xinreality.com",
+ ]
+ )
if urlparse(url).netloc.replace("www.", "") in whitelist | to_fix:
return True
response = requests.head(url, allow_redirects=True, timeout=30)
if not response.ok:
- print(f"INVALID url {url}' returned response status code '{response.status_code}'")
+ print(
+ f"INVALID url {url}' returned response status code '{response.status_code}'"
+ )
return response.ok
if "(" in target:
target += ")"
if not target:
- continue # TODO: are empty links bad
+ continue # TODO: are empty links bad
if re.search("^http(s)?://", target.lower()):
if FLAGS.check_outbound_links:
result = _check_outbound_link(target) and result
else:
target_path = knowledge.link_target_to_path(target)
- result = _check_file_present(knowledge.repo_root, md_file, target, target_path) and result
+ result = (
+ _check_file_present(
+ knowledge.repo_root, md_file, target, target_path
+ )
+ and result
+ )
return result
# The set of valid contributors is useful in upcoming validations
contributors_file = knowledge.knowledge_dir / "contributors.textproto"
assert contributors_file.is_file(), contributors_file
- contributors = {c.name for c in text_format.Parse(contributors_file.read_text(), knowledge_pb2.ContributorsProto()).contributors}
+ contributors = {
+ c.name
+ for c in text_format.Parse(
+ contributors_file.read_text(), knowledge_pb2.ContributorsProto()
+ ).contributors
+ }
result = True
for textproto_file in knowledge.textproto_files:
expected_files = set()
if textproto_file.stem == "contributors":
- pass # handled above
+ pass # handled above
elif textproto_file.stem == "knowledge":
- proto = text_format.Parse(textproto_file.read_text(), knowledge_pb2.KnowledgeProto())
- expected_files |= {(m, knowledge.module_name_to_path(m)) for m in proto.modules}
+ proto = text_format.Parse(
+ textproto_file.read_text(), knowledge_pb2.KnowledgeProto()
+ )
+ expected_files |= {
+ (m, knowledge.module_name_to_path(m)) for m in proto.modules
+ }
elif textproto_file.stem == "term":
- proto = text_format.Parse(textproto_file.read_text(), knowledge_pb2.TermProto())
- expected_files |= {(n, knowledge.lesson_target_to_path(n)) for n in proto.related_lessons}
+ proto = text_format.Parse(
+ textproto_file.read_text(), knowledge_pb2.TermProto()
+ )
+ expected_files |= {
+ (n, knowledge.lesson_target_to_path(n)) for n in proto.related_lessons
+ }
elif textproto_file.stem == "lesson":
- proto = text_format.Parse(textproto_file.read_text(), knowledge_pb2.LessonProto())
+ proto = text_format.Parse(
+ textproto_file.read_text(), knowledge_pb2.LessonProto()
+ )
for author in set(proto.authors) | set(proto.reviewers):
- result = _check_contributor(knowledge.repo_root, textproto_file, author, contributors) and result
- expected_files |= {(n, knowledge.topic_target_to_path(n)) for n in proto.topics}
- expected_files |= {(n, knowledge.lesson_target_to_path(n)) for n in proto.prev_lessons}
- expected_files |= {(n, knowledge.lesson_target_to_path(n)) for n in proto.next_lessons}
- expected_files |= {(n, knowledge.term_target_to_path(n)) for n in proto.related_terms}
+ result = (
+ _check_contributor(
+ knowledge.repo_root, textproto_file, author, contributors
+ )
+ and result
+ )
+ expected_files |= {
+ (n, knowledge.topic_target_to_path(n)) for n in proto.topics
+ }
+ expected_files |= {
+ (n, knowledge.lesson_target_to_path(n)) for n in proto.prev_lessons
+ }
+ expected_files |= {
+ (n, knowledge.lesson_target_to_path(n)) for n in proto.next_lessons
+ }
+ expected_files |= {
+ (n, knowledge.term_target_to_path(n)) for n in proto.related_terms
+ }
# thumbnail is mandatory
- expected_files.add(("thumbnail", textproto_file.parent / "images" / "thumbnail.svg"))
+ expected_files.add(
+ ("thumbnail", textproto_file.parent / "images" / "thumbnail.svg")
+ )
elif textproto_file.stem == "module":
- proto = text_format.Parse(textproto_file.read_text(), knowledge_pb2.ModuleProto())
- expected_files |= {(n, knowledge.lesson_target_to_path(n)) for n in proto.lessons}
+ proto = text_format.Parse(
+ textproto_file.read_text(), knowledge_pb2.ModuleProto()
+ )
+ expected_files |= {
+ (n, knowledge.lesson_target_to_path(n)) for n in proto.lessons
+ }
elif textproto_file.stem == "topic":
# The Topic parses. And that's enough.
text_format.Parse(textproto_file.read_text(), knowledge_pb2.TopicProto())
else:
- raise ValueError("No handler for " + textproto_file.relative_to(knowledge.repo_root))
+ raise ValueError(
+ "No handler for " + textproto_file.relative_to(knowledge.repo_root)
+ )
for ref, expected_file in expected_files:
- result = _check_file_present(knowledge.repo_root, textproto_file, ref, expected_file) and result
+ result = (
+ _check_file_present(
+ knowledge.repo_root, textproto_file, ref, expected_file
+ )
+ and result
+ )
return result
def _is_svg(image_file: Path) -> bool:
return image_file.suffix == ".svg"
+
def _check_image_files(knowledge: KnowledgeContent) -> bool:
result = True
image_files = list(knowledge.knowledge_dir.glob("**/images/*"))
st_size = image_file.stat().st_size
if _is_svg(image_file):
if st_size > MAX_VECTOR_IMAGE_SIZE_KB * 1024:
- print("File exceeds max size of %s KB (%s KB):" % (MAX_VECTOR_IMAGE_SIZE_KB, st_size // 1024), image_file.relative_to(knowledge.knowledge_dir))
+ print(
+ "File exceeds max size of %s KB (%s KB):"
+ % (MAX_VECTOR_IMAGE_SIZE_KB, st_size // 1024),
+ image_file.relative_to(knowledge.knowledge_dir),
+ )
result = False
root = minidom.parseString(image_file.read_text()).documentElement
if root.tagName != "svg":
- print("Root element must be <svg>:", image_file.relative_to(knowledge.repo_root))
+ print(
+ "Root element must be <svg>:",
+ image_file.relative_to(knowledge.repo_root),
+ )
result = False
has_view_box = "viewBox" in root.attributes
- has_width_and_height = "width" in root.attributes and "height" in root.attributes
+ has_width_and_height = (
+ "width" in root.attributes and "height" in root.attributes
+ )
if not has_view_box and not has_width_and_height:
- print("Must specify viewBox and/or width+height on <svg>:", image_file.relative_to(knowledge.knowledge_dir))
+ print(
+ "Must specify viewBox and/or width+height on <svg>:",
+ image_file.relative_to(knowledge.knowledge_dir),
+ )
result = False
for stopEl in root.getElementsByTagName("stop"):
if "offset" not in stopEl.attributes:
- print("Must specify offset on <stop>:", image_file.relative_to(knowledge.knowledge_dir))
+ print(
+ "Must specify offset on <stop>:",
+ image_file.relative_to(knowledge.knowledge_dir),
+ )
result = False
else:
if st_size > MAX_RASTER_IMAGE_SIZE_KB * 1024:
- print("File exceeds max size of %s KB (%s KB):" % (MAX_RASTER_IMAGE_SIZE_KB, st_size // 1024), image_file.relative_to(knowledge.knowledge_dir))
+ print(
+ "File exceeds max size of %s KB (%s KB):"
+ % (MAX_RASTER_IMAGE_SIZE_KB, st_size // 1024),
+ image_file.relative_to(knowledge.knowledge_dir),
+ )
result = False
return result
def main(_):
knowledge = KnowledgeContent.load(Path(__file__).parent.parent.parent)
return_code = 1
- if (_check_md_files(knowledge)
- and _check_proto_files(knowledge)
- and _check_image_files(knowledge)):
+ if (
+ _check_md_files(knowledge)
+ and _check_proto_files(knowledge)
+ and _check_image_files(knowledge)
+ ):
return_code = 0
sys.exit(return_code)