1
0
mirror of synced 2025-12-30 03:03:46 -05:00
Files
fonts/.github/workflows/knowledge_graph.py

311 lines
12 KiB
Python

from pprint import pprint
from absl import app
from absl import flags
from gftools import knowledge_pb2
from google.protobuf import text_format
import itertools
import mistune # markdown => ast
from xml.dom import minidom
from pathlib import Path
import re
import sys
from typing import Callable, Iterable, List, Mapping, NamedTuple, Optional, Tuple, Set, Union
MAX_RASTER_IMAGE_SIZE_KB = 800
MAX_VECTOR_IMAGE_SIZE_KB = 1750
def _topic_target_to_path(_: Set[str], target: str) -> str:
# TODO sanity check if this is the only valid update
return Path(target.replace("/topic/", "topics/")) / "topic.textproto"
def _module_target_to_path(_: Set[str], target: str) -> str:
return Path(target.replace("/module/", "modules/")) / "module.textproto"
def _content_md(path: str) -> Path:
return Path(path) / "content.md"
def _glossary_target_to_path(_: Set[str], target: str) -> str:
# TODO sanity check if this is the only valid update
return _content_md(target.replace("/glossary/", "glossary/terms/"))
def _lesson_target_to_path(names: Mapping[str, str], target: str) -> str:
# /lesson/choosing_type/choosing_reliable_typefaces => modules/choosing_type/lessons/choosing_reliable_typefaces/
parts = target[1:].split("/")
assert parts[0] == "lesson"
if len(parts) == 2:
path = names.get(parts[1], "")
if not path.startswith("modules/"):
return _content_md(target)
return _content_md(path)
elif len(parts) == 3:
return _content_md(f"modules/{parts[1]}/lessons/{parts[2]}")
else:
return _content_md(target)
def _any_unique_name_to_path(names: Mapping[str, str], target: str) -> str:
return _content_md(names.get(target, target))
_LINK_TO_PATH = [
(re.compile("^/glossary/"), _glossary_target_to_path),
(re.compile("^/topic/"), _topic_target_to_path),
(re.compile("^/lesson/"), _lesson_target_to_path),
(re.compile("^/module/"), _module_target_to_path),
(re.compile("[^/]+"), _any_unique_name_to_path)
]
FLAGS = flags.FLAGS
flags.DEFINE_bool("print_valid", False, "Whether to print valid links")
MdValue = Union[Mapping[str, "MdValue"]]
class KnowledgeContent(NamedTuple):
repo_root: Path
knowledge_dir: Path
md_files: Tuple[Path, ...]
textproto_files: Tuple[Path, ...]
unambiguous_names: Mapping[str, Path]
def module_name_to_path(self: "KnowledgeContent", name: str) -> Path:
return self.knowledge_dir / "modules" / name.lower().replace(" ", "_") / "module.textproto"
def lesson_target_to_path(self: "KnowledgeContent", target: str) -> Path:
return self.knowledge_dir / _link_target_to_path(self.unambiguous_names, "/lesson/" + target)
def term_target_to_path(self: "KnowledgeContent", target: str) -> Path:
return self.knowledge_dir / _link_target_to_path(self.unambiguous_names, "/glossary/" + target)
def topic_target_to_path(self: "KnowledgeContent", target: str) -> Path:
return self.knowledge_dir / _link_target_to_path(self.unambiguous_names, "/topic/" + target)
def link_target_to_path(self: "KnowledgeContent", target: str) -> Path:
return self.knowledge_dir / _link_target_to_path(self.unambiguous_names, target)
@classmethod
def load(cls, repo_root: Path) -> "KnowledgeContent":
knowledge_dir = repo_root / "cc-by-sa" / "knowledge"
assert knowledge_dir.is_dir(), f"No dir {knowledge_dir}"
md_files = []
textproto_files = []
for file in knowledge_dir.rglob("*"):
if file.suffix.lower() == ".md":
md_files.append(file)
elif file.suffix.lower() == ".textproto":
textproto_files.append(file)
else:
pass
unambiguous_names = {}
for name, entries in itertools.groupby(sorted(md_files, key=lambda p: p.parent.name), key=lambda p: p.parent.name):
entries = list(entries)
if len(entries) != 1:
print(name, "is ambiguous")
continue
unambiguous_names[name] = str(entries[0].relative_to(knowledge_dir).parent)
return cls(repo_root, knowledge_dir, tuple(md_files), tuple(textproto_files), unambiguous_names)
def _markdown_ast(md_file: Path) -> List[MdValue]:
return mistune.create_markdown(renderer='ast')(md_file.read_text())
def _ast_iter(root: List[MdValue], filter_fn: Callable[[MdValue], bool]) -> Iterable[MdValue]:
frontier = list(root)
while frontier:
current = frontier.pop(0)
assert isinstance(current, dict), f"What is {current}"
if filter_fn(current):
yield current
for entry in current.values():
if isinstance(entry, list):
frontier.extend(entry)
def _link_target_to_path(names: Mapping[str, Path], target: str) -> Path:
for matcher, link_to_path_fn in _LINK_TO_PATH:
if matcher.search(target):
return link_to_path_fn(names, target)
raise ValueError(f"Unrecognized target {target}")
def _safe_relative_to(parent: Path, child: Path) -> Path:
try:
return child.relative_to(parent)
except ValueError:
return child
def _maybe_print_check(result: bool, repo_root: Path, referrer: Path, ref: str, target: Optional[Path]) -> bool:
if FLAGS.print_valid or not result:
message = "valid "
if not result:
message = "INVALID "
suffix = ""
if target is not None:
suffix = " => " + str(_safe_relative_to(repo_root, target))
print(message, _safe_relative_to(repo_root, referrer), f"\"{ref}\"{suffix}")
return result
def _check_file_present(repo_root: Path, referrer: Path, ref: str, target: Path) -> bool:
return _maybe_print_check(target.is_file(), repo_root, referrer, ref, target)
def _check_contributor(repo_root: Path, referrer: Path, ref: str, contributors: Set[str]) -> bool:
return _maybe_print_check(ref in contributors, repo_root, referrer, ref, None)
def _check_md_file_contents(repo_root: Path, md_file: Path, ast: List[MdValue]) -> bool:
for el in _ast_iter(ast, lambda v: v.get("type", None) == "inline_html"):
text = el.get("text", "")
if re.search(' id="[^"]+"', text):
print("INVALID ", _safe_relative_to(repo_root, md_file), "attr.id not allowed:", text)
return False
f = open(md_file,"r")
content = "".join(f.readlines())
if re.search('</figcaption>(?!.*</figure>)', content, re.MULTILINE | re.DOTALL):
print("INVALID ", _safe_relative_to(repo_root, md_file), "Cannot have a <figcaption> outside of a <figure>")
return False
f.close()
return True
def _check_md_files(knowledge: KnowledgeContent) -> bool:
result = True
for md_file in knowledge.md_files:
ast = _markdown_ast(md_file)
result = _check_md_file_contents(knowledge.repo_root, md_file, ast) and result
for link in _ast_iter(ast, lambda v: v.get("type", None) == "link"):
target = link.get("link", "")
if not target:
continue # TODO: are empty links bad
if re.search("^http(s)?://", target.lower()):
continue # we aren't in the business of validating outbound links
target_path = knowledge.link_target_to_path(target)
result = _check_file_present(knowledge.repo_root, md_file, target, target_path) and result
return result
def _check_proto_files(knowledge: KnowledgeContent) -> bool:
# TODO support alt_ids, many Knowledge constructs have them
# The set of valid contributors is useful in upcoming validations
contributors_file = knowledge.knowledge_dir / "contributors.textproto"
assert contributors_file.is_file(), contributors_file
contributors = {c.name for c in text_format.Parse(contributors_file.read_text(), knowledge_pb2.ContributorsProto()).contributors}
result = True
for textproto_file in knowledge.textproto_files:
expected_files = set()
if textproto_file.stem == "contributors":
pass # handled above
elif textproto_file.stem == "knowledge":
proto = text_format.Parse(textproto_file.read_text(), knowledge_pb2.KnowledgeProto())
expected_files |= {(m, knowledge.module_name_to_path(m)) for m in proto.modules}
elif textproto_file.stem == "term":
proto = text_format.Parse(textproto_file.read_text(), knowledge_pb2.TermProto())
expected_files |= {(n, knowledge.lesson_target_to_path(n)) for n in proto.related_lessons}
elif textproto_file.stem == "lesson":
proto = text_format.Parse(textproto_file.read_text(), knowledge_pb2.LessonProto())
for author in set(proto.authors) | set(proto.reviewers):
result = _check_contributor(knowledge.repo_root, textproto_file, author, contributors) and result
expected_files |= {(n, knowledge.topic_target_to_path(n)) for n in proto.topics}
expected_files |= {(n, knowledge.lesson_target_to_path(n)) for n in proto.prev_lessons}
expected_files |= {(n, knowledge.lesson_target_to_path(n)) for n in proto.next_lessons}
expected_files |= {(n, knowledge.term_target_to_path(n)) for n in proto.related_terms}
# thumbnail is mandatory
expected_files.add(("thumbnail", textproto_file.parent / "images" / "thumbnail.svg"))
elif textproto_file.stem == "module":
proto = text_format.Parse(textproto_file.read_text(), knowledge_pb2.ModuleProto())
expected_files |= {(n, knowledge.lesson_target_to_path(n)) for n in proto.lessons}
elif textproto_file.stem == "topic":
# The Topic parses. And that's enough.
text_format.Parse(textproto_file.read_text(), knowledge_pb2.TopicProto())
else:
raise ValueError("No handler for " + textproto_file.relative_to(knowledge.repo_root))
for ref, expected_file in expected_files:
result = _check_file_present(knowledge.repo_root, textproto_file, ref, expected_file) and result
return result
def _is_svg(image_file: Path) -> bool:
return image_file.suffix == ".svg"
def _is_svg(image_file: Path) -> bool:
return image_file.suffix == ".svg"
def _check_image_files(knowledge: KnowledgeContent) -> bool:
result = True
image_files = list(knowledge.knowledge_dir.glob("**/images/*"))
for image_file in image_files:
st_size = image_file.stat().st_size
if _is_svg(image_file):
if st_size > MAX_VECTOR_IMAGE_SIZE_KB * 1024:
print("File exceeds max size of %s KB (%s KB):" % (MAX_VECTOR_IMAGE_SIZE_KB, st_size // 1024), image_file.relative_to(knowledge.knowledge_dir))
result = False
root = minidom.parseString(image_file.read_text()).documentElement
if root.tagName != "svg":
print("Root element must be <svg>:", image_file.relative_to(knowledge.repo_root))
result = False
has_view_box = "viewBox" in root.attributes
has_width_and_height = "width" in root.attributes and "height" in root.attributes
if not has_view_box and not has_width_and_height:
print("Must specify viewBox and/or width+height on <svg>:", image_file.relative_to(knowledge.knowledge_dir))
result = False
for stopEl in root.getElementsByTagName("stop"):
if "offset" not in stopEl.attributes:
print("Must specify offset on <stop>:", image_file.relative_to(knowledge.knowledge_dir))
result = False
else:
if st_size > MAX_RASTER_IMAGE_SIZE_KB * 1024:
print("File exceeds max size of %s KB (%s KB):" % (MAX_RASTER_IMAGE_SIZE_KB, st_size // 1024), image_file.relative_to(knowledge.knowledge_dir))
result = False
return result
def main(_):
knowledge = KnowledgeContent.load(Path(__file__).parent.parent.parent)
return_code = 1
if (_check_md_files(knowledge)
and _check_proto_files(knowledge)
and _check_image_files(knowledge)):
return_code = 0
sys.exit(return_code)
if __name__ == "__main__":
app.run(main)