fonts/.github/workflows/knowledge_graph.py

from pprint import pprint
from absl import app
from absl import flags
from gftools import knowledge_pb2
from google.protobuf import text_format
import itertools
import mistune  # markdown => ast
from xml.dom import minidom
from pathlib import Path
import re
import sys
from typing import Callable, Iterable, List, Mapping, NamedTuple, Optional, Tuple, Set, Union


MAX_RASTER_IMAGE_SIZE_KB = 800
MAX_VECTOR_IMAGE_SIZE_KB = 1750


def _topic_target_to_path(_: Set[str], target: str) -> str:
    # TODO sanity check if this is the only valid update
    return Path(target.replace("/topic/", "topics/")) / "topic.textproto"


def _module_target_to_path(_: Set[str], target: str) -> str:
    return Path(target.replace("/module/", "modules/")) / "module.textproto"


def _content_md(path: str) -> Path:
    return Path(path) / "content.md"


def _glossary_target_to_path(_: Set[str], target: str) -> str:
    # TODO sanity check if this is the only valid update
    return _content_md(target.replace("/glossary/", "glossary/terms/"))


def _lesson_target_to_path(names: Mapping[str, str], target: str) -> str:
    # /lesson/choosing_type/choosing_reliable_typefaces => modules/choosing_type/lessons/choosing_reliable_typefaces/
    parts = target[1:].split("/")
    assert parts[0] == "lesson"
    if len(parts) == 2:
        path = names.get(parts[1], "")
        if not path.startswith("modules/"):
            return _content_md(target)
        return _content_md(path)
    elif len(parts) == 3:
        return _content_md(f"modules/{parts[1]}/lessons/{parts[2]}")
    else:
        return _content_md(target)


def _any_unique_name_to_path(names: Mapping[str, str], target: str) -> str:
    return _content_md(names.get(target, target))


_LINK_TO_PATH = [
    (re.compile("^/glossary/"), _glossary_target_to_path),
    (re.compile("^/topic/"), _topic_target_to_path),
    (re.compile("^/lesson/"), _lesson_target_to_path),
    (re.compile("^/module/"), _module_target_to_path),
    (re.compile("[^/]+"), _any_unique_name_to_path)
]


FLAGS = flags.FLAGS


flags.DEFINE_bool("print_valid", False, "Whether to print valid links")


MdValue = Union[Mapping[str, "MdValue"]]


class KnowledgeContent(NamedTuple):
    repo_root: Path
    knowledge_dir: Path
    md_files: Tuple[Path, ...]
    textproto_files: Tuple[Path, ...]
    unambiguous_names: Mapping[str, Path]

    def module_name_to_path(self: "KnowledgeContent", name: str) -> Path:
        return self.knowledge_dir / "modules" / name.lower().replace(" ", "_") / "module.textproto"

    def lesson_target_to_path(self: "KnowledgeContent", target: str) -> Path:
        return self.knowledge_dir / _link_target_to_path(self.unambiguous_names, "/lesson/" + target)

    def term_target_to_path(self: "KnowledgeContent", target: str) -> Path:
        return self.knowledge_dir / _link_target_to_path(self.unambiguous_names, "/glossary/" + target)

    def topic_target_to_path(self: "KnowledgeContent", target: str) -> Path:
        return self.knowledge_dir / _link_target_to_path(self.unambiguous_names, "/topic/" + target)

    def link_target_to_path(self: "KnowledgeContent", target: str) -> Path:
        return self.knowledge_dir / _link_target_to_path(self.unambiguous_names, target)

    @classmethod
    def load(cls, repo_root: Path) -> "KnowledgeContent":
        knowledge_dir =  repo_root / "cc-by-sa" / "knowledge"
        assert knowledge_dir.is_dir(), f"No dir {knowledge_dir}"

        md_files = []
        textproto_files = []
        for file in knowledge_dir.rglob("*"):
            if file.suffix.lower() == ".md":
                md_files.append(file)
            elif file.suffix.lower() == ".textproto":
                textproto_files.append(file)
            else:
                pass

        unambiguous_names = {}
        for name, entries in itertools.groupby(sorted(md_files, key=lambda p: p.parent.name), key=lambda p: p.parent.name):
            entries = list(entries)
            if len(entries) != 1:
                print(name, "is ambiguous")
                continue
            unambiguous_names[name] = str(entries[0].relative_to(knowledge_dir).parent)

        return cls(repo_root, knowledge_dir, tuple(md_files), tuple(textproto_files), unambiguous_names)


def _markdown_ast(md_file: Path) -> List[MdValue]:
    return mistune.create_markdown(renderer='ast')(md_file.read_text())


def _ast_iter(root: List[MdValue], filter_fn: Callable[[MdValue], bool]) -> Iterable[MdValue]:
    frontier = list(root)
    while frontier:
        current = frontier.pop(0)
        assert isinstance(current, dict), f"What is {current}"
        if filter_fn(current):
            yield current

        for entry in current.values():
            if isinstance(entry, list):
                frontier.extend(entry)


def _link_target_to_path(names: Mapping[str, Path], target: str) -> Path:
    for matcher, link_to_path_fn in _LINK_TO_PATH:
        if matcher.search(target):
            return link_to_path_fn(names, target)
    raise ValueError(f"Unrecognized target {target}")


def _safe_relative_to(parent: Path, child: Path) -> Path:
    try:
        return child.relative_to(parent)
    except ValueError:
        return child


def _maybe_print_check(result: bool, repo_root: Path, referrer: Path, ref: str, target: Optional[Path]) -> bool:
    if FLAGS.print_valid or not result:
        message = "valid   "
        if not result:
            message = "INVALID "
        suffix = ""
        if target is not None:
            suffix = " => " + str(_safe_relative_to(repo_root, target))
        print(message, _safe_relative_to(repo_root, referrer), f"\"{ref}\"{suffix}")
    return result


def _check_file_present(repo_root: Path, referrer: Path, ref: str, target: Path) -> bool:
    return _maybe_print_check(target.is_file(), repo_root, referrer, ref, target)


def _check_contributor(repo_root: Path, referrer: Path, ref: str, contributors: Set[str]) -> bool:
    return _maybe_print_check(ref in contributors, repo_root, referrer, ref, None)


def _check_md_file_contents(repo_root: Path, md_file: Path, ast: List[MdValue]) -> bool:
    for el in _ast_iter(ast, lambda v: v.get("type", None) == "inline_html"):
        text = el.get("text", "")
        if re.search(' id="[^"]+"', text):
            print("INVALID ", _safe_relative_to(repo_root, md_file), "attr.id not allowed:", text)
            return False
    f = open(md_file,"r")
    content = "".join(f.readlines())
    if re.search('</figcaption>(?!.*</figure>)', content, re.MULTILINE | re.DOTALL):
        print("INVALID ", _safe_relative_to(repo_root, md_file), "Cannot have a <figcaption> outside of a <figure>")
        return False
    f.close()
    return True


def _check_md_files(knowledge: KnowledgeContent) -> bool:
    result = True
    for md_file in knowledge.md_files:
        ast = _markdown_ast(md_file)
        result = _check_md_file_contents(knowledge.repo_root, md_file, ast) and result
        for link in _ast_iter(ast, lambda v: v.get("type", None) == "link"):
            target = link.get("link", "")
            if not target:
                continue  # TODO: are empty links bad
            if re.search("^http(s)?://", target.lower()):
                continue  # we aren't in the business of validating outbound links

            target_path = knowledge.link_target_to_path(target)
            result = _check_file_present(knowledge.repo_root, md_file, target, target_path) and result

    return result


def _check_proto_files(knowledge: KnowledgeContent) -> bool:
    # TODO support alt_ids, many Knowledge constructs have them

    # The set of valid contributors is useful in upcoming validations
    contributors_file = knowledge.knowledge_dir / "contributors.textproto"
    assert contributors_file.is_file(), contributors_file
    contributors = {c.name for c in text_format.Parse(contributors_file.read_text(), knowledge_pb2.ContributorsProto()).contributors}

    result = True
    for textproto_file in knowledge.textproto_files:
        expected_files = set()

        if textproto_file.stem == "contributors":
            pass  # handled above

        elif textproto_file.stem == "knowledge":
            proto = text_format.Parse(textproto_file.read_text(), knowledge_pb2.KnowledgeProto())
            expected_files |= {(m, knowledge.module_name_to_path(m)) for m in proto.modules}

        elif textproto_file.stem == "term":
            proto = text_format.Parse(textproto_file.read_text(), knowledge_pb2.TermProto())
            expected_files |= {(n, knowledge.lesson_target_to_path(n)) for n in proto.related_lessons}

        elif textproto_file.stem == "lesson":
            proto = text_format.Parse(textproto_file.read_text(), knowledge_pb2.LessonProto())
            for author in set(proto.authors) | set(proto.reviewers):
                result = _check_contributor(knowledge.repo_root, textproto_file, author, contributors) and result
            expected_files |= {(n, knowledge.topic_target_to_path(n)) for n in proto.topics}
            expected_files |= {(n, knowledge.lesson_target_to_path(n)) for n in proto.prev_lessons}
            expected_files |= {(n, knowledge.lesson_target_to_path(n)) for n in proto.next_lessons}
            expected_files |= {(n, knowledge.term_target_to_path(n)) for n in proto.related_terms}

            # thumbnail is mandatory
            expected_files.add(("thumbnail", textproto_file.parent / "images" / "thumbnail.svg"))


        elif textproto_file.stem == "module":
            proto = text_format.Parse(textproto_file.read_text(), knowledge_pb2.ModuleProto())
            expected_files |= {(n, knowledge.lesson_target_to_path(n)) for n in proto.lessons}

        elif textproto_file.stem == "topic":
            # The Topic parses. And that's enough.
            text_format.Parse(textproto_file.read_text(), knowledge_pb2.TopicProto())

        else:
            raise ValueError("No handler for " + textproto_file.relative_to(knowledge.repo_root))

        for ref, expected_file in expected_files:
            result = _check_file_present(knowledge.repo_root, textproto_file, ref, expected_file) and result


    return result


def _is_svg(image_file: Path) -> bool:
  return image_file.suffix == ".svg"


def _is_svg(image_file: Path) -> bool:
  return image_file.suffix == ".svg"


def _check_image_files(knowledge: KnowledgeContent) -> bool:
    result = True
    image_files = list(knowledge.knowledge_dir.glob("**/images/*"))
    for image_file in image_files:
        st_size = image_file.stat().st_size
        if _is_svg(image_file):
            if st_size > MAX_VECTOR_IMAGE_SIZE_KB * 1024:
                print("File exceeds max size of %s KB (%s KB):" % (MAX_VECTOR_IMAGE_SIZE_KB, st_size // 1024), image_file.relative_to(knowledge.knowledge_dir))
                result = False
            root = minidom.parseString(image_file.read_text()).documentElement
            if root.tagName != "svg":
                print("Root element must be <svg>:", image_file.relative_to(knowledge.repo_root))
                result = False
            has_view_box = "viewBox" in root.attributes
            has_width_and_height = "width" in root.attributes and "height" in root.attributes
            if not has_view_box and not has_width_and_height:
                print("Must specify viewBox and/or width+height on <svg>:", image_file.relative_to(knowledge.knowledge_dir))
                result = False
            for stopEl in root.getElementsByTagName("stop"):
                if "offset" not in stopEl.attributes:
                    print("Must specify offset on <stop>:", image_file.relative_to(knowledge.knowledge_dir))
                    result = False
        else:
            if st_size > MAX_RASTER_IMAGE_SIZE_KB * 1024:
                print("File exceeds max size of %s KB (%s KB):" % (MAX_RASTER_IMAGE_SIZE_KB, st_size // 1024), image_file.relative_to(knowledge.knowledge_dir))
                result = False
    return result


def main(_):
    knowledge = KnowledgeContent.load(Path(__file__).parent.parent.parent)

    return_code = 1
    if (_check_md_files(knowledge)
        and _check_proto_files(knowledge)
        and _check_image_files(knowledge)):
        return_code = 0

    sys.exit(return_code)


if __name__ == "__main__":
    app.run(main)