311 lines
12 KiB
Python
311 lines
12 KiB
Python
from pprint import pprint
|
|
from absl import app
|
|
from absl import flags
|
|
from gftools import knowledge_pb2
|
|
from google.protobuf import text_format
|
|
import itertools
|
|
import mistune # markdown => ast
|
|
from xml.dom import minidom
|
|
from pathlib import Path
|
|
import re
|
|
import sys
|
|
from typing import Callable, Iterable, List, Mapping, NamedTuple, Optional, Tuple, Set, Union
|
|
|
|
|
|
MAX_RASTER_IMAGE_SIZE_KB = 800
|
|
MAX_VECTOR_IMAGE_SIZE_KB = 1750
|
|
|
|
|
|
def _topic_target_to_path(_: Set[str], target: str) -> str:
|
|
# TODO sanity check if this is the only valid update
|
|
return Path(target.replace("/topic/", "topics/")) / "topic.textproto"
|
|
|
|
|
|
def _module_target_to_path(_: Set[str], target: str) -> str:
|
|
return Path(target.replace("/module/", "modules/")) / "module.textproto"
|
|
|
|
|
|
def _content_md(path: str) -> Path:
|
|
return Path(path) / "content.md"
|
|
|
|
|
|
def _glossary_target_to_path(_: Set[str], target: str) -> str:
|
|
# TODO sanity check if this is the only valid update
|
|
return _content_md(target.replace("/glossary/", "glossary/terms/"))
|
|
|
|
|
|
def _lesson_target_to_path(names: Mapping[str, str], target: str) -> str:
|
|
# /lesson/choosing_type/choosing_reliable_typefaces => modules/choosing_type/lessons/choosing_reliable_typefaces/
|
|
parts = target[1:].split("/")
|
|
assert parts[0] == "lesson"
|
|
if len(parts) == 2:
|
|
path = names.get(parts[1], "")
|
|
if not path.startswith("modules/"):
|
|
return _content_md(target)
|
|
return _content_md(path)
|
|
elif len(parts) == 3:
|
|
return _content_md(f"modules/{parts[1]}/lessons/{parts[2]}")
|
|
else:
|
|
return _content_md(target)
|
|
|
|
|
|
def _any_unique_name_to_path(names: Mapping[str, str], target: str) -> str:
|
|
return _content_md(names.get(target, target))
|
|
|
|
|
|
_LINK_TO_PATH = [
|
|
(re.compile("^/glossary/"), _glossary_target_to_path),
|
|
(re.compile("^/topic/"), _topic_target_to_path),
|
|
(re.compile("^/lesson/"), _lesson_target_to_path),
|
|
(re.compile("^/module/"), _module_target_to_path),
|
|
(re.compile("[^/]+"), _any_unique_name_to_path)
|
|
]
|
|
|
|
|
|
FLAGS = flags.FLAGS
|
|
|
|
|
|
flags.DEFINE_bool("print_valid", False, "Whether to print valid links")
|
|
|
|
|
|
MdValue = Union[Mapping[str, "MdValue"]]
|
|
|
|
|
|
class KnowledgeContent(NamedTuple):
|
|
repo_root: Path
|
|
knowledge_dir: Path
|
|
md_files: Tuple[Path, ...]
|
|
textproto_files: Tuple[Path, ...]
|
|
unambiguous_names: Mapping[str, Path]
|
|
|
|
def module_name_to_path(self: "KnowledgeContent", name: str) -> Path:
|
|
return self.knowledge_dir / "modules" / name.lower().replace(" ", "_") / "module.textproto"
|
|
|
|
def lesson_target_to_path(self: "KnowledgeContent", target: str) -> Path:
|
|
return self.knowledge_dir / _link_target_to_path(self.unambiguous_names, "/lesson/" + target)
|
|
|
|
def term_target_to_path(self: "KnowledgeContent", target: str) -> Path:
|
|
return self.knowledge_dir / _link_target_to_path(self.unambiguous_names, "/glossary/" + target)
|
|
|
|
def topic_target_to_path(self: "KnowledgeContent", target: str) -> Path:
|
|
return self.knowledge_dir / _link_target_to_path(self.unambiguous_names, "/topic/" + target)
|
|
|
|
def link_target_to_path(self: "KnowledgeContent", target: str) -> Path:
|
|
return self.knowledge_dir / _link_target_to_path(self.unambiguous_names, target)
|
|
|
|
@classmethod
|
|
def load(cls, repo_root: Path) -> "KnowledgeContent":
|
|
knowledge_dir = repo_root / "cc-by-sa" / "knowledge"
|
|
assert knowledge_dir.is_dir(), f"No dir {knowledge_dir}"
|
|
|
|
md_files = []
|
|
textproto_files = []
|
|
for file in knowledge_dir.rglob("*"):
|
|
if file.suffix.lower() == ".md":
|
|
md_files.append(file)
|
|
elif file.suffix.lower() == ".textproto":
|
|
textproto_files.append(file)
|
|
else:
|
|
pass
|
|
|
|
unambiguous_names = {}
|
|
for name, entries in itertools.groupby(sorted(md_files, key=lambda p: p.parent.name), key=lambda p: p.parent.name):
|
|
entries = list(entries)
|
|
if len(entries) != 1:
|
|
print(name, "is ambiguous")
|
|
continue
|
|
unambiguous_names[name] = str(entries[0].relative_to(knowledge_dir).parent)
|
|
|
|
return cls(repo_root, knowledge_dir, tuple(md_files), tuple(textproto_files), unambiguous_names)
|
|
|
|
|
|
def _markdown_ast(md_file: Path) -> List[MdValue]:
|
|
return mistune.create_markdown(renderer='ast')(md_file.read_text())
|
|
|
|
|
|
def _ast_iter(root: List[MdValue], filter_fn: Callable[[MdValue], bool]) -> Iterable[MdValue]:
|
|
frontier = list(root)
|
|
while frontier:
|
|
current = frontier.pop(0)
|
|
assert isinstance(current, dict), f"What is {current}"
|
|
if filter_fn(current):
|
|
yield current
|
|
|
|
for entry in current.values():
|
|
if isinstance(entry, list):
|
|
frontier.extend(entry)
|
|
|
|
|
|
def _link_target_to_path(names: Mapping[str, Path], target: str) -> Path:
|
|
for matcher, link_to_path_fn in _LINK_TO_PATH:
|
|
if matcher.search(target):
|
|
return link_to_path_fn(names, target)
|
|
raise ValueError(f"Unrecognized target {target}")
|
|
|
|
|
|
def _safe_relative_to(parent: Path, child: Path) -> Path:
|
|
try:
|
|
return child.relative_to(parent)
|
|
except ValueError:
|
|
return child
|
|
|
|
|
|
def _maybe_print_check(result: bool, repo_root: Path, referrer: Path, ref: str, target: Optional[Path]) -> bool:
|
|
if FLAGS.print_valid or not result:
|
|
message = "valid "
|
|
if not result:
|
|
message = "INVALID "
|
|
suffix = ""
|
|
if target is not None:
|
|
suffix = " => " + str(_safe_relative_to(repo_root, target))
|
|
print(message, _safe_relative_to(repo_root, referrer), f"\"{ref}\"{suffix}")
|
|
return result
|
|
|
|
|
|
def _check_file_present(repo_root: Path, referrer: Path, ref: str, target: Path) -> bool:
|
|
return _maybe_print_check(target.is_file(), repo_root, referrer, ref, target)
|
|
|
|
|
|
def _check_contributor(repo_root: Path, referrer: Path, ref: str, contributors: Set[str]) -> bool:
|
|
return _maybe_print_check(ref in contributors, repo_root, referrer, ref, None)
|
|
|
|
|
|
def _check_md_file_contents(repo_root: Path, md_file: Path, ast: List[MdValue]) -> bool:
|
|
for el in _ast_iter(ast, lambda v: v.get("type", None) == "inline_html"):
|
|
text = el.get("text", "")
|
|
if re.search(' id="[^"]+"', text):
|
|
print("INVALID ", _safe_relative_to(repo_root, md_file), "attr.id not allowed:", text)
|
|
return False
|
|
f = open(md_file,"r")
|
|
content = "".join(f.readlines())
|
|
if re.search('</figcaption>(?!.*</figure>)', content, re.MULTILINE | re.DOTALL):
|
|
print("INVALID ", _safe_relative_to(repo_root, md_file), "Cannot have a <figcaption> outside of a <figure>")
|
|
return False
|
|
f.close()
|
|
return True
|
|
|
|
|
|
def _check_md_files(knowledge: KnowledgeContent) -> bool:
|
|
result = True
|
|
for md_file in knowledge.md_files:
|
|
ast = _markdown_ast(md_file)
|
|
result = _check_md_file_contents(knowledge.repo_root, md_file, ast) and result
|
|
for link in _ast_iter(ast, lambda v: v.get("type", None) == "link"):
|
|
target = link.get("link", "")
|
|
if not target:
|
|
continue # TODO: are empty links bad
|
|
if re.search("^http(s)?://", target.lower()):
|
|
continue # we aren't in the business of validating outbound links
|
|
|
|
target_path = knowledge.link_target_to_path(target)
|
|
result = _check_file_present(knowledge.repo_root, md_file, target, target_path) and result
|
|
|
|
return result
|
|
|
|
|
|
def _check_proto_files(knowledge: KnowledgeContent) -> bool:
|
|
# TODO support alt_ids, many Knowledge constructs have them
|
|
|
|
# The set of valid contributors is useful in upcoming validations
|
|
contributors_file = knowledge.knowledge_dir / "contributors.textproto"
|
|
assert contributors_file.is_file(), contributors_file
|
|
contributors = {c.name for c in text_format.Parse(contributors_file.read_text(), knowledge_pb2.ContributorsProto()).contributors}
|
|
|
|
result = True
|
|
for textproto_file in knowledge.textproto_files:
|
|
expected_files = set()
|
|
|
|
if textproto_file.stem == "contributors":
|
|
pass # handled above
|
|
|
|
elif textproto_file.stem == "knowledge":
|
|
proto = text_format.Parse(textproto_file.read_text(), knowledge_pb2.KnowledgeProto())
|
|
expected_files |= {(m, knowledge.module_name_to_path(m)) for m in proto.modules}
|
|
|
|
elif textproto_file.stem == "term":
|
|
proto = text_format.Parse(textproto_file.read_text(), knowledge_pb2.TermProto())
|
|
expected_files |= {(n, knowledge.lesson_target_to_path(n)) for n in proto.related_lessons}
|
|
|
|
elif textproto_file.stem == "lesson":
|
|
proto = text_format.Parse(textproto_file.read_text(), knowledge_pb2.LessonProto())
|
|
for author in set(proto.authors) | set(proto.reviewers):
|
|
result = _check_contributor(knowledge.repo_root, textproto_file, author, contributors) and result
|
|
expected_files |= {(n, knowledge.topic_target_to_path(n)) for n in proto.topics}
|
|
expected_files |= {(n, knowledge.lesson_target_to_path(n)) for n in proto.prev_lessons}
|
|
expected_files |= {(n, knowledge.lesson_target_to_path(n)) for n in proto.next_lessons}
|
|
expected_files |= {(n, knowledge.term_target_to_path(n)) for n in proto.related_terms}
|
|
|
|
# thumbnail is mandatory
|
|
expected_files.add(("thumbnail", textproto_file.parent / "images" / "thumbnail.svg"))
|
|
|
|
|
|
elif textproto_file.stem == "module":
|
|
proto = text_format.Parse(textproto_file.read_text(), knowledge_pb2.ModuleProto())
|
|
expected_files |= {(n, knowledge.lesson_target_to_path(n)) for n in proto.lessons}
|
|
|
|
elif textproto_file.stem == "topic":
|
|
# The Topic parses. And that's enough.
|
|
text_format.Parse(textproto_file.read_text(), knowledge_pb2.TopicProto())
|
|
|
|
else:
|
|
raise ValueError("No handler for " + textproto_file.relative_to(knowledge.repo_root))
|
|
|
|
for ref, expected_file in expected_files:
|
|
result = _check_file_present(knowledge.repo_root, textproto_file, ref, expected_file) and result
|
|
|
|
|
|
return result
|
|
|
|
|
|
def _is_svg(image_file: Path) -> bool:
|
|
return image_file.suffix == ".svg"
|
|
|
|
|
|
def _is_svg(image_file: Path) -> bool:
|
|
return image_file.suffix == ".svg"
|
|
|
|
|
|
def _check_image_files(knowledge: KnowledgeContent) -> bool:
|
|
result = True
|
|
image_files = list(knowledge.knowledge_dir.glob("**/images/*"))
|
|
for image_file in image_files:
|
|
st_size = image_file.stat().st_size
|
|
if _is_svg(image_file):
|
|
if st_size > MAX_VECTOR_IMAGE_SIZE_KB * 1024:
|
|
print("File exceeds max size of %s KB (%s KB):" % (MAX_VECTOR_IMAGE_SIZE_KB, st_size // 1024), image_file.relative_to(knowledge.knowledge_dir))
|
|
result = False
|
|
root = minidom.parseString(image_file.read_text()).documentElement
|
|
if root.tagName != "svg":
|
|
print("Root element must be <svg>:", image_file.relative_to(knowledge.repo_root))
|
|
result = False
|
|
has_view_box = "viewBox" in root.attributes
|
|
has_width_and_height = "width" in root.attributes and "height" in root.attributes
|
|
if not has_view_box and not has_width_and_height:
|
|
print("Must specify viewBox and/or width+height on <svg>:", image_file.relative_to(knowledge.knowledge_dir))
|
|
result = False
|
|
for stopEl in root.getElementsByTagName("stop"):
|
|
if "offset" not in stopEl.attributes:
|
|
print("Must specify offset on <stop>:", image_file.relative_to(knowledge.knowledge_dir))
|
|
result = False
|
|
else:
|
|
if st_size > MAX_RASTER_IMAGE_SIZE_KB * 1024:
|
|
print("File exceeds max size of %s KB (%s KB):" % (MAX_RASTER_IMAGE_SIZE_KB, st_size // 1024), image_file.relative_to(knowledge.knowledge_dir))
|
|
result = False
|
|
return result
|
|
|
|
|
|
def main(_):
|
|
knowledge = KnowledgeContent.load(Path(__file__).parent.parent.parent)
|
|
|
|
return_code = 1
|
|
if (_check_md_files(knowledge)
|
|
and _check_proto_files(knowledge)
|
|
and _check_image_files(knowledge)):
|
|
return_code = 0
|
|
|
|
sys.exit(return_code)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
app.run(main)
|