64 lines
2.3 KiB
Python
64 lines
2.3 KiB
Python
from collections import Counter
|
|
from google.protobuf import text_format
|
|
from gflanguages import languages_public_pb2
|
|
|
|
ATTRIBUTES = "base auxiliary marks punctuation index".split(" ")
|
|
|
|
|
|
def main(args=None):
|
|
for path in args:
|
|
with open(path, encoding="utf-8") as fp:
|
|
language = text_format.Parse(
|
|
fp.read(), languages_public_pb2.LanguageProto()
|
|
)
|
|
changed = False
|
|
exemplar_values = {}
|
|
if not hasattr(language, "exemplar_chars"):
|
|
exit()
|
|
for attr in ATTRIBUTES:
|
|
if hasattr(language.exemplar_chars, attr):
|
|
values = getattr(language.exemplar_chars, attr).split(" ")
|
|
value_set = set()
|
|
clean_values = []
|
|
for value in values:
|
|
if value in value_set:
|
|
continue
|
|
else:
|
|
value_set.add(value)
|
|
clean_values.append(value)
|
|
|
|
if clean_values != values:
|
|
if {len(set(values))} != {len(set(clean_values))}:
|
|
print("before: "+ " ".join(values))
|
|
print("after: "+ " ".join(clean_values))
|
|
sys.exit("Failed fixing exemplar.")
|
|
setattr(language.exemplar_chars, attr, " ".join(clean_values))
|
|
changed = True
|
|
exemplar_values[attr] = {
|
|
"before": values,
|
|
"after": clean_values
|
|
}
|
|
|
|
if changed:
|
|
for exemplar, values in exemplar_values.items():
|
|
before = values["before"]
|
|
after = values["after"]
|
|
counter = Counter(before)
|
|
duplicates = [(g, c - 1) for g, c in counter.most_common() if c > 1]
|
|
print(
|
|
f"Changed {path} {exemplar} exemplar:\n"
|
|
f"- from {len(before)} ({len(set(before))} as set) "
|
|
f"to {len(after)} elements\n"
|
|
f"- removing {len(before) - len(after)} duplicate(s):\n"
|
|
f" {duplicates}\n"
|
|
)
|
|
with open(path, "w", encoding="utf-8") as fp:
|
|
fp.write(text_format.MessageToString(language, as_utf8=True))
|
|
fp.close()
|
|
|
|
|
|
if __name__ == "__main__":
|
|
import sys
|
|
|
|
main(args=sys.argv[1:])
|