57 lines
1.7 KiB
Python
57 lines
1.7 KiB
Python
from collections import Counter
|
|
import unicodedata
|
|
from google.protobuf import text_format
|
|
from gflanguages import languages_public_pb2
|
|
|
|
ATTRIBUTES = "base auxiliary marks punctuation index".split(" ")
|
|
|
|
|
|
def main(args=None):
|
|
for path in args:
|
|
with open(path, encoding="utf-8") as fp:
|
|
language = text_format.Parse(
|
|
fp.read(), languages_public_pb2.LanguageProto()
|
|
)
|
|
changed = False
|
|
exemplar_values = {}
|
|
bases = language.exemplar_chars.base.split(" ")
|
|
marks = language.exemplar_chars.marks.split(" ")
|
|
if not len(bases) or bases == [""]:
|
|
continue
|
|
new_marks = []
|
|
new_bases = []
|
|
for chars in marks:
|
|
if not chars:
|
|
continue
|
|
if chars[0] != "\u25CC":
|
|
chars = "\u25CC" + chars
|
|
if chars not in new_marks:
|
|
new_marks.append(chars)
|
|
|
|
for chars in bases:
|
|
if not chars:
|
|
continue
|
|
if chars[0] == "\u25CC":
|
|
chars = chars[1:]
|
|
cat = unicodedata.category(chars[0])
|
|
if cat in ["Mn", "Mc"]:
|
|
if chars[0] != "\u25CC":
|
|
chars = "\u25CC" + chars
|
|
if chars not in new_marks:
|
|
new_marks.append(chars)
|
|
else:
|
|
new_bases.append(chars)
|
|
|
|
language.exemplar_chars.base = " ".join(new_bases)
|
|
language.exemplar_chars.marks = " ".join(new_marks)
|
|
|
|
with open(path, "w", encoding="utf-8") as fp:
|
|
fp.write(text_format.MessageToString(language, as_utf8=True))
|
|
fp.close()
|
|
|
|
|
|
if __name__ == "__main__":
|
|
import sys
|
|
|
|
main(args=sys.argv[1:])
|