Merge commit '9f60a8d1bd982fe5dea1286b13b51d4ada1e0591' into lang-0.6.4
This commit is contained in:
@@ -1,7 +1,7 @@
|
||||
id: "bn_Beng"
|
||||
language: "bn"
|
||||
script: "Beng"
|
||||
name: "Bengali"
|
||||
name: "Bangla"
|
||||
autonym: "বাংলা"
|
||||
population: 267280377
|
||||
region: "BD"
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
id: "brx_Beng"
|
||||
language: "brx"
|
||||
script: "Beng"
|
||||
name: "Bodo (Bengali)"
|
||||
name: "Bodo (Bangla)"
|
||||
region: "IN"
|
||||
sample_text {
|
||||
masthead_full: "গসবঙ"
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
id: "ccp_Beng"
|
||||
language: "ccp"
|
||||
script: "Beng"
|
||||
name: "Chakma (Bengali)"
|
||||
name: "Chakma (Bangla)"
|
||||
population: 729137
|
||||
region: "BD"
|
||||
region: "IN"
|
||||
|
||||
@@ -2,5 +2,4 @@ id: "chn_Dupl"
|
||||
language: "chn"
|
||||
script: "Dupl"
|
||||
name: "Chinook Jargon (Duployan)"
|
||||
region: "US"
|
||||
region: "CA"
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
id: "kha_Beng"
|
||||
language: "kha"
|
||||
script: "Beng"
|
||||
name: "Khasi (Bengali)"
|
||||
name: "Khasi (Bangla)"
|
||||
population: 0
|
||||
historical: true
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
id: "khr_Beng"
|
||||
language: "khr"
|
||||
script: "Beng"
|
||||
name: "Kharia (Bengali)"
|
||||
name: "Kharia (Bangla)"
|
||||
region: "IN"
|
||||
sample_text {
|
||||
masthead_full: "সউবম"
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
id: "kyw_Beng"
|
||||
language: "kyw"
|
||||
script: "Beng"
|
||||
name: "Kudmali (Bengali)"
|
||||
name: "Kudmali (Bangla)"
|
||||
region: "IN"
|
||||
sample_text {
|
||||
masthead_full: "সভমন"
|
||||
|
||||
@@ -2,7 +2,7 @@ id: "mi_Latn"
|
||||
language: "mi"
|
||||
script: "Latn"
|
||||
name: "Maori"
|
||||
autonym: "Māori"
|
||||
autonym: "te reo Māori"
|
||||
population: 137913
|
||||
region: "NZ"
|
||||
exemplar_chars {
|
||||
@@ -12,16 +12,16 @@ exemplar_chars {
|
||||
index: "A E H I K M N O P R T U W"
|
||||
}
|
||||
sample_text {
|
||||
masthead_full: "KkOo"
|
||||
masthead_partial: "Tt"
|
||||
styles: "No te mea na te whakanoa a na te whakahawea"
|
||||
tester: "No te mea ki te kore te tangata ae akina kia tae ki te tino hemanawatanga"
|
||||
poster_sm: "No te mea he"
|
||||
poster_md: "No te mea"
|
||||
poster_lg: "katoa"
|
||||
specimen_48: "No te mea ko nga iwi o roto i tenei Kotahitanga kua oati i runga"
|
||||
specimen_36: "No te mea e tutaki ai tenei oati he mea nui rawa kia matou te mano tini o te tangata ki enei tikanga rangatira."
|
||||
specimen_32: "Kaua tetahi tangata e hopukia noatia e te ringa o te ture e puritia noatia ranei i roto i tetahi whare herehere e peia noatia ranei ki tetahi whenua ke."
|
||||
specimen_21: "Ko ia tangata e tika ana kia whakatuturutia ki a ia tetahi whakawa tika ki te aroaro o te katoa e tetahi runanga wehekore whakahoahoa ranei, mo runga i te whakataunga i ona tika me nga tikanga hei whakarite mana tae atu hoki ki nga whakapae mona tera kua hara kino ia i raro i te ture.\nKo ia tangata e whai-tika ana ki nga ritenga o te noho pai o te iwi me te ao katoa, ma reira nei e tino tuturu ai nga tika me nga rangatiratanga kua whakararangitia nei ki roto i tenei Whakapuakitanga."
|
||||
specimen_16: "Kahore rawa i roto i tenei Whakapuakitanga tetahi mea e ahei ana kia whakamoaritia tera kei tetahi Mana Kawanatanga, kei tetahi ropu, kei tetahi tangata ranei tetahi mana ki te whakahaere i tetahi ritenga, ki te mahi ranei i tetahi mahi e anga atu ana hei tikanga turaki i tetahi o nga mano me nga rangatiratanga e mau ake nei.\nNo te mea na te whakanoa a na te whakahawea ki nga mana o te tangata i tupu ai nga mahi whakarihariha i pouri ai te ngakau tangata, a ko te kohaetanga o tetahi ao hou e mahorahora ai te tangata ki te korero ki te whakapono, ki te noho noa i runga i te rangimarie a i te ora, kua panuitia hei taumata mo te koingotanga o te ngakau o te mano tini o te tangata."
|
||||
masthead_full: "IiĀā"
|
||||
masthead_partial: "Ēē"
|
||||
styles: "I te whānautanga mai o te tangata, kāhore"
|
||||
tester: "E āhei ana ia tangata ki ngā tika me ngā herekoretanga e rār"
|
||||
poster_sm: "E whai tik"
|
||||
poster_md: "Kia ka"
|
||||
poster_lg: "mai"
|
||||
specimen_48: "Kia kaua te tangata e tukuna kia tūkinotia, kia whi"
|
||||
specimen_36: "Kia kaua te tangata e hopukina pokanoatia, e mauheretia pokanoatia rānei, e panaia pokanoatia rānei i te whenua."
|
||||
specimen_32: "E whai tika ana ia tangata ki te whakatā, ki te whakangahau hoki, me whakarite ngā hāora mahi kia pēnei, me whakawhiwhi hoki he wā hararei e utua ana."
|
||||
specimen_21: "E whai tika ana ia tangata ki tētahi punaha hapori i tōna whenua me te ao whānui e mana katoa ai ngā tika me ngā herekoretanga kei tēnei Whakapuakitanga e rārangi ana.\nKāhore he kōrero i tēnei Whakapuakitanga ka taea te kī māna e whai tika ai te Mana Whenua, te rōpū, te tangata rānei ki te mahi i tētahi mahi e korehāhātia ai ngā tika me ngā herekoretanga e rārangi ake nei."
|
||||
specimen_16: "I te whānautanga mai o te tangata, kāhore ōna here, e ōrite ana tōna mana me ōna tika ki te katoa. Ka whakatōkia ki roto i te tangata he wairua,\nhe hinengaro hoki, ā, me mahi tahi ia ki ngā tāngata o te ao i runga i te āhua o te tuakana me te teina.\nE whai tika ana ia tangata ki te ora, ki te noho herekore, ki te haumarutanga o te tinana.\nKia kaua te tangata e pupuritia hei taurekareka, hei pononga mā tētahi, ā, me aukati ngā āhuatanga katoa o te whakataurekareka i te tangata, o te hoko rānei i te tangata hei taurekareka.\nKia kaua te tangata e tukuna kia tūkinotia, kia whiua rānei ki te mahi whakawiri, whakāhawea rānei i a ia."
|
||||
}
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
id: "sat_Beng"
|
||||
language: "sat"
|
||||
script: "Beng"
|
||||
name: "Santali (Bengali)"
|
||||
name: "Santali (Bangla)"
|
||||
population: 0
|
||||
historical: true
|
||||
|
||||
20
lang/Lib/gflanguages/data/languages/sq_Todr.textproto
Normal file
20
lang/Lib/gflanguages/data/languages/sq_Todr.textproto
Normal file
@@ -0,0 +1,20 @@
|
||||
id: "sq_Todr"
|
||||
language: "sq"
|
||||
script: "Todr"
|
||||
name: "Albanian (Todhri)"
|
||||
population: 0
|
||||
sample_text {
|
||||
masthead_full: ""
|
||||
masthead_partial: ""
|
||||
styles: " "
|
||||
tester: " "
|
||||
poster_sm: " "
|
||||
poster_md: " "
|
||||
poster_lg: ""
|
||||
specimen_48: " "
|
||||
specimen_36: " . ."
|
||||
specimen_32: " . ."
|
||||
specimen_21: " . .\n . ."
|
||||
specimen_16: " . .\n . .\n . .\n . ."
|
||||
note: "This is a transliteration of the text in sq_Latn"
|
||||
}
|
||||
@@ -18,11 +18,11 @@ exemplar_chars {
|
||||
index: "А Б В Г Ґ Д Е Є Ж З И І Ї Й К Л М Н О П Р С Т У Ф Х Ц Ч Ш Щ Ю Я"
|
||||
}
|
||||
sample_text {
|
||||
masthead_full: "ВвСс"
|
||||
masthead_partial: "Іі"
|
||||
styles: "Беручи до уваги, що визнання гідності, яка"
|
||||
masthead_full: "ҐґЄє"
|
||||
masthead_partial: "Її"
|
||||
styles: "Беручи до уваги, що визнання гідності"
|
||||
tester: "беручи до уваги, що зневажання і нехтування правами людини призвели"
|
||||
poster_sm: "беручи до уваги,"
|
||||
poster_sm: "беручи до уваги"
|
||||
poster_md: "беручи до"
|
||||
poster_lg: "Всі"
|
||||
specimen_48: "беручи до уваги, що народи Об\'єднаних Націй підтвердили в Статуті"
|
||||
|
||||
@@ -1,2 +1,2 @@
|
||||
id: "Beng"
|
||||
name: "Bengali"
|
||||
name: "Bangla"
|
||||
|
||||
3
lang/Lib/gflanguages/data/scripts/Todr.textproto
Normal file
3
lang/Lib/gflanguages/data/scripts/Todr.textproto
Normal file
@@ -0,0 +1,3 @@
|
||||
id: "Todr"
|
||||
name: "Todhri"
|
||||
|
||||
252
lang/Lib/gflanguages/udhr.py
Normal file
252
lang/Lib/gflanguages/udhr.py
Normal file
@@ -0,0 +1,252 @@
|
||||
from gflanguages import languages_public_pb2
|
||||
import enum
|
||||
import re
|
||||
|
||||
|
||||
class Udhr:
|
||||
def __init__(
|
||||
self, key, iso639_3, iso15924, bcp47, direction, ohchr, stage, loc, name
|
||||
):
|
||||
self.key = key
|
||||
self.iso639_3 = iso639_3
|
||||
self.iso15924 = iso15924
|
||||
self.bcp47 = bcp47
|
||||
self.direction = direction
|
||||
self.ohchr = ohchr
|
||||
self.stage = stage
|
||||
self.loc = loc
|
||||
self.name = name
|
||||
|
||||
self.title = None
|
||||
self.preamble = None
|
||||
self.articles = []
|
||||
|
||||
def Parse(self, translation_data):
|
||||
if translation_data is None or self.stage < 2:
|
||||
return
|
||||
|
||||
if translation_data.find("./{*}title") is not None:
|
||||
self.title = translation_data.find("./{*}title").text
|
||||
|
||||
preamble_data = translation_data.find("./{*}preamble")
|
||||
if preamble_data is not None:
|
||||
if preamble_data.find("./{*}title") is not None:
|
||||
self.preamble = {
|
||||
"title": preamble_data.find("./{*}title").text,
|
||||
"content": [
|
||||
para.text for para in preamble_data.findall("./{*}para")
|
||||
],
|
||||
}
|
||||
|
||||
articles_data = translation_data.findall("./{*}article")
|
||||
for article_data in articles_data:
|
||||
title_data = article_data.find("./{*}title")
|
||||
article = {
|
||||
"id": int(article_data.get("number")),
|
||||
"title": None if title_data is None else title_data.text,
|
||||
"content": [para.text for para in article_data.findall("./{*}para")],
|
||||
}
|
||||
self.articles.append(article)
|
||||
|
||||
def LoadArticleOne(self, article_one):
|
||||
self.articles.append({"id": 0, "title": None, "content": [article_one]})
|
||||
|
||||
def GetSampleTexts(self):
|
||||
extractor = SampleTextExtractor(self)
|
||||
return extractor.GetSampleTexts()
|
||||
|
||||
|
||||
class SampleTextExtractor:
|
||||
class TextType(enum.Enum):
|
||||
GLYPHS = 1
|
||||
WORD = 2
|
||||
PHRASE = 3
|
||||
SENTENCE = 4
|
||||
PARAGRAPH = 5
|
||||
PASSAGE = 6
|
||||
|
||||
def __init__(self, udhr):
|
||||
self._udhr = udhr
|
||||
self._glyphs = iter(self._GetGlyphs())
|
||||
self._words = iter(self._GetWords())
|
||||
self._paragraphs = iter(self._GetParagraphs())
|
||||
self._phrase_history = set()
|
||||
|
||||
self._non_word_regex = re.compile(r"[^\w]+")
|
||||
self._space_regex = re.compile(r"\s+")
|
||||
self._non_space_regex = re.compile(r"[^\s]+")
|
||||
self._non_word_space_regex = re.compile(r"[^\w\s]+")
|
||||
self._any_regex = re.compile(r".")
|
||||
|
||||
def _DisplayLength(self, s):
|
||||
"""Returns length of given string. Omits combining characters.
|
||||
|
||||
Some entire scripts will not be counted; in those cases, the raw length of
|
||||
the string is returned.
|
||||
"""
|
||||
word_space_length = len(self._non_word_space_regex.sub("", s))
|
||||
space_length = len(self._non_space_regex.sub("", s))
|
||||
if word_space_length == space_length:
|
||||
return len(s)
|
||||
return word_space_length
|
||||
|
||||
def _GetGlyphs(self):
|
||||
seen = set()
|
||||
for article in self._udhr.articles:
|
||||
for para in article["content"]:
|
||||
for ch in self._non_word_regex.sub("", para) or self._space_regex.sub(
|
||||
"", para
|
||||
):
|
||||
ch = ch.lower()
|
||||
if ch not in seen:
|
||||
seen.add(ch)
|
||||
yield ch
|
||||
|
||||
def _GetWords(self):
|
||||
if self._space_regex.search(self._udhr.articles[0]["content"][0]) is not None:
|
||||
splitter = self._space_regex
|
||||
else:
|
||||
splitter = self._non_word_regex
|
||||
|
||||
seen = set()
|
||||
for article in self._udhr.articles:
|
||||
for para in article["content"]:
|
||||
for s in splitter.split(para):
|
||||
if s not in seen:
|
||||
seen.add(s)
|
||||
yield s
|
||||
|
||||
def _GetParagraphs(self):
|
||||
if self._udhr.preamble is not None:
|
||||
for para in self._udhr.preamble["content"]:
|
||||
yield para
|
||||
for article in self._udhr.articles:
|
||||
for para in article["content"]:
|
||||
yield para
|
||||
|
||||
def _ExtractGlyphs(self, min_chars, max_chars):
|
||||
s = ""
|
||||
for ch in self._glyphs:
|
||||
s += ch.upper()
|
||||
if len(s) >= min_chars:
|
||||
break
|
||||
if ch != ch.upper():
|
||||
s += ch
|
||||
if len(s) >= min_chars:
|
||||
break
|
||||
return s
|
||||
|
||||
def _ExtractWord(self, min_chars, max_chars):
|
||||
for iterator in [self._words, self._GetWords()]:
|
||||
for w in iterator:
|
||||
if w is None:
|
||||
continue
|
||||
if min_chars <= self._DisplayLength(w) <= max_chars:
|
||||
return w
|
||||
# Fallback to using multiple words for languages with very small words
|
||||
return self._ExtractPhrase(min_chars, max_chars)
|
||||
|
||||
def _ExtractPhrase(self, min_chars, max_chars):
|
||||
for iterator in [self._paragraphs, self._GetParagraphs()]:
|
||||
for para in iterator:
|
||||
if para is None:
|
||||
continue
|
||||
for regex in [self._any_regex, self._space_regex, self._non_word_regex]:
|
||||
breaks = [-1]
|
||||
for match in regex.finditer(para, min_chars):
|
||||
breaks.append(match.start())
|
||||
phrase = para[breaks[0] + 1 : breaks[len(breaks) - 1]]
|
||||
p_size = self._DisplayLength(phrase)
|
||||
while p_size > max_chars and len(breaks) > 1:
|
||||
breaks.pop()
|
||||
phrase = para[breaks[0] + 1 : breaks[len(breaks) - 1]]
|
||||
p_size = self._DisplayLength(phrase)
|
||||
if min_chars <= p_size and phrase not in self._phrase_history:
|
||||
self._phrase_history.add(phrase)
|
||||
return phrase
|
||||
return self._ExtractParagraph(min_chars, max_chars)
|
||||
|
||||
def _ExtractSentence(self, min_chars, max_chars):
|
||||
# Sentence delimination may differ between scripts, so tokenizing on spaces
|
||||
# would be unreliable. Prefer to use _ExtractPhrase.
|
||||
return self._ExtractPhrase(min_chars, max_chars)
|
||||
|
||||
def _ExtractParagraph(self, min_chars, max_chars):
|
||||
for iterator in [self._paragraphs, self._GetParagraphs()]:
|
||||
for para in iterator:
|
||||
if para is None:
|
||||
continue
|
||||
if min_chars <= self._DisplayLength(para) <= max_chars:
|
||||
return para
|
||||
# Paragraphs likely insufficient length; try combining into passages
|
||||
return self._ExtractPassage(min_chars, max_chars)
|
||||
|
||||
def _ExtractPassage(self, min_chars, max_chars):
|
||||
p = []
|
||||
p_size = 0
|
||||
while p_size < min_chars:
|
||||
for iterator in [self._paragraphs, self._GetParagraphs()]:
|
||||
for para in iterator:
|
||||
if para is None:
|
||||
continue
|
||||
p.append(para)
|
||||
p_size = self._DisplayLength(" ".join(p))
|
||||
if max_chars < p_size:
|
||||
p.pop()
|
||||
elif min_chars <= p_size:
|
||||
return "\n".join(p)
|
||||
assert len(p) > 0, "Unable to extract passage: " + self._udhr.key
|
||||
if len(p) == 0:
|
||||
p.append([p for p in self._GetParagraphs()][0])
|
||||
return "\n".join(p)
|
||||
|
||||
def _Get(self, text_type, **kwargs):
|
||||
if "char_count" in kwargs:
|
||||
min_chars = kwargs["char_count"]
|
||||
max_chars = kwargs["char_count"]
|
||||
else:
|
||||
min_chars = kwargs["min_chars"]
|
||||
max_chars = kwargs["max_chars"]
|
||||
if text_type == self.TextType.GLYPHS:
|
||||
return self._ExtractGlyphs(min_chars, max_chars)
|
||||
if text_type == self.TextType.WORD:
|
||||
return self._ExtractWord(min_chars, max_chars)
|
||||
if text_type == self.TextType.PHRASE:
|
||||
return self._ExtractPhrase(min_chars, max_chars)
|
||||
if text_type == self.TextType.SENTENCE:
|
||||
return self._ExtractSentence(min_chars, max_chars)
|
||||
if text_type == self.TextType.PARAGRAPH:
|
||||
return self._ExtractParagraph(min_chars, max_chars)
|
||||
if text_type == self.TextType.PASSAGE:
|
||||
return self._ExtractPassage(min_chars, max_chars)
|
||||
raise Exception("Unsupported text type: " + text_type)
|
||||
|
||||
def GetSampleTexts(self):
|
||||
sample_text = languages_public_pb2.SampleTextProto()
|
||||
sample_text.masthead_full = self._Get(self.TextType.GLYPHS, char_count=4)
|
||||
sample_text.masthead_partial = self._Get(self.TextType.GLYPHS, char_count=2)
|
||||
sample_text.styles = self._Get(self.TextType.PHRASE, min_chars=40, max_chars=60)
|
||||
sample_text.tester = self._Get(self.TextType.PHRASE, min_chars=60, max_chars=90)
|
||||
sample_text.poster_sm = self._Get(
|
||||
self.TextType.PHRASE, min_chars=10, max_chars=17
|
||||
)
|
||||
sample_text.poster_md = self._Get(
|
||||
self.TextType.PHRASE, min_chars=6, max_chars=12
|
||||
)
|
||||
sample_text.poster_lg = self._Get(self.TextType.WORD, min_chars=3, max_chars=8)
|
||||
sample_text.specimen_48 = self._Get(
|
||||
self.TextType.SENTENCE, min_chars=50, max_chars=80
|
||||
)
|
||||
sample_text.specimen_36 = self._Get(
|
||||
self.TextType.PARAGRAPH, min_chars=100, max_chars=120
|
||||
)
|
||||
sample_text.specimen_32 = self._Get(
|
||||
self.TextType.PARAGRAPH, min_chars=140, max_chars=180
|
||||
)
|
||||
sample_text.specimen_21 = self._Get(
|
||||
self.TextType.PASSAGE, min_chars=300, max_chars=500
|
||||
)
|
||||
sample_text.specimen_16 = self._Get(
|
||||
self.TextType.PASSAGE, min_chars=550, max_chars=750
|
||||
)
|
||||
return sample_text
|
||||
65
lang/snippets/lang_sample_text.py
Executable file
65
lang/snippets/lang_sample_text.py
Executable file
@@ -0,0 +1,65 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
lang-sample-text
|
||||
|
||||
Adds sample text for a given language using the specified UDHR translation.
|
||||
|
||||
Usage:
|
||||
|
||||
lang-sample-text -l ./languages/en.textproto ./udhr_translations/en.xml
|
||||
|
||||
"""
|
||||
|
||||
from gflanguages import LoadLanguages, languages_public_pb2
|
||||
from gftools.util.google_fonts import ReadProto, WriteProto
|
||||
from gflanguages.udhr import Udhr
|
||||
from lxml import etree
|
||||
import os
|
||||
import re
|
||||
import argparse
|
||||
|
||||
|
||||
def main(argv=None):
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Update UDHR sample text for a given language"
|
||||
)
|
||||
parser.add_argument(
|
||||
"-l",
|
||||
"--lang",
|
||||
help="Language proto file to update",
|
||||
required=True,
|
||||
)
|
||||
parser.add_argument(
|
||||
"-u",
|
||||
"--udhr",
|
||||
help="Path to UDHR translation (XML)",
|
||||
required=True,
|
||||
)
|
||||
args = parser.parse_args(argv)
|
||||
|
||||
language = ReadProto(languages_public_pb2.LanguageProto(), args.lang)
|
||||
|
||||
udhr_data = etree.parse(args.udhr)
|
||||
head = udhr_data.getroot()
|
||||
for name, value in head.attrib.items():
|
||||
if re.search(r"\{.*\}lang", name):
|
||||
bcp47 = value.replace("-", "_")
|
||||
udhr = Udhr(
|
||||
key=head.get("key"),
|
||||
iso639_3=head.get("iso639-3"),
|
||||
iso15924=head.get("iso15924"),
|
||||
bcp47=bcp47,
|
||||
direction=head.get("dir"),
|
||||
ohchr=None,
|
||||
stage=4,
|
||||
loc=None,
|
||||
name=head.get("n"),
|
||||
)
|
||||
udhr.Parse(udhr_data)
|
||||
|
||||
language.sample_text.MergeFrom(udhr.GetSampleTexts())
|
||||
WriteProto(language, args.lang)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user