1
0
mirror of synced 2025-12-19 17:48:10 -05:00

Merge commit '9f60a8d1bd982fe5dea1286b13b51d4ada1e0591' into lang-0.6.4

This commit is contained in:
Simon Cozens
2024-09-12 14:39:13 +01:00
15 changed files with 365 additions and 26 deletions

View File

@@ -1,7 +1,7 @@
id: "bn_Beng"
language: "bn"
script: "Beng"
name: "Bengali"
name: "Bangla"
autonym: "বাংলা"
population: 267280377
region: "BD"

View File

@@ -1,7 +1,7 @@
id: "brx_Beng"
language: "brx"
script: "Beng"
name: "Bodo (Bengali)"
name: "Bodo (Bangla)"
region: "IN"
sample_text {
masthead_full: "গসবঙ"

View File

@@ -1,7 +1,7 @@
id: "ccp_Beng"
language: "ccp"
script: "Beng"
name: "Chakma (Bengali)"
name: "Chakma (Bangla)"
population: 729137
region: "BD"
region: "IN"

View File

@@ -2,5 +2,4 @@ id: "chn_Dupl"
language: "chn"
script: "Dupl"
name: "Chinook Jargon (Duployan)"
region: "US"
region: "CA"

View File

@@ -1,6 +1,6 @@
id: "kha_Beng"
language: "kha"
script: "Beng"
name: "Khasi (Bengali)"
name: "Khasi (Bangla)"
population: 0
historical: true

View File

@@ -1,7 +1,7 @@
id: "khr_Beng"
language: "khr"
script: "Beng"
name: "Kharia (Bengali)"
name: "Kharia (Bangla)"
region: "IN"
sample_text {
masthead_full: "সউবম"

View File

@@ -1,7 +1,7 @@
id: "kyw_Beng"
language: "kyw"
script: "Beng"
name: "Kudmali (Bengali)"
name: "Kudmali (Bangla)"
region: "IN"
sample_text {
masthead_full: "সভমন"

View File

@@ -2,7 +2,7 @@ id: "mi_Latn"
language: "mi"
script: "Latn"
name: "Maori"
autonym: "Māori"
autonym: "te reo Māori"
population: 137913
region: "NZ"
exemplar_chars {
@@ -12,16 +12,16 @@ exemplar_chars {
index: "A E H I K M N O P R T U W"
}
sample_text {
masthead_full: "KkOo"
masthead_partial: "Tt"
styles: "No te mea na te whakanoa a na te whakahawea"
tester: "No te mea ki te kore te tangata ae akina kia tae ki te tino hemanawatanga"
poster_sm: "No te mea he"
poster_md: "No te mea"
poster_lg: "katoa"
specimen_48: "No te mea ko nga iwi o roto i tenei Kotahitanga kua oati i runga"
specimen_36: "No te mea e tutaki ai tenei oati he mea nui rawa kia matou te mano tini o te tangata ki enei tikanga rangatira."
specimen_32: "Kaua tetahi tangata e hopukia noatia e te ringa o te ture e puritia noatia ranei i roto i tetahi whare herehere e peia noatia ranei ki tetahi whenua ke."
specimen_21: "Ko ia tangata e tika ana kia whakatuturutia ki a ia tetahi whakawa tika ki te aroaro o te katoa e tetahi runanga wehekore whakahoahoa ranei, mo runga i te whakataunga i ona tika me nga tikanga hei whakarite mana tae atu hoki ki nga whakapae mona tera kua hara kino ia i raro i te ture.\nKo ia tangata e whai-tika ana ki nga ritenga o te noho pai o te iwi me te ao katoa, ma reira nei e tino tuturu ai nga tika me nga rangatiratanga kua whakararangitia nei ki roto i tenei Whakapuakitanga."
specimen_16: "Kahore rawa i roto i tenei Whakapuakitanga tetahi mea e ahei ana kia whakamoaritia tera kei tetahi Mana Kawanatanga, kei tetahi ropu, kei tetahi tangata ranei tetahi mana ki te whakahaere i tetahi ritenga, ki te mahi ranei i tetahi mahi e anga atu ana hei tikanga turaki i tetahi o nga mano me nga rangatiratanga e mau ake nei.\nNo te mea na te whakanoa a na te whakahawea ki nga mana o te tangata i tupu ai nga mahi whakarihariha i pouri ai te ngakau tangata, a ko te kohaetanga o tetahi ao hou e mahorahora ai te tangata ki te korero ki te whakapono, ki te noho noa i runga i te rangimarie a i te ora, kua panuitia hei taumata mo te koingotanga o te ngakau o te mano tini o te tangata."
masthead_full: "IiĀā"
masthead_partial: "Ēē"
styles: "I te whānautanga mai o te tangata, kāhore"
tester: "E āhei ana ia tangata ki ngā tika me ngā herekoretanga e rār"
poster_sm: "E whai tik"
poster_md: "Kia ka"
poster_lg: "mai"
specimen_48: "Kia kaua te tangata e tukuna kia tūkinotia, kia whi"
specimen_36: "Kia kaua te tangata e hopukina pokanoatia, e mauheretia pokanoatia rānei, e panaia pokanoatia rānei i te whenua."
specimen_32: "E whai tika ana ia tangata ki te whakatā, ki te whakangahau hoki, me whakarite ngā hāora mahi kia pēnei, me whakawhiwhi hoki he wā hararei e utua ana."
specimen_21: "E whai tika ana ia tangata ki tētahi punaha hapori i tōna whenua me te ao whānui e mana katoa ai ngā tika me ngā herekoretanga kei tēnei Whakapuakitanga e rārangi ana.\nKāhore he kōrero i tēnei Whakapuakitanga ka taea te kī māna e whai tika ai te Mana Whenua, te rōpū, te tangata rānei ki te mahi i tētahi mahi e korehāhātia ai ngā tika me ngā herekoretanga e rārangi ake nei."
specimen_16: "I te whānautanga mai o te tangata, kāhore ōna here, e ōrite ana tōna mana me ōna tika ki te katoa. Ka whakatōkia ki roto i te tangata he wairua,\nhe hinengaro hoki, ā, me mahi tahi ia ki ngā tāngata o te ao i runga i te āhua o te tuakana me te teina.\nE whai tika ana ia tangata ki te ora, ki te noho herekore, ki te haumarutanga o te tinana.\nKia kaua te tangata e pupuritia hei taurekareka, hei pononga mā tētahi, ā, me aukati ngā āhuatanga katoa o te whakataurekareka i te tangata, o te hoko rānei i te tangata hei taurekareka.\nKia kaua te tangata e tukuna kia tūkinotia, kia whiua rānei ki te mahi whakawiri, whakāhawea rānei i a ia."
}

View File

@@ -1,6 +1,6 @@
id: "sat_Beng"
language: "sat"
script: "Beng"
name: "Santali (Bengali)"
name: "Santali (Bangla)"
population: 0
historical: true

View File

@@ -0,0 +1,20 @@
id: "sq_Todr"
language: "sq"
script: "Todr"
name: "Albanian (Todhri)"
population: 0
sample_text {
masthead_full: "𐗎𐗒𐗢𐗐𐗊"
masthead_partial: "𐗢𐗊"
styles: "𐗢𐗊 𐗎𐗒𐗢𐗐𐗊 𐗙𐗉𐗝𐗊𐗬𐗒𐗢 𐗕𐗒𐗇𐗒𐗘 𐗢𐗊 𐗕𐗒𐗝𐗊 𐗈𐗉 𐗢𐗊"
tester: "𐗢𐗊 𐗎𐗒𐗢𐗐𐗊 𐗙𐗉𐗝𐗊𐗬𐗒𐗢 𐗕𐗒𐗇𐗒𐗘 𐗢𐗊 𐗕𐗒𐗝𐗊 𐗈𐗉 𐗢𐗊 𐗂𐗀𐗝𐗀𐗂𐗀𐗝𐗢𐗊 𐗘𐗊 𐗆𐗒𐗙𐗒𐗢𐗉𐗢"
poster_sm: "𐗢𐗊 𐗎𐗒𐗢𐗐𐗊 𐗙𐗉𐗝𐗊𐗬𐗒𐗢"
poster_md: "𐗢𐗊 𐗎𐗒𐗢𐗐𐗊"
poster_lg: "𐗎𐗒𐗢𐗐𐗊"
specimen_48: "𐗢𐗊 𐗎𐗒𐗢𐗐𐗊 𐗙𐗉𐗝𐗊𐗬𐗒𐗢 𐗕𐗒𐗇𐗒𐗘 𐗢𐗊 𐗕𐗒𐗝𐗊 𐗈𐗉 𐗢𐗊 𐗂𐗀𐗝𐗀𐗂𐗀𐗝𐗢𐗊"
specimen_36: "𐗢𐗊 𐗎𐗒𐗢𐗐𐗊 𐗙𐗉𐗝𐗊𐗬𐗒𐗢 𐗕𐗒𐗇𐗒𐗘 𐗢𐗊 𐗕𐗒𐗝𐗊 𐗈𐗉 𐗢𐗊 𐗂𐗀𐗝𐗀𐗂𐗀𐗝𐗢𐗊 𐗘𐗊 𐗆𐗒𐗙𐗒𐗢𐗉𐗢 𐗈𐗉 𐗘𐗊 𐗢𐗊 𐗆𐗝𐗉𐗓𐗢𐗀. 𐗀𐗢𐗀 𐗔𐗀𐗘𐗊 𐗀𐗝𐗟𐗪𐗉 𐗈𐗉 𐗇𐗊𐗝𐗎𐗉𐗎𐗉 𐗈𐗉 𐗆𐗤𐗐𐗉𐗢 𐗢𐗊 𐗟𐗒𐗖𐗉𐗘 𐗇𐗀𐗓 𐗙𐗊𐗝𐗒 𐗢𐗓𐗉𐗢𐗝𐗒𐗢 𐗗𐗉 𐗋𐗝𐗪𐗗𐗊 𐗥𐗊𐗖𐗀𐗬𐗊𐗝𐗒𐗗𐗒."
specimen_32: "𐗢𐗊 𐗎𐗒𐗢𐗐𐗊 𐗙𐗉𐗝𐗊𐗬𐗒𐗢 𐗕𐗒𐗇𐗒𐗘 𐗢𐗊 𐗕𐗒𐗝𐗊 𐗈𐗉 𐗢𐗊 𐗂𐗀𐗝𐗀𐗂𐗀𐗝𐗢𐗊 𐗘𐗊 𐗆𐗒𐗙𐗒𐗢𐗉𐗢 𐗈𐗉 𐗘𐗊 𐗢𐗊 𐗆𐗝𐗉𐗓𐗢𐗀. 𐗀𐗢𐗀 𐗔𐗀𐗘𐗊 𐗀𐗝𐗟𐗪𐗉 𐗈𐗉 𐗇𐗊𐗝𐗎𐗉𐗎𐗉 𐗈𐗉 𐗆𐗤𐗐𐗉𐗢 𐗢𐗊 𐗟𐗒𐗖𐗉𐗘 𐗇𐗀𐗓 𐗙𐗊𐗝𐗒 𐗢𐗓𐗉𐗢𐗝𐗒𐗢 𐗗𐗉 𐗋𐗝𐗪𐗗𐗊 𐗥𐗊𐗖𐗀𐗬𐗊𐗝𐗒𐗗𐗒."
specimen_21: "𐗢𐗊 𐗎𐗒𐗢𐗐𐗊 𐗙𐗉𐗝𐗊𐗬𐗒𐗢 𐗕𐗒𐗇𐗒𐗘 𐗢𐗊 𐗕𐗒𐗝𐗊 𐗈𐗉 𐗢𐗊 𐗂𐗀𐗝𐗀𐗂𐗀𐗝𐗢𐗊 𐗘𐗊 𐗆𐗒𐗙𐗒𐗢𐗉𐗢 𐗈𐗉 𐗘𐗊 𐗢𐗊 𐗆𐗝𐗉𐗓𐗢𐗀. 𐗀𐗢𐗀 𐗔𐗀𐗘𐗊 𐗀𐗝𐗟𐗪𐗉 𐗈𐗉 𐗇𐗊𐗝𐗎𐗉𐗎𐗉 𐗈𐗉 𐗆𐗤𐗐𐗉𐗢 𐗢𐗊 𐗟𐗒𐗖𐗉𐗘 𐗇𐗀𐗓 𐗙𐗊𐗝𐗒 𐗢𐗓𐗉𐗢𐗝𐗒𐗢 𐗗𐗉 𐗋𐗝𐗪𐗗𐗊 𐗥𐗊𐗖𐗀𐗬𐗊𐗝𐗒𐗗𐗒.\n𐗢𐗊 𐗎𐗒𐗢𐗐𐗊 𐗙𐗉𐗝𐗊𐗬𐗒𐗢 𐗕𐗒𐗇𐗒𐗘 𐗢𐗊 𐗕𐗒𐗝𐗊 𐗈𐗉 𐗢𐗊 𐗂𐗀𐗝𐗀𐗂𐗀𐗝𐗢𐗊 𐗘𐗊 𐗆𐗒𐗙𐗒𐗢𐗉𐗢 𐗈𐗉 𐗘𐗊 𐗢𐗊 𐗆𐗝𐗉𐗓𐗢𐗀. 𐗀𐗢𐗀 𐗔𐗀𐗘𐗊 𐗀𐗝𐗟𐗪𐗉 𐗈𐗉 𐗇𐗊𐗝𐗎𐗉𐗎𐗉 𐗈𐗉 𐗆𐗤𐗐𐗉𐗢 𐗢𐗊 𐗟𐗒𐗖𐗉𐗘 𐗇𐗀𐗓 𐗙𐗊𐗝𐗒 𐗢𐗓𐗉𐗢𐗝𐗒𐗢 𐗗𐗉 𐗋𐗝𐗪𐗗𐗊 𐗥𐗊𐗖𐗀𐗬𐗊𐗝𐗒𐗗𐗒."
specimen_16: "𐗢𐗊 𐗎𐗒𐗢𐗐𐗊 𐗙𐗉𐗝𐗊𐗬𐗒𐗢 𐗕𐗒𐗇𐗒𐗘 𐗢𐗊 𐗕𐗒𐗝𐗊 𐗈𐗉 𐗢𐗊 𐗂𐗀𐗝𐗀𐗂𐗀𐗝𐗢𐗊 𐗘𐗊 𐗆𐗒𐗙𐗒𐗢𐗉𐗢 𐗈𐗉 𐗘𐗊 𐗢𐗊 𐗆𐗝𐗉𐗓𐗢𐗀. 𐗀𐗢𐗀 𐗔𐗀𐗘𐗊 𐗀𐗝𐗟𐗪𐗉 𐗈𐗉 𐗇𐗊𐗝𐗎𐗉𐗎𐗉 𐗈𐗉 𐗆𐗤𐗐𐗉𐗢 𐗢𐗊 𐗟𐗒𐗖𐗉𐗘 𐗇𐗀𐗓 𐗙𐗊𐗝𐗒 𐗢𐗓𐗉𐗢𐗝𐗒𐗢 𐗗𐗉 𐗋𐗝𐗪𐗗𐗊 𐗥𐗊𐗖𐗀𐗬𐗊𐗝𐗒𐗗𐗒.\n𐗢𐗊 𐗎𐗒𐗢𐗐𐗊 𐗙𐗉𐗝𐗊𐗬𐗒𐗢 𐗕𐗒𐗇𐗒𐗘 𐗢𐗊 𐗕𐗒𐗝𐗊 𐗈𐗉 𐗢𐗊 𐗂𐗀𐗝𐗀𐗂𐗀𐗝𐗢𐗊 𐗘𐗊 𐗆𐗒𐗙𐗒𐗢𐗉𐗢 𐗈𐗉 𐗘𐗊 𐗢𐗊 𐗆𐗝𐗉𐗓𐗢𐗀. 𐗀𐗢𐗀 𐗔𐗀𐗘𐗊 𐗀𐗝𐗟𐗪𐗉 𐗈𐗉 𐗇𐗊𐗝𐗎𐗉𐗎𐗉 𐗈𐗉 𐗆𐗤𐗐𐗉𐗢 𐗢𐗊 𐗟𐗒𐗖𐗉𐗘 𐗇𐗀𐗓 𐗙𐗊𐗝𐗒 𐗢𐗓𐗉𐗢𐗝𐗒𐗢 𐗗𐗉 𐗋𐗝𐗪𐗗𐗊 𐗥𐗊𐗖𐗀𐗬𐗊𐗝𐗒𐗗𐗒.\n𐗢𐗊 𐗎𐗒𐗢𐗐𐗊 𐗙𐗉𐗝𐗊𐗬𐗒𐗢 𐗕𐗒𐗇𐗒𐗘 𐗢𐗊 𐗕𐗒𐗝𐗊 𐗈𐗉 𐗢𐗊 𐗂𐗀𐗝𐗀𐗂𐗀𐗝𐗢𐗊 𐗘𐗊 𐗆𐗒𐗙𐗒𐗢𐗉𐗢 𐗈𐗉 𐗘𐗊 𐗢𐗊 𐗆𐗝𐗉𐗓𐗢𐗀. 𐗀𐗢𐗀 𐗔𐗀𐗘𐗊 𐗀𐗝𐗟𐗪𐗉 𐗈𐗉 𐗇𐗊𐗝𐗎𐗉𐗎𐗉 𐗈𐗉 𐗆𐗤𐗐𐗉𐗢 𐗢𐗊 𐗟𐗒𐗖𐗉𐗘 𐗇𐗀𐗓 𐗙𐗊𐗝𐗒 𐗢𐗓𐗉𐗢𐗝𐗒𐗢 𐗗𐗉 𐗋𐗝𐗪𐗗𐗊 𐗥𐗊𐗖𐗀𐗬𐗊𐗝𐗒𐗗𐗒.\n𐗢𐗊 𐗎𐗒𐗢𐗐𐗊 𐗙𐗉𐗝𐗊𐗬𐗒𐗢 𐗕𐗒𐗇𐗒𐗘 𐗢𐗊 𐗕𐗒𐗝𐗊 𐗈𐗉 𐗢𐗊 𐗂𐗀𐗝𐗀𐗂𐗀𐗝𐗢𐗊 𐗘𐗊 𐗆𐗒𐗙𐗒𐗢𐗉𐗢 𐗈𐗉 𐗘𐗊 𐗢𐗊 𐗆𐗝𐗉𐗓𐗢𐗀. 𐗀𐗢𐗀 𐗔𐗀𐗘𐗊 𐗀𐗝𐗟𐗪𐗉 𐗈𐗉 𐗇𐗊𐗝𐗎𐗉𐗎𐗉 𐗈𐗉 𐗆𐗤𐗐𐗉𐗢 𐗢𐗊 𐗟𐗒𐗖𐗉𐗘 𐗇𐗀𐗓 𐗙𐗊𐗝𐗒 𐗢𐗓𐗉𐗢𐗝𐗒𐗢 𐗗𐗉 𐗋𐗝𐗪𐗗𐗊 𐗥𐗊𐗖𐗀𐗬𐗊𐗝𐗒𐗗𐗒."
note: "This is a transliteration of the text in sq_Latn"
}

View File

@@ -18,11 +18,11 @@ exemplar_chars {
index: "А Б В Г Ґ Д Е Є Ж З И І Ї Й К Л М Н О П Р С Т У Ф Х Ц Ч Ш Щ Ю Я"
}
sample_text {
masthead_full: "ВвСс"
masthead_partial: "Іі"
styles: "Беручи до уваги, що визнання гідності, яка"
masthead_full: "ҐґЄє"
masthead_partial: "Її"
styles: "Беручи до уваги, що визнання гідності"
tester: "беручи до уваги, що зневажання і нехтування правами людини призвели"
poster_sm: "беручи до уваги,"
poster_sm: "беручи до уваги"
poster_md: "беручи до"
poster_lg: "Всі"
specimen_48: "беручи до уваги, що народи Об\'єднаних Націй підтвердили в Статуті"

View File

@@ -1,2 +1,2 @@
id: "Beng"
name: "Bengali"
name: "Bangla"

View File

@@ -0,0 +1,3 @@
id: "Todr"
name: "Todhri"

View File

@@ -0,0 +1,252 @@
from gflanguages import languages_public_pb2
import enum
import re
class Udhr:
def __init__(
self, key, iso639_3, iso15924, bcp47, direction, ohchr, stage, loc, name
):
self.key = key
self.iso639_3 = iso639_3
self.iso15924 = iso15924
self.bcp47 = bcp47
self.direction = direction
self.ohchr = ohchr
self.stage = stage
self.loc = loc
self.name = name
self.title = None
self.preamble = None
self.articles = []
def Parse(self, translation_data):
if translation_data is None or self.stage < 2:
return
if translation_data.find("./{*}title") is not None:
self.title = translation_data.find("./{*}title").text
preamble_data = translation_data.find("./{*}preamble")
if preamble_data is not None:
if preamble_data.find("./{*}title") is not None:
self.preamble = {
"title": preamble_data.find("./{*}title").text,
"content": [
para.text for para in preamble_data.findall("./{*}para")
],
}
articles_data = translation_data.findall("./{*}article")
for article_data in articles_data:
title_data = article_data.find("./{*}title")
article = {
"id": int(article_data.get("number")),
"title": None if title_data is None else title_data.text,
"content": [para.text for para in article_data.findall("./{*}para")],
}
self.articles.append(article)
def LoadArticleOne(self, article_one):
self.articles.append({"id": 0, "title": None, "content": [article_one]})
def GetSampleTexts(self):
extractor = SampleTextExtractor(self)
return extractor.GetSampleTexts()
class SampleTextExtractor:
class TextType(enum.Enum):
GLYPHS = 1
WORD = 2
PHRASE = 3
SENTENCE = 4
PARAGRAPH = 5
PASSAGE = 6
def __init__(self, udhr):
self._udhr = udhr
self._glyphs = iter(self._GetGlyphs())
self._words = iter(self._GetWords())
self._paragraphs = iter(self._GetParagraphs())
self._phrase_history = set()
self._non_word_regex = re.compile(r"[^\w]+")
self._space_regex = re.compile(r"\s+")
self._non_space_regex = re.compile(r"[^\s]+")
self._non_word_space_regex = re.compile(r"[^\w\s]+")
self._any_regex = re.compile(r".")
def _DisplayLength(self, s):
"""Returns length of given string. Omits combining characters.
Some entire scripts will not be counted; in those cases, the raw length of
the string is returned.
"""
word_space_length = len(self._non_word_space_regex.sub("", s))
space_length = len(self._non_space_regex.sub("", s))
if word_space_length == space_length:
return len(s)
return word_space_length
def _GetGlyphs(self):
seen = set()
for article in self._udhr.articles:
for para in article["content"]:
for ch in self._non_word_regex.sub("", para) or self._space_regex.sub(
"", para
):
ch = ch.lower()
if ch not in seen:
seen.add(ch)
yield ch
def _GetWords(self):
if self._space_regex.search(self._udhr.articles[0]["content"][0]) is not None:
splitter = self._space_regex
else:
splitter = self._non_word_regex
seen = set()
for article in self._udhr.articles:
for para in article["content"]:
for s in splitter.split(para):
if s not in seen:
seen.add(s)
yield s
def _GetParagraphs(self):
if self._udhr.preamble is not None:
for para in self._udhr.preamble["content"]:
yield para
for article in self._udhr.articles:
for para in article["content"]:
yield para
def _ExtractGlyphs(self, min_chars, max_chars):
s = ""
for ch in self._glyphs:
s += ch.upper()
if len(s) >= min_chars:
break
if ch != ch.upper():
s += ch
if len(s) >= min_chars:
break
return s
def _ExtractWord(self, min_chars, max_chars):
for iterator in [self._words, self._GetWords()]:
for w in iterator:
if w is None:
continue
if min_chars <= self._DisplayLength(w) <= max_chars:
return w
# Fallback to using multiple words for languages with very small words
return self._ExtractPhrase(min_chars, max_chars)
def _ExtractPhrase(self, min_chars, max_chars):
for iterator in [self._paragraphs, self._GetParagraphs()]:
for para in iterator:
if para is None:
continue
for regex in [self._any_regex, self._space_regex, self._non_word_regex]:
breaks = [-1]
for match in regex.finditer(para, min_chars):
breaks.append(match.start())
phrase = para[breaks[0] + 1 : breaks[len(breaks) - 1]]
p_size = self._DisplayLength(phrase)
while p_size > max_chars and len(breaks) > 1:
breaks.pop()
phrase = para[breaks[0] + 1 : breaks[len(breaks) - 1]]
p_size = self._DisplayLength(phrase)
if min_chars <= p_size and phrase not in self._phrase_history:
self._phrase_history.add(phrase)
return phrase
return self._ExtractParagraph(min_chars, max_chars)
def _ExtractSentence(self, min_chars, max_chars):
# Sentence delimination may differ between scripts, so tokenizing on spaces
# would be unreliable. Prefer to use _ExtractPhrase.
return self._ExtractPhrase(min_chars, max_chars)
def _ExtractParagraph(self, min_chars, max_chars):
for iterator in [self._paragraphs, self._GetParagraphs()]:
for para in iterator:
if para is None:
continue
if min_chars <= self._DisplayLength(para) <= max_chars:
return para
# Paragraphs likely insufficient length; try combining into passages
return self._ExtractPassage(min_chars, max_chars)
def _ExtractPassage(self, min_chars, max_chars):
p = []
p_size = 0
while p_size < min_chars:
for iterator in [self._paragraphs, self._GetParagraphs()]:
for para in iterator:
if para is None:
continue
p.append(para)
p_size = self._DisplayLength(" ".join(p))
if max_chars < p_size:
p.pop()
elif min_chars <= p_size:
return "\n".join(p)
assert len(p) > 0, "Unable to extract passage: " + self._udhr.key
if len(p) == 0:
p.append([p for p in self._GetParagraphs()][0])
return "\n".join(p)
def _Get(self, text_type, **kwargs):
if "char_count" in kwargs:
min_chars = kwargs["char_count"]
max_chars = kwargs["char_count"]
else:
min_chars = kwargs["min_chars"]
max_chars = kwargs["max_chars"]
if text_type == self.TextType.GLYPHS:
return self._ExtractGlyphs(min_chars, max_chars)
if text_type == self.TextType.WORD:
return self._ExtractWord(min_chars, max_chars)
if text_type == self.TextType.PHRASE:
return self._ExtractPhrase(min_chars, max_chars)
if text_type == self.TextType.SENTENCE:
return self._ExtractSentence(min_chars, max_chars)
if text_type == self.TextType.PARAGRAPH:
return self._ExtractParagraph(min_chars, max_chars)
if text_type == self.TextType.PASSAGE:
return self._ExtractPassage(min_chars, max_chars)
raise Exception("Unsupported text type: " + text_type)
def GetSampleTexts(self):
sample_text = languages_public_pb2.SampleTextProto()
sample_text.masthead_full = self._Get(self.TextType.GLYPHS, char_count=4)
sample_text.masthead_partial = self._Get(self.TextType.GLYPHS, char_count=2)
sample_text.styles = self._Get(self.TextType.PHRASE, min_chars=40, max_chars=60)
sample_text.tester = self._Get(self.TextType.PHRASE, min_chars=60, max_chars=90)
sample_text.poster_sm = self._Get(
self.TextType.PHRASE, min_chars=10, max_chars=17
)
sample_text.poster_md = self._Get(
self.TextType.PHRASE, min_chars=6, max_chars=12
)
sample_text.poster_lg = self._Get(self.TextType.WORD, min_chars=3, max_chars=8)
sample_text.specimen_48 = self._Get(
self.TextType.SENTENCE, min_chars=50, max_chars=80
)
sample_text.specimen_36 = self._Get(
self.TextType.PARAGRAPH, min_chars=100, max_chars=120
)
sample_text.specimen_32 = self._Get(
self.TextType.PARAGRAPH, min_chars=140, max_chars=180
)
sample_text.specimen_21 = self._Get(
self.TextType.PASSAGE, min_chars=300, max_chars=500
)
sample_text.specimen_16 = self._Get(
self.TextType.PASSAGE, min_chars=550, max_chars=750
)
return sample_text

View File

@@ -0,0 +1,65 @@
#!/usr/bin/env python3
"""
lang-sample-text
Adds sample text for a given language using the specified UDHR translation.
Usage:
lang-sample-text -l ./languages/en.textproto ./udhr_translations/en.xml
"""
from gflanguages import LoadLanguages, languages_public_pb2
from gftools.util.google_fonts import ReadProto, WriteProto
from gflanguages.udhr import Udhr
from lxml import etree
import os
import re
import argparse
def main(argv=None):
parser = argparse.ArgumentParser(
description="Update UDHR sample text for a given language"
)
parser.add_argument(
"-l",
"--lang",
help="Language proto file to update",
required=True,
)
parser.add_argument(
"-u",
"--udhr",
help="Path to UDHR translation (XML)",
required=True,
)
args = parser.parse_args(argv)
language = ReadProto(languages_public_pb2.LanguageProto(), args.lang)
udhr_data = etree.parse(args.udhr)
head = udhr_data.getroot()
for name, value in head.attrib.items():
if re.search(r"\{.*\}lang", name):
bcp47 = value.replace("-", "_")
udhr = Udhr(
key=head.get("key"),
iso639_3=head.get("iso639-3"),
iso15924=head.get("iso15924"),
bcp47=bcp47,
direction=head.get("dir"),
ohchr=None,
stage=4,
loc=None,
name=head.get("n"),
)
udhr.Parse(udhr_data)
language.sample_text.MergeFrom(udhr.GetSampleTexts())
WriteProto(language, args.lang)
if __name__ == "__main__":
main()