From 2422ad05cf63931beadcf000486a87fceb1eb1ca Mon Sep 17 00:00:00 2001 From: Simon Cozens Date: Wed, 1 May 2024 14:37:57 +0100 Subject: [PATCH 01/11] Add Denis and myself to contributors --- CONTRIBUTORS.txt | 2 ++ 1 file changed, 2 insertions(+) diff --git a/CONTRIBUTORS.txt b/CONTRIBUTORS.txt index 6428befe0..f552619c3 100644 --- a/CONTRIBUTORS.txt +++ b/CONTRIBUTORS.txt @@ -24,5 +24,7 @@ # Please keep the list sorted. # (first name; alphabetical order) +Denis Moyogo Jacquerye Felipe Correa da Silva Sanches Marc Foley +Simon Cozens From 402689da2c4ed67978b001ed28dd202c6d2cd85a Mon Sep 17 00:00:00 2001 From: Simon Cozens Date: Wed, 1 May 2024 14:38:26 +0100 Subject: [PATCH 02/11] Revert "Add our own parse function for exemplars" This reverts commit b57fdba5341a7f65e284583412d463baf56ff3ca. --- Lib/gflanguages/__init__.py | 16 ---------------- 1 file changed, 16 deletions(-) diff --git a/Lib/gflanguages/__init__.py b/Lib/gflanguages/__init__.py index c7b0bc618..2957742e9 100644 --- a/Lib/gflanguages/__init__.py +++ b/Lib/gflanguages/__init__.py @@ -21,7 +21,6 @@ data on the Google Fonts collection. """ import glob import os -import unicodedata from gflanguages import languages_public_pb2 from google.protobuf import text_format @@ -72,18 +71,3 @@ def LoadRegions(base_dir=DATA_DIR): region = text_format.Parse(f.read(), languages_public_pb2.RegionProto()) regions[region.id] = region return regions - - -def parse(exemplars: str): - """Parses a list of exemplar characters into a set of codepoints.""" - codepoints = set() - for chars in exemplars.split(): - if len(chars) > 1: - chars = chars.lstrip("{").rstrip("}") - normalized_chars = unicodedata.normalize("NFC", chars) - if normalized_chars != chars: - for char in normalized_chars: - codepoints.add(char) - for char in chars: - codepoints.add(char) - return codepoints From 8a1a4fb8587c43ad2c9f0048112fa56add7ed124 Mon Sep 17 00:00:00 2001 From: Denis Moyogo Jacquerye Date: Wed, 1 May 2024 14:38:38 +0100 Subject: [PATCH 03/11] Add our own parse function for exemplars --- Lib/gflanguages/__init__.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/Lib/gflanguages/__init__.py b/Lib/gflanguages/__init__.py index 2957742e9..c7b0bc618 100644 --- a/Lib/gflanguages/__init__.py +++ b/Lib/gflanguages/__init__.py @@ -21,6 +21,7 @@ data on the Google Fonts collection. """ import glob import os +import unicodedata from gflanguages import languages_public_pb2 from google.protobuf import text_format @@ -71,3 +72,18 @@ def LoadRegions(base_dir=DATA_DIR): region = text_format.Parse(f.read(), languages_public_pb2.RegionProto()) regions[region.id] = region return regions + + +def parse(exemplars: str): + """Parses a list of exemplar characters into a set of codepoints.""" + codepoints = set() + for chars in exemplars.split(): + if len(chars) > 1: + chars = chars.lstrip("{").rstrip("}") + normalized_chars = unicodedata.normalize("NFC", chars) + if normalized_chars != chars: + for char in normalized_chars: + codepoints.add(char) + for char in chars: + codepoints.add(char) + return codepoints From 8089af9a8650359783411b27ddb610b1d92edcfb Mon Sep 17 00:00:00 2001 From: Simon Cozens Date: Tue, 14 May 2024 13:35:23 +0100 Subject: [PATCH 04/11] Stricter parsable test --- tests/test_parsable.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/tests/test_parsable.py b/tests/test_parsable.py index 8f32a2ee0..a0169774b 100644 --- a/tests/test_parsable.py +++ b/tests/test_parsable.py @@ -7,9 +7,16 @@ from google.protobuf import text_format languages_dir = os.path.join(DATA_DIR, "languages") -textproto_files = [os.path.basename(x) for x in glob.iglob(os.path.join(languages_dir, "*.textproto"))] +textproto_files = [ + os.path.basename(x) for x in glob.iglob(os.path.join(languages_dir, "*.textproto")) +] + @pytest.mark.parametrize("lang_code", textproto_files) def test_parsable(lang_code): with open(os.path.join(languages_dir, lang_code), "r", encoding="utf-8") as f: - text_format.Parse(f.read(), languages_public_pb2.LanguageProto()) + msg = text_format.Parse(f.read(), languages_public_pb2.LanguageProto()) + assert msg.id is not None + assert msg.language is not None + assert msg.script is not None + assert msg.population is not None From 2da265a1542f7e25a285218a08dae1d2d609fc90 Mon Sep 17 00:00:00 2001 From: Simon Cozens Date: Tue, 14 May 2024 13:35:36 +0100 Subject: [PATCH 05/11] Language names must be unique --- tests/test_data_languages.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/tests/test_data_languages.py b/tests/test_data_languages.py index 315275d95..3a8ef6b6e 100644 --- a/tests/test_data_languages.py +++ b/tests/test_data_languages.py @@ -265,3 +265,10 @@ def test_exemplar_parser(): "l", "̍", } + + +def test_language_uniqueness(): + names = Counter([lang.name for lang in LANGUAGES.values()]) + if any(count > 1 for count in names.values()): + duplicates = {name: count for name, count in names.items() if count > 1} + pytest.fail(f"Duplicate language names: {duplicates}") From ae4ecc2b51adf6c08c578cec2e47d10441c980f7 Mon Sep 17 00:00:00 2001 From: Simon Cozens Date: Tue, 14 May 2024 13:58:39 +0100 Subject: [PATCH 06/11] Uniquify language names --- Lib/gflanguages/data/languages/bdh_Latn.textproto | 5 +++-- Lib/gflanguages/data/languages/beh_Latn.textproto | 4 ++-- Lib/gflanguages/data/languages/bkc_Latn.textproto | 4 ++-- Lib/gflanguages/data/languages/bsc_Latn.textproto | 2 +- Lib/gflanguages/data/languages/bsc_Latn_GN.textproto | 2 +- Lib/gflanguages/data/languages/bsq_Bass.textproto | 2 +- Lib/gflanguages/data/languages/bsq_Latn.textproto | 4 ++-- Lib/gflanguages/data/languages/crh_Cyrl.textproto | 2 +- Lib/gflanguages/data/languages/dnj_Latn_LR.textproto | 2 +- Lib/gflanguages/data/languages/evn_Latn.textproto | 2 +- Lib/gflanguages/data/languages/kr_Arab.textproto | 2 +- Lib/gflanguages/data/languages/mlt_Latn.textproto | 11 ----------- Lib/gflanguages/data/languages/sa_Nand.textproto | 2 +- Lib/gflanguages/data/languages/wal_Ethi.textproto | 2 +- Lib/gflanguages/data/languages/wal_Latn.textproto | 4 ++-- Lib/gflanguages/data/languages/xsm_Latn_BF.textproto | 2 +- 16 files changed, 21 insertions(+), 31 deletions(-) delete mode 100644 Lib/gflanguages/data/languages/mlt_Latn.textproto diff --git a/Lib/gflanguages/data/languages/bdh_Latn.textproto b/Lib/gflanguages/data/languages/bdh_Latn.textproto index 6e53be3f8..963736278 100644 --- a/Lib/gflanguages/data/languages/bdh_Latn.textproto +++ b/Lib/gflanguages/data/languages/bdh_Latn.textproto @@ -1,11 +1,12 @@ id: "bdh_Latn" language: "bdh" script: "Latn" -name: "Baka" +name: "Baka (South Sudan/Congo)" +autonym: "Tara Baká" population: 60000 region: "CD" region: "SS" exemplar_chars { base: "a A b B c C d D e E f F g G h H i I ị Ị ɨ Ɨ k K l L m M n N ṇ Ṇ o O p P r R ṛ Ṛ s S t T u U ụ Ụ v V ṿ Ṿ w W y Y z Z ꞌ Ꞌ" marks: "◌̣ ◌́" -} \ No newline at end of file +} diff --git a/Lib/gflanguages/data/languages/beh_Latn.textproto b/Lib/gflanguages/data/languages/beh_Latn.textproto index 73661516a..62dfbda29 100644 --- a/Lib/gflanguages/data/languages/beh_Latn.textproto +++ b/Lib/gflanguages/data/languages/beh_Latn.textproto @@ -1,10 +1,10 @@ id: "beh_Latn" language: "beh" script: "Latn" -name: "Baka" +name: "Biali" population: 100000 region: "BJ" exemplar_chars { base: "a A b B c C d D e E ə Ə f F g G h H i I k K l L m M n N o O p P r R s S t T u U w W y Y" auxiliary: "j J q Q v V x X z Z" -} \ No newline at end of file +} diff --git a/Lib/gflanguages/data/languages/bkc_Latn.textproto b/Lib/gflanguages/data/languages/bkc_Latn.textproto index e761d8cf6..02b8bca30 100644 --- a/Lib/gflanguages/data/languages/bkc_Latn.textproto +++ b/Lib/gflanguages/data/languages/bkc_Latn.textproto @@ -1,11 +1,11 @@ id: "bkc_Latn" language: "bkc" script: "Latn" -name: "Baka" +name: "Baka (Cameroon/Gabon)" population: 71000 region: "CM" region: "GA" exemplar_chars { base: "a A b B ɓ Ɓ d D ɗ Ɗ e E ɛ Ɛ f F g G h H i I j J k K l L m M n N o O ɔ Ɔ s S t T u U w W y Y" auxiliary: "c C p P q Q r R v V x X z Z" -} \ No newline at end of file +} diff --git a/Lib/gflanguages/data/languages/bsc_Latn.textproto b/Lib/gflanguages/data/languages/bsc_Latn.textproto index edf186524..e1d9d9dc8 100644 --- a/Lib/gflanguages/data/languages/bsc_Latn.textproto +++ b/Lib/gflanguages/data/languages/bsc_Latn.textproto @@ -1,7 +1,7 @@ id: "bsc_Latn" language: "bsc" script: "Latn" -name: "Bassari" +name: "Bassari (Senegal)" autonym: "oniyan" population: 15264 region: "SN" diff --git a/Lib/gflanguages/data/languages/bsc_Latn_GN.textproto b/Lib/gflanguages/data/languages/bsc_Latn_GN.textproto index 97d08bd4f..1048b22f6 100644 --- a/Lib/gflanguages/data/languages/bsc_Latn_GN.textproto +++ b/Lib/gflanguages/data/languages/bsc_Latn_GN.textproto @@ -1,7 +1,7 @@ id: "bsc_Latn_GN" language: "bsc" script: "Latn" -name: "Bassari" +name: "Guinean Bassari" autonym: "oneyan" population: 18000 region: "GN" diff --git a/Lib/gflanguages/data/languages/bsq_Bass.textproto b/Lib/gflanguages/data/languages/bsq_Bass.textproto index 65c182d7c..8b9b529e9 100644 --- a/Lib/gflanguages/data/languages/bsq_Bass.textproto +++ b/Lib/gflanguages/data/languages/bsq_Bass.textproto @@ -1,7 +1,7 @@ id: "bsq_Bass" language: "bsq" script: "Bass" -name: "Bassa" +name: "Bassa, Vah" population: 410000 region: "LR" region: "SL" diff --git a/Lib/gflanguages/data/languages/bsq_Latn.textproto b/Lib/gflanguages/data/languages/bsq_Latn.textproto index ee3a76d80..7e6d576d0 100644 --- a/Lib/gflanguages/data/languages/bsq_Latn.textproto +++ b/Lib/gflanguages/data/languages/bsq_Latn.textproto @@ -1,7 +1,7 @@ id: "bsq_Latn" language: "bsq" script: "Latn" -name: "Bassa" +name: "Bassa, Latin" population: 410000 region: "LR" region: "SL" @@ -11,4 +11,4 @@ exemplar_chars { auxiliary: "l L q Q r R x X y Y z Z" } source: "“Bassa dictionary”, Christian Education Foundation in Liberia (CEFL), https://cefliberia.org/bassa/bassa-dictionary/" -source: "Ɓǎsɔ́ɔ̀ Báɓòɔ̀, Bible Society Liberia, 2002" \ No newline at end of file +source: "Ɓǎsɔ́ɔ̀ Báɓòɔ̀, Bible Society Liberia, 2002" diff --git a/Lib/gflanguages/data/languages/crh_Cyrl.textproto b/Lib/gflanguages/data/languages/crh_Cyrl.textproto index 3af416fba..ce6648b4c 100644 --- a/Lib/gflanguages/data/languages/crh_Cyrl.textproto +++ b/Lib/gflanguages/data/languages/crh_Cyrl.textproto @@ -1,7 +1,7 @@ id: "crh_Cyrl" language: "crh" script: "Cyrl" -name: "Crimean Turkish" +name: "Crimean Turkish, Cyrillic" autonym: "Къырымтатар" population: 245968 region: "UA" diff --git a/Lib/gflanguages/data/languages/dnj_Latn_LR.textproto b/Lib/gflanguages/data/languages/dnj_Latn_LR.textproto index d665f96e2..732900772 100644 --- a/Lib/gflanguages/data/languages/dnj_Latn_LR.textproto +++ b/Lib/gflanguages/data/languages/dnj_Latn_LR.textproto @@ -1,7 +1,7 @@ id: "dnj_Latn_LR" language: "dnj" script: "Latn" -name: "Dan" +name: "Liberian Dan" autonym: "Gio" population: 1099244 region: "LR" diff --git a/Lib/gflanguages/data/languages/evn_Latn.textproto b/Lib/gflanguages/data/languages/evn_Latn.textproto index 756f5dbc6..937e9a9bb 100644 --- a/Lib/gflanguages/data/languages/evn_Latn.textproto +++ b/Lib/gflanguages/data/languages/evn_Latn.textproto @@ -1,7 +1,7 @@ id: "evn_Latn" language: "evn" script: "Latn" -name: "Evenki" +name: "Evenki, Latin" population: 16000 region: "RU" region: "CN" diff --git a/Lib/gflanguages/data/languages/kr_Arab.textproto b/Lib/gflanguages/data/languages/kr_Arab.textproto index e051c89f4..4621a6326 100644 --- a/Lib/gflanguages/data/languages/kr_Arab.textproto +++ b/Lib/gflanguages/data/languages/kr_Arab.textproto @@ -1,4 +1,4 @@ id: "kr_Arab" language: "kr" script: "Arab" -name: "Kanuri" +name: "Kanuri, Arabic" diff --git a/Lib/gflanguages/data/languages/mlt_Latn.textproto b/Lib/gflanguages/data/languages/mlt_Latn.textproto deleted file mode 100644 index 21f8b7670..000000000 --- a/Lib/gflanguages/data/languages/mlt_Latn.textproto +++ /dev/null @@ -1,11 +0,0 @@ -id: "mlt_Latn" -language: "mlt" -script: "Latn" -name: "Maltese" -population: 530000 -region: "MT" -exemplar_chars { - base: "a A à À b B ċ Ċ d D e E è È f F ġ Ġ g G h H ħ Ħ i I ì Ì j J k K l L m M n N o O ò Ò p P q Q r R s S t T u U ù Ù v V w W x X ż Ż z Z" - marks: "◌̀ ◌̇" - auxiliary: "c C y Y" -} \ No newline at end of file diff --git a/Lib/gflanguages/data/languages/sa_Nand.textproto b/Lib/gflanguages/data/languages/sa_Nand.textproto index cd2c12cad..80b4c563c 100644 --- a/Lib/gflanguages/data/languages/sa_Nand.textproto +++ b/Lib/gflanguages/data/languages/sa_Nand.textproto @@ -2,7 +2,7 @@ id: "sa_Nand" language: "sa" script: "Nand" -name: "Sanskrit" +name: "Sanskrit, Nandinagari" autonym: "𑧍𑧞𑧍𑧠𑦮𑧖𑦽𑧆𑧠" region: "IN" exemplar_chars { diff --git a/Lib/gflanguages/data/languages/wal_Ethi.textproto b/Lib/gflanguages/data/languages/wal_Ethi.textproto index de9c0d257..b5c7fbe6d 100644 --- a/Lib/gflanguages/data/languages/wal_Ethi.textproto +++ b/Lib/gflanguages/data/languages/wal_Ethi.textproto @@ -1,6 +1,6 @@ id: "wal_Ethi" language: "wal" script: "Ethi" -name: "Wolaytta" +name: "Wolaytta, Ethiopic" population: 1946034 region: "ET" diff --git a/Lib/gflanguages/data/languages/wal_Latn.textproto b/Lib/gflanguages/data/languages/wal_Latn.textproto index 5804b2097..c6676a568 100644 --- a/Lib/gflanguages/data/languages/wal_Latn.textproto +++ b/Lib/gflanguages/data/languages/wal_Latn.textproto @@ -1,9 +1,9 @@ id: "wal_Latn" language: "wal" script: "Latn" -name: "Wolaytta" +name: "Wolaytta, Latin" population: 7000000 region: "ET" exemplar_chars { base: "a A b B c C d D e E f F g G h H i I j J k K l L m M n N o O p P q Q r R s S t T u U v V w W x X y Y z Z" -} \ No newline at end of file +} diff --git a/Lib/gflanguages/data/languages/xsm_Latn_BF.textproto b/Lib/gflanguages/data/languages/xsm_Latn_BF.textproto index ce9282dc1..983329112 100644 --- a/Lib/gflanguages/data/languages/xsm_Latn_BF.textproto +++ b/Lib/gflanguages/data/languages/xsm_Latn_BF.textproto @@ -1,7 +1,7 @@ id: "xsm_Latn_BF" language: "xsm" script: "Latn" -name: "Kasem" +name: "Burkinabè Kasem" population: 250000 region: "BF" exemplar_chars { From 6815883e41f96465f4f1985445743074a9d29226 Mon Sep 17 00:00:00 2001 From: Simon Cozens Date: Tue, 14 May 2024 17:04:00 +0100 Subject: [PATCH 07/11] Be more strict --- tests/test_parsable.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/test_parsable.py b/tests/test_parsable.py index a0169774b..f209bc3be 100644 --- a/tests/test_parsable.py +++ b/tests/test_parsable.py @@ -16,7 +16,7 @@ textproto_files = [ def test_parsable(lang_code): with open(os.path.join(languages_dir, lang_code), "r", encoding="utf-8") as f: msg = text_format.Parse(f.read(), languages_public_pb2.LanguageProto()) - assert msg.id is not None - assert msg.language is not None - assert msg.script is not None + assert msg.id + assert msg.language + assert msg.script assert msg.population is not None From 36f366c0e02aa85b881a2d8b9f60d28753ca4c71 Mon Sep 17 00:00:00 2001 From: Simon Cozens Date: Tue, 14 May 2024 17:04:43 +0100 Subject: [PATCH 08/11] Add required language field --- Lib/gflanguages/data/languages/uma_Latn.textproto | 1 + 1 file changed, 1 insertion(+) diff --git a/Lib/gflanguages/data/languages/uma_Latn.textproto b/Lib/gflanguages/data/languages/uma_Latn.textproto index ba40ae58a..eea6d698f 100644 --- a/Lib/gflanguages/data/languages/uma_Latn.textproto +++ b/Lib/gflanguages/data/languages/uma_Latn.textproto @@ -1,4 +1,5 @@ id: "uma_Latn" +language: "uma" script: "Latn" name: "Umatilla" population: 25 From 69944f1197823449d18b123cf1ae4ac9ed42c3eb Mon Sep 17 00:00:00 2001 From: Simon Cozens Date: Tue, 14 May 2024 17:07:40 +0100 Subject: [PATCH 09/11] Fixes suggested by Denis --- Lib/gflanguages/data/languages/bdh_Latn.textproto | 2 +- Lib/gflanguages/data/languages/bsc_Latn.textproto | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/Lib/gflanguages/data/languages/bdh_Latn.textproto b/Lib/gflanguages/data/languages/bdh_Latn.textproto index 963736278..7e7ccf6dd 100644 --- a/Lib/gflanguages/data/languages/bdh_Latn.textproto +++ b/Lib/gflanguages/data/languages/bdh_Latn.textproto @@ -1,7 +1,7 @@ id: "bdh_Latn" language: "bdh" script: "Latn" -name: "Baka (South Sudan/Congo)" +name: "Baka (DRC/South Sudan)" autonym: "Tara Baká" population: 60000 region: "CD" diff --git a/Lib/gflanguages/data/languages/bsc_Latn.textproto b/Lib/gflanguages/data/languages/bsc_Latn.textproto index e1d9d9dc8..edf186524 100644 --- a/Lib/gflanguages/data/languages/bsc_Latn.textproto +++ b/Lib/gflanguages/data/languages/bsc_Latn.textproto @@ -1,7 +1,7 @@ id: "bsc_Latn" language: "bsc" script: "Latn" -name: "Bassari (Senegal)" +name: "Bassari" autonym: "oniyan" population: 15264 region: "SN" From fa4d9fdafb9f6299775df3e16f5b3b12d8a7d8fb Mon Sep 17 00:00:00 2001 From: Simon Cozens Date: Wed, 15 May 2024 14:53:11 +0100 Subject: [PATCH 10/11] Relax uniqueness criteria to "unique within a script" --- tests/test_data_languages.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/tests/test_data_languages.py b/tests/test_data_languages.py index 3a8ef6b6e..25a22dd3c 100644 --- a/tests/test_data_languages.py +++ b/tests/test_data_languages.py @@ -268,7 +268,15 @@ def test_exemplar_parser(): def test_language_uniqueness(): - names = Counter([lang.name for lang in LANGUAGES.values()]) + names = Counter([]) + for lang in LANGUAGES.values(): + # We check that names are unique *within a script* since + # when we display them in a menu we segment that menu by + # script and then by language + if lang.preferred_name: + names[lang.script + "/" + lang.preferred_name] += 1 + else: + names[lang.name + "/" + lang.preferred_name] += 1 if any(count > 1 for count in names.values()): duplicates = {name: count for name, count in names.items() if count > 1} pytest.fail(f"Duplicate language names: {duplicates}") From 642b56cf9790cc437a1696de8ce5e88ffd03176b Mon Sep 17 00:00:00 2001 From: Simon Cozens Date: Wed, 15 May 2024 14:53:32 +0100 Subject: [PATCH 11/11] Keep this as "Western Balochi" to distinguish it from the Balochi macrolanguage --- Lib/gflanguages/data/languages/bgn_Arab.textproto | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Lib/gflanguages/data/languages/bgn_Arab.textproto b/Lib/gflanguages/data/languages/bgn_Arab.textproto index 56d4cb3f2..d42af6e88 100644 --- a/Lib/gflanguages/data/languages/bgn_Arab.textproto +++ b/Lib/gflanguages/data/languages/bgn_Arab.textproto @@ -2,7 +2,7 @@ id: "bgn_Arab" language: "bgn" script: "Arab" name: "Western Balochi" -preferred_name: "Balochi" +#preferred_name: "Balochi" population: 2037382 region: "AF" region: "IR"