From: Nathan Williams <williamsnathan@google.com>
Date: Tue, 3 Sep 2024 16:08:27 +0000 (-0400)
Subject: Pull in updates from lang subtree (#8132)
X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=4528dc803bd11a5602c320edf270d7975cd12dd0;p=thirdparty%2Fgoogle%2Ffonts.git

Pull in updates from lang subtree (#8132)

* Update de_Latn.textproto

Move `ẞ` to the auxiliary character set. The lowercase `ß` is still part of the base character set.

Based on [Wikipedia](https://en.wikipedia.org/wiki/%C3%9F#:~:text=Additionally%2C%20as%20of%202017%2C%20when,with%20%E2%9F%A8SS%E2%9F%A9%20in%20allcaps.), `ẞ` seems to be more of an auxiliary than base character. It is relatively new and being written as `SS` is considered valid.

* Revert "Update de_Latn.textproto"

* Enforce uniqueness of language names across all scripts

* Add test for enforcing language name structure

* Fix duplicate or near malstructured Chinese language names

* Update language names for Ancient Greek, there were duplicates

* Enforce language name structure for both name and preferred name

* Enforce language name structure for both name and preferred name

* Update language name structure test to output error once

* Fix type: endsWith => endswith

* Restructure language names to be consistent

* Fix language name regex to allow spaces and a couple other edge cases

* Remove duplicate entries for Malay

* Fix language name regex

* Fix dash character in character class

* Update language name character class to allow accent marks

* Re-structure several language names

* Expand character class for language names

* Correct a batch of language names

* Update character class for language names to use Unicode class

* Fix unicode character class in language name regex

* Use regex instead of re for access to unicode character classes

* Add regex package to dev reqs

* Add regex to deps in TOML config

* Fix name structure for two languages

* Assert uniqueness of all IDs

* Fix up languages with non-unique IDs

* Rename to Language, Place

* Add "removable" category of characters not required for language support

* Add LATIN CAPITAL LETTER SHARP S to removable

* Recompile protobuf (protoc 3.17.3)

* Rename removable to not_required

---------

Co-authored-by: Simon Cozens <simon@simon-cozens.org>
---

diff --git a/lang/Lib/gflanguages/__init__.py b/lang/Lib/gflanguages/__init__.py
index c7b0bc6181..0754d6c135 100644
--- a/lang/Lib/gflanguages/__init__.py
+++ b/lang/Lib/gflanguages/__init__.py
@@ -44,6 +44,7 @@ def LoadLanguages(base_dir=DATA_DIR):
     for textproto_file in glob.iglob(os.path.join(languages_dir, "*.textproto")):
         with open(textproto_file, "r", encoding="utf-8") as f:
             language = text_format.Parse(f.read(), languages_public_pb2.LanguageProto())
+            assert language.id not in langs, f"Duplicate language id: {language.id}"
             langs[language.id] = language
     return langs
 
@@ -57,6 +58,7 @@ def LoadScripts(base_dir=DATA_DIR):
     for textproto_file in glob.iglob(os.path.join(scripts_dir, "*.textproto")):
         with open(textproto_file, "r", encoding="utf-8") as f:
             script = text_format.Parse(f.read(), languages_public_pb2.ScriptProto())
+            assert script.id not in scripts, f"Duplicate script id: {script.id}"
             scripts[script.id] = script
     return scripts
 
@@ -70,6 +72,7 @@ def LoadRegions(base_dir=DATA_DIR):
     for textproto_file in glob.iglob(os.path.join(regions_dir, "*.textproto")):
         with open(textproto_file, "r", encoding="utf-8") as f:
             region = text_format.Parse(f.read(), languages_public_pb2.RegionProto())
+            assert region.id not in regions, f"Duplicate region id: {region.id}"
             regions[region.id] = region
     return regions
 
diff --git a/lang/Lib/gflanguages/data/languages/de_Latn.textproto b/lang/Lib/gflanguages/data/languages/de_Latn.textproto
index 2b4f1a012c..56023005f1 100644
--- a/lang/Lib/gflanguages/data/languages/de_Latn.textproto
+++ b/lang/Lib/gflanguages/data/languages/de_Latn.textproto
@@ -38,6 +38,7 @@ exemplar_chars {
   numerals: "- , . % + 0 1 2 3 4 5 6 7 8 9"
   punctuation: "- â â , ; : ! ? . â¦ \' â â \" â â Â« Â» ( ) [ ] { } @ * / & #"
   index: "A B C D E F G H I J K L M N O P Q R S áº T U V W X Y Z"
+  not_required: "áº"
 }
 sample_text {
   masthead_full: "AaLl"
diff --git a/lang/Lib/gflanguages/data/languages/mam_Latn_MX.textproto b/lang/Lib/gflanguages/data/languages/mam_Latn_MX.textproto
index 110b026935..27712e7a21 100644
--- a/lang/Lib/gflanguages/data/languages/mam_Latn_MX.textproto
+++ b/lang/Lib/gflanguages/data/languages/mam_Latn_MX.textproto
@@ -1,7 +1,7 @@
-id: "mam_Latn"
+id: "mam_Latn_MX"
 language: "mam"
 script: "Latn"
-name: "Mam"
+name: "Mexican Mam"
 region: "MX"
 exemplar_chars {
   base: "a A {bê} {Bê} {ch} {CH} {chê} {CHê} d D e E g G i I j J k K {kê} {Kê} {ky} {KY} {kyê} {KYê} l L m M n N o O p P q Q {qê} {Qê} r R s S t T {tê} {Tê} {ts} {TS} {tsê} {TSê} {tx} {TX} {txê} {TXê} u U w W x X {xh} {XH} y Y ê ê"
diff --git a/lang/Lib/gflanguages/data/languages/xsl_Latn.textproto b/lang/Lib/gflanguages/data/languages/xsl_Latn.textproto
index ef229cf89f..259e4e011d 100644
--- a/lang/Lib/gflanguages/data/languages/xsl_Latn.textproto
+++ b/lang/Lib/gflanguages/data/languages/xsl_Latn.textproto
@@ -1,5 +1,5 @@
-id: "scs_Latn"
-language: "scs"
+id: "xsl_Latn"
+language: "xsl"
 script: "Latn"
 name: "South Slavey"
 population: 950
diff --git a/lang/Lib/gflanguages/data/languages/yo_Latn_BJ.textproto b/lang/Lib/gflanguages/data/languages/yo_Latn_BJ.textproto
index d876805617..2136a20457 100644
--- a/lang/Lib/gflanguages/data/languages/yo_Latn_BJ.textproto
+++ b/lang/Lib/gflanguages/data/languages/yo_Latn_BJ.textproto
@@ -1,7 +1,7 @@
-id: "yo_Latn"
+id: "yo_Latn_BJ"
 language: "yo"
 script: "Latn"
-name: "Yoruba"
+name: "Yoruba, Benin"
 autonym: "ÃdÃ¨ YorÃ¹bÃ¡"
 population: 200000
 region: "BJ"
@@ -27,4 +27,4 @@ sample_text {
   specimen_16: "ÆnÃ¬ kÉÌÉÌkan lÃ³ nÃ­ ÉÌtÉÌ kpÃ© kÃ­ a mÃ¡ shÃ dÃ©dÃ© she Ã yÉjÃºrÃ n sÃ­ ÉÌrÉÌ Ã¬gbÃ©sÃ­ ayÃ© rÉÌ, tÃ bÃ­ sÃ­ ÉÌrÉÌÉbÃ­ rÉÌ tÃ bÃ­ sÃ­ ÉÌrÉÌ Ã¬dÃ­lÃ© rÉÌ tÃ bÃ­ Ã¬wÃ© tÃ­ a kÉ sÃ­ i; a kÃ² sÃ¬ gbÉdÉÌ ba iyÃ¬ Ã ti orÃºkÉ rÉÌ jÉÌ. ÆnÃ¬ kÉÌÉÌkan lÃ³ nÃ­ ÉÌtÉÌ sÃ­ Ã Ã bÃ² lÃ¡bÉÌ Ã²fin kÃºrÃ² lÉÌwÉÌ irÃº Ã yÉjÃºrÃ n tÃ bÃ­ Ã¬banijÉÌ bÉÌÉÌ.\nÆnÃ¬ kÉÌÉÌkan lÃ³ nÃ­ ÉÌtÉÌ sÃ­ Ã²mÃ¬nira Ã¨rÃ², Ã²mÃ¬nira ÉÌrÃ­-ÉkÃ n Ã ti Ã²mÃ¬nira É sÃ¬n. ÆtÉÌ yÃ¬Ã­ sÃ¬ gbani lÃ¡Ã yÃ¨ lÃ¡ti kpÃ Ã rÉÌ É sÃ¬n tÃ bÃ­ Ã¬gbÃ gbÉÌ Éni. Ã sÃ¬ fÃºn ÉyÉ ÉnÃ¬ kan tÃ bÃ­ Ã kÃ³jÉkpÉÌ Ã¨nÃ¬yÃ n lÃ¡Ã yÃ¨ lÃ¡ti she ÉÌsÃ¬n wÉn Ã ti Ã¬gbÃ gbÉÌ wÉn bÃ³ she jÉ mÉÌ ti Ã¬kÉÌni, Ã¬shesÃ­, Ã¬jÉÌsÃ¬n Ã ti Ã¬mÃºshe ohun tÃ­ wÉÌn gbÃ gbÉÌ yÃ¡lÃ  nÃ­ Ã¬kÉÌkÉÌ tÃ bÃ­ nÃ­ gbanÌgba.\nÆnÃ¬ kÉÌÉÌkan lÃ³ nÃ­ ÉÌtÉÌ sÃ­ Ã¬sinmi Ã ti fÃ Ã¡jÃ¬ kpÉÌlÃº Ã kÃ³kÃ² tÃ­ kÃ² kpÉÌ jÃ¹ lÉÌnu ishÉÌ Ã ti Ã sÃ¬kÃ² Ã¬sinmi lÉÌnu ishÉÌ lÃ¡ti Ã¬gbÃ  dÃ© Ã¬gbÃ  tÃ­ a Ã³ sanwÃ³ fÃºn."
 }
 source: "Centre national de linguistique appliquÃ©e (CENALA), Alphabet des langues nationales bÃ©ninoises, Cotonou: CENALA avec le concours de lâInitiative francophone pour la formation Ã  distance des maÃ®tres (IFADEM), 2008, 6th ed."
-note: "Yoruba (Benin) uses É É kp sh instead of Yoruba (Nigeria) áº¹ á» p á¹£."
\ No newline at end of file
+note: "Yoruba (Benin) uses É É kp sh instead of Yoruba (Nigeria) áº¹ á» p á¹£."
diff --git a/lang/Lib/gflanguages/languages_public.proto b/lang/Lib/gflanguages/languages_public.proto
index 0e8f6505c2..db00ef077d 100644
--- a/lang/Lib/gflanguages/languages_public.proto
+++ b/lang/Lib/gflanguages/languages_public.proto
@@ -48,8 +48,9 @@ message ExemplarCharsProto {
   optional string numerals = 4;
   optional string punctuation = 5;
   optional string index = 6;
+  optional string not_required = 7;  // Base characters which can be ignored when determining language support
 
-  // Next = 7;
+  // Next = 8;
 }
 
 message SampleTextProto {
diff --git a/lang/Lib/gflanguages/languages_public_pb2.py b/lang/Lib/gflanguages/languages_public_pb2.py
index 2b544d4d9a..4729af3465 100644
--- a/lang/Lib/gflanguages/languages_public_pb2.py
+++ b/lang/Lib/gflanguages/languages_public_pb2.py
@@ -19,7 +19,7 @@ DESCRIPTOR = _descriptor.FileDescriptor(
   syntax='proto2',
   serialized_options=None,
   create_key=_descriptor._internal_create_key,
-  serialized_pb=b'\n\x16languages_public.proto\x12\x17google.languages_public\"Q\n\x0bRegionProto\x12\n\n\x02id\x18\x01 \x01(\t\x12\x0c\n\x04name\x18\x02 \x01(\t\x12\x12\n\npopulation\x18\x03 \x01(\x05\x12\x14\n\x0cregion_group\x18\x04 \x03(\t\"\'\n\x0bScriptProto\x12\n\n\x02id\x18\x01 \x01(\t\x12\x0c\n\x04name\x18\x02 \x01(\t\"\xce\x02\n\rLanguageProto\x12\n\n\x02id\x18\x01 \x01(\t\x12\x10\n\x08language\x18\x02 \x01(\t\x12\x0e\n\x06script\x18\x03 \x01(\t\x12\x0c\n\x04name\x18\x04 \x01(\t\x12\x16\n\x0epreferred_name\x18\x05 \x01(\t\x12\x0f\n\x07\x61utonym\x18\x06 \x01(\t\x12\x12\n\npopulation\x18\x07 \x01(\x05\x12\x0e\n\x06region\x18\x08 \x03(\t\x12\x43\n\x0e\x65xemplar_chars\x18\t \x01(\x0b\x32+.google.languages_public.ExemplarCharsProto\x12=\n\x0bsample_text\x18\n \x01(\x0b\x32(.google.languages_public.SampleTextProto\x12\x12\n\nhistorical\x18\x0b \x01(\x08\x12\x0e\n\x06source\x18\x0c \x03(\t\x12\x0c\n\x04note\x18\r \x01(\t\"z\n\x12\x45xemplarCharsProto\x12\x0c\n\x04\x62\x61se\x18\x01 \x01(\t\x12\x11\n\tauxiliary\x18\x02 \x01(\t\x12\r\n\x05marks\x18\x03 \x01(\t\x12\x10\n\x08numerals\x18\x04 \x01(\t\x12\x13\n\x0bpunctuation\x18\x05 \x01(\t\x12\r\n\x05index\x18\x06 \x01(\t\"\x92\x02\n\x0fSampleTextProto\x12\x15\n\rmasthead_full\x18\x01 \x01(\t\x12\x18\n\x10masthead_partial\x18\x02 \x01(\t\x12\x0e\n\x06styles\x18\x03 \x01(\t\x12\x0e\n\x06tester\x18\x04 \x01(\t\x12\x11\n\tposter_sm\x18\x05 \x01(\t\x12\x11\n\tposter_md\x18\x06 \x01(\t\x12\x11\n\tposter_lg\x18\x07 \x01(\t\x12\x13\n\x0bspecimen_48\x18\x08 \x01(\t\x12\x13\n\x0bspecimen_36\x18\t \x01(\t\x12\x13\n\x0bspecimen_32\x18\n \x01(\t\x12\x13\n\x0bspecimen_21\x18\x0b \x01(\t\x12\x13\n\x0bspecimen_16\x18\x0c \x01(\t\x12\x0c\n\x04note\x18\r \x01(\t'
+  serialized_pb=b'\n\x16languages_public.proto\x12\x17google.languages_public\"Q\n\x0bRegionProto\x12\n\n\x02id\x18\x01 \x01(\t\x12\x0c\n\x04name\x18\x02 \x01(\t\x12\x12\n\npopulation\x18\x03 \x01(\x05\x12\x14\n\x0cregion_group\x18\x04 \x03(\t\"\'\n\x0bScriptProto\x12\n\n\x02id\x18\x01 \x01(\t\x12\x0c\n\x04name\x18\x02 \x01(\t\"\xce\x02\n\rLanguageProto\x12\n\n\x02id\x18\x01 \x01(\t\x12\x10\n\x08language\x18\x02 \x01(\t\x12\x0e\n\x06script\x18\x03 \x01(\t\x12\x0c\n\x04name\x18\x04 \x01(\t\x12\x16\n\x0epreferred_name\x18\x05 \x01(\t\x12\x0f\n\x07\x61utonym\x18\x06 \x01(\t\x12\x12\n\npopulation\x18\x07 \x01(\x05\x12\x0e\n\x06region\x18\x08 \x03(\t\x12\x43\n\x0e\x65xemplar_chars\x18\t \x01(\x0b\x32+.google.languages_public.ExemplarCharsProto\x12=\n\x0bsample_text\x18\n \x01(\x0b\x32(.google.languages_public.SampleTextProto\x12\x12\n\nhistorical\x18\x0b \x01(\x08\x12\x0e\n\x06source\x18\x0c \x03(\t\x12\x0c\n\x04note\x18\r \x01(\t\"\x90\x01\n\x12\x45xemplarCharsProto\x12\x0c\n\x04\x62\x61se\x18\x01 \x01(\t\x12\x11\n\tauxiliary\x18\x02 \x01(\t\x12\r\n\x05marks\x18\x03 \x01(\t\x12\x10\n\x08numerals\x18\x04 \x01(\t\x12\x13\n\x0bpunctuation\x18\x05 \x01(\t\x12\r\n\x05index\x18\x06 \x01(\t\x12\x14\n\x0cnot_required\x18\x07 \x01(\t\"\x92\x02\n\x0fSampleTextProto\x12\x15\n\rmasthead_full\x18\x01 \x01(\t\x12\x18\n\x10masthead_partial\x18\x02 \x01(\t\x12\x0e\n\x06styles\x18\x03 \x01(\t\x12\x0e\n\x06tester\x18\x04 \x01(\t\x12\x11\n\tposter_sm\x18\x05 \x01(\t\x12\x11\n\tposter_md\x18\x06 \x01(\t\x12\x11\n\tposter_lg\x18\x07 \x01(\t\x12\x13\n\x0bspecimen_48\x18\x08 \x01(\t\x12\x13\n\x0bspecimen_36\x18\t \x01(\t\x12\x13\n\x0bspecimen_32\x18\n \x01(\t\x12\x13\n\x0bspecimen_21\x18\x0b \x01(\t\x12\x13\n\x0bspecimen_16\x18\x0c \x01(\t\x12\x0c\n\x04note\x18\r \x01(\t'
 )
 
 
@@ -283,6 +283,13 @@ _EXEMPLARCHARSPROTO = _descriptor.Descriptor(
       message_type=None, enum_type=None, containing_type=None,
       is_extension=False, extension_scope=None,
       serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='not_required', full_name='google.languages_public.ExemplarCharsProto.not_required', index=6,
+      number=7, type=9, cpp_type=9, label=1,
+      has_default_value=False, default_value=b"".decode('utf-8'),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
   ],
   extensions=[
   ],
@@ -295,8 +302,8 @@ _EXEMPLARCHARSPROTO = _descriptor.Descriptor(
   extension_ranges=[],
   oneofs=[
   ],
-  serialized_start=512,
-  serialized_end=634,
+  serialized_start=513,
+  serialized_end=657,
 )
 
 
@@ -411,8 +418,8 @@ _SAMPLETEXTPROTO = _descriptor.Descriptor(
   extension_ranges=[],
   oneofs=[
   ],
-  serialized_start=637,
-  serialized_end=911,
+  serialized_start=660,
+  serialized_end=934,
 )
 
 _LANGUAGEPROTO.fields_by_name['exemplar_chars'].message_type = _EXEMPLARCHARSPROTO