From: Nathan Williams Date: Tue, 3 Sep 2024 16:08:27 +0000 (-0400) Subject: Pull in updates from lang subtree (#8132) X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=4528dc803bd11a5602c320edf270d7975cd12dd0;p=thirdparty%2Fgoogle%2Ffonts.git Pull in updates from lang subtree (#8132) * Update de_Latn.textproto Move `ẞ` to the auxiliary character set. The lowercase `ß` is still part of the base character set. Based on [Wikipedia](https://en.wikipedia.org/wiki/%C3%9F#:~:text=Additionally%2C%20as%20of%202017%2C%20when,with%20%E2%9F%A8SS%E2%9F%A9%20in%20allcaps.), `ẞ` seems to be more of an auxiliary than base character. It is relatively new and being written as `SS` is considered valid. * Revert "Update de_Latn.textproto" * Enforce uniqueness of language names across all scripts * Add test for enforcing language name structure * Fix duplicate or near malstructured Chinese language names * Update language names for Ancient Greek, there were duplicates * Enforce language name structure for both name and preferred name * Enforce language name structure for both name and preferred name * Update language name structure test to output error once * Fix type: endsWith => endswith * Restructure language names to be consistent * Fix language name regex to allow spaces and a couple other edge cases * Remove duplicate entries for Malay * Fix language name regex * Fix dash character in character class * Update language name character class to allow accent marks * Re-structure several language names * Expand character class for language names * Correct a batch of language names * Update character class for language names to use Unicode class * Fix unicode character class in language name regex * Use regex instead of re for access to unicode character classes * Add regex package to dev reqs * Add regex to deps in TOML config * Fix name structure for two languages * Assert uniqueness of all IDs * Fix up languages with non-unique IDs * Rename to Language, Place * Add "removable" category of characters not required for language support * Add LATIN CAPITAL LETTER SHARP S to removable * Recompile protobuf (protoc 3.17.3) * Rename removable to not_required --------- Co-authored-by: Simon Cozens --- diff --git a/lang/Lib/gflanguages/__init__.py b/lang/Lib/gflanguages/__init__.py index c7b0bc6181..0754d6c135 100644 --- a/lang/Lib/gflanguages/__init__.py +++ b/lang/Lib/gflanguages/__init__.py @@ -44,6 +44,7 @@ def LoadLanguages(base_dir=DATA_DIR): for textproto_file in glob.iglob(os.path.join(languages_dir, "*.textproto")): with open(textproto_file, "r", encoding="utf-8") as f: language = text_format.Parse(f.read(), languages_public_pb2.LanguageProto()) + assert language.id not in langs, f"Duplicate language id: {language.id}" langs[language.id] = language return langs @@ -57,6 +58,7 @@ def LoadScripts(base_dir=DATA_DIR): for textproto_file in glob.iglob(os.path.join(scripts_dir, "*.textproto")): with open(textproto_file, "r", encoding="utf-8") as f: script = text_format.Parse(f.read(), languages_public_pb2.ScriptProto()) + assert script.id not in scripts, f"Duplicate script id: {script.id}" scripts[script.id] = script return scripts @@ -70,6 +72,7 @@ def LoadRegions(base_dir=DATA_DIR): for textproto_file in glob.iglob(os.path.join(regions_dir, "*.textproto")): with open(textproto_file, "r", encoding="utf-8") as f: region = text_format.Parse(f.read(), languages_public_pb2.RegionProto()) + assert region.id not in regions, f"Duplicate region id: {region.id}" regions[region.id] = region return regions diff --git a/lang/Lib/gflanguages/data/languages/de_Latn.textproto b/lang/Lib/gflanguages/data/languages/de_Latn.textproto index 2b4f1a012c..56023005f1 100644 --- a/lang/Lib/gflanguages/data/languages/de_Latn.textproto +++ b/lang/Lib/gflanguages/data/languages/de_Latn.textproto @@ -38,6 +38,7 @@ exemplar_chars { numerals: "- , . % + 0 1 2 3 4 5 6 7 8 9" punctuation: "- – — , ; : ! ? . … \' ‘ ‚ \" “ „ « » ( ) [ ] { } @ * / & #" index: "A B C D E F G H I J K L M N O P Q R S ẞ T U V W X Y Z" + not_required: "ẞ" } sample_text { masthead_full: "AaLl" diff --git a/lang/Lib/gflanguages/data/languages/mam_Latn_MX.textproto b/lang/Lib/gflanguages/data/languages/mam_Latn_MX.textproto index 110b026935..27712e7a21 100644 --- a/lang/Lib/gflanguages/data/languages/mam_Latn_MX.textproto +++ b/lang/Lib/gflanguages/data/languages/mam_Latn_MX.textproto @@ -1,7 +1,7 @@ -id: "mam_Latn" +id: "mam_Latn_MX" language: "mam" script: "Latn" -name: "Mam" +name: "Mexican Mam" region: "MX" exemplar_chars { base: "a A {bꞌ} {BꞋ} {ch} {CH} {chꞌ} {CHꞋ} d D e E g G i I j J k K {kꞌ} {KꞋ} {ky} {KY} {kyꞌ} {KYꞋ} l L m M n N o O p P q Q {qꞌ} {QꞋ} r R s S t T {tꞌ} {TꞋ} {ts} {TS} {tsꞌ} {TSꞋ} {tx} {TX} {txꞌ} {TXꞋ} u U w W x X {xh} {XH} y Y ꞌ Ꞌ" diff --git a/lang/Lib/gflanguages/data/languages/xsl_Latn.textproto b/lang/Lib/gflanguages/data/languages/xsl_Latn.textproto index ef229cf89f..259e4e011d 100644 --- a/lang/Lib/gflanguages/data/languages/xsl_Latn.textproto +++ b/lang/Lib/gflanguages/data/languages/xsl_Latn.textproto @@ -1,5 +1,5 @@ -id: "scs_Latn" -language: "scs" +id: "xsl_Latn" +language: "xsl" script: "Latn" name: "South Slavey" population: 950 diff --git a/lang/Lib/gflanguages/data/languages/yo_Latn_BJ.textproto b/lang/Lib/gflanguages/data/languages/yo_Latn_BJ.textproto index d876805617..2136a20457 100644 --- a/lang/Lib/gflanguages/data/languages/yo_Latn_BJ.textproto +++ b/lang/Lib/gflanguages/data/languages/yo_Latn_BJ.textproto @@ -1,7 +1,7 @@ -id: "yo_Latn" +id: "yo_Latn_BJ" language: "yo" script: "Latn" -name: "Yoruba" +name: "Yoruba, Benin" autonym: "Èdè Yorùbá" population: 200000 region: "BJ" @@ -27,4 +27,4 @@ sample_text { specimen_16: "Ɛnì kɔ̀ɔ̀kan ló ní ɛ̀tɔ́ kpé kí a má shàdédé she àyɔjúràn sí ɔ̀rɔ̀ ìgbésí ayé rɛ̀, tàbí sí ɔ̀rɔ̀ɛbí rɛ̀ tàbí sí ɔ̀rɔ̀ ìdílé rɛ̀ tàbí ìwé tí a kɔ sí i; a kò sì gbɔdɔ̀ ba iyì àti orúkɔ rɛ̀ jɛ́. Ɛnì kɔ̀ɔ̀kan ló ní ɛ̀tɔ́ sí ààbò lábɛ́ òfin kúrò lɔ́wɔ́ irú àyɔjúràn tàbí ìbanijɛ́ bɛ́ɛ̀.\nƐnì kɔ̀ɔ̀kan ló ní ɛ̀tɔ́ sí òmìnira èrò, òmìnira ɛ̀rí-ɔkàn àti òmìnira ɛ sìn. Ɛtɔ́ yìí sì gbani láàyè láti kpààrɔ̀ ɛ sìn tàbí ìgbàgbɔ́ ɛni. Ó sì fún ɛyɔ ɛnì kan tàbí àkójɔkpɔ̀ ènìyàn láàyè láti she ɛ̀sìn wɔn àti ìgbàgbɔ́ wɔn bó she jɛ mɔ́ ti ìkɔ́ni, ìshesí, ìjɔ́sìn àti ìmúshe ohun tí wɔ́n gbàgbɔ́ yálà ní ìkɔ̀kɔ̀ tàbí ní gban̄gba.\nƐnì kɔ̀ɔ̀kan ló ní ɛ̀tɔ́ sí ìsinmi àti fàájì kpɛ̀lú àkókò tí kò kpɔ̀ jù lɛ́nu ishɛ́ àti àsìkò ìsinmi lɛ́nu ishɛ́ láti ìgbà dé ìgbà tí a ó sanwó fún." } source: "Centre national de linguistique appliquée (CENALA), Alphabet des langues nationales béninoises, Cotonou: CENALA avec le concours de l’Initiative francophone pour la formation à distance des maîtres (IFADEM), 2008, 6th ed." -note: "Yoruba (Benin) uses ɛ ɔ kp sh instead of Yoruba (Nigeria) ẹ ọ p á¹£." \ No newline at end of file +note: "Yoruba (Benin) uses ɛ ɔ kp sh instead of Yoruba (Nigeria) ẹ ọ p á¹£." diff --git a/lang/Lib/gflanguages/languages_public.proto b/lang/Lib/gflanguages/languages_public.proto index 0e8f6505c2..db00ef077d 100644 --- a/lang/Lib/gflanguages/languages_public.proto +++ b/lang/Lib/gflanguages/languages_public.proto @@ -48,8 +48,9 @@ message ExemplarCharsProto { optional string numerals = 4; optional string punctuation = 5; optional string index = 6; + optional string not_required = 7; // Base characters which can be ignored when determining language support - // Next = 7; + // Next = 8; } message SampleTextProto { diff --git a/lang/Lib/gflanguages/languages_public_pb2.py b/lang/Lib/gflanguages/languages_public_pb2.py index 2b544d4d9a..4729af3465 100644 --- a/lang/Lib/gflanguages/languages_public_pb2.py +++ b/lang/Lib/gflanguages/languages_public_pb2.py @@ -19,7 +19,7 @@ DESCRIPTOR = _descriptor.FileDescriptor( syntax='proto2', serialized_options=None, create_key=_descriptor._internal_create_key, - serialized_pb=b'\n\x16languages_public.proto\x12\x17google.languages_public\"Q\n\x0bRegionProto\x12\n\n\x02id\x18\x01 \x01(\t\x12\x0c\n\x04name\x18\x02 \x01(\t\x12\x12\n\npopulation\x18\x03 \x01(\x05\x12\x14\n\x0cregion_group\x18\x04 \x03(\t\"\'\n\x0bScriptProto\x12\n\n\x02id\x18\x01 \x01(\t\x12\x0c\n\x04name\x18\x02 \x01(\t\"\xce\x02\n\rLanguageProto\x12\n\n\x02id\x18\x01 \x01(\t\x12\x10\n\x08language\x18\x02 \x01(\t\x12\x0e\n\x06script\x18\x03 \x01(\t\x12\x0c\n\x04name\x18\x04 \x01(\t\x12\x16\n\x0epreferred_name\x18\x05 \x01(\t\x12\x0f\n\x07\x61utonym\x18\x06 \x01(\t\x12\x12\n\npopulation\x18\x07 \x01(\x05\x12\x0e\n\x06region\x18\x08 \x03(\t\x12\x43\n\x0e\x65xemplar_chars\x18\t \x01(\x0b\x32+.google.languages_public.ExemplarCharsProto\x12=\n\x0bsample_text\x18\n \x01(\x0b\x32(.google.languages_public.SampleTextProto\x12\x12\n\nhistorical\x18\x0b \x01(\x08\x12\x0e\n\x06source\x18\x0c \x03(\t\x12\x0c\n\x04note\x18\r \x01(\t\"z\n\x12\x45xemplarCharsProto\x12\x0c\n\x04\x62\x61se\x18\x01 \x01(\t\x12\x11\n\tauxiliary\x18\x02 \x01(\t\x12\r\n\x05marks\x18\x03 \x01(\t\x12\x10\n\x08numerals\x18\x04 \x01(\t\x12\x13\n\x0bpunctuation\x18\x05 \x01(\t\x12\r\n\x05index\x18\x06 \x01(\t\"\x92\x02\n\x0fSampleTextProto\x12\x15\n\rmasthead_full\x18\x01 \x01(\t\x12\x18\n\x10masthead_partial\x18\x02 \x01(\t\x12\x0e\n\x06styles\x18\x03 \x01(\t\x12\x0e\n\x06tester\x18\x04 \x01(\t\x12\x11\n\tposter_sm\x18\x05 \x01(\t\x12\x11\n\tposter_md\x18\x06 \x01(\t\x12\x11\n\tposter_lg\x18\x07 \x01(\t\x12\x13\n\x0bspecimen_48\x18\x08 \x01(\t\x12\x13\n\x0bspecimen_36\x18\t \x01(\t\x12\x13\n\x0bspecimen_32\x18\n \x01(\t\x12\x13\n\x0bspecimen_21\x18\x0b \x01(\t\x12\x13\n\x0bspecimen_16\x18\x0c \x01(\t\x12\x0c\n\x04note\x18\r \x01(\t' + serialized_pb=b'\n\x16languages_public.proto\x12\x17google.languages_public\"Q\n\x0bRegionProto\x12\n\n\x02id\x18\x01 \x01(\t\x12\x0c\n\x04name\x18\x02 \x01(\t\x12\x12\n\npopulation\x18\x03 \x01(\x05\x12\x14\n\x0cregion_group\x18\x04 \x03(\t\"\'\n\x0bScriptProto\x12\n\n\x02id\x18\x01 \x01(\t\x12\x0c\n\x04name\x18\x02 \x01(\t\"\xce\x02\n\rLanguageProto\x12\n\n\x02id\x18\x01 \x01(\t\x12\x10\n\x08language\x18\x02 \x01(\t\x12\x0e\n\x06script\x18\x03 \x01(\t\x12\x0c\n\x04name\x18\x04 \x01(\t\x12\x16\n\x0epreferred_name\x18\x05 \x01(\t\x12\x0f\n\x07\x61utonym\x18\x06 \x01(\t\x12\x12\n\npopulation\x18\x07 \x01(\x05\x12\x0e\n\x06region\x18\x08 \x03(\t\x12\x43\n\x0e\x65xemplar_chars\x18\t \x01(\x0b\x32+.google.languages_public.ExemplarCharsProto\x12=\n\x0bsample_text\x18\n \x01(\x0b\x32(.google.languages_public.SampleTextProto\x12\x12\n\nhistorical\x18\x0b \x01(\x08\x12\x0e\n\x06source\x18\x0c \x03(\t\x12\x0c\n\x04note\x18\r \x01(\t\"\x90\x01\n\x12\x45xemplarCharsProto\x12\x0c\n\x04\x62\x61se\x18\x01 \x01(\t\x12\x11\n\tauxiliary\x18\x02 \x01(\t\x12\r\n\x05marks\x18\x03 \x01(\t\x12\x10\n\x08numerals\x18\x04 \x01(\t\x12\x13\n\x0bpunctuation\x18\x05 \x01(\t\x12\r\n\x05index\x18\x06 \x01(\t\x12\x14\n\x0cnot_required\x18\x07 \x01(\t\"\x92\x02\n\x0fSampleTextProto\x12\x15\n\rmasthead_full\x18\x01 \x01(\t\x12\x18\n\x10masthead_partial\x18\x02 \x01(\t\x12\x0e\n\x06styles\x18\x03 \x01(\t\x12\x0e\n\x06tester\x18\x04 \x01(\t\x12\x11\n\tposter_sm\x18\x05 \x01(\t\x12\x11\n\tposter_md\x18\x06 \x01(\t\x12\x11\n\tposter_lg\x18\x07 \x01(\t\x12\x13\n\x0bspecimen_48\x18\x08 \x01(\t\x12\x13\n\x0bspecimen_36\x18\t \x01(\t\x12\x13\n\x0bspecimen_32\x18\n \x01(\t\x12\x13\n\x0bspecimen_21\x18\x0b \x01(\t\x12\x13\n\x0bspecimen_16\x18\x0c \x01(\t\x12\x0c\n\x04note\x18\r \x01(\t' ) @@ -283,6 +283,13 @@ _EXEMPLARCHARSPROTO = _descriptor.Descriptor( message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), + _descriptor.FieldDescriptor( + name='not_required', full_name='google.languages_public.ExemplarCharsProto.not_required', index=6, + number=7, type=9, cpp_type=9, label=1, + has_default_value=False, default_value=b"".decode('utf-8'), + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), ], extensions=[ ], @@ -295,8 +302,8 @@ _EXEMPLARCHARSPROTO = _descriptor.Descriptor( extension_ranges=[], oneofs=[ ], - serialized_start=512, - serialized_end=634, + serialized_start=513, + serialized_end=657, ) @@ -411,8 +418,8 @@ _SAMPLETEXTPROTO = _descriptor.Descriptor( extension_ranges=[], oneofs=[ ], - serialized_start=637, - serialized_end=911, + serialized_start=660, + serialized_end=934, ) _LANGUAGEPROTO.fields_by_name['exemplar_chars'].message_type = _EXEMPLARCHARSPROTO