Pull in updates from lang subtree (#8132)

author Nathan Williams <williamsnathan@google.com>

Tue, 3 Sep 2024 16:08:27 +0000 (12:08 -0400)

committer GitHub <noreply@github.com>

Tue, 3 Sep 2024 16:08:27 +0000 (12:08 -0400)
author Nathan Williams <williamsnathan@google.com>
Tue, 3 Sep 2024 16:08:27 +0000 (12:08 -0400)
committer GitHub <noreply@github.com>
Tue, 3 Sep 2024 16:08:27 +0000 (12:08 -0400)
diff --git a/lang/Lib/gflanguages/__init__.py b/lang/Lib/gflanguages/__init__.py

index c7b0bc61817a8b2466504419f6394cdb31257e55..0754d6c135547cd140115f6e63f85a8fc16dba6b 100644 (file)
--- a/lang/Lib/gflanguages/__init__.py
+++ b/lang/Lib/gflanguages/__init__.py
@@ -44,6 +44,7 @@ def LoadLanguages(base_dir=DATA_DIR):
      for textproto_file in glob.iglob(os.path.join(languages_dir, "*.textproto")):
          with open(textproto_file, "r", encoding="utf-8") as f:
              language = text_format.Parse(f.read(), languages_public_pb2.LanguageProto())
+            assert language.id not in langs, f"Duplicate language id: {language.id}"
              langs[language.id] = language
      return langs
  
@@ -57,6 +58,7 @@ def LoadScripts(base_dir=DATA_DIR):
      for textproto_file in glob.iglob(os.path.join(scripts_dir, "*.textproto")):
          with open(textproto_file, "r", encoding="utf-8") as f:
              script = text_format.Parse(f.read(), languages_public_pb2.ScriptProto())
+            assert script.id not in scripts, f"Duplicate script id: {script.id}"
              scripts[script.id] = script
      return scripts
  
@@ -70,6 +72,7 @@ def LoadRegions(base_dir=DATA_DIR):
      for textproto_file in glob.iglob(os.path.join(regions_dir, "*.textproto")):
          with open(textproto_file, "r", encoding="utf-8") as f:
              region = text_format.Parse(f.read(), languages_public_pb2.RegionProto())
+            assert region.id not in regions, f"Duplicate region id: {region.id}"
              regions[region.id] = region
      return regions
  
diff --git a/lang/Lib/gflanguages/data/languages/de_Latn.textproto b/lang/Lib/gflanguages/data/languages/de_Latn.textproto

index 2b4f1a012c885329a52dc730b3678be861128169..56023005f1e79e613f3ca197535d2bd054450e7e 100644 (file)
--- a/lang/Lib/gflanguages/data/languages/de_Latn.textproto
+++ b/lang/Lib/gflanguages/data/languages/de_Latn.textproto
@@ -38,6 +38,7 @@ exemplar_chars {
    numerals: "- , . % + 0 1 2 3 4 5 6 7 8 9"
    punctuation: "- – — , ; : ! ? . … \' ‘ ‚ \" “ „ « » ( ) [ ] { } @ * / & #"
    index: "A B C D E F G H I J K L M N O P Q R S ẞ T U V W X Y Z"
+  not_required: "ẞ"
  }
  sample_text {
    masthead_full: "AaLl"
diff --git a/lang/Lib/gflanguages/data/languages/mam_Latn_MX.textproto b/lang/Lib/gflanguages/data/languages/mam_Latn_MX.textproto

index 110b0269355299b4db1dff51a368d759e3f005dc..27712e7a21503bcb76bc80693959c0e18bdab0ee 100644 (file)
--- a/lang/Lib/gflanguages/data/languages/mam_Latn_MX.textproto
+++ b/lang/Lib/gflanguages/data/languages/mam_Latn_MX.textproto
@@ -1,7 +1,7 @@
-id: "mam_Latn"
+id: "mam_Latn_MX"
  language: "mam"
  script: "Latn"
-name: "Mam"
+name: "Mexican Mam"
  region: "MX"
  exemplar_chars {
    base: "a A {bꞌ} {BꞋ} {ch} {CH} {chꞌ} {CHꞋ} d D e E g G i I j J k K {kꞌ} {KꞋ} {ky} {KY} {kyꞌ} {KYꞋ} l L m M n N o O p P q Q {qꞌ} {QꞋ} r R s S t T {tꞌ} {TꞋ} {ts} {TS} {tsꞌ} {TSꞋ} {tx} {TX} {txꞌ} {TXꞋ} u U w W x X {xh} {XH} y Y ꞌ Ꞌ"
diff --git a/lang/Lib/gflanguages/data/languages/xsl_Latn.textproto b/lang/Lib/gflanguages/data/languages/xsl_Latn.textproto

index ef229cf89f7dbbb8310bab273022a7e153f0c2f9..259e4e011deb3c26f5c50543152da30532ee9f85 100644 (file)
--- a/lang/Lib/gflanguages/data/languages/xsl_Latn.textproto
+++ b/lang/Lib/gflanguages/data/languages/xsl_Latn.textproto
@@ -1,5 +1,5 @@
-id: "scs_Latn"
-language: "scs"
+id: "xsl_Latn"
+language: "xsl"
  script: "Latn"
  name: "South Slavey"
  population: 950
diff --git a/lang/Lib/gflanguages/data/languages/yo_Latn_BJ.textproto b/lang/Lib/gflanguages/data/languages/yo_Latn_BJ.textproto

index d876805617d4ca3bd3ac5dd145fd0b70cd80ba4d..2136a20457d70fed392bb0f0d935e80a865f7e3d 100644 (file)
--- a/lang/Lib/gflanguages/data/languages/yo_Latn_BJ.textproto
+++ b/lang/Lib/gflanguages/data/languages/yo_Latn_BJ.textproto
@@ -1,7 +1,7 @@
-id: "yo_Latn"
+id: "yo_Latn_BJ"
  language: "yo"
  script: "Latn"
-name: "Yoruba"
+name: "Yoruba, Benin"
  autonym: "Èdè Yorùbá"
  population: 200000
  region: "BJ"
@@ -27,4 +27,4 @@ sample_text {
    specimen_16: "Ɛnì kɔ̀ɔ̀kan ló ní ɛ̀tɔ́ kpé kí a má shàdédé she àyɔjúràn sí ɔ̀rɔ̀ ìgbésí ayé rɛ̀, tàbí sí ɔ̀rɔ̀ɛbí rɛ̀ tàbí sí ɔ̀rɔ̀ ìdílé rɛ̀ tàbí ìwé tí a kɔ sí i; a kò sì gbɔdɔ̀ ba iyì àti orúkɔ rɛ̀ jɛ́. Ɛnì kɔ̀ɔ̀kan ló ní ɛ̀tɔ́ sí ààbò lábɛ́ òfin kúrò lɔ́wɔ́ irú àyɔjúràn tàbí ìbanijɛ́ bɛ́ɛ̀.\nƐnì kɔ̀ɔ̀kan ló ní ɛ̀tɔ́ sí òmìnira èrò, òmìnira ɛ̀rí-ɔkàn àti òmìnira ɛ sìn. Ɛtɔ́ yìí sì gbani láàyè láti kpààrɔ̀ ɛ sìn tàbí ìgbàgbɔ́ ɛni. Ó sì fún ɛyɔ ɛnì kan tàbí àkójɔkpɔ̀ ènìyàn láàyè láti she ɛ̀sìn wɔn àti ìgbàgbɔ́ wɔn bó she jɛ mɔ́ ti ìkɔ́ni, ìshesí, ìjɔ́sìn àti ìmúshe ohun tí wɔ́n gbàgbɔ́ yálà ní ìkɔ̀kɔ̀ tàbí ní gban̄gba.\nƐnì kɔ̀ɔ̀kan ló ní ɛ̀tɔ́ sí ìsinmi àti fàájì kpɛ̀lú àkókò tí kò kpɔ̀ jù lɛ́nu ishɛ́ àti àsìkò ìsinmi lɛ́nu ishɛ́ láti ìgbà dé ìgbà tí a ó sanwó fún."
  }
  source: "Centre national de linguistique appliquée (CENALA), Alphabet des langues nationales béninoises, Cotonou: CENALA avec le concours de l’Initiative francophone pour la formation à distance des maîtres (IFADEM), 2008, 6th ed."
-note: "Yoruba (Benin) uses ɛ ɔ kp sh instead of Yoruba (Nigeria) ẹ ọ p ṣ."
-\ No newline at end of file
+note: "Yoruba (Benin) uses ɛ ɔ kp sh instead of Yoruba (Nigeria) ẹ ọ p ṣ."
diff --git a/lang/Lib/gflanguages/languages_public.proto b/lang/Lib/gflanguages/languages_public.proto

index 0e8f6505c2398bb8387b6f5f77b74ab11cfd137c..db00ef077dada0942564fe2fc0886f4c2b713229 100644 (file)
--- a/lang/Lib/gflanguages/languages_public.proto
+++ b/lang/Lib/gflanguages/languages_public.proto
@@ -48,8 +48,9 @@ message ExemplarCharsProto {
    optional string numerals = 4;
    optional string punctuation = 5;
    optional string index = 6;
+  optional string not_required = 7;  // Base characters which can be ignored when determining language support
  
-  // Next = 7;
+  // Next = 8;
  }
  
  message SampleTextProto {
diff --git a/lang/Lib/gflanguages/languages_public_pb2.py b/lang/Lib/gflanguages/languages_public_pb2.py

index 2b544d4d9a1fd3736e0ce60f1f3149943dd3c1e8..4729af346552f66dfd6c0b1c70490d4ec9ab59db 100644 (file)
--- a/lang/Lib/gflanguages/languages_public_pb2.py
+++ b/lang/Lib/gflanguages/languages_public_pb2.py
@@ -19,7 +19,7 @@ DESCRIPTOR = _descriptor.FileDescriptor(
    syntax='proto2',
    serialized_options=None,
    create_key=_descriptor._internal_create_key,
-  serialized_pb=b'\n\x16languages_public.proto\x12\x17google.languages_public\"Q\n\x0bRegionProto\x12\n\n\x02id\x18\x01 \x01(\t\x12\x0c\n\x04name\x18\x02 \x01(\t\x12\x12\n\npopulation\x18\x03 \x01(\x05\x12\x14\n\x0cregion_group\x18\x04 \x03(\t\"\'\n\x0bScriptProto\x12\n\n\x02id\x18\x01 \x01(\t\x12\x0c\n\x04name\x18\x02 \x01(\t\"\xce\x02\n\rLanguageProto\x12\n\n\x02id\x18\x01 \x01(\t\x12\x10\n\x08language\x18\x02 \x01(\t\x12\x0e\n\x06script\x18\x03 \x01(\t\x12\x0c\n\x04name\x18\x04 \x01(\t\x12\x16\n\x0epreferred_name\x18\x05 \x01(\t\x12\x0f\n\x07\x61utonym\x18\x06 \x01(\t\x12\x12\n\npopulation\x18\x07 \x01(\x05\x12\x0e\n\x06region\x18\x08 \x03(\t\x12\x43\n\x0e\x65xemplar_chars\x18\t \x01(\x0b\x32+.google.languages_public.ExemplarCharsProto\x12=\n\x0bsample_text\x18\n \x01(\x0b\x32(.google.languages_public.SampleTextProto\x12\x12\n\nhistorical\x18\x0b \x01(\x08\x12\x0e\n\x06source\x18\x0c \x03(\t\x12\x0c\n\x04note\x18\r \x01(\t\"z\n\x12\x45xemplarCharsProto\x12\x0c\n\x04\x62\x61se\x18\x01 \x01(\t\x12\x11\n\tauxiliary\x18\x02 \x01(\t\x12\r\n\x05marks\x18\x03 \x01(\t\x12\x10\n\x08numerals\x18\x04 \x01(\t\x12\x13\n\x0bpunctuation\x18\x05 \x01(\t\x12\r\n\x05index\x18\x06 \x01(\t\"\x92\x02\n\x0fSampleTextProto\x12\x15\n\rmasthead_full\x18\x01 \x01(\t\x12\x18\n\x10masthead_partial\x18\x02 \x01(\t\x12\x0e\n\x06styles\x18\x03 \x01(\t\x12\x0e\n\x06tester\x18\x04 \x01(\t\x12\x11\n\tposter_sm\x18\x05 \x01(\t\x12\x11\n\tposter_md\x18\x06 \x01(\t\x12\x11\n\tposter_lg\x18\x07 \x01(\t\x12\x13\n\x0bspecimen_48\x18\x08 \x01(\t\x12\x13\n\x0bspecimen_36\x18\t \x01(\t\x12\x13\n\x0bspecimen_32\x18\n \x01(\t\x12\x13\n\x0bspecimen_21\x18\x0b \x01(\t\x12\x13\n\x0bspecimen_16\x18\x0c \x01(\t\x12\x0c\n\x04note\x18\r \x01(\t'
+  serialized_pb=b'\n\x16languages_public.proto\x12\x17google.languages_public\"Q\n\x0bRegionProto\x12\n\n\x02id\x18\x01 \x01(\t\x12\x0c\n\x04name\x18\x02 \x01(\t\x12\x12\n\npopulation\x18\x03 \x01(\x05\x12\x14\n\x0cregion_group\x18\x04 \x03(\t\"\'\n\x0bScriptProto\x12\n\n\x02id\x18\x01 \x01(\t\x12\x0c\n\x04name\x18\x02 \x01(\t\"\xce\x02\n\rLanguageProto\x12\n\n\x02id\x18\x01 \x01(\t\x12\x10\n\x08language\x18\x02 \x01(\t\x12\x0e\n\x06script\x18\x03 \x01(\t\x12\x0c\n\x04name\x18\x04 \x01(\t\x12\x16\n\x0epreferred_name\x18\x05 \x01(\t\x12\x0f\n\x07\x61utonym\x18\x06 \x01(\t\x12\x12\n\npopulation\x18\x07 \x01(\x05\x12\x0e\n\x06region\x18\x08 \x03(\t\x12\x43\n\x0e\x65xemplar_chars\x18\t \x01(\x0b\x32+.google.languages_public.ExemplarCharsProto\x12=\n\x0bsample_text\x18\n \x01(\x0b\x32(.google.languages_public.SampleTextProto\x12\x12\n\nhistorical\x18\x0b \x01(\x08\x12\x0e\n\x06source\x18\x0c \x03(\t\x12\x0c\n\x04note\x18\r \x01(\t\"\x90\x01\n\x12\x45xemplarCharsProto\x12\x0c\n\x04\x62\x61se\x18\x01 \x01(\t\x12\x11\n\tauxiliary\x18\x02 \x01(\t\x12\r\n\x05marks\x18\x03 \x01(\t\x12\x10\n\x08numerals\x18\x04 \x01(\t\x12\x13\n\x0bpunctuation\x18\x05 \x01(\t\x12\r\n\x05index\x18\x06 \x01(\t\x12\x14\n\x0cnot_required\x18\x07 \x01(\t\"\x92\x02\n\x0fSampleTextProto\x12\x15\n\rmasthead_full\x18\x01 \x01(\t\x12\x18\n\x10masthead_partial\x18\x02 \x01(\t\x12\x0e\n\x06styles\x18\x03 \x01(\t\x12\x0e\n\x06tester\x18\x04 \x01(\t\x12\x11\n\tposter_sm\x18\x05 \x01(\t\x12\x11\n\tposter_md\x18\x06 \x01(\t\x12\x11\n\tposter_lg\x18\x07 \x01(\t\x12\x13\n\x0bspecimen_48\x18\x08 \x01(\t\x12\x13\n\x0bspecimen_36\x18\t \x01(\t\x12\x13\n\x0bspecimen_32\x18\n \x01(\t\x12\x13\n\x0bspecimen_21\x18\x0b \x01(\t\x12\x13\n\x0bspecimen_16\x18\x0c \x01(\t\x12\x0c\n\x04note\x18\r \x01(\t'
  )
  
  
@@ -283,6 +283,13 @@ _EXEMPLARCHARSPROTO = _descriptor.Descriptor(
        message_type=None, enum_type=None, containing_type=None,
        is_extension=False, extension_scope=None,
        serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='not_required', full_name='google.languages_public.ExemplarCharsProto.not_required', index=6,
+      number=7, type=9, cpp_type=9, label=1,
+      has_default_value=False, default_value=b"".decode('utf-8'),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
    ],
    extensions=[
    ],
@@ -295,8 +302,8 @@ _EXEMPLARCHARSPROTO = _descriptor.Descriptor(
    extension_ranges=[],
    oneofs=[
    ],
-  serialized_start=512,
-  serialized_end=634,
+  serialized_start=513,
+  serialized_end=657,
  )
  
  
@@ -411,8 +418,8 @@ _SAMPLETEXTPROTO = _descriptor.Descriptor(
    extension_ranges=[],
    oneofs=[
    ],
-  serialized_start=637,
-  serialized_end=911,
+  serialized_start=660,
+  serialized_end=934,
  )
  
  _LANGUAGEPROTO.fields_by_name['exemplar_chars'].message_type = _EXEMPLARCHARSPROTO
author	Nathan Williams <williamsnathan@google.com>
	Tue, 3 Sep 2024 16:08:27 +0000 (12:08 -0400)
committer	GitHub <noreply@github.com>
	Tue, 3 Sep 2024 16:08:27 +0000 (12:08 -0400)
lang/Lib/gflanguages/__init__.py		patch \| blob \| blame \| history
lang/Lib/gflanguages/data/languages/de_Latn.textproto		patch \| blob \| blame \| history
lang/Lib/gflanguages/data/languages/mam_Latn_MX.textproto		patch \| blob \| blame \| history
lang/Lib/gflanguages/data/languages/xsl_Latn.textproto		patch \| blob \| blame \| history
lang/Lib/gflanguages/data/languages/yo_Latn_BJ.textproto		patch \| blob \| blame \| history
lang/Lib/gflanguages/languages_public.proto		patch \| blob \| blame \| history
lang/Lib/gflanguages/languages_public_pb2.py		patch \| blob \| blame \| history