for textproto_file in glob.iglob(os.path.join(languages_dir, "*.textproto")):
with open(textproto_file, "r", encoding="utf-8") as f:
language = text_format.Parse(f.read(), languages_public_pb2.LanguageProto())
+ assert language.id not in langs, f"Duplicate language id: {language.id}"
langs[language.id] = language
return langs
for textproto_file in glob.iglob(os.path.join(scripts_dir, "*.textproto")):
with open(textproto_file, "r", encoding="utf-8") as f:
script = text_format.Parse(f.read(), languages_public_pb2.ScriptProto())
+ assert script.id not in scripts, f"Duplicate script id: {script.id}"
scripts[script.id] = script
return scripts
for textproto_file in glob.iglob(os.path.join(regions_dir, "*.textproto")):
with open(textproto_file, "r", encoding="utf-8") as f:
region = text_format.Parse(f.read(), languages_public_pb2.RegionProto())
+ assert region.id not in regions, f"Duplicate region id: {region.id}"
regions[region.id] = region
return regions
numerals: "- , . % + 0 1 2 3 4 5 6 7 8 9"
punctuation: "- – — , ; : ! ? . … \' ‘ ‚ \" “ „ « » ( ) [ ] { } @ * / & #"
index: "A B C D E F G H I J K L M N O P Q R S ẞ T U V W X Y Z"
+ not_required: "ẞ"
}
sample_text {
masthead_full: "AaLl"
-id: "mam_Latn"
+id: "mam_Latn_MX"
language: "mam"
script: "Latn"
-name: "Mam"
+name: "Mexican Mam"
region: "MX"
exemplar_chars {
base: "a A {bꞌ} {BꞋ} {ch} {CH} {chꞌ} {CHꞋ} d D e E g G i I j J k K {kꞌ} {KꞋ} {ky} {KY} {kyꞌ} {KYꞋ} l L m M n N o O p P q Q {qꞌ} {QꞋ} r R s S t T {tꞌ} {TꞋ} {ts} {TS} {tsꞌ} {TSꞋ} {tx} {TX} {txꞌ} {TXꞋ} u U w W x X {xh} {XH} y Y ꞌ Ꞌ"
-id: "scs_Latn"
-language: "scs"
+id: "xsl_Latn"
+language: "xsl"
script: "Latn"
name: "South Slavey"
population: 950
-id: "yo_Latn"
+id: "yo_Latn_BJ"
language: "yo"
script: "Latn"
-name: "Yoruba"
+name: "Yoruba, Benin"
autonym: "Èdè Yorùbá"
population: 200000
region: "BJ"
specimen_16: "Ɛnì kɔ̀ɔ̀kan ló ní ɛ̀tɔ́ kpé kí a má shàdédé she àyɔjúràn sí ɔ̀rɔ̀ ìgbésí ayé rɛ̀, tàbí sí ɔ̀rɔ̀ɛbí rɛ̀ tàbí sí ɔ̀rɔ̀ ìdílé rɛ̀ tàbí ìwé tí a kɔ sí i; a kò sì gbɔdɔ̀ ba iyì àti orúkɔ rɛ̀ jɛ́. Ɛnì kɔ̀ɔ̀kan ló ní ɛ̀tɔ́ sí ààbò lábɛ́ òfin kúrò lɔ́wɔ́ irú àyɔjúràn tàbí ìbanijɛ́ bɛ́ɛ̀.\nƐnì kɔ̀ɔ̀kan ló ní ɛ̀tɔ́ sí òmìnira èrò, òmìnira ɛ̀rí-ɔkàn àti òmìnira ɛ sìn. Ɛtɔ́ yìí sì gbani láàyè láti kpààrɔ̀ ɛ sìn tàbí ìgbàgbɔ́ ɛni. Ó sì fún ɛyɔ ɛnì kan tàbí àkójɔkpɔ̀ ènìyàn láàyè láti she ɛ̀sìn wɔn àti ìgbàgbɔ́ wɔn bó she jɛ mɔ́ ti ìkɔ́ni, ìshesí, ìjɔ́sìn àti ìmúshe ohun tí wɔ́n gbàgbɔ́ yálà ní ìkɔ̀kɔ̀ tàbí ní gban̄gba.\nƐnì kɔ̀ɔ̀kan ló ní ɛ̀tɔ́ sí ìsinmi àti fàájì kpɛ̀lú àkókò tí kò kpɔ̀ jù lɛ́nu ishɛ́ àti àsìkò ìsinmi lɛ́nu ishɛ́ láti ìgbà dé ìgbà tí a ó sanwó fún."
}
source: "Centre national de linguistique appliquée (CENALA), Alphabet des langues nationales béninoises, Cotonou: CENALA avec le concours de l’Initiative francophone pour la formation à distance des maîtres (IFADEM), 2008, 6th ed."
-note: "Yoruba (Benin) uses ɛ ɔ kp sh instead of Yoruba (Nigeria) ẹ ọ p ṣ."
\ No newline at end of file
+note: "Yoruba (Benin) uses ɛ ɔ kp sh instead of Yoruba (Nigeria) ẹ ọ p ṣ."
optional string numerals = 4;
optional string punctuation = 5;
optional string index = 6;
+ optional string not_required = 7; // Base characters which can be ignored when determining language support
- // Next = 7;
+ // Next = 8;
}
message SampleTextProto {
syntax='proto2',
serialized_options=None,
create_key=_descriptor._internal_create_key,
- serialized_pb=b'\n\x16languages_public.proto\x12\x17google.languages_public\"Q\n\x0bRegionProto\x12\n\n\x02id\x18\x01 \x01(\t\x12\x0c\n\x04name\x18\x02 \x01(\t\x12\x12\n\npopulation\x18\x03 \x01(\x05\x12\x14\n\x0cregion_group\x18\x04 \x03(\t\"\'\n\x0bScriptProto\x12\n\n\x02id\x18\x01 \x01(\t\x12\x0c\n\x04name\x18\x02 \x01(\t\"\xce\x02\n\rLanguageProto\x12\n\n\x02id\x18\x01 \x01(\t\x12\x10\n\x08language\x18\x02 \x01(\t\x12\x0e\n\x06script\x18\x03 \x01(\t\x12\x0c\n\x04name\x18\x04 \x01(\t\x12\x16\n\x0epreferred_name\x18\x05 \x01(\t\x12\x0f\n\x07\x61utonym\x18\x06 \x01(\t\x12\x12\n\npopulation\x18\x07 \x01(\x05\x12\x0e\n\x06region\x18\x08 \x03(\t\x12\x43\n\x0e\x65xemplar_chars\x18\t \x01(\x0b\x32+.google.languages_public.ExemplarCharsProto\x12=\n\x0bsample_text\x18\n \x01(\x0b\x32(.google.languages_public.SampleTextProto\x12\x12\n\nhistorical\x18\x0b \x01(\x08\x12\x0e\n\x06source\x18\x0c \x03(\t\x12\x0c\n\x04note\x18\r \x01(\t\"z\n\x12\x45xemplarCharsProto\x12\x0c\n\x04\x62\x61se\x18\x01 \x01(\t\x12\x11\n\tauxiliary\x18\x02 \x01(\t\x12\r\n\x05marks\x18\x03 \x01(\t\x12\x10\n\x08numerals\x18\x04 \x01(\t\x12\x13\n\x0bpunctuation\x18\x05 \x01(\t\x12\r\n\x05index\x18\x06 \x01(\t\"\x92\x02\n\x0fSampleTextProto\x12\x15\n\rmasthead_full\x18\x01 \x01(\t\x12\x18\n\x10masthead_partial\x18\x02 \x01(\t\x12\x0e\n\x06styles\x18\x03 \x01(\t\x12\x0e\n\x06tester\x18\x04 \x01(\t\x12\x11\n\tposter_sm\x18\x05 \x01(\t\x12\x11\n\tposter_md\x18\x06 \x01(\t\x12\x11\n\tposter_lg\x18\x07 \x01(\t\x12\x13\n\x0bspecimen_48\x18\x08 \x01(\t\x12\x13\n\x0bspecimen_36\x18\t \x01(\t\x12\x13\n\x0bspecimen_32\x18\n \x01(\t\x12\x13\n\x0bspecimen_21\x18\x0b \x01(\t\x12\x13\n\x0bspecimen_16\x18\x0c \x01(\t\x12\x0c\n\x04note\x18\r \x01(\t'
+ serialized_pb=b'\n\x16languages_public.proto\x12\x17google.languages_public\"Q\n\x0bRegionProto\x12\n\n\x02id\x18\x01 \x01(\t\x12\x0c\n\x04name\x18\x02 \x01(\t\x12\x12\n\npopulation\x18\x03 \x01(\x05\x12\x14\n\x0cregion_group\x18\x04 \x03(\t\"\'\n\x0bScriptProto\x12\n\n\x02id\x18\x01 \x01(\t\x12\x0c\n\x04name\x18\x02 \x01(\t\"\xce\x02\n\rLanguageProto\x12\n\n\x02id\x18\x01 \x01(\t\x12\x10\n\x08language\x18\x02 \x01(\t\x12\x0e\n\x06script\x18\x03 \x01(\t\x12\x0c\n\x04name\x18\x04 \x01(\t\x12\x16\n\x0epreferred_name\x18\x05 \x01(\t\x12\x0f\n\x07\x61utonym\x18\x06 \x01(\t\x12\x12\n\npopulation\x18\x07 \x01(\x05\x12\x0e\n\x06region\x18\x08 \x03(\t\x12\x43\n\x0e\x65xemplar_chars\x18\t \x01(\x0b\x32+.google.languages_public.ExemplarCharsProto\x12=\n\x0bsample_text\x18\n \x01(\x0b\x32(.google.languages_public.SampleTextProto\x12\x12\n\nhistorical\x18\x0b \x01(\x08\x12\x0e\n\x06source\x18\x0c \x03(\t\x12\x0c\n\x04note\x18\r \x01(\t\"\x90\x01\n\x12\x45xemplarCharsProto\x12\x0c\n\x04\x62\x61se\x18\x01 \x01(\t\x12\x11\n\tauxiliary\x18\x02 \x01(\t\x12\r\n\x05marks\x18\x03 \x01(\t\x12\x10\n\x08numerals\x18\x04 \x01(\t\x12\x13\n\x0bpunctuation\x18\x05 \x01(\t\x12\r\n\x05index\x18\x06 \x01(\t\x12\x14\n\x0cnot_required\x18\x07 \x01(\t\"\x92\x02\n\x0fSampleTextProto\x12\x15\n\rmasthead_full\x18\x01 \x01(\t\x12\x18\n\x10masthead_partial\x18\x02 \x01(\t\x12\x0e\n\x06styles\x18\x03 \x01(\t\x12\x0e\n\x06tester\x18\x04 \x01(\t\x12\x11\n\tposter_sm\x18\x05 \x01(\t\x12\x11\n\tposter_md\x18\x06 \x01(\t\x12\x11\n\tposter_lg\x18\x07 \x01(\t\x12\x13\n\x0bspecimen_48\x18\x08 \x01(\t\x12\x13\n\x0bspecimen_36\x18\t \x01(\t\x12\x13\n\x0bspecimen_32\x18\n \x01(\t\x12\x13\n\x0bspecimen_21\x18\x0b \x01(\t\x12\x13\n\x0bspecimen_16\x18\x0c \x01(\t\x12\x0c\n\x04note\x18\r \x01(\t'
)
message_type=None, enum_type=None, containing_type=None,
is_extension=False, extension_scope=None,
serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key),
+ _descriptor.FieldDescriptor(
+ name='not_required', full_name='google.languages_public.ExemplarCharsProto.not_required', index=6,
+ number=7, type=9, cpp_type=9, label=1,
+ has_default_value=False, default_value=b"".decode('utf-8'),
+ message_type=None, enum_type=None, containing_type=None,
+ is_extension=False, extension_scope=None,
+ serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key),
],
extensions=[
],
extension_ranges=[],
oneofs=[
],
- serialized_start=512,
- serialized_end=634,
+ serialized_start=513,
+ serialized_end=657,
)
extension_ranges=[],
oneofs=[
],
- serialized_start=637,
- serialized_end=911,
+ serialized_start=660,
+ serialized_end=934,
)
_LANGUAGEPROTO.fields_by_name['exemplar_chars'].message_type = _EXEMPLARCHARSPROTO