From: Simon Cozens Date: Fri, 11 Nov 2022 11:45:12 +0000 (+0000) Subject: Test to detect out-of-script exemplars X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=54b65167150d9e11994a7ace74e488efae909919;p=thirdparty%2Fgoogle%2Ffonts.git Test to detect out-of-script exemplars --- diff --git a/dev-requirements.txt b/dev-requirements.txt index 9acb19bea6..73b0496d54 100644 --- a/dev-requirements.txt +++ b/dev-requirements.txt @@ -1 +1,2 @@ uharfbuzz +youseedee diff --git a/tests/test_data_languages.py b/tests/test_data_languages.py index d188689c12..b992a6a437 100644 --- a/tests/test_data_languages.py +++ b/tests/test_data_languages.py @@ -14,14 +14,26 @@ # See the License for the specific language governing permissions and # limitations under the License. # -import pytest from collections import Counter +import re + from gflanguages import LoadLanguages, languages_public_pb2, LoadScripts +import pytest +import youseedee LANGUAGES = LoadLanguages() SCRIPTS = LoadScripts() +CLDR_SCRIPT_TO_UCD_SCRIPT = { + "Bangla": "Bengali", + "Traditional Han": "Han", + "Simplified Han": "Han", + "Korean": "Hangul", + "Odia": "Oriya", + "Ol Chiki": "Ol_Chiki", +} + @pytest.mark.parametrize("lang_code", LANGUAGES) @pytest.mark.parametrize( @@ -36,6 +48,7 @@ def test_languages_exemplars_duplicates(lang_code, exemplar_name): SampleText = languages_public_pb2.SampleTextProto().DESCRIPTOR +ExemplarChars = languages_public_pb2.ExemplarCharsProto().DESCRIPTOR @pytest.mark.parametrize("lang_code", LANGUAGES.keys()) @@ -57,3 +70,35 @@ def test_script_is_known(lang_code): lang = LANGUAGES[lang_code] script = lang.script assert script in SCRIPTS, f"{lang_code} used unknown script {lang.script}" + + +@pytest.mark.parametrize("lang_code", LANGUAGES.keys()) +def test_exemplars_are_in_script(lang_code): + lang = LANGUAGES[lang_code] + script_name = SCRIPTS[lang.script].name + script_name = CLDR_SCRIPT_TO_UCD_SCRIPT.get(script_name, script_name) + if not lang.exemplar_chars.ListFields(): + pytest.skip("No exemplars for language " + lang_code) + return + if "Jpan" in lang.id: + pytest.skip("Too tricky") + return + out_of_script = {} + for field in ExemplarChars.fields: + if field.name == "auxiliary": + continue + exemplars = getattr(lang.exemplar_chars, field.name) + group_of_chars = re.findall(r"(\{[^}]+\}|\S+)", exemplars) + for chars in group_of_chars: + for char in chars: + char_script = youseedee.ucd_data(ord(char)).get("Script") + if char_script == "Common" or char_script == "Inherited": + continue + if char_script != script_name: + out_of_script[chars] = char_script + break + assert not out_of_script, ( + f"{lang_code} exemplars contained out-of-script characters" + f": {', '.join(out_of_script.keys())}" + f" from scripts {', '.join(set(out_of_script.values()))}" + )