]> git.ipfire.org Git - thirdparty/google/fonts.git/commitdiff
Test to detect out-of-script exemplars
authorSimon Cozens <simon@simon-cozens.org>
Fri, 11 Nov 2022 11:45:12 +0000 (11:45 +0000)
committerSimon Cozens <simon@simon-cozens.org>
Fri, 11 Nov 2022 11:45:12 +0000 (11:45 +0000)
dev-requirements.txt
tests/test_data_languages.py

index 9acb19bea6947930b8fea05cf50fc7ece2946d4b..73b0496d54ca0288dfde1ea0aab98870723a4fd6 100644 (file)
@@ -1 +1,2 @@
 uharfbuzz
+youseedee
index d188689c121feacb19a78303297a17c35b1070ff..b992a6a4372b180670e8d33efe497ba667976c18 100644 (file)
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
-import pytest
 from collections import Counter
+import re
+
 from gflanguages import LoadLanguages, languages_public_pb2, LoadScripts
+import pytest
+import youseedee
 
 
 LANGUAGES = LoadLanguages()
 SCRIPTS = LoadScripts()
 
+CLDR_SCRIPT_TO_UCD_SCRIPT = {
+    "Bangla": "Bengali",
+    "Traditional Han": "Han",
+    "Simplified Han": "Han",
+    "Korean": "Hangul",
+    "Odia": "Oriya",
+    "Ol Chiki": "Ol_Chiki",
+}
+
 
 @pytest.mark.parametrize("lang_code", LANGUAGES)
 @pytest.mark.parametrize(
@@ -36,6 +48,7 @@ def test_languages_exemplars_duplicates(lang_code, exemplar_name):
 
 
 SampleText = languages_public_pb2.SampleTextProto().DESCRIPTOR
+ExemplarChars = languages_public_pb2.ExemplarCharsProto().DESCRIPTOR
 
 
 @pytest.mark.parametrize("lang_code", LANGUAGES.keys())
@@ -57,3 +70,35 @@ def test_script_is_known(lang_code):
     lang = LANGUAGES[lang_code]
     script = lang.script
     assert script in SCRIPTS, f"{lang_code} used unknown script {lang.script}"
+
+
+@pytest.mark.parametrize("lang_code", LANGUAGES.keys())
+def test_exemplars_are_in_script(lang_code):
+    lang = LANGUAGES[lang_code]
+    script_name = SCRIPTS[lang.script].name
+    script_name = CLDR_SCRIPT_TO_UCD_SCRIPT.get(script_name, script_name)
+    if not lang.exemplar_chars.ListFields():
+        pytest.skip("No exemplars for language " + lang_code)
+        return
+    if "Jpan" in lang.id:
+        pytest.skip("Too tricky")
+        return
+    out_of_script = {}
+    for field in ExemplarChars.fields:
+        if field.name == "auxiliary":
+            continue
+        exemplars = getattr(lang.exemplar_chars, field.name)
+        group_of_chars = re.findall(r"(\{[^}]+\}|\S+)", exemplars)
+        for chars in group_of_chars:
+            for char in chars:
+                char_script = youseedee.ucd_data(ord(char)).get("Script")
+                if char_script == "Common" or char_script == "Inherited":
+                    continue
+                if char_script != script_name:
+                    out_of_script[chars] = char_script
+                    break
+    assert not out_of_script, (
+        f"{lang_code} exemplars contained out-of-script characters"
+        f": {', '.join(out_of_script.keys())}"
+        f" from scripts {', '.join(set(out_of_script.values()))}"
+    )