From: Simon Cozens <simon@simon-cozens.org>
Date: Fri, 11 Nov 2022 11:45:12 +0000 (+0000)
Subject: Test to detect out-of-script exemplars
X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=54b65167150d9e11994a7ace74e488efae909919;p=thirdparty%2Fgoogle%2Ffonts.git

Test to detect out-of-script exemplars
---

diff --git a/dev-requirements.txt b/dev-requirements.txt
index 9acb19bea6..73b0496d54 100644
--- a/dev-requirements.txt
+++ b/dev-requirements.txt
@@ -1 +1,2 @@
 uharfbuzz
+youseedee
diff --git a/tests/test_data_languages.py b/tests/test_data_languages.py
index d188689c12..b992a6a437 100644
--- a/tests/test_data_languages.py
+++ b/tests/test_data_languages.py
@@ -14,14 +14,26 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
-import pytest
 from collections import Counter
+import re
+
 from gflanguages import LoadLanguages, languages_public_pb2, LoadScripts
+import pytest
+import youseedee
 
 
 LANGUAGES = LoadLanguages()
 SCRIPTS = LoadScripts()
 
+CLDR_SCRIPT_TO_UCD_SCRIPT = {
+    "Bangla": "Bengali",
+    "Traditional Han": "Han",
+    "Simplified Han": "Han",
+    "Korean": "Hangul",
+    "Odia": "Oriya",
+    "Ol Chiki": "Ol_Chiki",
+}
+
 
 @pytest.mark.parametrize("lang_code", LANGUAGES)
 @pytest.mark.parametrize(
@@ -36,6 +48,7 @@ def test_languages_exemplars_duplicates(lang_code, exemplar_name):
 
 
 SampleText = languages_public_pb2.SampleTextProto().DESCRIPTOR
+ExemplarChars = languages_public_pb2.ExemplarCharsProto().DESCRIPTOR
 
 
 @pytest.mark.parametrize("lang_code", LANGUAGES.keys())
@@ -57,3 +70,35 @@ def test_script_is_known(lang_code):
     lang = LANGUAGES[lang_code]
     script = lang.script
     assert script in SCRIPTS, f"{lang_code} used unknown script {lang.script}"
+
+
+@pytest.mark.parametrize("lang_code", LANGUAGES.keys())
+def test_exemplars_are_in_script(lang_code):
+    lang = LANGUAGES[lang_code]
+    script_name = SCRIPTS[lang.script].name
+    script_name = CLDR_SCRIPT_TO_UCD_SCRIPT.get(script_name, script_name)
+    if not lang.exemplar_chars.ListFields():
+        pytest.skip("No exemplars for language " + lang_code)
+        return
+    if "Jpan" in lang.id:
+        pytest.skip("Too tricky")
+        return
+    out_of_script = {}
+    for field in ExemplarChars.fields:
+        if field.name == "auxiliary":
+            continue
+        exemplars = getattr(lang.exemplar_chars, field.name)
+        group_of_chars = re.findall(r"(\{[^}]+\}|\S+)", exemplars)
+        for chars in group_of_chars:
+            for char in chars:
+                char_script = youseedee.ucd_data(ord(char)).get("Script")
+                if char_script == "Common" or char_script == "Inherited":
+                    continue
+                if char_script != script_name:
+                    out_of_script[chars] = char_script
+                    break
+    assert not out_of_script, (
+        f"{lang_code} exemplars contained out-of-script characters"
+        f": {', '.join(out_of_script.keys())}"
+        f" from scripts {', '.join(set(out_of_script.values()))}"
+    )