Move sample text updater from gftools to here

author Simon Cozens <simon@simon-cozens.org>

Thu, 12 Sep 2024 09:53:32 +0000 (10:53 +0100)

committer Simon Cozens <simon@simon-cozens.org>

Thu, 12 Sep 2024 09:53:32 +0000 (10:53 +0100)
author Simon Cozens <simon@simon-cozens.org>
Thu, 12 Sep 2024 09:53:32 +0000 (10:53 +0100)
committer Simon Cozens <simon@simon-cozens.org>
Thu, 12 Sep 2024 09:53:32 +0000 (10:53 +0100)
diff --git a/Lib/gflanguages/udhr.py b/Lib/gflanguages/udhr.py

new file mode 100644 (file)

index 0000000..ea076be
--- /dev/null
+++ b/Lib/gflanguages/udhr.py
@@ -0,0 +1,252 @@
+from gflanguages import languages_public_pb2
+import enum
+import re
+
+
+class Udhr:
+    def __init__(
+        self, key, iso639_3, iso15924, bcp47, direction, ohchr, stage, loc, name
+    ):
+        self.key = key
+        self.iso639_3 = iso639_3
+        self.iso15924 = iso15924
+        self.bcp47 = bcp47
+        self.direction = direction
+        self.ohchr = ohchr
+        self.stage = stage
+        self.loc = loc
+        self.name = name
+
+        self.title = None
+        self.preamble = None
+        self.articles = []
+
+    def Parse(self, translation_data):
+        if translation_data is None or self.stage < 2:
+            return
+
+        if translation_data.find("./{*}title") is not None:
+            self.title = translation_data.find("./{*}title").text
+
+        preamble_data = translation_data.find("./{*}preamble")
+        if preamble_data is not None:
+            if preamble_data.find("./{*}title") is not None:
+                self.preamble = {
+                    "title": preamble_data.find("./{*}title").text,
+                    "content": [
+                        para.text for para in preamble_data.findall("./{*}para")
+                    ],
+                }
+
+        articles_data = translation_data.findall("./{*}article")
+        for article_data in articles_data:
+            title_data = article_data.find("./{*}title")
+            article = {
+                "id": int(article_data.get("number")),
+                "title": None if title_data is None else title_data.text,
+                "content": [para.text for para in article_data.findall("./{*}para")],
+            }
+            self.articles.append(article)
+
+    def LoadArticleOne(self, article_one):
+        self.articles.append({"id": 0, "title": None, "content": [article_one]})
+
+    def GetSampleTexts(self):
+        extractor = SampleTextExtractor(self)
+        return extractor.GetSampleTexts()
+
+
+class SampleTextExtractor:
+    class TextType(enum.Enum):
+        GLYPHS = 1
+        WORD = 2
+        PHRASE = 3
+        SENTENCE = 4
+        PARAGRAPH = 5
+        PASSAGE = 6
+
+    def __init__(self, udhr):
+        self._udhr = udhr
+        self._glyphs = iter(self._GetGlyphs())
+        self._words = iter(self._GetWords())
+        self._paragraphs = iter(self._GetParagraphs())
+        self._phrase_history = set()
+
+        self._non_word_regex = re.compile(r"[^\w]+")
+        self._space_regex = re.compile(r"\s+")
+        self._non_space_regex = re.compile(r"[^\s]+")
+        self._non_word_space_regex = re.compile(r"[^\w\s]+")
+        self._any_regex = re.compile(r".")
+
+    def _DisplayLength(self, s):
+        """Returns length of given string. Omits combining characters.
+
+        Some entire scripts will not be counted; in those cases, the raw length of
+        the string is returned.
+        """
+        word_space_length = len(self._non_word_space_regex.sub("", s))
+        space_length = len(self._non_space_regex.sub("", s))
+        if word_space_length == space_length:
+            return len(s)
+        return word_space_length
+
+    def _GetGlyphs(self):
+        seen = set()
+        for article in self._udhr.articles:
+            for para in article["content"]:
+                for ch in self._non_word_regex.sub("", para) or self._space_regex.sub(
+                    "", para
+                ):
+                    ch = ch.lower()
+                    if ch not in seen:
+                        seen.add(ch)
+                        yield ch
+
+    def _GetWords(self):
+        if self._space_regex.search(self._udhr.articles[0]["content"][0]) is not None:
+            splitter = self._space_regex
+        else:
+            splitter = self._non_word_regex
+
+        seen = set()
+        for article in self._udhr.articles:
+            for para in article["content"]:
+                for s in splitter.split(para):
+                    if s not in seen:
+                        seen.add(s)
+                        yield s
+
+    def _GetParagraphs(self):
+        if self._udhr.preamble is not None:
+            for para in self._udhr.preamble["content"]:
+                yield para
+        for article in self._udhr.articles:
+            for para in article["content"]:
+                yield para
+
+    def _ExtractGlyphs(self, min_chars, max_chars):
+        s = ""
+        for ch in self._glyphs:
+            s += ch.upper()
+            if len(s) >= min_chars:
+                break
+            if ch != ch.upper():
+                s += ch
+                if len(s) >= min_chars:
+                    break
+        return s
+
+    def _ExtractWord(self, min_chars, max_chars):
+        for iterator in [self._words, self._GetWords()]:
+            for w in iterator:
+                if w is None:
+                    continue
+                if min_chars <= self._DisplayLength(w) <= max_chars:
+                    return w
+        # Fallback to using multiple words for languages with very small words
+        return self._ExtractPhrase(min_chars, max_chars)
+
+    def _ExtractPhrase(self, min_chars, max_chars):
+        for iterator in [self._paragraphs, self._GetParagraphs()]:
+            for para in iterator:
+                if para is None:
+                    continue
+                for regex in [self._any_regex, self._space_regex, self._non_word_regex]:
+                    breaks = [-1]
+                    for match in regex.finditer(para, min_chars):
+                        breaks.append(match.start())
+                        phrase = para[breaks[0] + 1 : breaks[len(breaks) - 1]]
+                        p_size = self._DisplayLength(phrase)
+                        while p_size > max_chars and len(breaks) > 1:
+                            breaks.pop()
+                            phrase = para[breaks[0] + 1 : breaks[len(breaks) - 1]]
+                            p_size = self._DisplayLength(phrase)
+                        if min_chars <= p_size and phrase not in self._phrase_history:
+                            self._phrase_history.add(phrase)
+                            return phrase
+        return self._ExtractParagraph(min_chars, max_chars)
+
+    def _ExtractSentence(self, min_chars, max_chars):
+        # Sentence delimination may differ between scripts, so tokenizing on spaces
+        # would be unreliable. Prefer to use _ExtractPhrase.
+        return self._ExtractPhrase(min_chars, max_chars)
+
+    def _ExtractParagraph(self, min_chars, max_chars):
+        for iterator in [self._paragraphs, self._GetParagraphs()]:
+            for para in iterator:
+                if para is None:
+                    continue
+                if min_chars <= self._DisplayLength(para) <= max_chars:
+                    return para
+        # Paragraphs likely insufficient length; try combining into passages
+        return self._ExtractPassage(min_chars, max_chars)
+
+    def _ExtractPassage(self, min_chars, max_chars):
+        p = []
+        p_size = 0
+        while p_size < min_chars:
+            for iterator in [self._paragraphs, self._GetParagraphs()]:
+                for para in iterator:
+                    if para is None:
+                        continue
+                    p.append(para)
+                    p_size = self._DisplayLength(" ".join(p))
+                    if max_chars < p_size:
+                        p.pop()
+                    elif min_chars <= p_size:
+                        return "\n".join(p)
+        assert len(p) > 0, "Unable to extract passage: " + self._udhr.key
+        if len(p) == 0:
+            p.append([p for p in self._GetParagraphs()][0])
+        return "\n".join(p)
+
+    def _Get(self, text_type, **kwargs):
+        if "char_count" in kwargs:
+            min_chars = kwargs["char_count"]
+            max_chars = kwargs["char_count"]
+        else:
+            min_chars = kwargs["min_chars"]
+            max_chars = kwargs["max_chars"]
+        if text_type == self.TextType.GLYPHS:
+            return self._ExtractGlyphs(min_chars, max_chars)
+        if text_type == self.TextType.WORD:
+            return self._ExtractWord(min_chars, max_chars)
+        if text_type == self.TextType.PHRASE:
+            return self._ExtractPhrase(min_chars, max_chars)
+        if text_type == self.TextType.SENTENCE:
+            return self._ExtractSentence(min_chars, max_chars)
+        if text_type == self.TextType.PARAGRAPH:
+            return self._ExtractParagraph(min_chars, max_chars)
+        if text_type == self.TextType.PASSAGE:
+            return self._ExtractPassage(min_chars, max_chars)
+        raise Exception("Unsupported text type: " + text_type)
+
+    def GetSampleTexts(self):
+        sample_text = languages_public_pb2.SampleTextProto()
+        sample_text.masthead_full = self._Get(self.TextType.GLYPHS, char_count=4)
+        sample_text.masthead_partial = self._Get(self.TextType.GLYPHS, char_count=2)
+        sample_text.styles = self._Get(self.TextType.PHRASE, min_chars=40, max_chars=60)
+        sample_text.tester = self._Get(self.TextType.PHRASE, min_chars=60, max_chars=90)
+        sample_text.poster_sm = self._Get(
+            self.TextType.PHRASE, min_chars=10, max_chars=17
+        )
+        sample_text.poster_md = self._Get(
+            self.TextType.PHRASE, min_chars=6, max_chars=12
+        )
+        sample_text.poster_lg = self._Get(self.TextType.WORD, min_chars=3, max_chars=8)
+        sample_text.specimen_48 = self._Get(
+            self.TextType.SENTENCE, min_chars=50, max_chars=80
+        )
+        sample_text.specimen_36 = self._Get(
+            self.TextType.PARAGRAPH, min_chars=100, max_chars=120
+        )
+        sample_text.specimen_32 = self._Get(
+            self.TextType.PARAGRAPH, min_chars=140, max_chars=180
+        )
+        sample_text.specimen_21 = self._Get(
+            self.TextType.PASSAGE, min_chars=300, max_chars=500
+        )
+        sample_text.specimen_16 = self._Get(
+            self.TextType.PASSAGE, min_chars=550, max_chars=750
+        )
+        return sample_text
diff --git a/snippets/lang_sample_text.py b/snippets/lang_sample_text.py

new file mode 100755 (executable)

index 0000000..10b3891
--- /dev/null
+++ b/snippets/lang_sample_text.py
@@ -0,0 +1,65 @@
+#!/usr/bin/env python3
+"""
+lang-sample-text
+
+Adds sample text for a given language using the specified UDHR translation.
+
+Usage:
+
+lang-sample-text -l ./languages/en.textproto ./udhr_translations/en.xml
+
+"""
+
+from gflanguages import LoadLanguages, languages_public_pb2
+from gftools.util.google_fonts import ReadProto, WriteProto
+from gflanguages.udhr import Udhr
+from lxml import etree
+import os
+import re
+import argparse
+
+
+def main(argv=None):
+    parser = argparse.ArgumentParser(
+        description="Update UDHR sample text for a given language"
+    )
+    parser.add_argument(
+        "-l",
+        "--lang",
+        help="Language proto file to update",
+        required=True,
+    )
+    parser.add_argument(
+        "-u",
+        "--udhr",
+        help="Path to UDHR translation (XML)",
+        required=True,
+    )
+    args = parser.parse_args(argv)
+
+    language = ReadProto(languages_public_pb2.LanguageProto(), args.lang)
+
+    udhr_data = etree.parse(args.udhr)
+    head = udhr_data.getroot()
+    for name, value in head.attrib.items():
+        if re.search(r"\{.*\}lang", name):
+            bcp47 = value.replace("-", "_")
+    udhr = Udhr(
+        key=head.get("key"),
+        iso639_3=head.get("iso639-3"),
+        iso15924=head.get("iso15924"),
+        bcp47=bcp47,
+        direction=head.get("dir"),
+        ohchr=None,
+        stage=4,
+        loc=None,
+        name=head.get("n"),
+    )
+    udhr.Parse(udhr_data)
+
+    language.sample_text.MergeFrom(udhr.GetSampleTexts())
+    WriteProto(language, args.lang)
+
+
+if __name__ == "__main__":
+    main()
author	Simon Cozens <simon@simon-cozens.org>
	Thu, 12 Sep 2024 09:53:32 +0000 (10:53 +0100)
committer	Simon Cozens <simon@simon-cozens.org>
	Thu, 12 Sep 2024 09:53:32 +0000 (10:53 +0100)
Lib/gflanguages/udhr.py	[new file with mode: 0644]	patch \| blob
snippets/lang_sample_text.py	[new file with mode: 0755]	patch \| blob