From 75486c9bf3c79a6e59cb8e1bf8cb17a33b6e7c08 Mon Sep 17 00:00:00 2001 From: Tomas R Date: Thu, 11 Jul 2024 08:10:19 +0200 Subject: [PATCH] Upgrade to CLDR 45 (#1077) * Upgrade to CLDR 45 * Handle 'localeRules="nonlikelyScript"' for parent locales Locales of the form 'lang_Script' where 'Script' is not the likely script for 'lang' should have 'root' as their parent locale. For example, the parent of 'az_Arab' should not be computed as 'az' by truncating from the end, but should be 'root' instead as 'Arab' is not the likely script for 'az'. The list of such languages was previously specified using an explicit 'locales' attribute. It is now handled dynamically using the new 'localeRules' attribute. --- babel/localedata.py | 28 ++++++++++++++++++++++++++-- scripts/download_import_cldr.py | 8 ++++---- scripts/import_cldr.py | 5 +++++ tests/test_localedata.py | 15 +++++++++++++++ tests/test_numbers.py | 2 +- 5 files changed, 51 insertions(+), 7 deletions(-) diff --git a/babel/localedata.py b/babel/localedata.py index a9f7d4bf..2aabfd18 100644 --- a/babel/localedata.py +++ b/babel/localedata.py @@ -95,6 +95,27 @@ def locale_identifiers() -> list[str]: ] +def _is_non_likely_script(name: str) -> bool: + """Return whether the locale is of the form ``lang_Script``, + and the script is not the likely script for the language. + + This implements the behavior of the ``nonlikelyScript`` value of the + ``localRules`` attribute for parent locales added in CLDR 45. + """ + from babel.core import get_global, parse_locale + + try: + lang, territory, script, variant, *rest = parse_locale(name) + except ValueError: + return False + + if lang and script and not territory and not variant and not rest: + likely_subtag = get_global('likely_subtags').get(lang) + _, _, likely_script, *_ = parse_locale(likely_subtag) + return script != likely_script + return False + + def load(name: os.PathLike[str] | str, merge_inherited: bool = True) -> dict[str, Any]: """Load the locale data for the given locale. @@ -132,8 +153,11 @@ def load(name: os.PathLike[str] | str, merge_inherited: bool = True) -> dict[str from babel.core import get_global parent = get_global('parent_exceptions').get(name) if not parent: - parts = name.split('_') - parent = "root" if len(parts) == 1 else "_".join(parts[:-1]) + if _is_non_likely_script(name): + parent = 'root' + else: + parts = name.split('_') + parent = "root" if len(parts) == 1 else "_".join(parts[:-1]) data = load(parent).copy() filename = resolve_locale_filename(name) with open(filename, 'rb') as fileobj: diff --git a/scripts/download_import_cldr.py b/scripts/download_import_cldr.py index 10a2deae..4a9805dd 100755 --- a/scripts/download_import_cldr.py +++ b/scripts/download_import_cldr.py @@ -9,10 +9,10 @@ import sys import zipfile from urllib.request import urlretrieve -URL = 'https://unicode.org/Public/cldr/44/cldr-common-44.0.zip' -FILENAME = 'cldr-common-44.0.zip' -# Via https://unicode.org/Public/cldr/44/hashes/SHASUM512 -FILESUM = 'f2cd8733948caf308d6e39eae21724da7f29f528f8969d456514e1e84ecd5f1e6936d0460414a968888bb1b597bc1ee723950ea47df5cba21a02bb14f96d18b6' +URL = 'https://unicode.org/Public/cldr/45/cldr-common-45.0.zip' +FILENAME = 'cldr-common-45.0.zip' +# Via https://unicode.org/Public/cldr/45/hashes/SHASUM512.txt +FILESUM = '638123882bd29911fc9492ec152926572fec48eb6c1f5dd706aee3e59cad8be4963a334bb7a09a645dbedc3356f60ef7ac2ef7ab4ccf2c8926b547782175603c' BLKSIZE = 131072 diff --git a/scripts/import_cldr.py b/scripts/import_cldr.py index 633ca9a0..761372ac 100755 --- a/scripts/import_cldr.py +++ b/scripts/import_cldr.py @@ -315,6 +315,11 @@ def parse_global(srcdir, sup): for paternity in parentBlock.findall('./parentLocale'): parent = paternity.attrib['parent'] + if parent == 'root': + # Since CLDR-45, the 'root' parent locale uses 'localeRules="nonlikelyScript"' instead of + # 'locales'. This special case is handled in babel when loading locale data + # (https://cldr.unicode.org/index/downloads/cldr-45#h.5rbkhkncdqi9) + continue for child in paternity.attrib['locales'].split(): parent_exceptions[child] = parent diff --git a/tests/test_localedata.py b/tests/test_localedata.py index 8a4fbef1..721b91fb 100644 --- a/tests/test_localedata.py +++ b/tests/test_localedata.py @@ -63,6 +63,21 @@ def test_load(): assert localedata.load('en_US') is localedata.load('en_US') +def test_load_inheritance(monkeypatch): + from babel.localedata import _cache + + _cache.clear() + localedata.load('hi_Latn') + # Must not be ['root', 'hi_Latn'] even though 'hi_Latn' matches the 'lang_Script' + # form used by 'nonLikelyScripts'. This is because 'hi_Latn' has an explicit parent locale 'en_IN'. + assert list(_cache.keys()) == ['root', 'en', 'en_001', 'en_IN', 'hi_Latn'] + + _cache.clear() + localedata.load('az_Arab') + # Must not include 'az' as 'Arab' is not a likely script for 'az'. + assert list(_cache.keys()) == ['root', 'az_Arab'] + + def test_merge(): d = {1: 'foo', 3: 'baz'} localedata.merge(d, {1: 'Foo', 2: 'Bar'}) diff --git a/tests/test_numbers.py b/tests/test_numbers.py index ed0531c2..a96bdbeb 100644 --- a/tests/test_numbers.py +++ b/tests/test_numbers.py @@ -250,7 +250,7 @@ def test_list_currencies(): assert list_currencies(locale='pa_Arab') == {'PKR', 'INR', 'EUR'} - assert len(list_currencies()) == 305 + assert len(list_currencies()) == 306 def test_validate_currency(): -- 2.47.2