From: Armin Ronacher Date: Wed, 24 Jul 2013 18:40:16 +0000 (+0200) Subject: Added basic likely-subtag resolving X-Git-Tag: 1.0~45 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=8b7c5e1585b52ea110a5e884151d4eaa2175985d;p=thirdparty%2Fbabel.git Added basic likely-subtag resolving --- diff --git a/ChangeLog b/ChangeLog index 952f1497..fda9c009 100644 --- a/ChangeLog +++ b/ChangeLog @@ -70,6 +70,8 @@ Version 1.0 * Added experimental Python 3 support. * Added better support for returning timezone names. * Don't throw away a Catalog's obsolete messages when updating it. + * Added basic likelySubtag resolving when doing locale parsing and no + match can be found. Version 0.9.6 diff --git a/babel/core.py b/babel/core.py index d22ccaa1..e5fe4bd0 100644 --- a/babel/core.py +++ b/babel/core.py @@ -194,7 +194,7 @@ class Locale(object): return Locale.parse(identifier, sep=sep) @classmethod - def parse(cls, identifier, sep='_'): + def parse(cls, identifier, sep='_', resolve_likely_subtags=True): """Create a `Locale` instance for the given locale identifier. >>> l = Locale.parse('de-DE', sep='-') @@ -207,8 +207,22 @@ class Locale(object): >>> Locale.parse(l) Locale('de', territory='DE') + This also can perform resolving of likely subtags which it does + by default. + :param identifier: the locale identifier string :param sep: optional component separator + :param resolve_likely_subtags: if this is specified then a locale will + have its likely subtag resolved if the + locale otherwise does not exist. For + instance ``zh_TW`` by itself is not a + locale that exists but Babel can + automatically expand it to the full + form of ``zh_hant_TW``. Note that this + expansion is only taking place if no + locale exists otherwise. For instance + there is a locale ``en`` that can exist + by itself. :return: a corresponding `Locale` instance :rtype: `Locale` :raise `ValueError`: if the string does not appear to be a valid locale @@ -217,9 +231,72 @@ class Locale(object): requested locale :see: `parse_locale` """ - if isinstance(identifier, string_types): - return cls(*parse_locale(identifier, sep=sep)) - return identifier + if identifier is None: + return None + elif isinstance(identifier, Locale): + return identifier + elif not isinstance(identifier, string_types): + raise TypeError('Unxpected value for identifier: %r' % (identifier,)) + + parts = parse_locale(identifier, sep=sep) + + def _make_id(language, territory, script, variant): + return '_'.join(filter(None, [language, script, + territory, variant])) + + input_id = _make_id(*parts) + + def _try_load(parts): + try: + return cls(*parts) + except UnknownLocaleError: + return None + + locale = _try_load(parts) + if locale is not None: + return locale + if not resolve_likely_subtags: + raise UnknownLocaleError(input_id) + + # From here onwards is some very bad likely subtag resolving. This + # whole logic is not entirely correct but good enough (tm) for the + # time being. This has been added so that zh_TW does not cause + # errors for people when they upgrade. Later we should properly + # implement ICU like fuzzy locale objects and provide a way to + # maximize and minimize locale tags. + + language, territory, script, variant = parts + language = get_global('language_aliases').get(language, language) + territory = get_global('territory_aliases').get(territory, territory) + script = get_global('script_aliases').get(script, script) + variant = get_global('variant_aliases').get(variant, variant) + + if territory == 'ZZ': + territory = None + if script == 'Zzzz': + script = None + + parts = language, territory, script, variant + + new_id = _make_id(*parts) + likely_subtag = get_global('likely_subtags').get(new_id) + if likely_subtag is None: + raise UnknownLocaleError(input_id) + + parts2 = parse_locale(likely_subtag) + + # Success on first hit, return it. + locale = _try_load(parts2) + if locale is not None: + return locale + + # Now try without script and variant + lcoale = _try_load(parts2[:2]) + if locale is not None: + return locale + + # Give up. + raise UnknownLocaleError(input_id) def __eq__(self, other): for key in ('language', 'territory', 'script', 'variant'): diff --git a/scripts/import_cldr.py b/scripts/import_cldr.py index f5128614..84b2b1dd 100755 --- a/scripts/import_cldr.py +++ b/scripts/import_cldr.py @@ -109,6 +109,10 @@ def main(): bcp47_timezone = parse(os.path.join(srcdir, 'bcp47', 'timezone.xml')) sup_windows_zones = parse(os.path.join(srcdir, 'supplemental', 'windowsZones.xml')) + sup_metadata = parse(os.path.join(srcdir, 'supplemental', + 'supplementalMetadata.xml')) + sup_likely = parse(os.path.join(srcdir, 'supplemental', + 'likelySubtags.xml')) sup = parse(sup_filename) # Import global data from the supplemental files @@ -119,11 +123,16 @@ def main(): zone_aliases = global_data.setdefault('zone_aliases', {}) zone_territories = global_data.setdefault('zone_territories', {}) win_mapping = global_data.setdefault('windows_zone_mapping', {}) - - # create auxiliary zone->territory map from the windows zones (we don't set - # the 'zones_territories' map directly here, because there are some zones - # aliases listed and we defer the decision of which ones to choose to the - # 'bcp47' data + language_aliases = global_data.setdefault('language_aliases', {}) + territory_aliases = global_data.setdefault('territory_aliases', {}) + script_aliases = global_data.setdefault('script_aliases', {}) + variant_aliases = global_data.setdefault('variant_aliases', {}) + likely_subtags = global_data.setdefault('likely_subtags', {}) + + # create auxiliary zone->territory map from the windows zones (we don't set + # the 'zones_territories' map directly here, because there are some zones + # aliases listed and we defer the decision of which ones to choose to the + # 'bcp47' data _zone_territory_map = {} for map_zone in sup_windows_zones.findall('.//windowsZones/mapTimezones/mapZone'): if map_zone.attrib.get('territory') == '001': @@ -151,6 +160,32 @@ def main(): if 'to' not in child.attrib: # FIXME: support old mappings meta_zones[elem.attrib['type']] = child.attrib['mzone'] + # Language aliases + for alias in sup_metadata.findall('.//alias/languageAlias'): + # We don't have a use for those at the moment. They don't + # pass our parser anyways. + if '-' in alias.attrib['type']: + continue + language_aliases[alias.attrib['type']] = alias.attrib['replacement'] + + # Territory aliases + for alias in sup_metadata.findall('.//alias/territoryAlias'): + territory_aliases[alias.attrib['type']] = alias.attrib['replacement'].split() + + # Script aliases + for alias in sup_metadata.findall('.//alias/scriptAlias'): + script_aliases[alias.attrib['type']] = alias.attrib['replacement'] + + # Variant aliases + for alias in sup_metadata.findall('.//alias/variantAlias'): + repl = alias.attrib.get('replacement') + if repl: + variant_aliases[alias.attrib['type']] = repl + + # Likely subtags + for likely_subtag in sup_likely.findall('.//likelySubtags/likelySubtag'): + likely_subtags[likely_subtag.attrib['from']] = likely_subtag.attrib['to'] + outfile = open(global_path, 'wb') try: pickle.dump(global_data, outfile, 2) diff --git a/tests/test_core.py b/tests/test_core.py index 224ebc83..7e13bf6b 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -91,6 +91,12 @@ class TestLocaleClass: de_DE = Locale.parse(l) assert (de_DE.language, de_DE.territory) == ('de', 'DE') + def test_parse_likely_subtags(self): + l = Locale.parse('zh-TW', sep='-') + assert l.language == 'zh' + assert l.territory == 'TW' + assert l.script == 'Hant' + def test_get_display_name(self): zh_CN = Locale('zh', 'CN', script='Hans') assert zh_CN.get_display_name('en') == 'Chinese (Simplified, China)'