]> git.ipfire.org Git - thirdparty/babel.git/commitdiff
Added basic likely-subtag resolving
authorArmin Ronacher <armin.ronacher@active-4.com>
Wed, 24 Jul 2013 18:40:16 +0000 (20:40 +0200)
committerArmin Ronacher <armin.ronacher@active-4.com>
Wed, 24 Jul 2013 18:40:16 +0000 (20:40 +0200)
ChangeLog
babel/core.py
scripts/import_cldr.py
tests/test_core.py

index 952f14971793b03e3ba8fad7665e4781abb7787e..fda9c009fe4b5a0df803bdd3f9ff1a0c53983a44 100644 (file)
--- a/ChangeLog
+++ b/ChangeLog
@@ -70,6 +70,8 @@ Version 1.0
  * Added experimental Python 3 support.
  * Added better support for returning timezone names.
  * Don't throw away a Catalog's obsolete messages when updating it.
+ * Added basic likelySubtag resolving when doing locale parsing and no
+   match can be found.
 
 
 Version 0.9.6
index d22ccaa1c3c8999fb0d97ad4e5181a38f9f3bf85..e5fe4bd0bcf03114aea70ef92045066b03db7e79 100644 (file)
@@ -194,7 +194,7 @@ class Locale(object):
             return Locale.parse(identifier, sep=sep)
 
     @classmethod
-    def parse(cls, identifier, sep='_'):
+    def parse(cls, identifier, sep='_', resolve_likely_subtags=True):
         """Create a `Locale` instance for the given locale identifier.
 
         >>> l = Locale.parse('de-DE', sep='-')
@@ -207,8 +207,22 @@ class Locale(object):
         >>> Locale.parse(l)
         Locale('de', territory='DE')
 
+        This also can perform resolving of likely subtags which it does
+        by default.
+
         :param identifier: the locale identifier string
         :param sep: optional component separator
+        :param resolve_likely_subtags: if this is specified then a locale will
+                                       have its likely subtag resolved if the
+                                       locale otherwise does not exist.  For
+                                       instance ``zh_TW`` by itself is not a
+                                       locale that exists but Babel can
+                                       automatically expand it to the full
+                                       form of ``zh_hant_TW``.  Note that this
+                                       expansion is only taking place if no
+                                       locale exists otherwise.  For instance
+                                       there is a locale ``en`` that can exist
+                                       by itself.
         :return: a corresponding `Locale` instance
         :rtype: `Locale`
         :raise `ValueError`: if the string does not appear to be a valid locale
@@ -217,9 +231,72 @@ class Locale(object):
                                      requested locale
         :see: `parse_locale`
         """
-        if isinstance(identifier, string_types):
-            return cls(*parse_locale(identifier, sep=sep))
-        return identifier
+        if identifier is None:
+            return None
+        elif isinstance(identifier, Locale):
+            return identifier
+        elif not isinstance(identifier, string_types):
+            raise TypeError('Unxpected value for identifier: %r' % (identifier,))
+
+        parts = parse_locale(identifier, sep=sep)
+
+        def _make_id(language, territory, script, variant):
+            return '_'.join(filter(None, [language, script,
+                                          territory, variant]))
+
+        input_id = _make_id(*parts)
+
+        def _try_load(parts):
+            try:
+                return cls(*parts)
+            except UnknownLocaleError:
+                return None
+
+        locale = _try_load(parts)
+        if locale is not None:
+            return locale
+        if not resolve_likely_subtags:
+            raise UnknownLocaleError(input_id)
+
+        # From here onwards is some very bad likely subtag resolving.  This
+        # whole logic is not entirely correct but good enough (tm) for the
+        # time being.  This has been added so that zh_TW does not cause
+        # errors for people when they upgrade.  Later we should properly
+        # implement ICU like fuzzy locale objects and provide a way to
+        # maximize and minimize locale tags.
+
+        language, territory, script, variant = parts
+        language = get_global('language_aliases').get(language, language)
+        territory = get_global('territory_aliases').get(territory, territory)
+        script = get_global('script_aliases').get(script, script)
+        variant = get_global('variant_aliases').get(variant, variant)
+
+        if territory == 'ZZ':
+            territory = None
+        if script == 'Zzzz':
+            script = None
+
+        parts = language, territory, script, variant
+
+        new_id = _make_id(*parts)
+        likely_subtag = get_global('likely_subtags').get(new_id)
+        if likely_subtag is None:
+            raise UnknownLocaleError(input_id)
+
+        parts2 = parse_locale(likely_subtag)
+
+        # Success on first hit, return it.
+        locale = _try_load(parts2)
+        if locale is not None:
+            return locale
+
+        # Now try without script and variant
+        lcoale = _try_load(parts2[:2])
+        if locale is not None:
+            return locale
+
+        # Give up.
+        raise UnknownLocaleError(input_id)
 
     def __eq__(self, other):
         for key in ('language', 'territory', 'script', 'variant'):
index f5128614e0ce83becf9499f0f88d8a6a89cf5898..84b2b1ddf6ef223aa432f4da67d6e47c041fb433 100755 (executable)
@@ -109,6 +109,10 @@ def main():
     bcp47_timezone = parse(os.path.join(srcdir, 'bcp47', 'timezone.xml'))
     sup_windows_zones = parse(os.path.join(srcdir, 'supplemental',
                                            'windowsZones.xml'))
+    sup_metadata = parse(os.path.join(srcdir, 'supplemental',
+                                      'supplementalMetadata.xml'))
+    sup_likely = parse(os.path.join(srcdir, 'supplemental',
+                                    'likelySubtags.xml'))
     sup = parse(sup_filename)
 
     # Import global data from the supplemental files
@@ -119,11 +123,16 @@ def main():
         zone_aliases = global_data.setdefault('zone_aliases', {})
         zone_territories = global_data.setdefault('zone_territories', {})
         win_mapping = global_data.setdefault('windows_zone_mapping', {})
-
-         # create auxiliary zone->territory map from the windows zones (we don't set
-         # the 'zones_territories' map directly here, because there are some zones
-         # aliases listed and we defer the decision of which ones to choose to the
-         # 'bcp47' data
+        language_aliases = global_data.setdefault('language_aliases', {})
+        territory_aliases = global_data.setdefault('territory_aliases', {})
+        script_aliases = global_data.setdefault('script_aliases', {})
+        variant_aliases = global_data.setdefault('variant_aliases', {})
+        likely_subtags = global_data.setdefault('likely_subtags', {})
+
+        # create auxiliary zone->territory map from the windows zones (we don't set
+        # the 'zones_territories' map directly here, because there are some zones
+        # aliases listed and we defer the decision of which ones to choose to the
+        # 'bcp47' data
         _zone_territory_map = {}
         for map_zone in sup_windows_zones.findall('.//windowsZones/mapTimezones/mapZone'):
             if map_zone.attrib.get('territory') == '001':
@@ -151,6 +160,32 @@ def main():
                 if 'to' not in child.attrib: # FIXME: support old mappings
                     meta_zones[elem.attrib['type']] = child.attrib['mzone']
 
+        # Language aliases
+        for alias in sup_metadata.findall('.//alias/languageAlias'):
+            # We don't have a use for those at the moment.  They don't
+            # pass our parser anyways.
+            if '-' in alias.attrib['type']:
+                continue
+            language_aliases[alias.attrib['type']] = alias.attrib['replacement']
+
+        # Territory aliases
+        for alias in sup_metadata.findall('.//alias/territoryAlias'):
+            territory_aliases[alias.attrib['type']] = alias.attrib['replacement'].split()
+
+        # Script aliases
+        for alias in sup_metadata.findall('.//alias/scriptAlias'):
+            script_aliases[alias.attrib['type']] = alias.attrib['replacement']
+
+        # Variant aliases
+        for alias in sup_metadata.findall('.//alias/variantAlias'):
+            repl = alias.attrib.get('replacement')
+            if repl:
+                variant_aliases[alias.attrib['type']] = repl
+
+        # Likely subtags
+        for likely_subtag in sup_likely.findall('.//likelySubtags/likelySubtag'):
+            likely_subtags[likely_subtag.attrib['from']] = likely_subtag.attrib['to']
+
         outfile = open(global_path, 'wb')
         try:
             pickle.dump(global_data, outfile, 2)
index 224ebc83949d45254475ab9e6d08867d60ce71c9..7e13bf6b75459e1e15c97c2c91c815cede7546a7 100644 (file)
@@ -91,6 +91,12 @@ class TestLocaleClass:
         de_DE = Locale.parse(l)
         assert (de_DE.language, de_DE.territory) == ('de', 'DE')
 
+    def test_parse_likely_subtags(self):
+        l = Locale.parse('zh-TW', sep='-')
+        assert l.language == 'zh'
+        assert l.territory == 'TW'
+        assert l.script == 'Hant'
+
     def test_get_display_name(self):
         zh_CN = Locale('zh', 'CN', script='Hans')
         assert zh_CN.get_display_name('en') == 'Chinese (Simplified, China)'