From cd70395b0f138b7f60b0866f5b2b164f1a025786 Mon Sep 17 00:00:00 2001
From: Aarni Koskela <akx@iki.fi>
Date: Mon, 4 Jan 2016 21:51:36 +0200
Subject: [PATCH] dates: Add and use fuzzy skeleton matching

Based on:

* http://www.unicode.org/reports/tr35/tr35-dates.html#availableFormats_appendItems
* http://source.icu-project.org/repos/icu/icu4j/trunk/main/classes/core/src/com/ibm/icu/text/DateIntervalInfo.java
  (method `getBestSkeleton`)
---
 babel/dates.py               | 114 ++++++++++++++++++++++++++++++++---
 tests/test_date_intervals.py |  17 ++++--
 2 files changed, 119 insertions(+), 12 deletions(-)

diff --git a/babel/dates.py b/babel/dates.py
index 3605a451..b41945bc 100644
--- a/babel/dates.py
+++ b/babel/dates.py
@@ -741,7 +741,7 @@ def format_time(time=None, format='medium', tzinfo=None, locale=LC_TIME):
     return parse_pattern(format).apply(time, locale)
 
 
-def format_skeleton(skeleton, datetime=None, tzinfo=None, locale=LC_TIME):
+def format_skeleton(skeleton, datetime=None, tzinfo=None, fuzzy=True, locale=LC_TIME):
     r"""Return a time and/or date formatted according to the given pattern.
 
     The skeletons are defined in the CLDR data and provide more flexibility
@@ -754,6 +754,12 @@ def format_skeleton(skeleton, datetime=None, tzinfo=None, locale=LC_TIME):
     u'dim. 1 avr.'
     >>> format_skeleton('MMMEd', t, locale='en')
     u'Sun, Apr 1'
+    >>> format_skeleton('yMMd', t, locale='fi')  # yMMd is not in the Finnish locale; yMd gets used
+    u'1.4.2007'
+    >>> format_skeleton('yMMd', t, fuzzy=False, locale='fi')  # yMMd is not in the Finnish locale, an error is thrown
+    Traceback (most recent call last):
+        ...
+    KeyError: yMMd
 
     After the skeleton is resolved to a pattern `format_datetime` is called so
     all timezone processing etc is the same as for that.
@@ -762,9 +768,13 @@ def format_skeleton(skeleton, datetime=None, tzinfo=None, locale=LC_TIME):
     :param datetime: the ``time`` or ``datetime`` object; if `None`, the current
                  time in UTC is used
     :param tzinfo: the time-zone to apply to the time for display
+    :param fuzzy: If the skeleton is not found, allow choosing a skeleton that's
+                  close enough to it.
     :param locale: a `Locale` object or a locale identifier
     """
     locale = Locale.parse(locale)
+    if fuzzy and skeleton not in locale.datetime_skeletons:
+        skeleton = match_skeleton(skeleton, locale.datetime_skeletons)
     format = locale.datetime_skeletons[skeleton]
     return format_datetime(datetime, format, tzinfo, locale)
 
@@ -905,7 +915,7 @@ def _format_fallback_interval(start, end, skeleton, tzinfo, locale):
     )
 
 
-def format_interval(start, end, skeleton, tzinfo=None, locale=LC_TIME):
+def format_interval(start, end, skeleton=None, tzinfo=None, fuzzy=True, locale=LC_TIME):
     """
     Format an interval between two instants according to the locale's rules.
 
@@ -926,10 +936,23 @@ def format_interval(start, end, skeleton, tzinfo=None, locale=LC_TIME):
     >>> format_interval(time(16, 18), time(16, 18), "Hm", locale="it")
     '16:18'
 
+    Unknown skeletons fall back to "default" formatting.
+
+    >>> format_interval(date(2015, 1, 1), date(2017, 1, 1), "wzq", locale="ja")
+    '2015/01/01\uff5e2017/01/01'
+
+    >>> format_interval(time(16, 18), time(16, 24), "xxx", locale="ja")
+    '16:18:00\uff5e16:24:00'
+
+    >>> format_interval(date(2016, 1, 15), date(2016, 1, 17), "xxx", locale="de")
+    '15.01.2016 \u2013 17.01.2016'
+
     :param start: First instant (datetime/date/time)
     :param end: Second instant (datetime/date/time)
     :param skeleton: The "skeleton format" to use for formatting.
     :param tzinfo: tzinfo to use (if none is already attached)
+    :param fuzzy: If the skeleton is not found, allow choosing a skeleton that's
+                  close enough to it.
     :param locale: A locale object or identifier.
     :return: Formatted interval
     """
@@ -942,19 +965,26 @@ def format_interval(start, end, skeleton, tzinfo=None, locale=LC_TIME):
     # > starting in the current locale and then following the locale fallback
     # > chain up to, but not including root.
 
-    if skeleton not in locale.interval_formats:
+    interval_formats = locale.interval_formats
+
+    if skeleton not in interval_formats or not skeleton:
         # > If no match was found from the previous step, check what the closest
         # > match is in the fallback locale chain, as in availableFormats. That
         # > is, this allows for adjusting the string value field's width,
         # > including adjusting between "MMM" and "MMMM", and using different
         # > variants of the same field, such as 'v' and 'z'.
-        # TODO: Implement closest-match instead of immediately falling back
-        return _format_fallback_interval(start, end, skeleton, tzinfo, locale)
+        if skeleton and fuzzy:
+            skeleton = match_skeleton(skeleton, interval_formats)
+        else:
+            skeleton = None
+        if not skeleton:  # Still no match whatsoever?
+            # > Otherwise, format the start and end datetime using the fallback pattern.
+            return _format_fallback_interval(start, end, skeleton, tzinfo, locale)
 
-    skel_formats = locale.interval_formats[skeleton]
+    skel_formats = interval_formats[skeleton]
 
     if start == end:
-        return format_skeleton(skeleton, start, tzinfo, locale)
+        return format_skeleton(skeleton, start, tzinfo, fuzzy=fuzzy, locale=locale)
 
     start = _ensure_datetime_tzinfo(_get_datetime(start), tzinfo=tzinfo)
     end = _ensure_datetime_tzinfo(_get_datetime(end), tzinfo=tzinfo)
@@ -1489,3 +1519,73 @@ def split_interval_pattern(pattern):
         parts[-1].append((tok_type, tok_value))
 
     return [untokenize_pattern(tokens) for tokens in parts]
+
+
+def match_skeleton(skeleton, options, allow_different_fields=False):
+    """
+    Find the closest match for the given datetime skeleton among the options given.
+
+    This uses the rules outlined in the TR35 document.
+
+    >>> match_skeleton('yMMd', ('yMd', 'yMMMd'))
+    'yMd'
+
+    >>> match_skeleton('yMMd', ('jyMMd',), allow_different_fields=True)
+    'jyMMd'
+
+    >>> match_skeleton('yMMd', ('qyMMd',), allow_different_fields=False)
+
+    >>> match_skeleton('hmz', ('hmv',))
+    'hmv'
+
+    :param skeleton: The skeleton to match
+    :type skeleton: str
+    :param options: An iterable of other skeletons to match against
+    :type options: Iterable[str]
+    :return: The closest skeleton match, or if no match was found, None.
+    :rtype: str|None
+    """
+
+    # TODO: maybe implement pattern expansion?
+
+    # Based on the implementation in
+    # http://source.icu-project.org/repos/icu/icu4j/trunk/main/classes/core/src/com/ibm/icu/text/DateIntervalInfo.java
+
+    # Filter out falsy values and sort for stability; when `interval_formats` is passed in, there may be a None key.
+    options = sorted(option for option in options if option)
+
+    if 'z' in skeleton and not any('z' in option for option in options):
+        skeleton = skeleton.replace('z', 'v')
+
+    get_input_field_width = dict(t[1] for t in tokenize_pattern(skeleton) if t[0] == "field").get
+    best_skeleton = None
+    best_distance = None
+    for option in options:
+        get_opt_field_width = dict(t[1] for t in tokenize_pattern(option) if t[0] == "field").get
+        distance = 0
+        for field in PATTERN_CHARS:
+            input_width = get_input_field_width(field, 0)
+            opt_width = get_opt_field_width(field, 0)
+            if input_width == opt_width:
+                continue
+            if opt_width == 0 or input_width == 0:
+                if not allow_different_fields:  # This one is not okay
+                    option = None
+                    break
+                distance += 0x1000  # Magic weight constant for "entirely different fields"
+            elif field == 'M' and ((input_width > 2 and opt_width <= 2) or (input_width <= 2 and opt_width > 2)):
+                distance += 0x100  # Magic weight for "text turns into a number"
+            else:
+                distance += abs(input_width - opt_width)
+
+        if not option:  # We lost the option along the way (probably due to "allow_different_fields")
+            continue
+
+        if not best_skeleton or distance < best_distance:
+            best_skeleton = option
+            best_distance = distance
+
+        if distance == 0:  # Found a perfect match!
+            break
+
+    return best_skeleton
diff --git a/tests/test_date_intervals.py b/tests/test_date_intervals.py
index 73fae462..e5a797a9 100644
--- a/tests/test_date_intervals.py
+++ b/tests/test_date_intervals.py
@@ -13,25 +13,25 @@ TEST_DATE = TEST_DT.date()
 
 
 def test_format_interval_same_instant_1():
-    assert dates.format_interval(TEST_DT, TEST_DT, "yMMMd", locale="fi") == "8. tammikuuta 2016"
+    assert dates.format_interval(TEST_DT, TEST_DT, "yMMMd", fuzzy=False, locale="fi") == "8. tammikuuta 2016"
 
 
 def test_format_interval_same_instant_2():
-    assert dates.format_interval(TEST_DT, TEST_DT, "xxx", locale="fi") == "8.1.2016 klo 11.46.15"
+    assert dates.format_interval(TEST_DT, TEST_DT, "xxx", fuzzy=False, locale="fi") == "8.1.2016 klo 11.46.15"
 
 
 def test_format_interval_same_instant_3():
-    assert dates.format_interval(TEST_TIME, TEST_TIME, "xxx", locale="fi") == "11.46.15"
+    assert dates.format_interval(TEST_TIME, TEST_TIME, "xxx", fuzzy=False, locale="fi") == "11.46.15"
 
 
 def test_format_interval_same_instant_4():
-    assert dates.format_interval(TEST_DATE, TEST_DATE, "xxx", locale="fi") == "8.1.2016"
+    assert dates.format_interval(TEST_DATE, TEST_DATE, "xxx", fuzzy=False, locale="fi") == "8.1.2016"
 
 
 def test_format_interval_no_difference():
     t1 = TEST_DT
     t2 = t1 + datetime.timedelta(minutes=8)
-    assert dates.format_interval(t1, t2, "yMd", locale="fi") == "8.1.2016"
+    assert dates.format_interval(t1, t2, "yMd", fuzzy=False, locale="fi") == "8.1.2016"
 
 
 def test_format_interval_in_tz():
@@ -45,3 +45,10 @@ def test_format_interval_12_hour():
     t2 = TEST_DT
     t1 = t2 - datetime.timedelta(hours=1)
     assert dates.format_interval(t1, t2, "hm", locale="en") == "10:46 \u2013 11:46 AM"
+
+
+def test_format_interval_invalid_skeleton():
+    t1 = TEST_DATE
+    t2 = TEST_DATE + datetime.timedelta(days=1)
+    assert dates.format_interval(t1, t2, "mumumu", fuzzy=False, locale="fi") == u"8.1.2016\u20139.1.2016"
+    assert dates.format_interval(t1, t2, fuzzy=False, locale="fi") == u"8.1.2016\u20139.1.2016"
-- 
2.47.3