From cd70395b0f138b7f60b0866f5b2b164f1a025786 Mon Sep 17 00:00:00 2001 From: Aarni Koskela Date: Mon, 4 Jan 2016 21:51:36 +0200 Subject: [PATCH] dates: Add and use fuzzy skeleton matching Based on: * http://www.unicode.org/reports/tr35/tr35-dates.html#availableFormats_appendItems * http://source.icu-project.org/repos/icu/icu4j/trunk/main/classes/core/src/com/ibm/icu/text/DateIntervalInfo.java (method `getBestSkeleton`) --- babel/dates.py | 114 ++++++++++++++++++++++++++++++++--- tests/test_date_intervals.py | 17 ++++-- 2 files changed, 119 insertions(+), 12 deletions(-) diff --git a/babel/dates.py b/babel/dates.py index 3605a451..b41945bc 100644 --- a/babel/dates.py +++ b/babel/dates.py @@ -741,7 +741,7 @@ def format_time(time=None, format='medium', tzinfo=None, locale=LC_TIME): return parse_pattern(format).apply(time, locale) -def format_skeleton(skeleton, datetime=None, tzinfo=None, locale=LC_TIME): +def format_skeleton(skeleton, datetime=None, tzinfo=None, fuzzy=True, locale=LC_TIME): r"""Return a time and/or date formatted according to the given pattern. The skeletons are defined in the CLDR data and provide more flexibility @@ -754,6 +754,12 @@ def format_skeleton(skeleton, datetime=None, tzinfo=None, locale=LC_TIME): u'dim. 1 avr.' >>> format_skeleton('MMMEd', t, locale='en') u'Sun, Apr 1' + >>> format_skeleton('yMMd', t, locale='fi') # yMMd is not in the Finnish locale; yMd gets used + u'1.4.2007' + >>> format_skeleton('yMMd', t, fuzzy=False, locale='fi') # yMMd is not in the Finnish locale, an error is thrown + Traceback (most recent call last): + ... + KeyError: yMMd After the skeleton is resolved to a pattern `format_datetime` is called so all timezone processing etc is the same as for that. @@ -762,9 +768,13 @@ def format_skeleton(skeleton, datetime=None, tzinfo=None, locale=LC_TIME): :param datetime: the ``time`` or ``datetime`` object; if `None`, the current time in UTC is used :param tzinfo: the time-zone to apply to the time for display + :param fuzzy: If the skeleton is not found, allow choosing a skeleton that's + close enough to it. :param locale: a `Locale` object or a locale identifier """ locale = Locale.parse(locale) + if fuzzy and skeleton not in locale.datetime_skeletons: + skeleton = match_skeleton(skeleton, locale.datetime_skeletons) format = locale.datetime_skeletons[skeleton] return format_datetime(datetime, format, tzinfo, locale) @@ -905,7 +915,7 @@ def _format_fallback_interval(start, end, skeleton, tzinfo, locale): ) -def format_interval(start, end, skeleton, tzinfo=None, locale=LC_TIME): +def format_interval(start, end, skeleton=None, tzinfo=None, fuzzy=True, locale=LC_TIME): """ Format an interval between two instants according to the locale's rules. @@ -926,10 +936,23 @@ def format_interval(start, end, skeleton, tzinfo=None, locale=LC_TIME): >>> format_interval(time(16, 18), time(16, 18), "Hm", locale="it") '16:18' + Unknown skeletons fall back to "default" formatting. + + >>> format_interval(date(2015, 1, 1), date(2017, 1, 1), "wzq", locale="ja") + '2015/01/01\uff5e2017/01/01' + + >>> format_interval(time(16, 18), time(16, 24), "xxx", locale="ja") + '16:18:00\uff5e16:24:00' + + >>> format_interval(date(2016, 1, 15), date(2016, 1, 17), "xxx", locale="de") + '15.01.2016 \u2013 17.01.2016' + :param start: First instant (datetime/date/time) :param end: Second instant (datetime/date/time) :param skeleton: The "skeleton format" to use for formatting. :param tzinfo: tzinfo to use (if none is already attached) + :param fuzzy: If the skeleton is not found, allow choosing a skeleton that's + close enough to it. :param locale: A locale object or identifier. :return: Formatted interval """ @@ -942,19 +965,26 @@ def format_interval(start, end, skeleton, tzinfo=None, locale=LC_TIME): # > starting in the current locale and then following the locale fallback # > chain up to, but not including root. - if skeleton not in locale.interval_formats: + interval_formats = locale.interval_formats + + if skeleton not in interval_formats or not skeleton: # > If no match was found from the previous step, check what the closest # > match is in the fallback locale chain, as in availableFormats. That # > is, this allows for adjusting the string value field's width, # > including adjusting between "MMM" and "MMMM", and using different # > variants of the same field, such as 'v' and 'z'. - # TODO: Implement closest-match instead of immediately falling back - return _format_fallback_interval(start, end, skeleton, tzinfo, locale) + if skeleton and fuzzy: + skeleton = match_skeleton(skeleton, interval_formats) + else: + skeleton = None + if not skeleton: # Still no match whatsoever? + # > Otherwise, format the start and end datetime using the fallback pattern. + return _format_fallback_interval(start, end, skeleton, tzinfo, locale) - skel_formats = locale.interval_formats[skeleton] + skel_formats = interval_formats[skeleton] if start == end: - return format_skeleton(skeleton, start, tzinfo, locale) + return format_skeleton(skeleton, start, tzinfo, fuzzy=fuzzy, locale=locale) start = _ensure_datetime_tzinfo(_get_datetime(start), tzinfo=tzinfo) end = _ensure_datetime_tzinfo(_get_datetime(end), tzinfo=tzinfo) @@ -1489,3 +1519,73 @@ def split_interval_pattern(pattern): parts[-1].append((tok_type, tok_value)) return [untokenize_pattern(tokens) for tokens in parts] + + +def match_skeleton(skeleton, options, allow_different_fields=False): + """ + Find the closest match for the given datetime skeleton among the options given. + + This uses the rules outlined in the TR35 document. + + >>> match_skeleton('yMMd', ('yMd', 'yMMMd')) + 'yMd' + + >>> match_skeleton('yMMd', ('jyMMd',), allow_different_fields=True) + 'jyMMd' + + >>> match_skeleton('yMMd', ('qyMMd',), allow_different_fields=False) + + >>> match_skeleton('hmz', ('hmv',)) + 'hmv' + + :param skeleton: The skeleton to match + :type skeleton: str + :param options: An iterable of other skeletons to match against + :type options: Iterable[str] + :return: The closest skeleton match, or if no match was found, None. + :rtype: str|None + """ + + # TODO: maybe implement pattern expansion? + + # Based on the implementation in + # http://source.icu-project.org/repos/icu/icu4j/trunk/main/classes/core/src/com/ibm/icu/text/DateIntervalInfo.java + + # Filter out falsy values and sort for stability; when `interval_formats` is passed in, there may be a None key. + options = sorted(option for option in options if option) + + if 'z' in skeleton and not any('z' in option for option in options): + skeleton = skeleton.replace('z', 'v') + + get_input_field_width = dict(t[1] for t in tokenize_pattern(skeleton) if t[0] == "field").get + best_skeleton = None + best_distance = None + for option in options: + get_opt_field_width = dict(t[1] for t in tokenize_pattern(option) if t[0] == "field").get + distance = 0 + for field in PATTERN_CHARS: + input_width = get_input_field_width(field, 0) + opt_width = get_opt_field_width(field, 0) + if input_width == opt_width: + continue + if opt_width == 0 or input_width == 0: + if not allow_different_fields: # This one is not okay + option = None + break + distance += 0x1000 # Magic weight constant for "entirely different fields" + elif field == 'M' and ((input_width > 2 and opt_width <= 2) or (input_width <= 2 and opt_width > 2)): + distance += 0x100 # Magic weight for "text turns into a number" + else: + distance += abs(input_width - opt_width) + + if not option: # We lost the option along the way (probably due to "allow_different_fields") + continue + + if not best_skeleton or distance < best_distance: + best_skeleton = option + best_distance = distance + + if distance == 0: # Found a perfect match! + break + + return best_skeleton diff --git a/tests/test_date_intervals.py b/tests/test_date_intervals.py index 73fae462..e5a797a9 100644 --- a/tests/test_date_intervals.py +++ b/tests/test_date_intervals.py @@ -13,25 +13,25 @@ TEST_DATE = TEST_DT.date() def test_format_interval_same_instant_1(): - assert dates.format_interval(TEST_DT, TEST_DT, "yMMMd", locale="fi") == "8. tammikuuta 2016" + assert dates.format_interval(TEST_DT, TEST_DT, "yMMMd", fuzzy=False, locale="fi") == "8. tammikuuta 2016" def test_format_interval_same_instant_2(): - assert dates.format_interval(TEST_DT, TEST_DT, "xxx", locale="fi") == "8.1.2016 klo 11.46.15" + assert dates.format_interval(TEST_DT, TEST_DT, "xxx", fuzzy=False, locale="fi") == "8.1.2016 klo 11.46.15" def test_format_interval_same_instant_3(): - assert dates.format_interval(TEST_TIME, TEST_TIME, "xxx", locale="fi") == "11.46.15" + assert dates.format_interval(TEST_TIME, TEST_TIME, "xxx", fuzzy=False, locale="fi") == "11.46.15" def test_format_interval_same_instant_4(): - assert dates.format_interval(TEST_DATE, TEST_DATE, "xxx", locale="fi") == "8.1.2016" + assert dates.format_interval(TEST_DATE, TEST_DATE, "xxx", fuzzy=False, locale="fi") == "8.1.2016" def test_format_interval_no_difference(): t1 = TEST_DT t2 = t1 + datetime.timedelta(minutes=8) - assert dates.format_interval(t1, t2, "yMd", locale="fi") == "8.1.2016" + assert dates.format_interval(t1, t2, "yMd", fuzzy=False, locale="fi") == "8.1.2016" def test_format_interval_in_tz(): @@ -45,3 +45,10 @@ def test_format_interval_12_hour(): t2 = TEST_DT t1 = t2 - datetime.timedelta(hours=1) assert dates.format_interval(t1, t2, "hm", locale="en") == "10:46 \u2013 11:46 AM" + + +def test_format_interval_invalid_skeleton(): + t1 = TEST_DATE + t2 = TEST_DATE + datetime.timedelta(days=1) + assert dates.format_interval(t1, t2, "mumumu", fuzzy=False, locale="fi") == u"8.1.2016\u20139.1.2016" + assert dates.format_interval(t1, t2, fuzzy=False, locale="fi") == u"8.1.2016\u20139.1.2016" -- 2.47.2