From 2e56a2adef4152f0eea2fd7964452c60f792dcc7 Mon Sep 17 00:00:00 2001 From: Aarni Koskela Date: Mon, 17 Mar 2025 15:05:00 +0200 Subject: [PATCH] Improve handling of no-inheritance-marker in timezone data (#1194) * Add icu4c-tools * Improve handling of no-inheritance-marker in timezone data Fixes #1192 (but uncovers another bug) --- babel/dates.py | 6 +- misc/icu4c-tools/.gitignore | 1 + misc/icu4c-tools/Makefile | 3 + misc/icu4c-tools/README.md | 22 ++++++ misc/icu4c-tools/icu4c_date_format.cpp | 101 +++++++++++++++++++++++++ tests/test_dates.py | 30 ++++++++ 6 files changed, 161 insertions(+), 2 deletions(-) create mode 100644 misc/icu4c-tools/.gitignore create mode 100644 misc/icu4c-tools/Makefile create mode 100644 misc/icu4c-tools/README.md create mode 100644 misc/icu4c-tools/icu4c_date_format.cpp diff --git a/babel/dates.py b/babel/dates.py index 355a9236..2ec10b5b 100644 --- a/babel/dates.py +++ b/babel/dates.py @@ -649,7 +649,9 @@ def get_timezone_name( info = locale.time_zones.get(zone, {}) # Try explicitly translated zone names first if width in info and zone_variant in info[width]: - return info[width][zone_variant] + value = info[width][zone_variant] + if value != NO_INHERITANCE_MARKER: + return value metazone = get_global('meta_zones').get(zone) if metazone: @@ -660,7 +662,7 @@ def get_timezone_name( # If the short form is marked no-inheritance, # try to fall back to the long name instead. name = metazone_info.get('long', {}).get(zone_variant) - if name: + if name and name != NO_INHERITANCE_MARKER: return name # If we have a concrete datetime, we assume that the result can't be diff --git a/misc/icu4c-tools/.gitignore b/misc/icu4c-tools/.gitignore new file mode 100644 index 00000000..e660fd93 --- /dev/null +++ b/misc/icu4c-tools/.gitignore @@ -0,0 +1 @@ +bin/ diff --git a/misc/icu4c-tools/Makefile b/misc/icu4c-tools/Makefile new file mode 100644 index 00000000..0f1d5d13 --- /dev/null +++ b/misc/icu4c-tools/Makefile @@ -0,0 +1,3 @@ +bin/icu4c_date_format: icu4c_date_format.cpp + mkdir -p bin + $(CXX) -Wall -std=c++17 -o $@ $^ $(shell pkg-config --cflags --libs icu-uc icu-i18n) diff --git a/misc/icu4c-tools/README.md b/misc/icu4c-tools/README.md new file mode 100644 index 00000000..7cf11c04 --- /dev/null +++ b/misc/icu4c-tools/README.md @@ -0,0 +1,22 @@ +# icu4c-tools + +Some haphazard tools for cross-checking results between ICU4C and Babel. +These are not meant to be production-ready or e.g. guaranteed to not leak memory in any way. + +## icu4c_date_format + +### Compiling + +This worked on my macOS – on a Linux machine, you shouldn't need the `PKG_CONFIG_PATH` environment variable. + +``` +env PKG_CONFIG_PATH="/opt/homebrew/opt/icu4c@76/lib/pkgconfig" make bin/icu4c_date_format +``` + +### Running + +E.g. + +``` +env TEST_TIMEZONES=Pacific/Honolulu TEST_LOCALES=en_US,en,en_GB TEST_TIME_FORMAT="YYYY-MM-dd H:mm zz" bin/icu4c_date_format +``` diff --git a/misc/icu4c-tools/icu4c_date_format.cpp b/misc/icu4c-tools/icu4c_date_format.cpp new file mode 100644 index 00000000..8a6ac28b --- /dev/null +++ b/misc/icu4c-tools/icu4c_date_format.cpp @@ -0,0 +1,101 @@ +#include +#include +#include +#include + +static std::vector split(const std::string &s, char delimiter) { + std::vector tokens; + std::string token; + std::istringstream tokenStream(s); + while (std::getline(tokenStream, token, delimiter)) { + tokens.push_back(token); + } + return tokens; +} + +static UDate parse_time_str(const char *time_str) { + UErrorCode status = U_ZERO_ERROR; + icu::UnicodeString fauxISO8601("yyyy-MM-dd'T'hh:mm:ss'Z'"); + auto fmt = new icu::SimpleDateFormat(fauxISO8601, status); + fmt->setTimeZone(*icu::TimeZone::getGMT()); + UDate date = fmt->parse(icu::UnicodeString(time_str), status); + if (U_FAILURE(status)) { + std::cerr << "Failed to parse time string: " << time_str << std::endl; + exit(1); + } + return date; +} + +static std::vector parse_locales(const char *locales_str) { + auto locales = std::vector{}; + for (auto token : split(locales_str, ',')) { + auto loc = icu::Locale(token.c_str()); + if (loc.isBogus()) { + std::cerr << "Invalid locale: " << token << std::endl; + exit(1); + } + locales.push_back(loc); + } + return locales; +} + +static std::vector parse_timezones(const char *timezones_str) { + auto timezones = std::vector{}; + for (auto token : split(timezones_str, ',')) { + auto tz = icu::TimeZone::createTimeZone(token.c_str()); + if (tz == nullptr) { + std::cerr << "Invalid timezone: " << token << std::endl; + exit(1); + } + timezones.push_back(tz); + } + return timezones; +} + +int main() { + UErrorCode status = U_ZERO_ERROR; + const char *timezones_str = getenv("TEST_TIMEZONES"); + const char *locales_str = getenv("TEST_LOCALES"); + const char *time_str = getenv("TEST_TIME"); + const char *time_format_str = getenv("TEST_TIME_FORMAT"); + + if (!timezones_str || !locales_str) { + std::cerr << "Please set TEST_TIMEZONES, TEST_LOCALES environment variables" + << std::endl; + return 1; + } + + if (time_str == nullptr) { + time_str = "2025-03-04T13:53:00Z"; + std::cerr << "Defaulting TEST_TIME to " << time_str << std::endl; + } + + if (time_format_str == nullptr) { + time_format_str = "z:zz:zzz:zzzz"; + std::cerr << "Defaulting TEST_TIME_FORMAT to " << time_format_str + << std::endl; + } + + auto date = parse_time_str(time_str); + auto timezones = parse_timezones(timezones_str); + auto locales = parse_locales(locales_str); + + for (auto tz : timezones) { + icu::UnicodeString tzid; + tz->getID(tzid); + std::string tzid_str; + tzid.toUTF8String(tzid_str); + for (auto loc : locales) { + auto fmt = new icu::SimpleDateFormat(time_format_str, loc, status); + fmt->setTimeZone(*tz); + icu::UnicodeString name; + fmt->format(date, name); + std::string result; + name.toUTF8String(result); + std::cout << tzid_str << "\t" << loc.getName() << "\t" << result + << std::endl; + delete fmt; + } + } + return 0; +} diff --git a/tests/test_dates.py b/tests/test_dates.py index e47521e4..cd213b7f 100644 --- a/tests/test_dates.py +++ b/tests/test_dates.py @@ -1187,3 +1187,33 @@ def test_issue_1089(): def test_issue_1162(locale, format, negative, expected): delta = timedelta(seconds=10800) * (-1 if negative else +1) assert dates.format_timedelta(delta, add_direction=True, format=format, locale=locale) == expected + + +def test_issue_1192(): + # The actual returned value here is not actually strictly specified ("get_timezone_name" + # is not an operation specified as such). Issue #1192 concerned this invocation returning + # the invalid "no inheritance marker" value; _that_ should never be returned here. + # IOW, if the below "Hawaii-Aleutian Time" changes with e.g. CLDR updates, that's fine. + assert dates.get_timezone_name('Pacific/Honolulu', 'short', locale='en_GB') == "Hawaii-Aleutian Time" + + +@pytest.mark.xfail +def test_issue_1192_fmt(timezone_getter): + """ + There is an issue in how we format the fallback for z/zz in the absence of data + (esp. with the no inheritance marker present). + This test is marked xfail until that's fixed. + """ + # env TEST_TIMEZONES=Pacific/Honolulu TEST_LOCALES=en_US,en_GB TEST_TIME_FORMAT="YYYY-MM-dd H:mm z" bin/icu4c_date_format + # Defaulting TEST_TIME to 2025-03-04T13:53:00Z + # Pacific/Honolulu en_US 2025-03-04 3:53 HST + # Pacific/Honolulu en_GB 2025-03-04 3:53 GMT-10 + # env TEST_TIMEZONES=Pacific/Honolulu TEST_LOCALES=en_US,en_GB TEST_TIME_FORMAT="YYYY-MM-dd H:mm zz" bin/icu4c_date_format + # Pacific/Honolulu en_US 2025-03-04 3:53 HST + # Pacific/Honolulu en_GB 2025-03-04 3:53 GMT-10 + tz = timezone_getter("Pacific/Honolulu") + dt = _localize(tz, datetime(2025, 3, 4, 13, 53, tzinfo=UTC)) + assert dates.format_datetime(dt, "YYYY-MM-dd H:mm z", locale="en_US") == "2025-03-04 3:53 HST" + assert dates.format_datetime(dt, "YYYY-MM-dd H:mm z", locale="en_GB") == "2025-03-04 3:53 GMT-10" + assert dates.format_datetime(dt, "YYYY-MM-dd H:mm zz", locale="en_US") == "2025-03-04 3:53 HST" + assert dates.format_datetime(dt, "YYYY-MM-dd H:mm zz", locale="en_GB") == "2025-03-04 3:53 GMT-10" -- 2.47.2