From c4ceaf0bde210fc1f891eccf0da3db91452ddef7 Mon Sep 17 00:00:00 2001 From: Kevin Wilfong Date: Mon, 28 Oct 2024 19:44:50 -0700 Subject: [PATCH] Add support for zzzz (and beyond) in format_datetime (#11330) Summary: This diff adds support for JODA's zzzz (or more) patterns (all equivalent) in Presto's format_datetime function. This is used to format long time zone names. Long time zone names are not available from the IANA time zone database, so we can't use the tz library to generate these. Fortunately, unicode provides some utilities to generate these. Reviewed By: pedroerp Differential Revision: D64795407 --- CMake/resolve_dependency_modules/boost.cmake | 29 +++------- CMakeLists.txt | 19 +++++++ scripts/setup-ubuntu.sh | 3 +- velox/functions/lib/CMakeLists.txt | 3 +- velox/functions/lib/DateTimeFormatter.cpp | 15 +++--- .../prestosql/tests/DateTimeFunctionsTest.cpp | 45 ++++++++++++++-- velox/type/tz/CMakeLists.txt | 4 +- velox/type/tz/TimeZoneMap.cpp | 54 +++++++++++++++++++ velox/type/tz/TimeZoneMap.h | 8 +++ velox/type/tz/tests/TimeZoneMapTest.cpp | 24 +++++++++ 10 files changed, 167 insertions(+), 37 deletions(-) diff --git a/CMake/resolve_dependency_modules/boost.cmake b/CMake/resolve_dependency_modules/boost.cmake index bdf6633d0375..842cba6f1334 100644 --- a/CMake/resolve_dependency_modules/boost.cmake +++ b/CMake/resolve_dependency_modules/boost.cmake @@ -13,30 +13,13 @@ # limitations under the License. include_guard(GLOBAL) -if(CMAKE_SYSTEM_NAME MATCHES "Darwin") - if(ON_APPLE_M1) - list(APPEND CMAKE_PREFIX_PATH "/opt/homebrew/opt/icu4c") - else() - list(APPEND CMAKE_PREFIX_PATH "/usr/local/opt/icu4c") - endif() -endif() - -# ICU is only needed with Boost build from source -set_source(ICU) -resolve_dependency( - ICU - COMPONENTS - data - i18n - io - uc - tu - test) - add_subdirectory(${CMAKE_CURRENT_LIST_DIR}/boost) -if(${ICU_SOURCE} STREQUAL "BUNDLED") - # ensure ICU is built before Boost - add_dependencies(boost_regex ICU ICU::i18n) + +if(ICU_SOURCE) + if(${ICU_SOURCE} STREQUAL "BUNDLED") + # ensure ICU is built before Boost + add_dependencies(boost_regex ICU ICU::i18n) + endif() endif() # This prevents system boost from leaking in diff --git a/CMakeLists.txt b/CMakeLists.txt index 97ba92170c0f..168eee277044 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -385,6 +385,25 @@ endif() set(CMAKE_EXPORT_COMPILE_COMMANDS ON) +if(CMAKE_SYSTEM_NAME MATCHES "Darwin") + if(ON_APPLE_M1) + list(APPEND CMAKE_PREFIX_PATH "/opt/homebrew/opt/icu4c") + else() + list(APPEND CMAKE_PREFIX_PATH "/usr/local/opt/icu4c") + endif() +endif() + +set_source(ICU) +resolve_dependency( + ICU + COMPONENTS + data + i18n + io + uc + tu + test) + set(BOOST_INCLUDE_LIBRARIES atomic context diff --git a/scripts/setup-ubuntu.sh b/scripts/setup-ubuntu.sh index 5874e98e9651..5822d902e848 100755 --- a/scripts/setup-ubuntu.sh +++ b/scripts/setup-ubuntu.sh @@ -92,7 +92,6 @@ function install_velox_deps_from_apt { libc-ares-dev \ libcurl4-openssl-dev \ libssl-dev \ - libicu-dev \ libdouble-conversion-dev \ libgoogle-glog-dev \ libbz2-dev \ @@ -111,6 +110,7 @@ function install_velox_deps_from_apt { flex \ libfl-dev \ tzdata + ${SUDO} apt remove -y libicu-dev } function install_fmt { @@ -297,4 +297,3 @@ function install_apt_deps { fi fi ) - diff --git a/velox/functions/lib/CMakeLists.txt b/velox/functions/lib/CMakeLists.txt index bdff97bba2d9..5cee3af311d1 100644 --- a/velox/functions/lib/CMakeLists.txt +++ b/velox/functions/lib/CMakeLists.txt @@ -24,7 +24,8 @@ velox_link_libraries(velox_functions_util velox_vector velox_common_base) velox_add_library(velox_functions_lib_date_time_formatter DateTimeFormatter.cpp DateTimeFormatterBuilder.cpp) -velox_link_libraries(velox_functions_lib_date_time_formatter velox_type_tz) +velox_link_libraries(velox_functions_lib_date_time_formatter velox_type_tz + ICU::i18n ICU::uc) velox_add_library( velox_functions_lib diff --git a/velox/functions/lib/DateTimeFormatter.cpp b/velox/functions/lib/DateTimeFormatter.cpp index ec4f973213ad..5676a63efc8c 100644 --- a/velox/functions/lib/DateTimeFormatter.cpp +++ b/velox/functions/lib/DateTimeFormatter.cpp @@ -1209,10 +1209,10 @@ uint32_t DateTimeFormatter::maxResultSize(const tz::TimeZone* timezone) const { // https://en.wikipedia.org/wiki/List_of_tz_database_time_zones size += 5; } else { - VELOX_NYI( - "Date format specifier is not yet implemented: {} ({})", - getSpecifierName(token.pattern.specifier), - token.pattern.minRepresentDigits); + // The longest time zone long name is 40, Australian Central Western + // Standard Time. + // https://www.timeanddate.com/time/zones/ + size += 50; } break; @@ -1468,8 +1468,11 @@ int32_t DateTimeFormatter::format( std::memcpy(result, abbrev.data(), abbrev.length()); result += abbrev.length(); } else { - // TODO: implement full name time zone - VELOX_NYI("full time zone name is not yet supported"); + std::string longName = timezone->getLongName( + std::chrono::milliseconds(timestamp.toMillis()), + tz::TimeZone::TChoose::kEarliest); + std::memcpy(result, longName.data(), longName.length()); + result += longName.length(); } } break; diff --git a/velox/functions/prestosql/tests/DateTimeFunctionsTest.cpp b/velox/functions/prestosql/tests/DateTimeFunctionsTest.cpp index afa4f5a8ac91..30090173d8f8 100644 --- a/velox/functions/prestosql/tests/DateTimeFunctionsTest.cpp +++ b/velox/functions/prestosql/tests/DateTimeFunctionsTest.cpp @@ -3272,6 +3272,12 @@ TEST_F(DateTimeFunctionsTest, formatDateTime) { EXPECT_EQ("IST", formatDatetime(parseTimestamp("1970-01-01"), "zzz")); EXPECT_EQ("IST", formatDatetime(parseTimestamp("1970-01-01"), "zz")); EXPECT_EQ("IST", formatDatetime(parseTimestamp("1970-01-01"), "z")); + EXPECT_EQ( + "India Standard Time", + formatDatetime(parseTimestamp("1970-01-01"), "zzzz")); + EXPECT_EQ( + "India Standard Time", + formatDatetime(parseTimestamp("1970-01-01"), "zzzzzzzzzzzzzzzzzzzzzz")); // Test daylight savings. setQueryTimeZone("America/Los_Angeles"); @@ -3281,16 +3287,47 @@ TEST_F(DateTimeFunctionsTest, formatDateTime) { EXPECT_EQ("PDT", formatDatetime(parseTimestamp("2024-03-10 03:00"), "z")); EXPECT_EQ("PDT", formatDatetime(parseTimestamp("2024-11-03 01:00"), "z")); EXPECT_EQ("PST", formatDatetime(parseTimestamp("2024-11-03 02:00"), "z")); + EXPECT_EQ( + "Pacific Standard Time", + formatDatetime(parseTimestamp("1970-01-01"), "zzzz")); + EXPECT_EQ( + "Pacific Daylight Time", + formatDatetime(parseTimestamp("1970-10-01"), "zzzz")); + EXPECT_EQ( + "Pacific Standard Time", + formatDatetime(parseTimestamp("2024-03-10 01:00"), "zzzz")); + EXPECT_EQ( + "Pacific Daylight Time", + formatDatetime(parseTimestamp("2024-03-10 03:00"), "zzzz")); + EXPECT_EQ( + "Pacific Daylight Time", + formatDatetime(parseTimestamp("2024-11-03 01:00"), "zzzz")); + EXPECT_EQ( + "Pacific Standard Time", + formatDatetime(parseTimestamp("2024-11-03 02:00"), "zzzz")); + + // Test ambiguous time. + EXPECT_EQ( + "PDT", formatDatetime(parseTimestamp("2024-11-03 01:30:00"), "zzz")); + EXPECT_EQ( + "Pacific Daylight Time", + formatDatetime(parseTimestamp("2024-11-03 01:30:00"), "zzzz")); // Test a long abbreviation. setQueryTimeZone("Asia/Colombo"); EXPECT_EQ("+0530", formatDatetime(parseTimestamp("1970-10-01"), "z")); + EXPECT_EQ( + "India Standard Time", + formatDatetime(parseTimestamp("1970-10-01"), "zzzz")); - setQueryTimeZone("Asia/Kolkata"); - // We don't support more than 3 'z's yet. - EXPECT_THROW( - formatDatetime(parseTimestamp("1970-01-01"), "zzzz"), VeloxRuntimeError); + // Test a long long name. + setQueryTimeZone("Australia/Eucla"); + EXPECT_EQ("+0845", formatDatetime(parseTimestamp("1970-10-01"), "z")); + EXPECT_EQ( + "Australian Central Western Standard Time", + formatDatetime(parseTimestamp("1970-10-01"), "zzzz")); + setQueryTimeZone("Asia/Kolkata"); // Literal test cases. EXPECT_EQ("hello", formatDatetime(parseTimestamp("1970-01-01"), "'hello'")); EXPECT_EQ("'", formatDatetime(parseTimestamp("1970-01-01"), "''")); diff --git a/velox/type/tz/CMakeLists.txt b/velox/type/tz/CMakeLists.txt index b7d96feb1262..614b594000a7 100644 --- a/velox/type/tz/CMakeLists.txt +++ b/velox/type/tz/CMakeLists.txt @@ -23,4 +23,6 @@ velox_link_libraries( velox_external_date Boost::regex fmt::fmt - Folly::folly) + Folly::folly + ICU::i18n + ICU::uc) diff --git a/velox/type/tz/TimeZoneMap.cpp b/velox/type/tz/TimeZoneMap.cpp index 2b7e8968ed04..b68c88c6be04 100644 --- a/velox/type/tz/TimeZoneMap.cpp +++ b/velox/type/tz/TimeZoneMap.cpp @@ -20,6 +20,16 @@ #include #include #include + +#include +#include +#include + +// The ICU libraries define TRUE/FALSE macros which frequently conflict with +// other libraries that use these as enum/variable names. +#undef TRUE +#undef FALSE + #include "velox/common/base/Exceptions.h" #include "velox/common/testutil/TestValue.h" #include "velox/external/date/tz.h" @@ -379,4 +389,48 @@ std::string TimeZone::getShortName( return getZonedTime(tz_, timePoint, choose).get_info().abbrev; } + +std::string TimeZone::getLongName( + TimeZone::milliseconds timestamp, + TimeZone::TChoose choose) const { + static const icu::Locale locale("en", "US"); + + validateRange(date::sys_time(timestamp)); + + // Time zone offsets only have one name. + if (tz_ == nullptr) { + return timeZoneName_; + } + + // Special case for UTC. ICU uses "GMT" for some reason which is an + // abbreviation. + if (timeZoneID_ == 0) { + return "Coordinated Universal Time"; + } + + // Get the ICU TimeZone by name + std::unique_ptr tz(icu::TimeZone::createTimeZone( + icu::UnicodeString(timeZoneName_.data(), timeZoneName_.length()))); + VELOX_USER_CHECK_NOT_NULL(tz); + + // According to the documentation this is how to determine if DST applies to + // a given timestamp in a given time zone. + // https://howardhinnant.github.io/date/tz.html#sys_info + date::local_time timePoint{timestamp}; + bool isDst = getZonedTime(tz_, timePoint, choose).get_info().save != + std::chrono::minutes(0); + + // Construct the long name for the time zone. + // Note that ICU does not have DST information for many time zones prior to + // 1970, so it's important to specify it explicitly. + icu::UnicodeString longName; + tz->getDisplayName( + isDst, icu::TimeZone::EDisplayType::LONG, locale, longName); + + // Convert the UnicodeString back to a string and write it out + std::string longNameStr; + longName.toUTF8String(longNameStr); + + return longNameStr; +} } // namespace facebook::velox::tz diff --git a/velox/type/tz/TimeZoneMap.h b/velox/type/tz/TimeZoneMap.h index c7e5c0f6087e..c04cc308657b 100644 --- a/velox/type/tz/TimeZoneMap.h +++ b/velox/type/tz/TimeZoneMap.h @@ -160,6 +160,14 @@ class TimeZone { milliseconds timestamp, TChoose choose = TChoose::kFail) const; + /// Returns the long name of the time zone for the given timestamp, e.g. + /// Pacific Standard Time. Note that the timestamp is needed for time zones + /// that support daylight savings time as the long name will change depending + /// on the date (e.g. Pacific Standard Time vs Pacific Daylight Time). + std::string getLongName( + milliseconds timestamp, + TChoose choose = TChoose::kFail) const; + private: const date::time_zone* tz_{nullptr}; const std::chrono::minutes offset_{0}; diff --git a/velox/type/tz/tests/TimeZoneMapTest.cpp b/velox/type/tz/tests/TimeZoneMapTest.cpp index 0d0fa21cb43c..364fd2baf297 100644 --- a/velox/type/tz/tests/TimeZoneMapTest.cpp +++ b/velox/type/tz/tests/TimeZoneMapTest.cpp @@ -259,5 +259,29 @@ TEST(TimeZoneMapTest, getShortName) { EXPECT_EQ("PST", toShortName("America/Los_Angeles", ts)); } +TEST(TimeZoneMapTest, getLongName) { + auto toLongName = [&](std::string_view name, size_t ts) { + const auto* tz = locateZone(name); + EXPECT_NE(tz, nullptr); + return tz->getLongName(milliseconds{ts}); + }; + + // Test an offset that maps to an actual time zone. + EXPECT_EQ("Coordinated Universal Time", toLongName("+00:00", 0)); + + // Test offsets that do not map to named time zones. + EXPECT_EQ("+00:01", toLongName("+00:01", 0)); + EXPECT_EQ("-00:01", toLongName("-00:01", 0)); + EXPECT_EQ("+01:00", toLongName("+01:00", 0)); + EXPECT_EQ("-01:01", toLongName("-01:01", 0)); + + // In "2024-07-25", America/Los_Angeles was in daylight savings time (UTC-07). + size_t ts = 1721890800000; + EXPECT_EQ("Pacific Daylight Time", toLongName("America/Los_Angeles", ts)); + + // In "2024-01-01", it was not (UTC-08). + ts = 1704096000000; + EXPECT_EQ("Pacific Standard Time", toLongName("America/Los_Angeles", ts)); +} } // namespace } // namespace facebook::velox::tz