Skip to content

Commit

Permalink
Add support for zzzz (and beyond) in format_datetime (facebookincubat…
Browse files Browse the repository at this point in the history
…or#11330)

Summary:

This diff adds support for JODA's zzzz (or more) patterns (all equivalent) in Presto's
format_datetime function.

This is used to format long time zone names.

Long time zone names are not available from the IANA time zone database, so we
can't use the tz library to generate these.  Fortunately, unicode provides some
utilities to generate these.

Reviewed By: pedroerp

Differential Revision: D64795407
  • Loading branch information
Kevin Wilfong authored and facebook-github-bot committed Oct 28, 2024
1 parent 12f46cf commit 988ffe7
Show file tree
Hide file tree
Showing 9 changed files with 166 additions and 35 deletions.
29 changes: 6 additions & 23 deletions CMake/resolve_dependency_modules/boost.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -13,30 +13,13 @@
# limitations under the License.
include_guard(GLOBAL)

if(CMAKE_SYSTEM_NAME MATCHES "Darwin")
if(ON_APPLE_M1)
list(APPEND CMAKE_PREFIX_PATH "/opt/homebrew/opt/icu4c")
else()
list(APPEND CMAKE_PREFIX_PATH "/usr/local/opt/icu4c")
endif()
endif()

# ICU is only needed with Boost build from source
set_source(ICU)
resolve_dependency(
ICU
COMPONENTS
data
i18n
io
uc
tu
test)

add_subdirectory(${CMAKE_CURRENT_LIST_DIR}/boost)
if(${ICU_SOURCE} STREQUAL "BUNDLED")
# ensure ICU is built before Boost
add_dependencies(boost_regex ICU ICU::i18n)

if(ICU_SOURCE)
if(${ICU_SOURCE} STREQUAL "BUNDLED")
# ensure ICU is built before Boost
add_dependencies(boost_regex ICU ICU::i18n)
endif()
endif()

# This prevents system boost from leaking in
Expand Down
19 changes: 19 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -473,6 +473,25 @@ add_compile_definitions(FOLLY_HAVE_INT128_T=1)
set_source(folly)
resolve_dependency(folly)

if(CMAKE_SYSTEM_NAME MATCHES "Darwin")
if(ON_APPLE_M1)
list(APPEND CMAKE_PREFIX_PATH "/opt/homebrew/opt/icu4c")
else()
list(APPEND CMAKE_PREFIX_PATH "/usr/local/opt/icu4c")
endif()
endif()

set_source(ICU)
resolve_dependency(
ICU
COMPONENTS
data
i18n
io
uc
tu
test)

if(${VELOX_BUILD_TESTING})
# Spark qury runner depends on absl, gRPC.
set_source(absl)
Expand Down
3 changes: 2 additions & 1 deletion velox/functions/lib/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,8 @@ velox_link_libraries(velox_functions_util velox_vector velox_common_base)
velox_add_library(velox_functions_lib_date_time_formatter DateTimeFormatter.cpp
DateTimeFormatterBuilder.cpp)

velox_link_libraries(velox_functions_lib_date_time_formatter velox_type_tz)
velox_link_libraries(velox_functions_lib_date_time_formatter velox_type_tz
ICU::i18n ICU::uc)

velox_add_library(
velox_functions_lib
Expand Down
15 changes: 9 additions & 6 deletions velox/functions/lib/DateTimeFormatter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1209,10 +1209,10 @@ uint32_t DateTimeFormatter::maxResultSize(const tz::TimeZone* timezone) const {
// https://en.wikipedia.org/wiki/List_of_tz_database_time_zones
size += 5;
} else {
VELOX_NYI(
"Date format specifier is not yet implemented: {} ({})",
getSpecifierName(token.pattern.specifier),
token.pattern.minRepresentDigits);
// The longest time zone long name is 40, Australian Central Western
// Standard Time.
// https://www.timeanddate.com/time/zones/
size += 50;
}

break;
Expand Down Expand Up @@ -1468,8 +1468,11 @@ int32_t DateTimeFormatter::format(
std::memcpy(result, abbrev.data(), abbrev.length());
result += abbrev.length();
} else {
// TODO: implement full name time zone
VELOX_NYI("full time zone name is not yet supported");
std::string longName = timezone->getLongName(
std::chrono::milliseconds(timestamp.toMillis()),
tz::TimeZone::TChoose::kEarliest);
std::memcpy(result, longName.data(), longName.length());
result += longName.length();
}
} break;

Expand Down
45 changes: 41 additions & 4 deletions velox/functions/prestosql/tests/DateTimeFunctionsTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3272,6 +3272,12 @@ TEST_F(DateTimeFunctionsTest, formatDateTime) {
EXPECT_EQ("IST", formatDatetime(parseTimestamp("1970-01-01"), "zzz"));
EXPECT_EQ("IST", formatDatetime(parseTimestamp("1970-01-01"), "zz"));
EXPECT_EQ("IST", formatDatetime(parseTimestamp("1970-01-01"), "z"));
EXPECT_EQ(
"India Standard Time",
formatDatetime(parseTimestamp("1970-01-01"), "zzzz"));
EXPECT_EQ(
"India Standard Time",
formatDatetime(parseTimestamp("1970-01-01"), "zzzzzzzzzzzzzzzzzzzzzz"));

// Test daylight savings.
setQueryTimeZone("America/Los_Angeles");
Expand All @@ -3281,16 +3287,47 @@ TEST_F(DateTimeFunctionsTest, formatDateTime) {
EXPECT_EQ("PDT", formatDatetime(parseTimestamp("2024-03-10 03:00"), "z"));
EXPECT_EQ("PDT", formatDatetime(parseTimestamp("2024-11-03 01:00"), "z"));
EXPECT_EQ("PST", formatDatetime(parseTimestamp("2024-11-03 02:00"), "z"));
EXPECT_EQ(
"Pacific Standard Time",
formatDatetime(parseTimestamp("1970-01-01"), "zzzz"));
EXPECT_EQ(
"Pacific Daylight Time",
formatDatetime(parseTimestamp("1970-10-01"), "zzzz"));
EXPECT_EQ(
"Pacific Standard Time",
formatDatetime(parseTimestamp("2024-03-10 01:00"), "zzzz"));
EXPECT_EQ(
"Pacific Daylight Time",
formatDatetime(parseTimestamp("2024-03-10 03:00"), "zzzz"));
EXPECT_EQ(
"Pacific Daylight Time",
formatDatetime(parseTimestamp("2024-11-03 01:00"), "zzzz"));
EXPECT_EQ(
"Pacific Standard Time",
formatDatetime(parseTimestamp("2024-11-03 02:00"), "zzzz"));

// Test ambiguous time.
EXPECT_EQ(
"PDT", formatDatetime(parseTimestamp("2024-11-03 01:30:00"), "zzz"));
EXPECT_EQ(
"Pacific Daylight Time",
formatDatetime(parseTimestamp("2024-11-03 01:30:00"), "zzzz"));

// Test a long abbreviation.
setQueryTimeZone("Asia/Colombo");
EXPECT_EQ("+0530", formatDatetime(parseTimestamp("1970-10-01"), "z"));
EXPECT_EQ(
"India Standard Time",
formatDatetime(parseTimestamp("1970-10-01"), "zzzz"));

setQueryTimeZone("Asia/Kolkata");
// We don't support more than 3 'z's yet.
EXPECT_THROW(
formatDatetime(parseTimestamp("1970-01-01"), "zzzz"), VeloxRuntimeError);
// Test a long long name.
setQueryTimeZone("Australia/Eucla");
EXPECT_EQ("+0845", formatDatetime(parseTimestamp("1970-10-01"), "z"));
EXPECT_EQ(
"Australian Central Western Standard Time",
formatDatetime(parseTimestamp("1970-10-01"), "zzzz"));

setQueryTimeZone("Asia/Kolkata");
// Literal test cases.
EXPECT_EQ("hello", formatDatetime(parseTimestamp("1970-01-01"), "'hello'"));
EXPECT_EQ("'", formatDatetime(parseTimestamp("1970-01-01"), "''"));
Expand Down
4 changes: 3 additions & 1 deletion velox/type/tz/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -23,4 +23,6 @@ velox_link_libraries(
velox_external_date
Boost::regex
fmt::fmt
Folly::folly)
Folly::folly
ICU::i18n
ICU::uc)
54 changes: 54 additions & 0 deletions velox/type/tz/TimeZoneMap.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,16 @@
#include <fmt/core.h>
#include <folly/container/F14Map.h>
#include <folly/container/F14Set.h>

#include <unicode/locid.h>
#include <unicode/timezone.h>
#include <unicode/unistr.h>

// The ICU libraries define TRUE/FALSE macros which frequently conflict with
// other libraries that use these as enum/variable names.
#undef TRUE
#undef FALSE

#include "velox/common/base/Exceptions.h"
#include "velox/common/testutil/TestValue.h"
#include "velox/external/date/tz.h"
Expand Down Expand Up @@ -379,4 +389,48 @@ std::string TimeZone::getShortName(

return getZonedTime(tz_, timePoint, choose).get_info().abbrev;
}

std::string TimeZone::getLongName(
TimeZone::milliseconds timestamp,
TimeZone::TChoose choose) const {
static const icu::Locale locale("en", "US");

validateRange(date::sys_time<milliseconds>(timestamp));

// Time zone offsets only have one name.
if (tz_ == nullptr) {
return timeZoneName_;
}

// Special case for UTC. ICU uses "GMT" for some reason which is an
// abbreviation.
if (timeZoneID_ == 0) {
return "Coordinated Universal Time";
}

// Get the ICU TimeZone by name
std::unique_ptr<icu::TimeZone> tz(icu::TimeZone::createTimeZone(
icu::UnicodeString(timeZoneName_.data(), timeZoneName_.length())));
VELOX_USER_CHECK_NOT_NULL(tz);

// According to the documentation this is how to determine if DST applies to
// a given timestamp in a given time zone.
// https://howardhinnant.github.io/date/tz.html#sys_info
date::local_time<milliseconds> timePoint{timestamp};
bool isDst = getZonedTime(tz_, timePoint, choose).get_info().save !=
std::chrono::minutes(0);

// Construct the long name for the time zone.
// Note that ICU does not have DST information for many time zones prior to
// 1970, so it's important to specify it explicitly.
icu::UnicodeString longName;
tz->getDisplayName(
isDst, icu::TimeZone::EDisplayType::LONG, locale, longName);

// Convert the UnicodeString back to a string and write it out
std::string longNameStr;
longName.toUTF8String(longNameStr);

return longNameStr;
}
} // namespace facebook::velox::tz
8 changes: 8 additions & 0 deletions velox/type/tz/TimeZoneMap.h
Original file line number Diff line number Diff line change
Expand Up @@ -160,6 +160,14 @@ class TimeZone {
milliseconds timestamp,
TChoose choose = TChoose::kFail) const;

/// Returns the long name of the time zone for the given timestamp, e.g.
/// Pacific Standard Time. Note that the timestamp is needed for time zones
/// that support daylight savings time as the long name will change depending
/// on the date (e.g. Pacific Standard Time vs Pacific Daylight Time).
std::string getLongName(
milliseconds timestamp,
TChoose choose = TChoose::kFail) const;

private:
const date::time_zone* tz_{nullptr};
const std::chrono::minutes offset_{0};
Expand Down
24 changes: 24 additions & 0 deletions velox/type/tz/tests/TimeZoneMapTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -259,5 +259,29 @@ TEST(TimeZoneMapTest, getShortName) {
EXPECT_EQ("PST", toShortName("America/Los_Angeles", ts));
}

TEST(TimeZoneMapTest, getLongName) {
auto toLongName = [&](std::string_view name, size_t ts) {
const auto* tz = locateZone(name);
EXPECT_NE(tz, nullptr);
return tz->getLongName(milliseconds{ts});
};

// Test an offset that maps to an actual time zone.
EXPECT_EQ("Coordinated Universal Time", toLongName("+00:00", 0));

// Test offsets that do not map to named time zones.
EXPECT_EQ("+00:01", toLongName("+00:01", 0));
EXPECT_EQ("-00:01", toLongName("-00:01", 0));
EXPECT_EQ("+01:00", toLongName("+01:00", 0));
EXPECT_EQ("-01:01", toLongName("-01:01", 0));

// In "2024-07-25", America/Los_Angeles was in daylight savings time (UTC-07).
size_t ts = 1721890800000;
EXPECT_EQ("Pacific Daylight Time", toLongName("America/Los_Angeles", ts));

// In "2024-01-01", it was not (UTC-08).
ts = 1704096000000;
EXPECT_EQ("Pacific Standard Time", toLongName("America/Los_Angeles", ts));
}
} // namespace
} // namespace facebook::velox::tz

0 comments on commit 988ffe7

Please sign in to comment.