Skip to content

Commit f4dfd6c

Browse files
wjones127jonkeane
authored andcommitted
ARROW-13168: [C++][R] Enable runtime timezone database for Windows
This allows for runtime configuration of the timezone database on Windows for C++ and R. Python will be handled later because it's available timezone libraries use the binary rather than text format, which is not yet supported the vendored date library. For R, Windows will only support the "C" locale, since (as far as I can tell) that's the only locale supported by the MingW std::locale implementation. I think R itself gets around this by implementing a completely custom version of `strftime()` and friends. Closes apache#12536 from wjones127/ARROW-13168-timezone-database Authored-by: Will Jones <willjones127@gmail.com> Signed-off-by: Jonathan Keane <jkeane@gmail.com>
1 parent 919d113 commit f4dfd6c

22 files changed

Lines changed: 304 additions & 97 deletions

.github/workflows/cpp.yml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -239,6 +239,9 @@ jobs:
239239
with:
240240
fetch-depth: 0
241241
submodules: recursive
242+
- name: Download Timezone Database
243+
shell: bash
244+
run: ci/scripts/download_tz_database.sh
242245
- name: Build
243246
shell: bash
244247
run: ci/scripts/cpp_build.sh $(pwd) $(pwd)/build
@@ -319,6 +322,9 @@ jobs:
319322
run: |
320323
export CMAKE_BUILD_PARALLEL_LEVEL=$NUMBER_OF_PROCESSORS
321324
ci/scripts/cpp_build.sh "$(pwd)" "$(pwd)/build"
325+
- name: Download Timezone Database
326+
shell: bash
327+
run: ci/scripts/download_tz_database.sh
322328
- name: Download MinIO
323329
shell: msys2 {0}
324330
run: |

ci/appveyor-cpp-setup.bat

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -115,3 +115,17 @@ powershell.exe -Command "Start-Process clcache-server" || exit /B
115115
if "%ARROW_S3%" == "ON" (
116116
appveyor DownloadFile https://dl.min.io/server/minio/release/windows-amd64/minio.exe -FileName C:\Windows\Minio.exe || exit /B
117117
)
118+
119+
120+
@rem
121+
@rem Download IANA Timezone Database for unit tests
122+
@rem
123+
@rem (Doc section: Download timezone database)
124+
curl https://data.iana.org/time-zones/releases/tzdata2021e.tar.gz --output tzdata.tar.gz
125+
mkdir tzdata
126+
tar --extract --file tzdata.tar.gz --directory tzdata
127+
move tzdata %USERPROFILE%\Downloads\tzdata
128+
@rem Also need Windows timezone mapping
129+
curl https://raw.githubusercontent.com/unicode-org/cldr/master/common/supplemental/windowsZones.xml ^
130+
--output %USERPROFILE%\Downloads\tzdata\windowsZones.xml
131+
@rem (Doc section: Download timezone database)

ci/scripts/download_tz_database.sh

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
#!/usr/bin/env bash
2+
#
3+
# Licensed to the Apache Software Foundation (ASF) under one
4+
# or more contributor license agreements. See the NOTICE file
5+
# distributed with this work for additional information
6+
# regarding copyright ownership. The ASF licenses this file
7+
# to you under the Apache License, Version 2.0 (the
8+
# "License"); you may not use this file except in compliance
9+
# with the License. You may obtain a copy of the License at
10+
#
11+
# http://www.apache.org/licenses/LICENSE-2.0
12+
#
13+
# Unless required by applicable law or agreed to in writing,
14+
# software distributed under the License is distributed on an
15+
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
16+
# KIND, either express or implied. See the License for the
17+
# specific language governing permissions and limitations
18+
# under the License.
19+
20+
set -ex
21+
22+
# Download database
23+
curl https://data.iana.org/time-zones/releases/tzdata2021e.tar.gz --output ~/Downloads/tzdata2021e.tar.gz
24+
25+
# Extract
26+
mkdir -p ~/Downloads/tzdata
27+
tar --extract --file ~/Downloads/tzdata2021e.tar.gz --directory ~/Downloads/tzdata
28+
29+
# Download Windows timezone mapping
30+
curl https://raw.githubusercontent.com/unicode-org/cldr/master/common/supplemental/windowsZones.xml --output ~/Downloads/tzdata/windowsZones.xml

cpp/src/arrow/compute/kernels/scalar_cast_string.cc

Lines changed: 0 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -150,12 +150,6 @@ struct TemporalToStringCastFunctor<O, TimestampType> {
150150
return Status::OK();
151151
}));
152152
} else {
153-
#ifdef _WIN32
154-
// TODO(ARROW-13168):
155-
return Status::NotImplemented(
156-
"Casting a timestamp with time zone to string is not yet supported on "
157-
"Windows.");
158-
#else
159153
switch (ty.unit()) {
160154
case TimeUnit::SECOND:
161155
RETURN_NOT_OK(ConvertZoned<std::chrono::seconds>(input, timezone, &builder));
@@ -176,7 +170,6 @@ struct TemporalToStringCastFunctor<O, TimestampType> {
176170
DCHECK(false);
177171
return Status::NotImplemented("Unimplemented time unit");
178172
}
179-
#endif
180173
}
181174
std::shared_ptr<Array> output_array;
182175
RETURN_NOT_OK(builder.Finish(&output_array));

cpp/src/arrow/compute/kernels/scalar_cast_test.cc

Lines changed: 14 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@
3434
#include "arrow/testing/extension_type.h"
3535
#include "arrow/testing/gtest_util.h"
3636
#include "arrow/testing/random.h"
37+
#include "arrow/testing/util.h"
3738
#include "arrow/type.h"
3839
#include "arrow/type_fwd.h"
3940
#include "arrow/type_traits.h"
@@ -1146,6 +1147,16 @@ constexpr char kTimestampSecondsJson[] =
11461147
constexpr char kTimestampExtremeJson[] =
11471148
R"(["1677-09-20T00:00:59.123456", "2262-04-13T23:23:23.999999"])";
11481149

1150+
class CastTimezone : public ::testing::Test {
1151+
protected:
1152+
void SetUp() override {
1153+
#ifdef _WIN32
1154+
// Initialize timezone database on Windows
1155+
ASSERT_OK(InitTestTimezoneDatabase());
1156+
#endif
1157+
}
1158+
};
1159+
11491160
TEST(Cast, TimestampToDate) {
11501161
// See scalar_temporal_test.cc
11511162
auto timestamps = ArrayFromJSON(timestamp(TimeUnit::NANO), kTimestampJson);
@@ -1181,12 +1192,7 @@ TEST(Cast, TimestampToDate) {
11811192
}
11821193
}
11831194

1184-
TEST(Cast, ZonedTimestampToDate) {
1185-
#ifdef _WIN32
1186-
// TODO(ARROW-13168): we lack tzdb on Windows
1187-
GTEST_SKIP() << "ARROW-13168: no access to timezone database on Windows";
1188-
#endif
1189-
1195+
TEST_F(CastTimezone, ZonedTimestampToDate) {
11901196
{
11911197
// See TestZoned in scalar_temporal_test.cc
11921198
auto timestamps =
@@ -1377,12 +1383,7 @@ TEST(Cast, TimestampToTime) {
13771383
}
13781384
}
13791385

1380-
TEST(Cast, ZonedTimestampToTime) {
1381-
#ifdef _WIN32
1382-
// TODO(ARROW-13168): we lack tzdb on Windows
1383-
GTEST_SKIP() << "ARROW-13168: no access to timezone database on Windows";
1384-
#endif
1385-
1386+
TEST_F(CastTimezone, ZonedTimestampToTime) {
13861387
CheckCast(ArrayFromJSON(timestamp(TimeUnit::NANO, "Pacific/Marquesas"), kTimestampJson),
13871388
ArrayFromJSON(time64(TimeUnit::NANO), R"([
13881389
52259123456789, 50003999999999, 56480001001001, 65000000000000,
@@ -1573,8 +1574,7 @@ TEST(Cast, TimestampToString) {
15731574
}
15741575
}
15751576

1576-
#ifndef _WIN32
1577-
TEST(Cast, TimestampWithZoneToString) {
1577+
TEST_F(CastTimezone, TimestampWithZoneToString) {
15781578
for (auto string_type : {utf8(), large_utf8()}) {
15791579
CheckCast(
15801580
ArrayFromJSON(timestamp(TimeUnit::SECOND, "UTC"), "[-30610224000, -5364662400]"),
@@ -1608,21 +1608,6 @@ TEST(Cast, TimestampWithZoneToString) {
16081608
R"(["1968-11-30 13:30:44.123456789-0700", "2016-02-29 10:42:23.456789246-0700"])"));
16091609
}
16101610
}
1611-
#else
1612-
// TODO(ARROW-13168): we lack tzdb on Windows
1613-
TEST(Cast, TimestampWithZoneToString) {
1614-
for (auto string_type : {utf8(), large_utf8()}) {
1615-
ASSERT_RAISES(NotImplemented, Cast(ArrayFromJSON(timestamp(TimeUnit::SECOND, "UTC"),
1616-
"[-34226955, 1456767743]"),
1617-
CastOptions::Safe(string_type)));
1618-
1619-
ASSERT_RAISES(NotImplemented,
1620-
Cast(ArrayFromJSON(timestamp(TimeUnit::SECOND, "America/Phoenix"),
1621-
"[-34226955, 1456767743]"),
1622-
CastOptions::Safe(string_type)));
1623-
}
1624-
}
1625-
#endif
16261611

16271612
TEST(Cast, DateToDate) {
16281613
auto day_32 = ArrayFromJSON(date32(), "[0, null, 100, 1, 10]");

cpp/src/arrow/compute/kernels/scalar_temporal_test.cc

Lines changed: 13 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424
#include "arrow/compute/kernels/test_util.h"
2525
#include "arrow/testing/gtest_util.h"
2626
#include "arrow/testing/matchers.h"
27+
#include "arrow/testing/util.h"
2728
#include "arrow/type.h"
2829
#include "arrow/util/checked_cast.h"
2930
#include "arrow/util/formatting.h"
@@ -407,6 +408,14 @@ class ScalarTemporalTest : public ::testing::Test {
407408
RoundTemporalOptions round_to_15_quarters =
408409
RoundTemporalOptions(15, CalendarUnit::QUARTER);
409410
RoundTemporalOptions round_to_15_years = RoundTemporalOptions(15, CalendarUnit::YEAR);
411+
412+
protected:
413+
void SetUp() override {
414+
#ifdef _WIN32
415+
// Initialize timezone database on Windows
416+
ASSERT_OK(InitTestTimezoneDatabase());
417+
#endif
418+
}
410419
};
411420

412421
TEST_F(ScalarTemporalTest, TestTemporalComponentExtractionAllTemporalTypes) {
@@ -564,8 +573,6 @@ TEST_F(ScalarTemporalTest, TestOutsideNanosecondRange) {
564573
CheckScalarUnary("subsecond", unit, times, float64(), subsecond);
565574
}
566575

567-
#ifndef _WIN32
568-
// TODO: We should test on windows once ARROW-13168 is resolved.
569576
TEST_F(ScalarTemporalTest, TestIsLeapYear) {
570577
auto is_leap_year_marquesas =
571578
"[false, true, false, false, false, false, false, false, false, false, false, "
@@ -792,7 +799,6 @@ TEST_F(ScalarTemporalTest, TestNonexistentTimezone) {
792799
ASSERT_RAISES(Invalid, Subsecond(timestamp_array));
793800
}
794801
}
795-
#endif
796802

797803
TEST_F(ScalarTemporalTest, Week) {
798804
auto unit = timestamp(TimeUnit::NANO);
@@ -1611,8 +1617,6 @@ TEST_F(ScalarTemporalTest, TestTemporalDifferenceErrors) {
16111617
CallFunction("weeks_between", {arr1, arr1}, &options));
16121618
}
16131619

1614-
// TODO: We should test on windows once ARROW-13168 is resolved.
1615-
#ifndef _WIN32
16161620
TEST_F(ScalarTemporalTest, TestAssumeTimezone) {
16171621
std::string timezone_utc = "UTC";
16181622
std::string timezone_kolkata = "Asia/Kolkata";
@@ -1879,6 +1883,9 @@ TEST_F(ScalarTemporalTest, StrftimeCLocale) {
18791883
}
18801884

18811885
TEST_F(ScalarTemporalTest, StrftimeOtherLocale) {
1886+
#ifdef _WIN32
1887+
GTEST_SKIP() << "There is a known bug in strftime for locales on Windows (ARROW-15922)";
1888+
#else
18821889
if (!LocaleExists("fr_FR.UTF-8")) {
18831890
GTEST_SKIP() << "locale 'fr_FR.UTF-8' doesn't exist on this system";
18841891
}
@@ -1890,6 +1897,7 @@ TEST_F(ScalarTemporalTest, StrftimeOtherLocale) {
18901897
["01 janvier 1970 00:00:59,123", "18 août 2021 15:11:50,456", null])";
18911898
CheckScalarUnary("strftime", timestamp(TimeUnit::MILLI, "UTC"), milliseconds, utf8(),
18921899
expected, &options);
1900+
#endif
18931901
}
18941902

18951903
TEST_F(ScalarTemporalTest, StrftimeInvalidLocale) {
@@ -2583,7 +2591,6 @@ TEST_F(ScalarTemporalTest, TestCeilFloorRoundTemporalKolkata) {
25832591
CheckScalarUnary("round_temporal", unit, times, unit, round_1_hours, &round_to_1_hours);
25842592
CheckScalarUnary("round_temporal", unit, times, unit, round_2_hours, &round_to_2_hours);
25852593
}
2586-
#endif // !_WIN32
25872594

25882595
} // namespace compute
25892596
} // namespace arrow

cpp/src/arrow/compute/kernels/scalar_temporal_unary.cc

Lines changed: 0 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1048,7 +1048,6 @@ struct RoundTemporal {
10481048
// ----------------------------------------------------------------------
10491049
// Convert timestamps to a string representation with an arbitrary format
10501050

1051-
#ifndef _WIN32
10521051
Result<std::locale> GetLocale(const std::string& locale) {
10531052
try {
10541053
return std::locale(locale.c_str());
@@ -1132,18 +1131,6 @@ struct Strftime {
11321131
return Status::OK();
11331132
}
11341133
};
1135-
#else
1136-
// TODO(ARROW-13168)
1137-
template <typename Duration, typename InType>
1138-
struct Strftime {
1139-
static Status Call(KernelContext* ctx, const Scalar& in, Scalar* out) {
1140-
return Status::NotImplemented("Strftime not yet implemented on windows.");
1141-
}
1142-
static Status Call(KernelContext* ctx, const ArrayData& in, ArrayData* out) {
1143-
return Status::NotImplemented("Strftime not yet implemented on windows.");
1144-
}
1145-
};
1146-
#endif
11471134

11481135
// ----------------------------------------------------------------------
11491136
// Convert string representations of timestamps in arbitrary format to timestamps

cpp/src/arrow/config.cc

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121

2222
#include "arrow/util/config.h"
2323
#include "arrow/util/cpu_info.h"
24+
#include "arrow/vendored/datetime.h"
2425

2526
namespace arrow {
2627

@@ -62,6 +63,8 @@ std::string MakeSimdLevelString(QueryFlagFunction&& query_flag) {
6263
}
6364
}
6465

66+
util::optional<std::string> timezone_db_path;
67+
6568
}; // namespace
6669

6770
const BuildInfo& GetBuildInfo() { return kBuildInfo; }
@@ -73,7 +76,32 @@ RuntimeInfo GetRuntimeInfo() {
7376
MakeSimdLevelString([&](int64_t flags) { return cpu_info->IsSupported(flags); });
7477
info.detected_simd_level =
7578
MakeSimdLevelString([&](int64_t flags) { return cpu_info->IsDetected(flags); });
79+
info.using_os_timezone_db = USE_OS_TZDB;
80+
#if !USE_OS_TZDB
81+
info.timezone_db_path = timezone_db_path;
82+
#else
83+
info.timezone_db_path = util::optional<std::string>();
84+
#endif
7685
return info;
7786
}
7887

88+
Status Initialize(const GlobalOptions& options) noexcept {
89+
if (options.timezone_db_path.has_value()) {
90+
#if !USE_OS_TZDB
91+
try {
92+
arrow_vendored::date::set_install(options.timezone_db_path.value());
93+
arrow_vendored::date::reload_tzdb();
94+
} catch (const std::runtime_error& e) {
95+
return Status::IOError(e.what());
96+
}
97+
timezone_db_path = options.timezone_db_path.value();
98+
#else
99+
return Status::Invalid(
100+
"Arrow was set to use OS timezone database at compile time, "
101+
"so a downloaded database cannot be provided at runtime.");
102+
#endif // !USE_OS_TZDB
103+
}
104+
return Status::OK();
105+
}
106+
79107
} // namespace arrow

cpp/src/arrow/config.h

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,9 @@
1919

2020
#include <string>
2121

22+
#include "arrow/status.h"
2223
#include "arrow/util/config.h" // IWYU pragma: export
24+
#include "arrow/util/optional.h"
2325
#include "arrow/util/visibility.h"
2426

2527
namespace arrow {
@@ -62,6 +64,13 @@ struct RuntimeInfo {
6264

6365
/// The SIMD level available on the OS and CPU
6466
std::string detected_simd_level;
67+
68+
/// Whether using the OS-based timezone database
69+
/// This is set at compile-time.
70+
bool using_os_timezone_db;
71+
72+
/// The path to the timezone database; by default None.
73+
util::optional<std::string> timezone_db_path;
6574
};
6675

6776
/// \brief Get runtime build info.
@@ -77,4 +86,13 @@ const BuildInfo& GetBuildInfo();
7786
ARROW_EXPORT
7887
RuntimeInfo GetRuntimeInfo();
7988

89+
struct GlobalOptions {
90+
/// Path to text timezone database. This is only configurable on Windows,
91+
/// which does not have a compatible OS timezone database.
92+
util::optional<std::string> timezone_db_path;
93+
};
94+
95+
ARROW_EXPORT
96+
Status Initialize(const GlobalOptions& options) noexcept;
97+
8098
} // namespace arrow

0 commit comments

Comments
 (0)