Skip to content

Commit 61d7cd3

Browse files
westonpacelidavidm
authored andcommitted
ARROW-11883: [C++] Add ConcatMap, MergeMap, and an async-reentrant version of Map
These items can all stand on their own and they are used by the async datasets conversion. MergeMap - Given AsyncGenerator<AsyncGenerator<T>> return AsyncGenerator<T>. This method flattens a generator of generators into a generator of items. It may reorder the items. ConcatMap - Same as MergeMap but it will only pull items from one inner subscription at a time. This reduced parallelism allows items to be returned in-order. Async-reentrant Map - In some cases the map function is slow. Even if the source is not async-reentrant this map can still be async-reentrant by allowing multiple instances of the map function to run at once. The resulting mapped generator is async reentrant but it will not pull reentrantly from the source. Vector utilities - In order to make migrating from Iterator code to vector code easier I added some map style utilities. These copy the vectors (where an iterator wouldn't) so some care should be taken but they can still be useful. Moved Future/AsyncGenerator into top level type_fwd. This is needed for the RecordBatchGenerator alias in the same way Iterator is needed at the top level. Added `IsEnd` to `IterationTraits`. This allows non-comparable types to be iterated on. It allows us to create an AsyncGenerator<AsyncGenerator<T>> since AsyncGenerator is std::function and we can use an empty instance as an end token even though std::function is not comaprable. Closes apache#9643 from westonpace/feature/arrow-11883 Authored-by: Weston Pace <weston.pace@gmail.com> Signed-off-by: David Li <li.davidm96@gmail.com>
1 parent 8e43f23 commit 61d7cd3

18 files changed

Lines changed: 1835 additions & 694 deletions

cpp/src/arrow/csv/reader.cc

Lines changed: 4 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -154,19 +154,12 @@ struct CSVBlock {
154154
template <>
155155
struct IterationTraits<csv::CSVBlock> {
156156
static csv::CSVBlock End() { return csv::CSVBlock{{}, {}, {}, -1, true, {}}; }
157+
static bool IsEnd(const csv::CSVBlock& val) { return val.block_index < 0; }
157158
};
158159

159160
namespace csv {
160161
namespace {
161162

162-
// The == operator must be defined to be used as T in Iterator<T>
163-
bool operator==(const CSVBlock& left, const CSVBlock& right) {
164-
return left.block_index == right.block_index;
165-
}
166-
bool operator!=(const CSVBlock& left, const CSVBlock& right) {
167-
return left.block_index != right.block_index;
168-
}
169-
170163
// This is a callable that can be used to transform an iterator. The source iterator
171164
// will contain buffers of data and the output iterator will contain delimited CSV
172165
// blocks. util::optional is used so that there is an end token (required by the
@@ -731,7 +724,7 @@ class SerialStreamingReader : public BaseStreamingReader {
731724

732725
if (!source_eof_) {
733726
ARROW_ASSIGN_OR_RAISE(auto maybe_block, block_iterator_.Next());
734-
if (maybe_block != IterationTraits<CSVBlock>::End()) {
727+
if (!IsIterationEnd(maybe_block)) {
735728
last_block_index_ = maybe_block.block_index;
736729
auto maybe_parsed = ParseAndInsert(maybe_block.partial, maybe_block.completion,
737730
maybe_block.buffer, maybe_block.block_index,
@@ -813,7 +806,7 @@ class SerialTableReader : public BaseTableReader {
813806
RETURN_NOT_OK(stop_token_.Poll());
814807

815808
ARROW_ASSIGN_OR_RAISE(auto maybe_block, block_iterator.Next());
816-
if (maybe_block == IterationTraits<CSVBlock>::End()) {
809+
if (IsIterationEnd(maybe_block)) {
817810
// EOF
818811
break;
819812
}
@@ -865,7 +858,7 @@ class AsyncThreadedTableReader
865858

866859
auto transferred_it = MakeTransferredGenerator(bg_it, cpu_executor_);
867860

868-
int32_t block_queue_size = std::max(2, cpu_executor_->GetCapacity());
861+
int32_t block_queue_size = cpu_executor_->GetCapacity();
869862
auto rh_it =
870863
MakeSerialReadaheadGenerator(std::move(transferred_it), block_queue_size);
871864
buffer_generator_ = CSVBufferIterator::MakeAsync(std::move(rh_it));

cpp/src/arrow/testing/future_util.h

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@
2626
// unit test anyways.
2727
#define ASSERT_FINISHES_IMPL(fut) \
2828
do { \
29-
ASSERT_TRUE(fut.Wait(10)); \
29+
ASSERT_TRUE(fut.Wait(300)); \
3030
if (!fut.is_finished()) { \
3131
FAIL() << "Future did not finish in a timely fashion"; \
3232
} \
@@ -35,11 +35,11 @@
3535
#define ASSERT_FINISHES_OK(expr) \
3636
do { \
3737
auto&& _fut = (expr); \
38-
ASSERT_TRUE(_fut.Wait(10)); \
38+
ASSERT_TRUE(_fut.Wait(300)); \
3939
if (!_fut.is_finished()) { \
4040
FAIL() << "Future did not finish in a timely fashion"; \
4141
} \
42-
auto _st = _fut.status(); \
42+
auto& _st = _fut.status(); \
4343
if (!_st.ok()) { \
4444
FAIL() << "'" ARROW_STRINGIFY(expr) "' failed with " << _st.ToString(); \
4545
} \

cpp/src/arrow/testing/gtest_util.cc

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,8 +49,10 @@
4949
#include "arrow/table.h"
5050
#include "arrow/type.h"
5151
#include "arrow/util/checked_cast.h"
52+
#include "arrow/util/future.h"
5253
#include "arrow/util/io_util.h"
5354
#include "arrow/util/logging.h"
55+
#include "arrow/util/windows_compatibility.h"
5456

5557
namespace arrow {
5658

@@ -597,13 +599,58 @@ void SleepFor(double seconds) {
597599
std::chrono::nanoseconds(static_cast<int64_t>(seconds * 1e9)));
598600
}
599601

602+
#ifdef _WIN32
603+
void SleepABit() {
604+
LARGE_INTEGER freq, start, now;
605+
QueryPerformanceFrequency(&freq);
606+
// 1 ms
607+
auto desired = freq.QuadPart / 1000;
608+
if (desired <= 0) {
609+
// Fallback to STL sleep if high resolution clock not available, tests may fail,
610+
// shouldn't really happen
611+
SleepFor(1e-3);
612+
return;
613+
}
614+
QueryPerformanceCounter(&start);
615+
while (true) {
616+
std::this_thread::yield();
617+
QueryPerformanceCounter(&now);
618+
auto elapsed = now.QuadPart - start.QuadPart;
619+
if (elapsed > desired) {
620+
break;
621+
}
622+
}
623+
}
624+
#else
625+
// std::this_thread::sleep_for should be high enough resolution on non-Windows systems
626+
void SleepABit() { SleepFor(1e-3); }
627+
#endif
628+
600629
void BusyWait(double seconds, std::function<bool()> predicate) {
601630
const double period = 0.001;
602631
for (int i = 0; !predicate() && i * period < seconds; ++i) {
603632
SleepFor(period);
604633
}
605634
}
606635

636+
Future<> SleepAsync(double seconds) {
637+
auto out = Future<>::Make();
638+
std::thread([out, seconds]() mutable {
639+
SleepFor(seconds);
640+
out.MarkFinished(Status::OK());
641+
}).detach();
642+
return out;
643+
}
644+
645+
Future<> SleepABitAsync() {
646+
auto out = Future<>::Make();
647+
std::thread([out]() mutable {
648+
SleepABit();
649+
out.MarkFinished(Status::OK());
650+
}).detach();
651+
return out;
652+
}
653+
607654
///////////////////////////////////////////////////////////////////////////
608655
// Extension types
609656

cpp/src/arrow/testing/gtest_util.h

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -436,10 +436,24 @@ inline void BitmapFromVector(const std::vector<T>& is_valid,
436436
ARROW_TESTING_EXPORT
437437
void SleepFor(double seconds);
438438

439+
// Sleeps for a very small amount of time. The thread will be yielded
440+
// at least once ensuring that context switches could happen. It is intended
441+
// to be used for stress testing parallel code and shouldn't be assumed to do any
442+
// reliable timing.
443+
ARROW_TESTING_EXPORT
444+
void SleepABit();
445+
439446
// Wait until predicate is true or timeout in seconds expires.
440447
ARROW_TESTING_EXPORT
441448
void BusyWait(double seconds, std::function<bool()> predicate);
442449

450+
ARROW_TESTING_EXPORT
451+
Future<> SleepAsync(double seconds);
452+
453+
// \see SleepABit
454+
ARROW_TESTING_EXPORT
455+
Future<> SleepABitAsync();
456+
443457
template <typename T>
444458
std::vector<T> IteratorToVector(Iterator<T> iterator) {
445459
EXPECT_OK_AND_ASSIGN(auto out, iterator.ToVector());

cpp/src/arrow/type_fwd.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,12 @@ class Result;
3535

3636
class Status;
3737

38+
namespace detail {
39+
struct Empty;
40+
}
41+
template <typename T = detail::Empty>
42+
class Future;
43+
3844
namespace util {
3945
class Codec;
4046
} // namespace util

cpp/src/arrow/util/CMakeLists.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@ endif()
4141
add_arrow_test(utility-test
4242
SOURCES
4343
align_util_test.cc
44+
async_generator_test.cc
4445
bit_block_counter_test.cc
4546
bit_util_test.cc
4647
cache_test.cc
@@ -60,6 +61,7 @@ add_arrow_test(utility-test
6061
stl_util_test.cc
6162
string_test.cc
6263
tdigest_test.cc
64+
test_common.cc
6365
time_test.cc
6466
trie_test.cc
6567
uri_test.cc

cpp/src/arrow/util/algorithm.h

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
// Licensed to the Apache Software Foundation (ASF) under one
2+
// or more contributor license agreements. See the NOTICE file
3+
// distributed with this work for additional information
4+
// regarding copyright ownership. The ASF licenses this file
5+
// to you under the Apache License, Version 2.0 (the
6+
// "License"); you may not use this file except in compliance
7+
// with the License. You may obtain a copy of the License at
8+
//
9+
// http://www.apache.org/licenses/LICENSE-2.0
10+
//
11+
// Unless required by applicable law or agreed to in writing,
12+
// software distributed under the License is distributed on an
13+
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
// KIND, either express or implied. See the License for the
15+
// specific language governing permissions and limitations
16+
// under the License.
17+
18+
#pragma once
19+
20+
#include "arrow/result.h"
21+
22+
namespace arrow {
23+
24+
template <typename InputIterator, typename OutputIterator, typename UnaryOperation>
25+
Status MaybeTransform(InputIterator first, InputIterator last, OutputIterator out,
26+
UnaryOperation unary_op) {
27+
for (; first != last; ++first, (void)++out) {
28+
ARROW_ASSIGN_OR_RAISE(*out, unary_op(*first));
29+
}
30+
return Status::OK();
31+
}
32+
33+
} // namespace arrow

0 commit comments

Comments
 (0)